From 4a798fa9e4017d2133e4cf7cc11db309af1ab89d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 12:33:46 +0900 Subject: [PATCH 01/23] repr.rs --- crates/hstr/src/lib.rs | 1 + crates/hstr/src/repr/heap.rs | 1 + crates/hstr/src/repr/inline.rs | 1 + crates/hstr/src/repr/interned.rs | 1 + crates/hstr/src/repr/mod.rs | 5 +++++ 5 files changed, 9 insertions(+) create mode 100644 crates/hstr/src/repr/heap.rs create mode 100644 crates/hstr/src/repr/inline.rs create mode 100644 crates/hstr/src/repr/interned.rs create mode 100644 crates/hstr/src/repr/mod.rs diff --git a/crates/hstr/src/lib.rs b/crates/hstr/src/lib.rs index 9b9be33..73a8656 100644 --- a/crates/hstr/src/lib.rs +++ b/crates/hstr/src/lib.rs @@ -19,6 +19,7 @@ use crate::dynamic::Entry; mod dynamic; mod global_store; +mod repr; mod tagged_value; #[cfg(test)] mod tests; diff --git a/crates/hstr/src/repr/heap.rs b/crates/hstr/src/repr/heap.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/crates/hstr/src/repr/heap.rs @@ -0,0 +1 @@ + diff --git a/crates/hstr/src/repr/inline.rs b/crates/hstr/src/repr/inline.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/crates/hstr/src/repr/inline.rs @@ -0,0 +1 @@ + diff --git a/crates/hstr/src/repr/interned.rs b/crates/hstr/src/repr/interned.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/crates/hstr/src/repr/interned.rs @@ -0,0 +1 @@ + diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs new file mode 100644 index 0000000..778cf4a --- /dev/null +++ b/crates/hstr/src/repr/mod.rs @@ -0,0 +1,5 @@ +mod heap; +mod inline; +mod interned; + +pub(crate) struct Repr {} From 6ceaa4d3c46df2c58f29667898bda4cd603c1773 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 12:34:40 +0900 Subject: [PATCH 02/23] fixup --- crates/hstr/src/repr/mod.rs | 1 + crates/hstr/src/repr/static_ref.rs | 1 + 2 files changed, 2 insertions(+) create mode 100644 crates/hstr/src/repr/static_ref.rs diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index 778cf4a..bf70001 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -1,5 +1,6 @@ mod heap; mod inline; mod interned; +mod static_ref; pub(crate) struct Repr {} diff --git a/crates/hstr/src/repr/static_ref.rs b/crates/hstr/src/repr/static_ref.rs new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/crates/hstr/src/repr/static_ref.rs @@ -0,0 +1 @@ + From 70f03dcae6df8fe1bf4f7da28731fc345deab278 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 12:41:27 +0900 Subject: [PATCH 03/23] Dep --- crates/hstr/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/hstr/Cargo.toml b/crates/hstr/Cargo.toml index 3ae635e..06780aa 100644 --- a/crates/hstr/Cargo.toml +++ b/crates/hstr/Cargo.toml @@ -28,6 +28,7 @@ phf = "0.11.2" rkyv = { version = "0.7.42", optional = true } rustc-hash = "1.1.0" serde = { version = "1.0.192", optional = true } +static_assertions = "1.1.0" triomphe = "0.1.11" [dev-dependencies] From f8067e18cd63a9f66d886e51284a6ee9d2eee5b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 12:41:30 +0900 Subject: [PATCH 04/23] lockfile --- Cargo.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.lock b/Cargo.lock index 5982ed7..297653b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -386,6 +386,7 @@ dependencies = [ "serde", "smartstring", "smol_str", + "static_assertions", "string_cache", "triomphe", ] From 4bf063c8b9d69988204d86ea6a9be55a025305ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 12:41:34 +0900 Subject: [PATCH 05/23] Repr --- crates/hstr/src/repr/mod.rs | 21 ++- crates/hstr/src/repr/nonmax_u8.rs | 271 ++++++++++++++++++++++++++++++ 2 files changed, 291 insertions(+), 1 deletion(-) create mode 100644 crates/hstr/src/repr/nonmax_u8.rs diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index bf70001..d3c88f4 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -1,6 +1,25 @@ +use self::nonmax_u8::NonMaxU8; + mod heap; mod inline; mod interned; +mod nonmax_u8; mod static_ref; -pub(crate) struct Repr {} +#[repr(C)] +pub struct Repr( + // We have a pointer in the repesentation to properly carry provenance + *const (), + // Then we need one `usize` (aka WORDs) of data + // ...but we breakup into multiple pieces... + #[cfg(target_pointer_width = "64")] u32, + u16, + u8, + // ...so that the last byte can be a NonMax, which allows the compiler to see a niche value + NonMaxU8, +); + +unsafe impl Send for Repr {} +unsafe impl Sync for Repr {} + +static_assertions::assert_eq_size!(Repr, [usize; 2]); diff --git a/crates/hstr/src/repr/nonmax_u8.rs b/crates/hstr/src/repr/nonmax_u8.rs new file mode 100644 index 0000000..2f95153 --- /dev/null +++ b/crates/hstr/src/repr/nonmax_u8.rs @@ -0,0 +1,271 @@ +/// [`NonMaxU8`] is an unsigned 8-bit integer data type that has a valid range +/// of `[0, 254]`. Excluding `255` allows the Rust compiler to use `255` as a +/// niche. +/// +/// Specifically the compiler can use `255` to encode the `None` variant of +/// `Option` allowing `std::mem::size_of:: == +/// std::mem::size_of::>()` +#[allow(clippy::upper_case_acronyms)] +#[allow(dead_code)] +#[allow(non_camel_case_types)] +#[derive(Copy, Clone, Debug)] +#[repr(u8)] +pub enum NonMaxU8 { + V0 = 0, + V1 = 1, + V2 = 2, + V3 = 3, + V4 = 4, + V5 = 5, + V6 = 6, + V7 = 7, + V8 = 8, + V9 = 9, + V10 = 10, + V11 = 11, + V12 = 12, + V13 = 13, + V14 = 14, + V15 = 15, + V16 = 16, + V17 = 17, + V18 = 18, + V19 = 19, + V20 = 20, + V21 = 21, + V22 = 22, + V23 = 23, + V24 = 24, + V25 = 25, + V26 = 26, + V27 = 27, + V28 = 28, + V29 = 29, + V30 = 30, + V31 = 31, + V32 = 32, + V33 = 33, + V34 = 34, + V35 = 35, + V36 = 36, + V37 = 37, + V38 = 38, + V39 = 39, + V40 = 40, + V41 = 41, + V42 = 42, + V43 = 43, + V44 = 44, + V45 = 45, + V46 = 46, + V47 = 47, + V48 = 48, + V49 = 49, + V50 = 50, + V51 = 51, + V52 = 52, + V53 = 53, + V54 = 54, + V55 = 55, + V56 = 56, + V57 = 57, + V58 = 58, + V59 = 59, + V60 = 60, + V61 = 61, + V62 = 62, + V63 = 63, + V64 = 64, + V65 = 65, + V66 = 66, + V67 = 67, + V68 = 68, + V69 = 69, + V70 = 70, + V71 = 71, + V72 = 72, + V73 = 73, + V74 = 74, + V75 = 75, + V76 = 76, + V77 = 77, + V78 = 78, + V79 = 79, + V80 = 80, + V81 = 81, + V82 = 82, + V83 = 83, + V84 = 84, + V85 = 85, + V86 = 86, + V87 = 87, + V88 = 88, + V89 = 89, + V90 = 90, + V91 = 91, + V92 = 92, + V93 = 93, + V94 = 94, + V95 = 95, + V96 = 96, + V97 = 97, + V98 = 98, + V99 = 99, + V100 = 100, + V101 = 101, + V102 = 102, + V103 = 103, + V104 = 104, + V105 = 105, + V106 = 106, + V107 = 107, + V108 = 108, + V109 = 109, + V110 = 110, + V111 = 111, + V112 = 112, + V113 = 113, + V114 = 114, + V115 = 115, + V116 = 116, + V117 = 117, + V118 = 118, + V119 = 119, + V120 = 120, + V121 = 121, + V122 = 122, + V123 = 123, + V124 = 124, + V125 = 125, + V126 = 126, + V127 = 127, + V128 = 128, + V129 = 129, + V130 = 130, + V131 = 131, + V132 = 132, + V133 = 133, + V134 = 134, + V135 = 135, + V136 = 136, + V137 = 137, + V138 = 138, + V139 = 139, + V140 = 140, + V141 = 141, + V142 = 142, + V143 = 143, + V144 = 144, + V145 = 145, + V146 = 146, + V147 = 147, + V148 = 148, + V149 = 149, + V150 = 150, + V151 = 151, + V152 = 152, + V153 = 153, + V154 = 154, + V155 = 155, + V156 = 156, + V157 = 157, + V158 = 158, + V159 = 159, + V160 = 160, + V161 = 161, + V162 = 162, + V163 = 163, + V164 = 164, + V165 = 165, + V166 = 166, + V167 = 167, + V168 = 168, + V169 = 169, + V170 = 170, + V171 = 171, + V172 = 172, + V173 = 173, + V174 = 174, + V175 = 175, + V176 = 176, + V177 = 177, + V178 = 178, + V179 = 179, + V180 = 180, + V181 = 181, + V182 = 182, + V183 = 183, + V184 = 184, + V185 = 185, + V186 = 186, + V187 = 187, + V188 = 188, + V189 = 189, + V190 = 190, + V191 = 191, + V192 = 192, + V193 = 193, + V194 = 194, + V195 = 195, + V196 = 196, + V197 = 197, + V198 = 198, + V199 = 199, + V200 = 200, + V201 = 201, + V202 = 202, + V203 = 203, + V204 = 204, + V205 = 205, + V206 = 206, + V207 = 207, + V208 = 208, + V209 = 209, + V210 = 210, + V211 = 211, + V212 = 212, + V213 = 213, + V214 = 214, + V215 = 215, + V216 = 216, + V217 = 217, + V218 = 218, + V219 = 219, + V220 = 220, + V221 = 221, + V222 = 222, + V223 = 223, + V224 = 224, + V225 = 225, + V226 = 226, + V227 = 227, + V228 = 228, + V229 = 229, + V230 = 230, + V231 = 231, + V232 = 232, + V233 = 233, + V234 = 234, + V235 = 235, + V236 = 236, + V237 = 237, + V238 = 238, + V239 = 239, + V240 = 240, + V241 = 241, + V242 = 242, + V243 = 243, + V244 = 244, + V245 = 245, + V246 = 246, + V247 = 247, + V248 = 248, + V249 = 249, + V250 = 250, + V251 = 251, + V252 = 252, + V253 = 253, + V254 = 254, +} + +static_assertions::assert_eq_size!(NonMaxU8, Option, u8); From a479f60ed84d6c3c59a0734e0959b73cea99fc73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 12:42:48 +0900 Subject: [PATCH 06/23] size --- crates/hstr/src/repr/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index d3c88f4..2c24407 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -22,4 +22,4 @@ pub struct Repr( unsafe impl Send for Repr {} unsafe impl Sync for Repr {} -static_assertions::assert_eq_size!(Repr, [usize; 2]); +static_assertions::assert_eq_size!(Repr, Option, [usize; 2]); From 49e45f252c3f286dffc2c25c6e63d55fcfc844d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 12:54:36 +0900 Subject: [PATCH 07/23] kind --- crates/hstr/src/repr/mod.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index 2c24407..608e827 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -22,4 +22,21 @@ pub struct Repr( unsafe impl Send for Repr {} unsafe impl Sync for Repr {} +const KIND_INLINED: u8 = 0b00; +const KIND_INTERNED: u8 = 0b01; +const KIND_HEAP: u8 = 0b10; +const KIND_STATIC: u8 = 0b11; +const KIND_MASK: u8 = 0b11; + +impl Repr { + #[inline] + pub fn new_static(text: &'static str) -> Self {} + + #[inline] + pub fn new_dynamic(text: &str) -> Self {} + + #[inline] + pub fn new_interned(text: &str) -> Self {} +} + static_assertions::assert_eq_size!(Repr, Option, [usize; 2]); From 54cf57deea2ed0d535e326b151d92585cc6c748b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:01:32 +0900 Subject: [PATCH 08/23] Dep --- crates/hstr/Cargo.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/hstr/Cargo.toml b/crates/hstr/Cargo.toml index 06780aa..a3fce52 100644 --- a/crates/hstr/Cargo.toml +++ b/crates/hstr/Cargo.toml @@ -21,6 +21,7 @@ atom_size_64 = [] atom_size_128 = [] [dependencies] +cfg-if = "1.0.0" hashbrown = { version = "0.14.3", default-features = false } new_debug_unreachable = "1.0.4" once_cell = "1.18.0" From bc608519661c63257b062ac4c6c3b917dd3f349f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:01:35 +0900 Subject: [PATCH 09/23] lockfile --- Cargo.lock | 1 + 1 file changed, 1 insertion(+) diff --git a/Cargo.lock b/Cargo.lock index 297653b..484f768 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -370,6 +370,7 @@ checksum = "d77f7ec81a6d05a3abb01ab6eb7590f6083d08449fe5a1c8b1e620283546ccb7" name = "hstr" version = "0.2.9" dependencies = [ + "cfg-if", "compact_str", "criterion", "dudy-malloc", From 43c907389378fe9d81d7d04cbf5715bf4fa60c19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:01:37 +0900 Subject: [PATCH 10/23] API --- crates/hstr/src/repr/mod.rs | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index 608e827..a46f936 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -9,7 +9,7 @@ mod static_ref; #[repr(C)] pub struct Repr( // We have a pointer in the repesentation to properly carry provenance - *const (), + *const u8, // Then we need one `usize` (aka WORDs) of data // ...but we breakup into multiple pieces... #[cfg(target_pointer_width = "64")] u32, @@ -37,6 +37,28 @@ impl Repr { #[inline] pub fn new_interned(text: &str) -> Self {} + + fn len(&self) -> usize {} + + fn as_str(&self) -> &str {} + + #[inline] + fn kind(&self) -> u8 { + self.last_byte() & KIND_MASK + } + + fn last_byte(&self) -> u8 { + cfg_if::cfg_if! { + if #[cfg(target_pointer_width = "64")] { + let last_byte = self.4; + } else if #[cfg(target_pointer_width = "32")] { + let last_byte = self.3; + } else { + compile_error!("Unsupported target_pointer_width"); + } + }; + last_byte as u8 + } } static_assertions::assert_eq_size!(Repr, Option, [usize; 2]); From c2ab1b33ad73a558b549202dcfb7928fcc6b1985 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:07:17 +0900 Subject: [PATCH 11/23] nonmax --- crates/hstr/src/repr/mod.rs | 23 +++------------ .../hstr/src/repr/{nonmax_u8.rs => nonmax.rs} | 28 +++++++++++++++++++ crates/hstr/src/repr/static_ref.rs | 5 +++- 3 files changed, 36 insertions(+), 20 deletions(-) rename crates/hstr/src/repr/{nonmax_u8.rs => nonmax.rs} (84%) diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index a46f936..d25c2f7 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -1,22 +1,16 @@ -use self::nonmax_u8::NonMaxU8; +use self::nonmax::NonMaxUsize; mod heap; mod inline; mod interned; -mod nonmax_u8; +mod nonmax; mod static_ref; #[repr(C)] pub struct Repr( // We have a pointer in the repesentation to properly carry provenance *const u8, - // Then we need one `usize` (aka WORDs) of data - // ...but we breakup into multiple pieces... - #[cfg(target_pointer_width = "64")] u32, - u16, - u8, - // ...so that the last byte can be a NonMax, which allows the compiler to see a niche value - NonMaxU8, + NonMaxUsize, ); unsafe impl Send for Repr {} @@ -48,16 +42,7 @@ impl Repr { } fn last_byte(&self) -> u8 { - cfg_if::cfg_if! { - if #[cfg(target_pointer_width = "64")] { - let last_byte = self.4; - } else if #[cfg(target_pointer_width = "32")] { - let last_byte = self.3; - } else { - compile_error!("Unsupported target_pointer_width"); - } - }; - last_byte as u8 + self.1.last_byte() } } diff --git a/crates/hstr/src/repr/nonmax_u8.rs b/crates/hstr/src/repr/nonmax.rs similarity index 84% rename from crates/hstr/src/repr/nonmax_u8.rs rename to crates/hstr/src/repr/nonmax.rs index 2f95153..f5874a0 100644 --- a/crates/hstr/src/repr/nonmax_u8.rs +++ b/crates/hstr/src/repr/nonmax.rs @@ -1,3 +1,31 @@ +#[repr(C)] +pub struct NonMaxUsize( + // Then we need one `usize` (aka WORDs) of data + // ...but we breakup into multiple pieces... + #[cfg(target_pointer_width = "64")] u32, + u16, + u8, + // ...so that the last byte can be a NonMax, which allows the compiler to see a niche value + NonMaxU8, +); + +static_assertions::assert_eq_size!(NonMaxUsize, Option, usize); + +impl NonMaxUsize { + pub const fn last_byte(self) -> u8 { + cfg_if::cfg_if! { + if #[cfg(target_pointer_width = "64")] { + let last_byte = self.3; + } else if #[cfg(target_pointer_width = "32")] { + let last_byte = self.2; + } else { + compile_error!("Unsupported target_pointer_width"); + } + }; + last_byte as u8 + } +} + /// [`NonMaxU8`] is an unsigned 8-bit integer data type that has a valid range /// of `[0, 254]`. Excluding `255` allows the Rust compiler to use `255` as a /// niche. diff --git a/crates/hstr/src/repr/static_ref.rs b/crates/hstr/src/repr/static_ref.rs index 8b13789..36abad4 100644 --- a/crates/hstr/src/repr/static_ref.rs +++ b/crates/hstr/src/repr/static_ref.rs @@ -1 +1,4 @@ - +pub(super) struct StaticStr { + ptr: *const u8, + len: u16, +} From 9f6338a16557b7dbff0507713d28f0a0478d5dd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:16:19 +0900 Subject: [PATCH 12/23] StaticStr --- crates/hstr/src/repr/nonmax.rs | 12 ++++++++++++ crates/hstr/src/repr/static_ref.rs | 21 ++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/crates/hstr/src/repr/nonmax.rs b/crates/hstr/src/repr/nonmax.rs index f5874a0..328f907 100644 --- a/crates/hstr/src/repr/nonmax.rs +++ b/crates/hstr/src/repr/nonmax.rs @@ -1,3 +1,5 @@ +use std::mem::transmute; + #[repr(C)] pub struct NonMaxUsize( // Then we need one `usize` (aka WORDs) of data @@ -12,6 +14,16 @@ pub struct NonMaxUsize( static_assertions::assert_eq_size!(NonMaxUsize, Option, usize); impl NonMaxUsize { + pub fn new(value: usize) -> Self { + debug_assert_ne!( + value, + usize::MAX, + "NonMaxUsize::new(usize::MAX) is not allowed" + ); + + unsafe { transmute(value) } + } + pub const fn last_byte(self) -> u8 { cfg_if::cfg_if! { if #[cfg(target_pointer_width = "64")] { diff --git a/crates/hstr/src/repr/static_ref.rs b/crates/hstr/src/repr/static_ref.rs index 36abad4..21d1fc5 100644 --- a/crates/hstr/src/repr/static_ref.rs +++ b/crates/hstr/src/repr/static_ref.rs @@ -1,4 +1,23 @@ +use super::{nonmax::NonMaxUsize, Repr, KIND_STATIC}; + pub(super) struct StaticStr { ptr: *const u8, - len: u16, + /// We use the last two bits to store the kind of the string. + len: NonMaxUsize, +} + +static_assertions::assert_eq_size!(Repr, StaticStr); + +impl StaticStr { + pub fn new(text: &'static str) -> Self { + // Shift length to the right by 2 bits and store the kind in the last two + // bits. + + let len = NonMaxUsize::new(text.len() << 2 | (KIND_STATIC as usize)); + + Self { + ptr: text.as_ptr(), + len, + } + } } From 7f683d3ab62fa88932faf27dbae86451146787c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:17:42 +0900 Subject: [PATCH 13/23] new_static --- crates/hstr/src/repr/mod.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index d25c2f7..cfc34e5 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -1,4 +1,4 @@ -use self::nonmax::NonMaxUsize; +use self::{nonmax::NonMaxUsize, static_ref::StaticStr}; mod heap; mod inline; @@ -24,7 +24,14 @@ const KIND_MASK: u8 = 0b11; impl Repr { #[inline] - pub fn new_static(text: &'static str) -> Self {} + pub fn new_static(text: &'static str) -> Self { + let repr = StaticStr::new(text); + let repr = unsafe { std::mem::transmute::(repr) }; + + debug_assert_eq!(repr.kind(), KIND_STATIC); + + repr + } #[inline] pub fn new_dynamic(text: &str) -> Self {} From e914002003c9f07a14e6ff8bec85723ae5d6231a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:19:47 +0900 Subject: [PATCH 14/23] more --- crates/hstr/src/repr/mod.rs | 15 ++++++++++++++- crates/hstr/src/repr/nonmax.rs | 4 ++++ crates/hstr/src/repr/static_ref.rs | 8 ++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index cfc34e5..c2869d5 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -1,3 +1,5 @@ +use debug_unreachable::debug_unreachable; + use self::{nonmax::NonMaxUsize, static_ref::StaticStr}; mod heap; @@ -39,7 +41,18 @@ impl Repr { #[inline] pub fn new_interned(text: &str) -> Self {} - fn len(&self) -> usize {} + fn len(&self) -> usize { + match self.kind() { + KIND_INLINED => {} + KIND_HEAP => {} + KIND_STATIC => { + let repr = unsafe { std::mem::transmute::(*self) }; + repr.len() + } + KIND_INTERNED => {} + _ => unsafe { debug_unreachable!("Invalid kind in Repr::len()") }, + } + } fn as_str(&self) -> &str {} diff --git a/crates/hstr/src/repr/nonmax.rs b/crates/hstr/src/repr/nonmax.rs index 328f907..a7f975f 100644 --- a/crates/hstr/src/repr/nonmax.rs +++ b/crates/hstr/src/repr/nonmax.rs @@ -24,6 +24,10 @@ impl NonMaxUsize { unsafe { transmute(value) } } + pub fn as_usize(self) -> usize { + unsafe { transmute(self) } + } + pub const fn last_byte(self) -> u8 { cfg_if::cfg_if! { if #[cfg(target_pointer_width = "64")] { diff --git a/crates/hstr/src/repr/static_ref.rs b/crates/hstr/src/repr/static_ref.rs index 21d1fc5..2e0949a 100644 --- a/crates/hstr/src/repr/static_ref.rs +++ b/crates/hstr/src/repr/static_ref.rs @@ -20,4 +20,12 @@ impl StaticStr { len, } } + + pub fn len(&self) -> usize { + self.len.as_usize() >> 2 + } + + pub fn as_str(&self) -> &str { + unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.ptr, self.len())) } + } } From d36d20a5b659d98ee6eb22c49e2faa792a88026e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:24:44 +0900 Subject: [PATCH 15/23] static str --- crates/hstr/src/repr/inline.rs | 2 +- crates/hstr/src/repr/static_ref.rs | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/crates/hstr/src/repr/inline.rs b/crates/hstr/src/repr/inline.rs index 8b13789..785b179 100644 --- a/crates/hstr/src/repr/inline.rs +++ b/crates/hstr/src/repr/inline.rs @@ -1 +1 @@ - +pub struct InlineBuffer {} diff --git a/crates/hstr/src/repr/static_ref.rs b/crates/hstr/src/repr/static_ref.rs index 2e0949a..1e0d22e 100644 --- a/crates/hstr/src/repr/static_ref.rs +++ b/crates/hstr/src/repr/static_ref.rs @@ -1,5 +1,6 @@ use super::{nonmax::NonMaxUsize, Repr, KIND_STATIC}; +#[repr(C)] pub(super) struct StaticStr { ptr: *const u8, /// We use the last two bits to store the kind of the string. @@ -8,11 +9,15 @@ pub(super) struct StaticStr { static_assertions::assert_eq_size!(Repr, StaticStr); +const MAX_LEN: usize = (usize::MAX >> 2) - 1; + impl StaticStr { - pub fn new(text: &'static str) -> Self { + // Safety: `text.len()` must be less than `usize::MAX >> 2 - 1`. + pub unsafe fn new(text: &'static str) -> Self { // Shift length to the right by 2 bits and store the kind in the last two // bits. + debug_assert!(text.len() < MAX_LEN); let len = NonMaxUsize::new(text.len() << 2 | (KIND_STATIC as usize)); Self { From 8add6099d89b80dc3db575afd7a85e3d65d59b64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:40:06 +0900 Subject: [PATCH 16/23] more work for dynamic --- crates/hstr/src/repr/heap.rs | 6 +++++ crates/hstr/src/repr/inline.rs | 17 +++++++++++++- crates/hstr/src/repr/mod.rs | 43 ++++++++++++++++++++++++++++++---- 3 files changed, 61 insertions(+), 5 deletions(-) diff --git a/crates/hstr/src/repr/heap.rs b/crates/hstr/src/repr/heap.rs index 8b13789..9413f9b 100644 --- a/crates/hstr/src/repr/heap.rs +++ b/crates/hstr/src/repr/heap.rs @@ -1 +1,7 @@ +pub struct HeapStr { + ptr: *const u8, +} +impl HeapStr { + pub unsafe fn new(text: &str) -> Self {} +} diff --git a/crates/hstr/src/repr/inline.rs b/crates/hstr/src/repr/inline.rs index 785b179..51e80da 100644 --- a/crates/hstr/src/repr/inline.rs +++ b/crates/hstr/src/repr/inline.rs @@ -1 +1,16 @@ -pub struct InlineBuffer {} +use super::{Repr, MAX_SIZE}; + +#[repr(transparent)] +pub struct InlineBuffer(pub [u8; MAX_SIZE]); +static_assertions::assert_eq_size!(InlineBuffer, Repr); + +impl InlineBuffer { + /// Safety: `text.len()` must be less than `MAX_SIZE`. + pub unsafe fn new(text: &str) -> Self { + let mut buffer = InlineBuffer([0; MAX_SIZE]); + let len = text.len(); + let text = text.as_bytes(); + buffer.0[..len].copy_from_slice(text); + buffer + } +} diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index c2869d5..4c157f4 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -1,6 +1,8 @@ +use std::mem::size_of; + use debug_unreachable::debug_unreachable; -use self::{nonmax::NonMaxUsize, static_ref::StaticStr}; +use self::{inline::InlineBuffer, nonmax::NonMaxUsize, static_ref::StaticStr}; mod heap; mod inline; @@ -8,6 +10,8 @@ mod interned; mod nonmax; mod static_ref; +const MAX_SIZE: usize = size_of::(); + #[repr(C)] pub struct Repr( // We have a pointer in the repesentation to properly carry provenance @@ -27,16 +31,47 @@ const KIND_MASK: u8 = 0b11; impl Repr { #[inline] pub fn new_static(text: &'static str) -> Self { - let repr = StaticStr::new(text); + let repr = unsafe { StaticStr::new(text) }; let repr = unsafe { std::mem::transmute::(repr) }; - debug_assert_eq!(repr.kind(), KIND_STATIC); + if cfg!(feature = "debug") { + assert_eq!(repr.as_str(), text); + assert_eq!(repr.kind(), KIND_STATIC); + } repr } #[inline] - pub fn new_dynamic(text: &str) -> Self {} + pub fn new_dynamic(text: &str) -> Self { + let len = text.len(); + + if len == 0 { + return Self::new_static(""); + } + + if len < MAX_SIZE { + let repr = unsafe { InlineBuffer::new(text) }; + let repr = unsafe { std::mem::transmute::(repr) }; + + if cfg!(feature = "debug") { + assert_eq!(repr.as_str(), text); + assert_eq!(repr.kind(), KIND_INLINED); + } + + repr + } else { + let repr = unsafe { heap::HeapStr::new(text) }; + let repr = unsafe { std::mem::transmute::(repr) }; + + if cfg!(feature = "debug") { + assert_eq!(repr.as_str(), text); + assert_eq!(repr.kind(), KIND_HEAP); + } + + repr + } + } #[inline] pub fn new_interned(text: &str) -> Self {} From bcf3baf92890b0ea192a890d3efbe310781a1478 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:46:14 +0900 Subject: [PATCH 17/23] capacity --- crates/hstr/src/repr/capacity.rs | 169 +++++++++++++++++++++++++++++++ crates/hstr/src/repr/heap.rs | 11 +- crates/hstr/src/repr/mod.rs | 4 + 3 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 crates/hstr/src/repr/capacity.rs diff --git a/crates/hstr/src/repr/capacity.rs b/crates/hstr/src/repr/capacity.rs new file mode 100644 index 0000000..3c35a51 --- /dev/null +++ b/crates/hstr/src/repr/capacity.rs @@ -0,0 +1,169 @@ +use crate::repr::HEAP_MASK; + +// how many bytes a `usize` occupies +const USIZE_SIZE: usize = core::mem::size_of::(); + +/// Used to generate [`CAPACITY_IS_ON_THE_HEAP`] +#[allow(non_snake_case)] +const fn CAP_ON_HEAP_FLAG() -> [u8; USIZE_SIZE] { + // all bytes 255, with the last being HEAP_MASK + let mut flag = [255; USIZE_SIZE]; + flag[USIZE_SIZE - 1] = HEAP_MASK; + flag +} + +/// State that describes the capacity as being stored on the heap. +/// +/// All bytes `255`, with the last being [`HEAP_MASK`], using the same amount of +/// bytes as `usize` Example (64-bit): `[255, 255, 255, 255, 255, 255, 255, +/// 254]` +const CAPACITY_IS_ON_THE_HEAP: [u8; USIZE_SIZE] = CAP_ON_HEAP_FLAG(); + +// how many bytes we can use for capacity +const SPACE_FOR_CAPACITY: usize = USIZE_SIZE - 1; +// the maximum value we're able to store, e.g. on 64-bit arch this is 2^56 - 2 +// +// note: Preferably we'd used usize.pow(..) here, but that's not a `const fn`, +// so we need to use bitshift operators, and there's a lint against using them +// in this pattern, which IMO isn't a great lint +pub const MAX_VALUE: usize = 2usize.pow(SPACE_FOR_CAPACITY as u32 * 8) - 2; + +/// An integer type that uses `core::mem::size_of::() - 1` bytes to store +/// the capacity of a heap buffer. +/// +/// Assumming a 64-bit arch, a [`super::BoxString`] uses 8 bytes for a pointer, +/// 8 bytes for a length, and then needs 1 byte for a discriminant. We need to +/// store the capacity somewhere, and we could store it on the heap, but we also +/// have 7 unused bytes. [`Capacity`] handles storing a value in these 7 bytes, +/// returning an error if it's not possible, at which point we'll store the +/// capacity on the heap. +/// +/// # Max Values +/// * __64-bit:__ `(2 ^ (7 * 8)) - 2 = 72_057_594_037_927_934 ~= 64 petabytes` +/// * __32-bit:__ `(2 ^ (3 * 8)) - 2 = 16_777_214 ~= 16 megabytes` +/// +/// Practically speaking, on a 64-bit architecture we'll never need to store the +/// capacity on the heap, because with it's impossible to create a string that +/// is 64 petabytes or larger. But for 32-bit architectures we need to be able +/// to store a capacity larger than 16 megabytes, since a string larger than 16 +/// megabytes probably isn't that uncommon. +#[derive(Copy, Clone, Debug, PartialEq, Eq)] +#[cfg_attr(target_pointer_width = "64", repr(align(8)))] +#[cfg_attr(target_pointer_width = "32", repr(align(4)))] +pub struct Capacity([u8; USIZE_SIZE]); + +static_assertions::assert_eq_size!(Capacity, usize); +static_assertions::assert_eq_align!(Capacity, usize); + +impl Capacity { + #[inline] + pub const fn new(capacity: usize) -> Self { + cfg_if::cfg_if! { + if #[cfg(target_pointer_width = "64")] { + // on 64-bit arches we can always fit the capacity inline + debug_assert!(capacity <= MAX_VALUE); + + let mut bytes = capacity.to_le_bytes(); + bytes[core::mem::size_of::() - 1] = HEAP_MASK; + Capacity(bytes) + } else if #[cfg(target_pointer_width = "32")] { + // on 32-bit arches we might need to store the capacity on the heap + if capacity > MAX_VALUE { + // if we need the last byte to encode this capacity then we need to put the capacity on + // the heap. return an Error so `BoxString` can do the right thing + Capacity(CAPACITY_IS_ON_THE_HEAP) + } else { + // otherwise, we can store this capacity inline! Set the last byte to be our `HEAP_MASK` + // for our discriminant, using the leading bytes to store the actual value + let mut bytes = capacity.to_le_bytes(); + bytes[core::mem::size_of::() - 1] = HEAP_MASK; + Capacity(bytes) + } + } else { + compile_error!("Unsupported target_pointer_width"); + } + } + } + + /// Re-interprets a [`Capacity`] as a `usize` + /// + /// # SAFETY: + /// * `self` must be less than or equal to [`MAX_VALUE`] + #[inline(always)] + pub unsafe fn as_usize(&self) -> usize { + let mut usize_buf = [0u8; USIZE_SIZE]; + // SAFETY: + // * `src` is valid for reads of `SPACE_FOR_CAPACITY` because it is less than + // `USIZE_SIZE` + // * `dst` is valid for reads of `SPACE_FOR_CAPACITY` because it is less than + // `USIZE_SIZE` + // * `src` and `dst` do not overlap because we created `usize_buf` + core::ptr::copy_nonoverlapping(self.0.as_ptr(), usize_buf.as_mut_ptr(), SPACE_FOR_CAPACITY); + usize::from_le_bytes(usize_buf) + } + + /// Returns whether or not this [`Capacity`] has a value that indicates the + /// capacity is being stored on the heap + #[inline(always)] + pub fn is_heap(&self) -> bool { + self.0 == CAPACITY_IS_ON_THE_HEAP + } +} + +#[cfg(test)] +mod tests { + use rayon::prelude::*; + + use super::Capacity; + + #[test] + fn test_zero_roundtrips() { + let og = 0; + let cap = Capacity::new(og); + let after = unsafe { cap.as_usize() }; + + assert_eq!(og, after); + } + + #[test] + fn test_max_value() { + let available_bytes = (core::mem::size_of::() - 1) as u32; + let max_value = 2usize.pow(available_bytes * 8) - 2; + + #[cfg(target_pointer_width = "64")] + assert_eq!(max_value, 72057594037927934); + #[cfg(target_pointer_width = "32")] + assert_eq!(max_value, 16777214); + + let cap = Capacity::new(max_value); + let after = unsafe { cap.as_usize() }; + + assert_eq!(max_value, after); + } + + #[cfg(target_pointer_width = "32")] + #[test] + + fn test_invalid_value() { + let invalid_val = usize::MAX; + let cap = Capacity::new(invalid_val); + let after = unsafe { cap.as_usize() }; + + // anything greater than or equal to 16777215, should "resolve" to 16777215 + assert_eq!(16777215, after); + } + + #[test] + #[cfg_attr(miri, ignore)] + fn test_all_valid_32bit_values() { + #[cfg(target_pointer_width = "32")] + assert_eq!(16_777_214, super::MAX_VALUE); + + (0..=16_777_214).into_par_iter().for_each(|i| { + let cap = Capacity::new(i); + let val = unsafe { cap.as_usize() }; + + assert_eq!(val, i, "value roundtriped to wrong value?"); + }); + } +} diff --git a/crates/hstr/src/repr/heap.rs b/crates/hstr/src/repr/heap.rs index 9413f9b..8cc08b3 100644 --- a/crates/hstr/src/repr/heap.rs +++ b/crates/hstr/src/repr/heap.rs @@ -1,7 +1,16 @@ +use super::{capacity::Capacity, Repr}; + pub struct HeapStr { ptr: *const u8, + len: Capacity, } +static_assertions::assert_eq_size!(HeapStr, Repr); + impl HeapStr { - pub unsafe fn new(text: &str) -> Self {} + pub unsafe fn new(text: &str) -> Self { + let len = Capacity::new(text.len()); + let ptr = text.as_ptr(); + Self { ptr, len } + } } diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index 4c157f4..b9b7485 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -4,6 +4,7 @@ use debug_unreachable::debug_unreachable; use self::{inline::InlineBuffer, nonmax::NonMaxUsize, static_ref::StaticStr}; +mod capacity; mod heap; mod inline; mod interned; @@ -28,6 +29,9 @@ const KIND_HEAP: u8 = 0b10; const KIND_STATIC: u8 = 0b11; const KIND_MASK: u8 = 0b11; +/// Used as a discriminant to identify different variants +const HEAP_MASK: u8 = 0b11111110; + impl Repr { #[inline] pub fn new_static(text: &'static str) -> Self { From d2bd971527691375cda6b36102447bb7e3b2dfb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:47:23 +0900 Subject: [PATCH 18/23] Use capacity for static --- crates/hstr/src/repr/static_ref.rs | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/crates/hstr/src/repr/static_ref.rs b/crates/hstr/src/repr/static_ref.rs index 1e0d22e..31fa29f 100644 --- a/crates/hstr/src/repr/static_ref.rs +++ b/crates/hstr/src/repr/static_ref.rs @@ -1,24 +1,16 @@ -use super::{nonmax::NonMaxUsize, Repr, KIND_STATIC}; +use super::{capacity::Capacity, Repr}; #[repr(C)] pub(super) struct StaticStr { ptr: *const u8, - /// We use the last two bits to store the kind of the string. - len: NonMaxUsize, + len: Capacity, } static_assertions::assert_eq_size!(Repr, StaticStr); -const MAX_LEN: usize = (usize::MAX >> 2) - 1; - impl StaticStr { - // Safety: `text.len()` must be less than `usize::MAX >> 2 - 1`. pub unsafe fn new(text: &'static str) -> Self { - // Shift length to the right by 2 bits and store the kind in the last two - // bits. - - debug_assert!(text.len() < MAX_LEN); - let len = NonMaxUsize::new(text.len() << 2 | (KIND_STATIC as usize)); + let len = Capacity::new(text.len()); Self { ptr: text.as_ptr(), @@ -27,7 +19,7 @@ impl StaticStr { } pub fn len(&self) -> usize { - self.len.as_usize() >> 2 + unsafe { self.len.as_usize() } } pub fn as_str(&self) -> &str { From 8c08bd5bfef84d13e4fb83b6e2ec44205cb52bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:54:46 +0900 Subject: [PATCH 19/23] WIP --- crates/hstr/src/repr/heap.rs | 8 +++++ crates/hstr/src/repr/mod.rs | 60 ++++++++++++++++++++++++++++++------ 2 files changed, 59 insertions(+), 9 deletions(-) diff --git a/crates/hstr/src/repr/heap.rs b/crates/hstr/src/repr/heap.rs index 8cc08b3..97c2970 100644 --- a/crates/hstr/src/repr/heap.rs +++ b/crates/hstr/src/repr/heap.rs @@ -13,4 +13,12 @@ impl HeapStr { let ptr = text.as_ptr(); Self { ptr, len } } + + pub fn len(&self) -> usize { + unsafe { self.len.as_usize() } + } + + pub fn as_str(&self) -> &str { + unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.ptr, self.len())) } + } } diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index b9b7485..01883df 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -36,11 +36,16 @@ impl Repr { #[inline] pub fn new_static(text: &'static str) -> Self { let repr = unsafe { StaticStr::new(text) }; + + debug_assert_eq!(repr.len(), text.len()); + let repr = unsafe { std::mem::transmute::(repr) }; + debug_assert_eq!(repr.kind(), KIND_STATIC); + debug_assert_eq!(repr.len(), text.len()); + if cfg!(feature = "debug") { assert_eq!(repr.as_str(), text); - assert_eq!(repr.kind(), KIND_STATIC); } repr @@ -56,44 +61,81 @@ impl Repr { if len < MAX_SIZE { let repr = unsafe { InlineBuffer::new(text) }; + + debug_assert_eq!(repr.len(), text.len()); + let repr = unsafe { std::mem::transmute::(repr) }; + debug_assert_eq!(repr.kind(), KIND_INLINED); + debug_assert_eq!(repr.len(), text.len()); + if cfg!(feature = "debug") { assert_eq!(repr.as_str(), text); - assert_eq!(repr.kind(), KIND_INLINED); } repr } else { let repr = unsafe { heap::HeapStr::new(text) }; + + debug_assert_eq!(repr.len(), text.len()); + let repr = unsafe { std::mem::transmute::(repr) }; + debug_assert_eq!(repr.kind(), KIND_HEAP); + debug_assert_eq!(repr.len(), text.len()); + if cfg!(feature = "debug") { assert_eq!(repr.as_str(), text); - assert_eq!(repr.kind(), KIND_HEAP); } repr } } - #[inline] - pub fn new_interned(text: &str) -> Self {} + // #[inline] + // pub fn new_interned(text: &str) -> Self {} fn len(&self) -> usize { match self.kind() { - KIND_INLINED => {} - KIND_HEAP => {} + KIND_INLINED => { + let repr = unsafe { std::mem::transmute::(*self) }; + repr.len() + } + KIND_HEAP => { + let repr = unsafe { std::mem::transmute::(*self) }; + repr.len() + } KIND_STATIC => { let repr = unsafe { std::mem::transmute::(*self) }; repr.len() } - KIND_INTERNED => {} + KIND_INTERNED => { + todo!("Repr::len() for interned strings") + } _ => unsafe { debug_unreachable!("Invalid kind in Repr::len()") }, } } - fn as_str(&self) -> &str {} + fn as_str(&self) -> &str { + match self.kind() { + KIND_INLINED => { + let repr = unsafe { std::mem::transmute::(*self) }; + repr.as_str() + } + KIND_HEAP => { + let repr = unsafe { std::mem::transmute::(*self) }; + repr.as_str() + } + KIND_STATIC => { + let repr = unsafe { std::mem::transmute::(*self) }; + repr.as_str() + } + KIND_INTERNED => { + todo!("Repr::as_str() for interned strings") + } + _ => unsafe { debug_unreachable!("Invalid kind in Repr::as_str()") }, + } + } #[inline] fn kind(&self) -> u8 { From 1542e5307b881c731dd5ada4fef87f1d23893790 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 13:58:23 +0900 Subject: [PATCH 20/23] Repr --- crates/hstr/src/repr/inline.rs | 17 +++++++++++++---- crates/hstr/src/repr/mod.rs | 18 +++++++++--------- crates/hstr/src/repr/nonmax.rs | 1 + 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/crates/hstr/src/repr/inline.rs b/crates/hstr/src/repr/inline.rs index 51e80da..b401f47 100644 --- a/crates/hstr/src/repr/inline.rs +++ b/crates/hstr/src/repr/inline.rs @@ -1,16 +1,25 @@ -use super::{Repr, MAX_SIZE}; +use std::mem::transmute; -#[repr(transparent)] -pub struct InlineBuffer(pub [u8; MAX_SIZE]); +use super::{nonmax::NonMaxU8, Repr, MAX_SIZE}; + +pub struct InlineBuffer(pub [u8; MAX_SIZE - 1], NonMaxU8); static_assertions::assert_eq_size!(InlineBuffer, Repr); impl InlineBuffer { /// Safety: `text.len()` must be less than `MAX_SIZE`. pub unsafe fn new(text: &str) -> Self { - let mut buffer = InlineBuffer([0; MAX_SIZE]); + let mut buffer = InlineBuffer([0; MAX_SIZE - 1], unsafe { transmute(text.len() as u8) }); let len = text.len(); let text = text.as_bytes(); buffer.0[..len].copy_from_slice(text); buffer } + + pub fn len(&self) -> usize { + unsafe { transmute::<_, u8>(self.1) as usize } + } + + pub fn as_str(&self) -> &str { + unsafe { std::str::from_utf8_unchecked(&self.0[..self.len()]) } + } } diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index 01883df..d1aec9e 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -2,7 +2,7 @@ use std::mem::size_of; use debug_unreachable::debug_unreachable; -use self::{inline::InlineBuffer, nonmax::NonMaxUsize, static_ref::StaticStr}; +use self::{heap::HeapStr, inline::InlineBuffer, nonmax::NonMaxUsize, static_ref::StaticStr}; mod capacity; mod heap; @@ -75,11 +75,11 @@ impl Repr { repr } else { - let repr = unsafe { heap::HeapStr::new(text) }; + let repr = unsafe { HeapStr::new(text) }; debug_assert_eq!(repr.len(), text.len()); - let repr = unsafe { std::mem::transmute::(repr) }; + let repr = unsafe { std::mem::transmute::(repr) }; debug_assert_eq!(repr.kind(), KIND_HEAP); debug_assert_eq!(repr.len(), text.len()); @@ -98,15 +98,15 @@ impl Repr { fn len(&self) -> usize { match self.kind() { KIND_INLINED => { - let repr = unsafe { std::mem::transmute::(*self) }; + let repr = unsafe { std::mem::transmute::<&Repr, &InlineBuffer>(self) }; repr.len() } KIND_HEAP => { - let repr = unsafe { std::mem::transmute::(*self) }; + let repr = unsafe { std::mem::transmute::<&Repr, &HeapStr>(self) }; repr.len() } KIND_STATIC => { - let repr = unsafe { std::mem::transmute::(*self) }; + let repr = unsafe { std::mem::transmute::<&Repr, &StaticStr>(self) }; repr.len() } KIND_INTERNED => { @@ -119,15 +119,15 @@ impl Repr { fn as_str(&self) -> &str { match self.kind() { KIND_INLINED => { - let repr = unsafe { std::mem::transmute::(*self) }; + let repr = unsafe { std::mem::transmute::<&Repr, &InlineBuffer>(self) }; repr.as_str() } KIND_HEAP => { - let repr = unsafe { std::mem::transmute::(*self) }; + let repr = unsafe { std::mem::transmute::<&Repr, &HeapStr>(self) }; repr.as_str() } KIND_STATIC => { - let repr = unsafe { std::mem::transmute::(*self) }; + let repr = unsafe { std::mem::transmute::<&Repr, &StaticStr>(self) }; repr.as_str() } KIND_INTERNED => { diff --git a/crates/hstr/src/repr/nonmax.rs b/crates/hstr/src/repr/nonmax.rs index a7f975f..73d4c0d 100644 --- a/crates/hstr/src/repr/nonmax.rs +++ b/crates/hstr/src/repr/nonmax.rs @@ -1,6 +1,7 @@ use std::mem::transmute; #[repr(C)] +#[derive(Copy, Clone, Debug)] pub struct NonMaxUsize( // Then we need one `usize` (aka WORDs) of data // ...but we breakup into multiple pieces... From a1fed493c6f59deb56553f742713ea707dec1aba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 14:20:18 +0900 Subject: [PATCH 21/23] drop --- crates/hstr/src/repr/interned.rs | 8 ++++++ crates/hstr/src/repr/mod.rs | 43 ++++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/crates/hstr/src/repr/interned.rs b/crates/hstr/src/repr/interned.rs index 8b13789..3af506c 100644 --- a/crates/hstr/src/repr/interned.rs +++ b/crates/hstr/src/repr/interned.rs @@ -1 +1,9 @@ +pub struct Interned { + ptr: *const (), +} +impl Interned { + pub fn new(ptr: *const ()) -> Self { + Self { ptr } + } +} diff --git a/crates/hstr/src/repr/mod.rs b/crates/hstr/src/repr/mod.rs index d1aec9e..2c51be9 100644 --- a/crates/hstr/src/repr/mod.rs +++ b/crates/hstr/src/repr/mod.rs @@ -1,8 +1,11 @@ -use std::mem::size_of; +use std::mem::{size_of, transmute}; use debug_unreachable::debug_unreachable; -use self::{heap::HeapStr, inline::InlineBuffer, nonmax::NonMaxUsize, static_ref::StaticStr}; +use self::{ + heap::HeapStr, inline::InlineBuffer, interned::Interned, nonmax::NonMaxUsize, + static_ref::StaticStr, +}; mod capacity; mod heap; @@ -148,3 +151,39 @@ impl Repr { } static_assertions::assert_eq_size!(Repr, Option, [usize; 2]); + +impl Drop for Repr { + #[inline] + fn drop(&mut self) { + // By "outlining" the actual Drop code and only calling it if we're a heap + // variant, it allows dropping an inline variant to be as cheap as + // possible. + match self.kind() { + KIND_HEAP | KIND_INLINED => outlined_drop(self), + _ => {} + } + + #[cold] + fn outlined_drop(this: &mut Repr) { + match this.kind() { + KIND_HEAP => { + let repr = unsafe { + // SAFETY: We just checked the discriminant to make sure we're heap + // allocated + transmute::<&mut Repr, &mut HeapStr>(this) + }; + repr.dealloc(); + } + KIND_INTERNED => { + let repr = unsafe { + // SAFETY: We just checked the discriminant to make sure + // we're heap allocated + transmute::<&mut Repr, &mut Interned>(this) + }; + repr.dealloc(); + } + _ => unsafe { debug_unreachable!("Invalid kind in Repr::drop()") }, + } + } + } +} From 0a7e18c0d5c10e49c6befc793df6e8ee32d63ae0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 14:24:19 +0900 Subject: [PATCH 22/23] dealloc --- crates/hstr/src/repr/heap.rs | 155 ++++++++++++++++++++++++++++++++++- 1 file changed, 152 insertions(+), 3 deletions(-) diff --git a/crates/hstr/src/repr/heap.rs b/crates/hstr/src/repr/heap.rs index 97c2970..6bc7199 100644 --- a/crates/hstr/src/repr/heap.rs +++ b/crates/hstr/src/repr/heap.rs @@ -1,7 +1,12 @@ +use std::{ + mem, + ptr::{self, NonNull}, +}; + use super::{capacity::Capacity, Repr}; pub struct HeapStr { - ptr: *const u8, + ptr: ptr::NonNull, len: Capacity, } @@ -10,7 +15,7 @@ static_assertions::assert_eq_size!(HeapStr, Repr); impl HeapStr { pub unsafe fn new(text: &str) -> Self { let len = Capacity::new(text.len()); - let ptr = text.as_ptr(); + let ptr = NonNull::new(text as *const str as *mut u8).unwrap(); Self { ptr, len } } @@ -19,6 +24,150 @@ impl HeapStr { } pub fn as_str(&self) -> &str { - unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.ptr, self.len())) } + unsafe { + std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.ptr.as_ptr(), self.len())) + } + } + + #[inline] + pub fn dealloc(&mut self) { + deallocate_ptr(self.ptr, self.len) + } +} + +/// Deallocates a buffer on the heap, handling when the capacity is also stored +/// on the heap +#[inline] +pub fn deallocate_ptr(ptr: ptr::NonNull, cap: Capacity) { + #[cold] + fn deallocate_with_capacity_on_heap(ptr: ptr::NonNull) { + // re-adjust the pointer to include the capacity that's on the heap + let adj_ptr = ptr.as_ptr().wrapping_sub(mem::size_of::()); + // read the capacity from the heap so we know how much to deallocate + let mut buf = [0u8; mem::size_of::()]; + // SAFETY: `src` and `dst` don't overlap, and are valid for usize number of + // bytes + unsafe { + ptr::copy_nonoverlapping(adj_ptr, buf.as_mut_ptr(), mem::size_of::()); + } + let capacity = usize::from_ne_bytes(buf); + // SAFETY: We know the pointer is not null since we got it as a NonNull + let ptr = unsafe { ptr::NonNull::new_unchecked(adj_ptr) }; + // SAFETY: We checked above that our capacity is on the heap, and we readjusted + // the pointer to reference the capacity + unsafe { heap_capacity::dealloc(ptr, capacity) } + } + + if cap.is_heap() { + deallocate_with_capacity_on_heap(ptr); + } else { + // SAFETY: Our capacity is always inline on 64-bit archs + unsafe { inline_capacity::dealloc(ptr, cap.as_usize()) } + } +} + +mod heap_capacity { + use core::ptr; + use std::alloc; + + use super::HeapStr; + + #[inline] + pub fn alloc(capacity: usize) -> ptr::NonNull { + let layout = layout(capacity); + debug_assert!(layout.size() > 0); + + // SAFETY: `alloc(...)` has undefined behavior if the layout is zero-sized. We + // know the layout can't be zero-sized though because we're always at + // least allocating one `usize` + let raw_ptr = unsafe { alloc::alloc(layout) }; + + // Check to make sure our pointer is non-null, some allocators return null + // pointers instead of panicking + match ptr::NonNull::new(raw_ptr) { + Some(ptr) => ptr, + None => alloc::handle_alloc_error(layout), + } + } + + /// Deallocates a pointer which references a `HeapBuffer` whose capacity is + /// on the heap + /// + /// # Saftey + /// * `ptr` must point to the start of a `HeapBuffer` whose capacity is on + /// the heap. i.e. we must have `ptr -> [cap ; string]` + pub unsafe fn dealloc(ptr: ptr::NonNull, capacity: usize) { + let layout = layout(capacity); + alloc::dealloc(ptr.as_ptr(), layout); + } + + #[repr(C)] + struct HeapBufferInnerHeapCapacity { + capacity: usize, + buffer: HeapStr, + } + + #[inline(always)] + pub fn layout(capacity: usize) -> alloc::Layout { + let buffer_layout = alloc::Layout::array::(capacity).expect("valid capacity"); + alloc::Layout::new::() + .extend(buffer_layout) + .expect("valid layout") + .0 + .pad_to_align() + } +} + +mod inline_capacity { + use core::ptr; + use std::alloc; + + use super::HeapStr; + + /// # SAFETY: + /// * `capacity` must be > 0 + #[inline] + pub unsafe fn alloc(capacity: usize) -> ptr::NonNull { + let layout = layout(capacity); + debug_assert!(layout.size() > 0); + + // SAFETY: `alloc(...)` has undefined behavior if the layout is zero-sized. We + // specify that `capacity` must be > 0 as a constraint to uphold the + // safety of this method. If capacity is greater than 0, then our layout + // will be non-zero-sized. + let raw_ptr = alloc::alloc(layout); + + // Check to make sure our pointer is non-null, some allocators return null + // pointers instead of panicking + match ptr::NonNull::new(raw_ptr) { + Some(ptr) => ptr, + None => alloc::handle_alloc_error(layout), + } + } + + /// Deallocates a pointer which references a `HeapBuffer` whose capacity is + /// stored inline + /// + /// # Saftey + /// * `ptr` must point to the start of a `HeapBuffer` whose capacity is on + /// the inline + pub unsafe fn dealloc(ptr: ptr::NonNull, capacity: usize) { + let layout = layout(capacity); + alloc::dealloc(ptr.as_ptr(), layout); + } + + #[repr(C)] + struct HeapBufferInnerInlineCapacity { + buffer: HeapStr, + } + + #[inline(always)] + pub fn layout(capacity: usize) -> alloc::Layout { + let buffer_layout = alloc::Layout::array::(capacity).expect("valid capacity"); + alloc::Layout::new::() + .extend(buffer_layout) + .expect("valid layout") + .0 + .pad_to_align() } } From abdd30d0f22abe17f19298155bbc38ddbb1df162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=EA=B0=95=EB=8F=99=EC=9C=A4=20=28Donny=29?= Date: Thu, 11 Apr 2024 14:26:12 +0900 Subject: [PATCH 23/23] nonnull --- crates/hstr/src/repr/heap.rs | 2 +- crates/hstr/src/repr/static_ref.rs | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/crates/hstr/src/repr/heap.rs b/crates/hstr/src/repr/heap.rs index 6bc7199..253060d 100644 --- a/crates/hstr/src/repr/heap.rs +++ b/crates/hstr/src/repr/heap.rs @@ -15,7 +15,7 @@ static_assertions::assert_eq_size!(HeapStr, Repr); impl HeapStr { pub unsafe fn new(text: &str) -> Self { let len = Capacity::new(text.len()); - let ptr = NonNull::new(text as *const str as *mut u8).unwrap(); + let ptr = NonNull::new_unchecked(text as *const str as *mut u8); Self { ptr, len } } diff --git a/crates/hstr/src/repr/static_ref.rs b/crates/hstr/src/repr/static_ref.rs index 31fa29f..11e212b 100644 --- a/crates/hstr/src/repr/static_ref.rs +++ b/crates/hstr/src/repr/static_ref.rs @@ -1,8 +1,10 @@ +use std::ptr; + use super::{capacity::Capacity, Repr}; #[repr(C)] pub(super) struct StaticStr { - ptr: *const u8, + ptr: ptr::NonNull, len: Capacity, } @@ -13,7 +15,7 @@ impl StaticStr { let len = Capacity::new(text.len()); Self { - ptr: text.as_ptr(), + ptr: ptr::NonNull::new_unchecked(text as *const str as *mut u8), len, } } @@ -23,6 +25,8 @@ impl StaticStr { } pub fn as_str(&self) -> &str { - unsafe { std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.ptr, self.len())) } + unsafe { + std::str::from_utf8_unchecked(std::slice::from_raw_parts(self.ptr.as_ptr(), self.len())) + } } }