Skip to content

Commit 48a8953

Browse files
authored
refactor: bytes view calc total_bytes_len lazy. (#19056)
* refactor: bytes view calc total_bytes_len lazy. * update tests.
1 parent 361c0ab commit 48a8953

File tree

4 files changed

+68
-43
lines changed

4 files changed

+68
-43
lines changed

src/common/column/src/binview/builder.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ impl<T: ViewType + ?Sized> From<BinaryViewColumnBuilder<T>> for BinaryViewColumn
7373
Self::new_unchecked(
7474
value.views.into(),
7575
Arc::from(value.completed_buffers),
76-
value.total_bytes_len,
77-
value.total_buffer_len,
76+
Some(value.total_bytes_len),
77+
Some(value.total_buffer_len),
7878
)
7979
}
8080
}

src/common/column/src/binview/mod.rs

Lines changed: 58 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use std::collections::HashMap;
2222
use std::fmt::Debug;
2323
use std::marker::PhantomData;
2424
use std::sync::Arc;
25+
use std::sync::OnceLock;
2526

2627
use arrow_data::ArrayData;
2728
use arrow_data::ArrayDataBuilder;
@@ -115,20 +116,24 @@ pub struct BinaryViewColumnGeneric<T: ViewType + ?Sized> {
115116
buffers: Arc<[Buffer<u8>]>,
116117
phantom: PhantomData<T>,
117118
/// Total bytes length if we would concat them all
118-
total_bytes_len: usize,
119+
/// Initialized lazily when needed.
120+
total_bytes_len: OnceLock<usize>,
119121
/// Total bytes in the buffer (exclude remaining capacity)
120-
total_buffer_len: usize,
122+
/// Initialized lazily when needed.
123+
total_buffer_len: OnceLock<usize>,
121124
}
122125

123126
impl<T: ViewType + ?Sized> Clone for BinaryViewColumnGeneric<T> {
124127
fn clone(&self) -> Self {
128+
let total_bytes_len = self.total_bytes_len.get().copied();
129+
let total_buffer_len = self.total_buffer_len.get().copied();
125130
Self {
126131
views: self.views.clone(),
127132
buffers: self.buffers.clone(),
128133

129134
phantom: Default::default(),
130-
total_bytes_len: self.total_bytes_len,
131-
total_buffer_len: self.total_buffer_len,
135+
total_bytes_len: Self::init_cache(total_bytes_len),
136+
total_buffer_len: Self::init_cache(total_buffer_len),
132137
}
133138
}
134139
}
@@ -138,29 +143,41 @@ unsafe impl<T: ViewType + ?Sized> Send for BinaryViewColumnGeneric<T> {}
138143
unsafe impl<T: ViewType + ?Sized> Sync for BinaryViewColumnGeneric<T> {}
139144

140145
impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
146+
fn init_cache(value: Option<usize>) -> OnceLock<usize> {
147+
let cache = OnceLock::new();
148+
if let Some(v) = value {
149+
let _ = cache.set(v);
150+
}
151+
cache
152+
}
153+
141154
pub fn new_unchecked(
142155
views: Buffer<View>,
143156
buffers: Arc<[Buffer<u8>]>,
144157

145-
total_bytes_len: usize,
146-
total_buffer_len: usize,
158+
total_bytes_len: Option<usize>,
159+
total_buffer_len: Option<usize>,
147160
) -> Self {
148161
#[cfg(debug_assertions)]
149162
{
150-
let total = views.iter().map(|v| v.length as usize).sum::<usize>();
151-
assert_eq!(total, total_bytes_len);
163+
if let Some(total_bytes_len) = total_bytes_len {
164+
let total = views.iter().map(|v| v.length as usize).sum::<usize>();
165+
assert_eq!(total, total_bytes_len);
166+
}
152167

153-
let total = buffers.iter().map(|v| v.len()).sum::<usize>();
154-
assert_eq!(total, total_buffer_len);
168+
if let Some(total_buffer_len) = total_buffer_len {
169+
let total = buffers.iter().map(|v| v.len()).sum::<usize>();
170+
assert_eq!(total, total_buffer_len);
171+
}
155172
}
156173

157174
Self {
158175
views,
159176
buffers,
160177

161178
phantom: Default::default(),
162-
total_bytes_len,
163-
total_buffer_len,
179+
total_bytes_len: Self::init_cache(total_bytes_len),
180+
total_buffer_len: Self::init_cache(total_buffer_len),
164181
}
165182
}
166183

@@ -173,9 +190,8 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
173190
total_buffer_len: Option<usize>,
174191
) -> Self {
175192
let total_buffer_len =
176-
total_buffer_len.unwrap_or(buffers.iter().map(|v| v.len()).sum::<usize>());
177-
let total_bytes_len = views.iter().map(|v| v.length as usize).sum::<usize>();
178-
Self::new_unchecked(views, buffers, total_bytes_len, total_buffer_len)
193+
total_buffer_len.unwrap_or_else(|| buffers.iter().map(|v| v.len()).sum::<usize>());
194+
Self::new_unchecked(views, buffers, None, Some(total_buffer_len))
179195
}
180196

181197
pub fn data_buffers(&self) -> &Arc<[Buffer<u8>]> {
@@ -212,7 +228,7 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
212228
/// Creates an empty [`BinaryViewColumnGeneric`], i.e. whose `.len` is zero.
213229
#[inline]
214230
pub fn new_empty() -> Self {
215-
Self::new_unchecked(Buffer::new(), Arc::from([]), 0, 0)
231+
Self::new_unchecked(Buffer::new(), Arc::from([]), Some(0), Some(0))
216232
}
217233

218234
/// Returns the element at index `i`
@@ -293,11 +309,13 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
293309

294310
/// Get the total length of bytes that it would take to concatenate all binary/str values in this array.
295311
pub fn total_bytes_len(&self) -> usize {
296-
self.total_bytes_len
312+
*self
313+
.total_bytes_len
314+
.get_or_init(|| self.views.iter().map(|v| v.length as usize).sum::<usize>())
297315
}
298316

299317
pub fn memory_size(&self) -> usize {
300-
self.total_buffer_len + self.len() * 16
318+
self.total_buffer_len() + self.len() * 16
301319
}
302320

303321
fn total_unshared_buffer_len(&self) -> usize {
@@ -317,7 +335,9 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
317335

318336
/// Get the length of bytes that are stored in the variadic buffers.
319337
pub fn total_buffer_len(&self) -> usize {
320-
self.total_buffer_len
338+
*self
339+
.total_buffer_len
340+
.get_or_init(|| self.buffers.iter().map(|v| v.len()).sum::<usize>())
321341
}
322342

323343
#[inline(always)]
@@ -397,15 +417,15 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
397417
unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
398418
debug_assert!(offset + length <= self.len());
399419
self.views.slice_unchecked(offset, length);
400-
self.total_bytes_len = self.views.iter().map(|v| v.length as usize).sum::<usize>();
420+
self.total_bytes_len = OnceLock::new();
401421
}
402422

403423
impl_sliced!();
404424

405425
pub fn maybe_gc(self) -> Self {
406426
const GC_MINIMUM_SAVINGS: usize = 16 * 1024; // At least 16 KiB.
407427

408-
if self.total_buffer_len <= GC_MINIMUM_SAVINGS {
428+
if self.total_buffer_len() <= GC_MINIMUM_SAVINGS {
409429
return self;
410430
}
411431

@@ -435,6 +455,8 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
435455
}
436456

437457
pub fn make_mut(self) -> BinaryViewColumnBuilder<T> {
458+
let total_bytes_len = self.total_bytes_len();
459+
let total_buffer_len = self.total_buffer_len();
438460
let views = self.views.make_mut();
439461
let completed_buffers = self.buffers.to_vec();
440462

@@ -444,8 +466,8 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
444466
in_progress_buffer: vec![],
445467

446468
phantom: Default::default(),
447-
total_bytes_len: self.total_bytes_len,
448-
total_buffer_len: self.total_buffer_len,
469+
total_bytes_len,
470+
total_buffer_len,
449471
}
450472
}
451473

@@ -454,26 +476,29 @@ impl<T: ViewType + ?Sized> BinaryViewColumnGeneric<T> {
454476
use Either::*;
455477
let is_unique = (Arc::strong_count(&self.buffers) + Arc::weak_count(&self.buffers)) == 1;
456478

479+
let total_bytes_len = self.total_bytes_len();
480+
let total_buffer_len = self.total_buffer_len();
481+
457482
match (self.views.into_mut(), is_unique) {
458483
(Right(views), true) => Right(BinaryViewColumnBuilder {
459484
views,
460485
completed_buffers: self.buffers.to_vec(),
461486
in_progress_buffer: vec![],
462487
phantom: Default::default(),
463-
total_bytes_len: self.total_bytes_len,
464-
total_buffer_len: self.total_buffer_len,
488+
total_bytes_len,
489+
total_buffer_len,
465490
}),
466491
(Right(views), false) => Left(Self::new_unchecked(
467492
views.into(),
468493
self.buffers,
469-
self.total_bytes_len,
470-
self.total_buffer_len,
494+
Some(total_bytes_len),
495+
Some(total_buffer_len),
471496
)),
472497
(Left(views), _) => Left(Self::new_unchecked(
473498
views,
474499
self.buffers,
475-
self.total_bytes_len,
476-
self.total_buffer_len,
500+
Some(total_bytes_len),
501+
Some(total_buffer_len),
477502
)),
478503
}
479504
}
@@ -538,8 +563,8 @@ impl BinaryViewColumn {
538563
Utf8ViewColumn::new_unchecked(
539564
self.views.clone(),
540565
self.buffers.clone(),
541-
self.total_bytes_len,
542-
self.total_buffer_len,
566+
Some(self.total_bytes_len()),
567+
Some(self.total_buffer_len()),
543568
)
544569
}
545570
}
@@ -549,8 +574,8 @@ impl Utf8ViewColumn {
549574
BinaryViewColumn::new_unchecked(
550575
self.views.clone(),
551576
self.buffers.clone(),
552-
self.total_bytes_len,
553-
self.total_buffer_len,
577+
Some(self.total_bytes_len()),
578+
Some(self.total_buffer_len()),
554579
)
555580
}
556581

src/common/column/tests/it/binview/mod.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ fn basics_string_view() {
4343
let array2 = Utf8ViewColumn::new_unchecked(
4444
array.views().clone(),
4545
array.data_buffers().clone(),
46-
array.total_bytes_len(),
47-
array.total_buffer_len(),
46+
Some(array.total_bytes_len()),
47+
Some(array.total_buffer_len()),
4848
);
4949

5050
assert_eq!(array, array2);
@@ -83,8 +83,8 @@ fn basics_binary_view() {
8383
let array2 = BinaryViewColumn::new_unchecked(
8484
array.views().clone(),
8585
array.data_buffers().clone(),
86-
array.total_bytes_len(),
87-
array.total_buffer_len(),
86+
Some(array.total_bytes_len()),
87+
Some(array.total_buffer_len()),
8888
);
8989

9090
assert_eq!(array, array2);

src/query/expression/src/expression.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1133,14 +1133,14 @@ mod tests {
11331133

11341134
#[test]
11351135
fn test_size() {
1136-
assert_eq!(size_of::<Expr>(), 144);
1137-
assert_eq!(size_of::<Constant>(), 128);
1136+
assert_eq!(size_of::<Expr>(), 160);
1137+
assert_eq!(size_of::<Constant>(), 144);
11381138
assert_eq!(size_of::<ColumnRef>(), 72);
11391139
assert_eq!(size_of::<Cast>(), 48);
11401140
assert_eq!(size_of::<FunctionCall>(), 104);
11411141
assert_eq!(size_of::<LambdaFunctionCall>(), 120);
11421142

1143-
assert_eq!(size_of::<RawExpr>(), 128);
1144-
assert_eq!(size_of::<RemoteExpr>(), 128);
1143+
assert_eq!(size_of::<RawExpr>(), 144);
1144+
assert_eq!(size_of::<RemoteExpr>(), 144);
11451145
}
11461146
}

0 commit comments

Comments
 (0)