Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions .changeset/text-edit-utf16-quadratic.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
"loro-crdt": patch
"loro-crdt-map": patch
---

Fix two O(n^2) editing slowdowns.

1. Editing with UTF-16 / UTF-8 (byte) positions (the default in the JS binding)
validated each position by materializing the entire `[0, pos)` prefix string,
making every `insert`/`delete`/`splice`/`mark` O(n) and a run of edits O(n^2)
(regression since 1.12.0). The boundary check now reads the rope's prefix
caches via the cursor (O(log n)). Unicode-indexed editing was unaffected.

2. When a subscriber is attached and many edits land on the same container within
one event batch (e.g. random-position inserts, or many distinct map-key
writes), building the event cloned the growing accumulated diff on every
compose — O(n^2) in the number of fragments. The diffs are now composed in
place. This affected text, map and list events.

3. Converting a UTF-16 / UTF-8 position within a text chunk to a unicode offset
scanned the chunk char-by-char, so editing/slicing a large contiguous chunk
(a big insert, a loaded document, or a long run of typed text that merges into
one chunk) was O(chunk length) per op. Chunks that contain no astral-plane
characters (UTF-16) or are pure ASCII (UTF-8) now convert in O(1), covering
essentially all real-world text (ASCII/Latin/CJK).
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

108 changes: 93 additions & 15 deletions crates/loro-internal/src/container/richtext/richtext_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,68 @@ mod text_chunk {
}
}

/// Whether every char in this chunk is in the Basic Multilingual Plane,
/// i.e. each char is a single UTF-16 code unit. When true, UTF-16 offsets
/// equal unicode offsets, so conversions are O(1) instead of O(offset).
#[inline]
fn is_all_bmp(&self) -> bool {
self.utf16_len == self.unicode_len
}

/// Whether this chunk is pure ASCII, i.e. each char is a single byte.
/// When true, byte offsets equal unicode offsets.
#[inline]
fn is_all_ascii(&self) -> bool {
self.bytes.len() as i32 == self.unicode_len
}

/// Convert a UTF-16 offset within this chunk to a unicode offset.
///
/// O(1) when the chunk has no astral-plane characters; otherwise it
/// scans up to `utf16_offset` chars. Mirrors [`utf16_to_unicode_index`].
pub fn utf16_offset_to_unicode(&self, utf16_offset: usize) -> Result<usize, usize> {
if self.is_all_bmp() {
return Ok(utf16_offset);
}
super::utf16_to_unicode_index(self.as_str(), utf16_offset)
}

/// Convert a UTF-8 (byte) offset within this chunk to a unicode offset.
/// O(1) when the chunk is pure ASCII.
pub fn utf8_offset_to_unicode(&self, utf8_offset: usize) -> Result<usize, usize> {
if self.is_all_ascii() {
return Ok(utf8_offset);
}
super::utf8_to_unicode_index(self.as_str(), utf8_offset)
}

/// Convert a unicode offset within this chunk to a UTF-16 offset.
/// O(1) when the chunk has no astral-plane characters.
pub fn unicode_offset_to_utf16(&self, unicode_offset: usize) -> Option<usize> {
if self.is_all_bmp() {
return Some(unicode_offset);
}
super::unicode_to_utf16_index(self.as_str(), unicode_offset)
}

/// Convert a unicode offset within this chunk to a UTF-8 (byte) offset.
/// O(1) when the chunk is pure ASCII.
pub fn unicode_offset_to_utf8(&self, unicode_offset: usize) -> Option<usize> {
if self.is_all_ascii() {
return Some(unicode_offset);
}
super::unicode_to_utf8_index(self.as_str(), unicode_offset)
}

/// Slice this chunk by unicode offsets, returning the substring.
/// O(1) byte lookup when the chunk is pure ASCII.
pub fn unicode_slice(&self, start: usize, end: usize) -> Result<&str, ()> {
if self.is_all_ascii() {
return self.as_str().get(start..end).ok_or(());
}
super::unicode_slice(self.as_str(), start, end)
}

pub(crate) fn delete_by_entity_index(
&mut self,
unicode_offset: usize,
Expand Down Expand Up @@ -498,7 +560,7 @@ mod text_chunk {
}

// Fast path for ASCII text: unicode index == byte index, and utf16 index == unicode index.
if self.bytes.len() as i32 == self.unicode_len {
if self.is_all_ascii() {
let ans = Self {
unicode_len: range.len() as i32,
bytes: self.bytes.slice_clone(range.start..range.end),
Expand Down Expand Up @@ -1220,7 +1282,7 @@ mod query {
// Allow left to not at the correct utf16 boundary. If so fallback to the last position.
// TODO: if we remove the use of query(pos-1), we won't need this fallback behavior
// WARNING: Unable to report error!!!
let offset = utf16_to_unicode_index(s.as_str(), left).unwrap_or_else(|e| e);
let offset = s.utf16_offset_to_unicode(left).unwrap_or_else(|e| e);
(offset, true)
}
RichtextStateChunk::Style { .. } => (1, false),
Expand Down Expand Up @@ -1300,7 +1362,7 @@ mod query {
// Allow left to not at the correct utf16 boundary. If so fallback to the last position.
// TODO: if we remove the use of query(pos-1), we won't need this fallback behavior
// WARNING: Unable to report error!!!
let offset = utf8_to_unicode_index(s.as_str(), left).unwrap_or_else(|e| e);
let offset = s.utf8_offset_to_unicode(left).unwrap_or_else(|e| e);
(offset, true)
}
RichtextStateChunk::Style { .. } => (1, false),
Expand Down Expand Up @@ -1954,7 +2016,7 @@ impl RichtextState {
}

if let RichtextStateChunk::Text(s) = span.elem {
match unicode_slice(s.as_str(), start, end) {
match s.unicode_slice(start, end) {
Ok(x) => ans.push_str(x),
Err(()) => {
return Err(LoroError::UTF16InUnicodeCodePoint { pos: pos + len })
Expand Down Expand Up @@ -2094,11 +2156,13 @@ impl RichtextState {
let slice_start = span.start.unwrap_or(0) + processed_len;
let slice_end = slice_start + take_len;

let text_content = unicode_slice(t.as_str(), slice_start, slice_end)
.map_err(|_| LoroError::OutOfBound {
pos: slice_end,
len: t.unicode_len() as usize,
info: "Slice delta out of bound".into(),
let text_content =
t.unicode_slice(slice_start, slice_end).map_err(|_| {
LoroError::OutOfBound {
pos: slice_end,
len: t.unicode_len() as usize,
info: "Slice delta out of bound".into(),
}
})?;

let styles = cur_styles.as_ref().unwrap();
Expand Down Expand Up @@ -2332,6 +2396,20 @@ impl RichtextState {
self.cursor_to_unicode_index(cursor.cursor)
}

/// Convert an event-index position into the index of `pos_type`.
///
/// This runs in O(log n) by querying the tree once and reading the prefix
/// caches from the resulting cursor, instead of materializing the whole
/// prefix string.
pub fn event_index_to_index(&self, event_index: usize, pos_type: PosType) -> usize {
if self.tree.is_empty() {
return 0;
}

let cursor = self.tree.query::<EventIndexQuery>(&event_index).unwrap();
self.get_index_from_cursor(cursor.cursor, pos_type).unwrap()
}

#[allow(unused)]
pub(crate) fn check(&self) {
if !cfg!(any(debug_assertions, test)) {
Expand Down Expand Up @@ -2744,22 +2822,22 @@ fn entity_offset_to_pos_type_offset(
) -> usize {
match pos_type {
PosType::Bytes => match elem {
RichtextStateChunk::Text(t) => unicode_to_utf8_index(t.as_str(), offset).unwrap(),
RichtextStateChunk::Text(t) => t.unicode_offset_to_utf8(offset).unwrap(),
RichtextStateChunk::Style { .. } => 0,
},
PosType::Unicode => match elem {
RichtextStateChunk::Text(_) => offset,
RichtextStateChunk::Style { .. } => 0,
},
PosType::Utf16 => match elem {
RichtextStateChunk::Text(t) => unicode_to_utf16_index(t.as_str(), offset).unwrap(),
RichtextStateChunk::Text(t) => t.unicode_offset_to_utf16(offset).unwrap(),
RichtextStateChunk::Style { .. } => 0,
},
PosType::Entity => offset,
PosType::Event => match elem {
RichtextStateChunk::Text(t) => {
if cfg!(feature = "wasm") {
unicode_to_utf16_index(t.as_str(), offset).unwrap()
t.unicode_offset_to_utf16(offset).unwrap()
} else {
offset
}
Expand All @@ -2776,7 +2854,7 @@ fn pos_type_offset_to_entity_offset(
) -> Option<usize> {
match pos_type {
PosType::Bytes => match elem {
RichtextStateChunk::Text(t) => utf8_to_unicode_index(t.as_str(), offset).ok(),
RichtextStateChunk::Text(t) => t.utf8_offset_to_unicode(offset).ok(),
RichtextStateChunk::Style { .. } => {
if offset > 0 {
None
Expand All @@ -2787,7 +2865,7 @@ fn pos_type_offset_to_entity_offset(
},
PosType::Unicode => Some(offset),
PosType::Utf16 => match elem {
RichtextStateChunk::Text(t) => utf16_to_unicode_index(t.as_str(), offset).ok(),
RichtextStateChunk::Text(t) => t.utf16_offset_to_unicode(offset).ok(),
RichtextStateChunk::Style { .. } => {
if offset > 0 {
None
Expand All @@ -2806,7 +2884,7 @@ fn pos_type_offset_to_entity_offset(
PosType::Event => match elem {
RichtextStateChunk::Text(t) => {
if cfg!(feature = "wasm") {
utf16_to_unicode_index(t.as_str(), offset).ok()
t.utf16_offset_to_unicode(offset).ok()
} else if offset < t.unicode_len() as usize {
Some(offset)
} else {
Expand Down
11 changes: 6 additions & 5 deletions crates/loro-internal/src/delta/map_delta.rs
Original file line number Diff line number Diff line change
Expand Up @@ -101,18 +101,19 @@ impl MapDelta {
}

impl ResolvedMapDelta {
pub(crate) fn compose(&self, x: ResolvedMapDelta) -> ResolvedMapDelta {
let mut updated = self.updated.clone();
pub(crate) fn compose(mut self, x: ResolvedMapDelta) -> ResolvedMapDelta {
// Compose into `self` in place; cloning `self.updated` here made
// composing N fragments into one map event O(N^2).
for (k, v) in x.updated.into_iter() {
if let Some(old) = updated.get_mut(&k) {
if let Some(old) = self.updated.get_mut(&k) {
if v.idlp > old.idlp {
*old = v;
}
} else {
updated.insert(k, v);
self.updated.insert(k, v);
}
}
ResolvedMapDelta { updated }
self
}

#[inline]
Expand Down
14 changes: 7 additions & 7 deletions crates/loro-internal/src/event.rs
Original file line number Diff line number Diff line change
Expand Up @@ -369,10 +369,9 @@ impl InternalDiff {
(InternalDiff::ListRaw(a), InternalDiff::ListRaw(b)) => {
Ok(InternalDiff::ListRaw(a.compose(b)))
}
(InternalDiff::RichtextRaw(a), InternalDiff::RichtextRaw(b)) => {
let mut ans = a.clone();
ans.compose(&b);
Ok(InternalDiff::RichtextRaw(ans))
(InternalDiff::RichtextRaw(mut a), InternalDiff::RichtextRaw(b)) => {
a.compose(&b);
Ok(InternalDiff::RichtextRaw(a))
}
(InternalDiff::Map(a), InternalDiff::Map(b)) => Ok(InternalDiff::Map(a.compose(b))),
(InternalDiff::Tree(a), InternalDiff::Tree(b)) => Ok(InternalDiff::Tree(a.compose(b))),
Expand All @@ -383,7 +382,6 @@ impl InternalDiff {

impl Diff {
pub fn compose_ref(&mut self, diff: &Diff) {
// PERF: avoid clone
match (self, diff) {
(Diff::List(a), Diff::List(b)) => {
a.compose(b);
Expand All @@ -392,10 +390,12 @@ impl Diff {
a.compose(b);
}
(Diff::Map(a), Diff::Map(b)) => {
*a = a.clone().compose(b.clone());
// Move the accumulator out instead of cloning it, so composing
// a long run of fragments stays linear rather than O(n^2).
*a = std::mem::take(a).compose(b.clone());
}
(Diff::Tree(a), Diff::Tree(b)) => {
*a = a.clone().compose(b.clone());
*a = std::mem::take(a).compose(b.clone());
}
#[cfg(feature = "counter")]
(Diff::Counter(a), Diff::Counter(b)) => *a += b,
Expand Down
39 changes: 13 additions & 26 deletions crates/loro-internal/src/handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2783,43 +2783,30 @@ impl TextHandler {
PosType::Unicode => Some(unicode_index),
PosType::Event => Some(event_index),
PosType::Bytes | PosType::Utf16 => {
// Use the prefix text to compute target offset.
let prefix = match &self.inner {
// Map the event-index position onto the target coordinate via the
// rope's prefix caches. This is O(log n); materializing the prefix
// string would be O(n) and makes repeated edits O(n^2).
match &self.inner {
MaybeDetached::Detached(t) => {
let t = t.lock();
if event_index > t.value.len_event() {
return None;
}
t.value.get_text_slice_by_event_index(0, event_index).ok()?
Some(t.value.event_index_to_index(event_index, to))
}
MaybeDetached::Attached(a) if a.has_decoded_state() => {
let res: Result<String, ()> = a.with_state(|state| {
let state = state.as_richtext_state_mut().unwrap();
if event_index > state.len_event() {
return Err(());
}
state
.get_text_slice_by_event_index(0, event_index)
.map_err(|_| ())
});

match res {
Ok(v) => v,
Err(_) => return None,
MaybeDetached::Attached(a) if a.has_decoded_state() => a.with_state(|state| {
let state = state.as_richtext_state_mut().unwrap();
if event_index > state.len_event() {
return None;
}
}
Some(state.event_index_to_index(event_index, to))
}),
MaybeDetached::Attached(a) => {
let value = a.get_value();
let s = value.as_string().unwrap();
return unicode_to_text_pos(s, unicode_index, to);
unicode_to_text_pos(s, unicode_index, to)
}
};

Some(match to {
PosType::Bytes => prefix.len(),
PosType::Utf16 => count_utf16_len(prefix.as_bytes()),
_ => unreachable!(),
})
}
}
PosType::Entity => None,
};
Expand Down
12 changes: 7 additions & 5 deletions crates/loro-internal/src/state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1570,11 +1570,13 @@ impl DocState {

continue;
};
// TODO: PERF avoid this clone
*last_container_diff = last_container_diff
.clone()
.compose(container_diff.diff)
.unwrap();
// Compose in place. Cloning the accumulated diff here made a
// batch of N same-container fragments O(N^2) (each compose
// cloned the growing accumulator), which is hit whenever a
// subscriber is attached and many edits land on one container
// in a single event batch.
let prev = std::mem::take(last_container_diff);
*last_container_diff = prev.compose(container_diff.diff).unwrap();
}
}
let mut diff: Vec<_> = containers
Expand Down
6 changes: 6 additions & 0 deletions crates/loro-internal/src/state/richtext_state.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,12 @@ impl RichtextState {
.get_mut()
.event_index_to_unicode_index(event_index)
}

pub(crate) fn event_index_to_index(&mut self, event_index: usize, pos_type: PosType) -> usize {
self.state
.get_mut()
.event_index_to_index(event_index, pos_type)
}
}

#[derive(Debug, Default, Clone)]
Expand Down
Loading
Loading