Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ public static final class Builder {
private Integer minNgramLength;
private Integer maxNgramLength;
private Boolean prefixOnly;
private Boolean disableCrossArrayUnnest;
private Boolean skipMerge;

/**
Expand Down Expand Up @@ -223,6 +224,22 @@ public Builder prefixOnly(boolean prefixOnly) {
return this;
}

/**
* Configure whether flattened JSON tokenization avoids cross-array unnesting.
*
* <p>When true, sibling arrays are indexed independently instead of producing their Cartesian
* product. This can reduce index build memory for JSON records with multiple arrays but can
* sacrifice result accuracy for queries that constrain values across those arrays. The default
* is false.
*
* @param disableCrossArrayUnnest whether to avoid cross-array unnesting
* @return this builder
*/
public Builder disableCrossArrayUnnest(boolean disableCrossArrayUnnest) {
this.disableCrossArrayUnnest = disableCrossArrayUnnest;
return this;
}

/**
* Configure whether to skip the partition merge stage after indexing. If true, skip the
* partition merge stage after indexing. This can be useful for distributed indexing where merge
Expand Down Expand Up @@ -280,6 +297,9 @@ public ScalarIndexParams build() {
if (prefixOnly != null) {
params.put("prefix_only", prefixOnly);
}
if (disableCrossArrayUnnest != null) {
params.put("disable_cross_array_unnest", disableCrossArrayUnnest);
}
if (skipMerge != null) {
params.put("skip_merge", skipMerge);
}
Expand Down
6 changes: 6 additions & 0 deletions protos/index_old.proto
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,10 @@ message InvertedIndexDetails {
uint32 min_ngram_length = 9;
uint32 max_ngram_length = 10;
bool prefix_only = 11;
// JSON document tokenization mode. Absent means SingleDocument JSON tokenization,
// which is how indexes written before flattened JSON sub-docs are interpreted.
optional string json_tokenizer_mode = 12;
// If true, avoid cross-array unnesting during flattened JSON tokenization.
// The default false value preserves exact Cartesian-product semantics.
bool disable_cross_array_unnest = 13;
}
6 changes: 6 additions & 0 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3255,6 +3255,12 @@ def create_scalar_index(
``[1, num_compute_cpus]``. If unset, Lance uses ``num_compute_cpus``
workers unless ``LANCE_FTS_NUM_SHARDS`` is set. This parameter is
only used for the current build and is not persisted with the index.
disable_cross_array_unnest: bool, default False
This is for the ``INVERTED`` index on JSON columns. If True, flattened
JSON tokenization indexes sibling arrays independently instead of
producing their Cartesian product. This reduces index build memory for
records with multiple arrays but can sacrifice result accuracy for
queries that constrain values across those arrays.
base_tokenizer: str, default "simple"
This is for the ``INVERTED`` index. The base tokenizer to use. The
value can be:
Expand Down
5 changes: 4 additions & 1 deletion python/python/tests/test_scalar_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -4727,7 +4727,10 @@ def test_json_inverted_match_query(tmp_path):
stem=True,
lower_case=True,
remove_stop_words=True,
disable_cross_array_unnest=True,
)
details = dataset.describe_indices()[0].details
assert details["disable_cross_array_unnest"] is True

# Test match query with token exceeding max_token_length
results = dataset.to_table(
Expand All @@ -4743,7 +4746,7 @@ def test_json_inverted_match_query(tmp_path):

# Test language match
results = dataset.to_table(
full_text_query=MatchQuery("Language,str,english", "json_col")
full_text_query=MatchQuery("Language[*],str,english", "json_col")
)
assert results.num_rows == 3

Expand Down
6 changes: 6 additions & 0 deletions python/src/dataset.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2292,6 +2292,12 @@ impl Dataset {
if let Some(prefix_only) = kwargs.get_item("prefix_only")? {
params = params.ngram_prefix_only(prefix_only.extract()?);
}
if let Some(disable_cross_array_unnest) =
kwargs.get_item("disable_cross_array_unnest")?
{
params = params
.disable_cross_array_unnest(disable_cross_array_unnest.extract()?);
}
if let Some(memory_limit) = kwargs.get_item("memory_limit")? {
params = params.memory_limit_mb(memory_limit.extract()?);
}
Expand Down
2 changes: 1 addition & 1 deletion rust/lance-index/src/scalar/inverted.rs
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,11 @@ impl InvertedIndexPlugin {
}
});

let details = pbold::InvertedIndexDetails::try_from(&params)?;
let mut inverted_index =
InvertedIndexBuilder::new_with_fragment_mask(params, fragment_mask)
.with_progress(progress);
let files = inverted_index.update(data, index_store, None).await?;
let details = pbold::InvertedIndexDetails::try_from(inverted_index.params())?;
Ok(CreatedIndex {
index_details: prost_types::Any::from_msg(&details).unwrap(),
index_version: current_fts_format_version().index_version(),
Expand Down
Loading
Loading