lance-format · wirybeaver · Jun 21, 2026 · Jun 21, 2026
diff --git a/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java b/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java
@@ -53,6 +53,7 @@ public static final class Builder {
     private Integer minNgramLength;
     private Integer maxNgramLength;
     private Boolean prefixOnly;
+    private Boolean disableCrossArrayUnnest;
     private Boolean skipMerge;
 
     /**
@@ -223,6 +224,22 @@ public Builder prefixOnly(boolean prefixOnly) {
       return this;
     }
 
+    /**
+     * Configure whether flattened JSON tokenization avoids cross-array unnesting.
+     *
+     * <p>When true, sibling arrays are indexed independently instead of producing their Cartesian
+     * product. This can reduce index build memory for JSON records with multiple arrays but can
+     * sacrifice result accuracy for queries that constrain values across those arrays. The default
+     * is false.
+     *
+     * @param disableCrossArrayUnnest whether to avoid cross-array unnesting
+     * @return this builder
+     */
+    public Builder disableCrossArrayUnnest(boolean disableCrossArrayUnnest) {
+      this.disableCrossArrayUnnest = disableCrossArrayUnnest;
+      return this;
+    }
+
     /**
      * Configure whether to skip the partition merge stage after indexing. If true, skip the
      * partition merge stage after indexing. This can be useful for distributed indexing where merge
@@ -280,6 +297,9 @@ public ScalarIndexParams build() {
       if (prefixOnly != null) {
         params.put("prefix_only", prefixOnly);
       }
+      if (disableCrossArrayUnnest != null) {
+        params.put("disable_cross_array_unnest", disableCrossArrayUnnest);
+      }
       if (skipMerge != null) {
         params.put("skip_merge", skipMerge);
       }

diff --git a/protos/index_old.proto b/protos/index_old.proto
@@ -39,4 +39,10 @@ message InvertedIndexDetails {
   uint32 min_ngram_length = 9;
   uint32 max_ngram_length = 10;
   bool prefix_only = 11;
+  // JSON document tokenization mode. Absent means SingleDocument JSON tokenization,
+  // which is how indexes written before flattened JSON sub-docs are interpreted.
+  optional string json_tokenizer_mode = 12;
+  // If true, avoid cross-array unnesting during flattened JSON tokenization.
+  // The default false value preserves exact Cartesian-product semantics.
+  bool disable_cross_array_unnest = 13;
 }
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -3255,6 +3255,12 @@ def create_scalar_index(
             ``[1, num_compute_cpus]``. If unset, Lance uses ``num_compute_cpus``
             workers unless ``LANCE_FTS_NUM_SHARDS`` is set. This parameter is
             only used for the current build and is not persisted with the index.
+        disable_cross_array_unnest: bool, default False
+            This is for the ``INVERTED`` index on JSON columns. If True, flattened
+            JSON tokenization indexes sibling arrays independently instead of
+            producing their Cartesian product. This reduces index build memory for
+            records with multiple arrays but can sacrifice result accuracy for
+            queries that constrain values across those arrays.
         base_tokenizer: str, default "simple"
             This is for the ``INVERTED`` index. The base tokenizer to use. The
             value can be:

diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
@@ -4727,7 +4727,10 @@ def test_json_inverted_match_query(tmp_path):
         stem=True,
         lower_case=True,
         remove_stop_words=True,
+        disable_cross_array_unnest=True,
     )
+    details = dataset.describe_indices()[0].details
+    assert details["disable_cross_array_unnest"] is True
 
     # Test match query with token exceeding max_token_length
     results = dataset.to_table(
@@ -4743,7 +4746,7 @@ def test_json_inverted_match_query(tmp_path):
 
     # Test language match
     results = dataset.to_table(
-        full_text_query=MatchQuery("Language,str,english", "json_col")
+        full_text_query=MatchQuery("Language[*],str,english", "json_col")
     )
     assert results.num_rows == 3
 

diff --git a/python/src/dataset.rs b/python/src/dataset.rs
@@ -2292,6 +2292,12 @@ impl Dataset {
                     if let Some(prefix_only) = kwargs.get_item("prefix_only")? {
                         params = params.ngram_prefix_only(prefix_only.extract()?);
                     }
+                    if let Some(disable_cross_array_unnest) =
+                        kwargs.get_item("disable_cross_array_unnest")?
+                    {
+                        params = params
+                            .disable_cross_array_unnest(disable_cross_array_unnest.extract()?);
+                    }
                     if let Some(memory_limit) = kwargs.get_item("memory_limit")? {
                         params = params.memory_limit_mb(memory_limit.extract()?);
                     }

diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs
@@ -147,11 +147,11 @@ impl InvertedIndexPlugin {
             }
         });
 
-        let details = pbold::InvertedIndexDetails::try_from(&params)?;
         let mut inverted_index =
             InvertedIndexBuilder::new_with_fragment_mask(params, fragment_mask)
                 .with_progress(progress);
         let files = inverted_index.update(data, index_store, None).await?;
+        let details = pbold::InvertedIndexDetails::try_from(inverted_index.params())?;
         Ok(CreatedIndex {
             index_details: prost_types::Any::from_msg(&details).unwrap(),
             index_version: current_fts_format_version().index_version(),