diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 1a2ec6a6a691..bacd176ea776 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -202,6 +202,8 @@ New Features * GITHUB#15468: Add support for `@SuppressAssertingFormats` annotation for fine-grained control over `AssertingCodec` formats (Prudhvi Godithi) + * GITHUB#15518: Add support for post-collection faceting to the new faceting API in the sandbox module. (Egor Potemkin) + Improvements --------------------- * GITHUB#15148: Add support uint8 distance and allow 8 bit scalar quantization (Trevor McCulloch) diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/utils/PostCollectionFaceting.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/utils/PostCollectionFaceting.java new file mode 100644 index 000000000000..e11e267148c8 --- /dev/null +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/facet/utils/PostCollectionFaceting.java @@ -0,0 +1,454 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.utils; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.Executor; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.search.Collector; +import org.apache.lucene.search.CollectorManager; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.LeafCollector; +import org.apache.lucene.search.Scorable; +import org.apache.lucene.search.TaskExecutor; +import org.apache.lucene.util.ArrayUtil; + +/** + * Performs post-collection faceting by replaying collected documents through drill-down and + * drill-sideways collectors. This enables parallel facet computation after initial document + * collection. + * + *

Normally, users can collect facets directly during search without needing FacetsCollector to + * store doc IDs. However, this class implements the second step of two-step collection: iterating + * over doc IDs already collected in FacetsCollector to compute facet results. This approach is + * useful when all matches must be known before computing facets, or when reusing the same matching + * documents to run faceting multiple times. + * + * @param drill-down collector type + * @param drill-down result type + * @param drill-sideways collector type + * @param drill-sideways result type + */ +public final class PostCollectionFaceting { + // TODO: is there more optimal slicing method or docs per slice limit? + static final int MIN_DOCS_PER_SLICE = 100; + + private final CollectorManager drillDownCollectorManager; + private final List> drillSidewaysCollectorManagers; + private final FacetsCollector drillDownFacetsCollector; + private final Map drillSidewaysFacetsCollectors; + private final TaskExecutor taskExecutor; + private final Map dimToIndexMap; + private final int numOfIndexLeafs; + + /** + * Creates a new PostCollectionFaceting instance. + * + * @param drillDownCollectorManager collector manager for drill-down results + * @param drillSidewaysCollectorManagers map of dimension names to collector managers for + * drill-sideways results + * @param drillDownFacetsCollector facets collector containing drill-down matching documents + * @param drillSidewaysFacetsCollectors map of dimension names to facets collectors for + * drill-sideways + * @param executor executor for parallel processing, or null for sequential execution + */ + public PostCollectionFaceting( + CollectorManager drillDownCollectorManager, + Map> drillSidewaysCollectorManagers, + FacetsCollector drillDownFacetsCollector, + Map drillSidewaysFacetsCollectors, + Executor executor) { + this.drillDownCollectorManager = drillDownCollectorManager; + this.drillDownFacetsCollector = drillDownFacetsCollector; + if (drillSidewaysFacetsCollectors == null) { + this.drillSidewaysFacetsCollectors = Map.of(); + } else { + this.drillSidewaysFacetsCollectors = drillSidewaysFacetsCollectors; + } + if (executor == null) { + this.taskExecutor = new TaskExecutor(Runnable::run); + } else { + this.taskExecutor = new TaskExecutor(executor); + } + this.numOfIndexLeafs = calculateNumOfIndexLeafs(); + this.dimToIndexMap = new HashMap<>(); + int ind = 0; + if (drillSidewaysCollectorManagers == null) { + // assert this.drillSidewaysFacetsCollectors.isEmpty(); + this.drillSidewaysCollectorManagers = List.of(); + } else { + // Ignore dimensions that don't exist in either one of the maps. + this.drillSidewaysCollectorManagers = new ArrayList<>(drillSidewaysCollectorManagers.size()); + for (Map.Entry> entry : + drillSidewaysCollectorManagers.entrySet()) { + if (this.drillSidewaysFacetsCollectors.containsKey(entry.getKey())) { + dimToIndexMap.put(entry.getKey(), ind++); + this.drillSidewaysCollectorManagers.add(entry.getValue()); + } + } + } + } + + /** + * Creates a new PostCollectionFaceting instance without drill-sideways collectors. + * + * @param drillDownCollectorManager collector manager for drill-down results + * @param drillDownFacetsCollector facets collector containing drill-down matching documents + * @param executor executor for parallel processing, or null for sequential execution + */ + public PostCollectionFaceting( + CollectorManager drillDownCollectorManager, + FacetsCollector drillDownFacetsCollector, + Executor executor) { + this(drillDownCollectorManager, null, drillDownFacetsCollector, null, executor); + } + + private int calculateNumOfIndexLeafs() { + int maxOrd = -1; + for (FacetsCollector.MatchingDocs matchingDocs : drillDownFacetsCollector.getMatchingDocs()) { + maxOrd = Math.max(maxOrd, matchingDocs.context().ord); + } + for (FacetsCollector facetsCollector : drillSidewaysFacetsCollectors.values()) { + for (FacetsCollector.MatchingDocs matchingDocs : facetsCollector.getMatchingDocs()) { + maxOrd = Math.max(maxOrd, matchingDocs.context().ord); + } + } + return maxOrd + 1; + } + + /** + * Organizes matching documents into a 2D array indexed by leaf ordinal and dimension ordinal. + * + * @return leaf ord -> dim ord -> matching docs, where dim ordinal is from dimToIndexMap + *

Example: For an index with 3 segments and 2 drill-sideways dimensions ("brand", + * "color"): + *

+   * // dimToIndexMap: {"brand" -> 0, "color" -> 1}
+   * // Result array structure:
+   * result[0][0] = drill-down docs for segment 0
+   * result[0][1] = "brand" drill-sideways docs for segment 0
+   * result[0][2] = "color" drill-sideways docs for segment 0
+   * result[1][0] = drill-down docs for segment 1
+   * result[1][1] = "brand" drill-sideways docs for segment 1
+   * ...
+   * 
+ *

Note: {@link FacetsCollector#getMatchingDocs()} returns one MatchingDocs per visited + * segment, so the number of MatchingDocs is never greater than the number of index segments, + * even if intra-segment concurrency was used to collect data. + */ + private FacetsCollector.MatchingDocs[][] getPerLeafMatchingDocs() { + // Max dim + FacetsCollector.MatchingDocs[][] perLeafMatchingDocs = + new FacetsCollector.MatchingDocs[numOfIndexLeafs] + [drillSidewaysCollectorManagers.size() + 1]; + for (FacetsCollector.MatchingDocs drillDownMatchingDocs : + drillDownFacetsCollector.getMatchingDocs()) { + perLeafMatchingDocs[drillDownMatchingDocs.context().ord][0] = drillDownMatchingDocs; + } + for (Map.Entry entry : dimToIndexMap.entrySet()) { + for (FacetsCollector.MatchingDocs matchingDocs : + drillSidewaysFacetsCollectors.get(entry.getKey()).getMatchingDocs()) { + perLeafMatchingDocs[matchingDocs.context().ord][entry.getValue() + 1] = matchingDocs; + } + } + return perLeafMatchingDocs; + } + + private record Slice(FacetsCollector.MatchingDocs[][] leafMatchingDocs) {} + + /** + * Partitions matching documents into slices for parallel processing. + * + *

Slicing enables parallel facet computation by distributing work across multiple threads. + * Each slice contains a subset of index segments with enough documents to justify the overhead of + * parallel execution, improving throughput for large result sets. + * + * @param minDocsPerSlice minimum number of documents per slice to balance parallelization + * overhead + * @param perLeafMatchingDocs matching documents organized by leaf ordinal and dimension ordinal + * @return list of slices, each containing a subset of segments for independent processing + */ + private List getSlices( + int minDocsPerSlice, FacetsCollector.MatchingDocs[][] perLeafMatchingDocs) { + List slices = new ArrayList<>(); + + int currentSliceSize = 0; + int lastSliceEnd = -1; + for (int leafOrd = 0; leafOrd < perLeafMatchingDocs.length; leafOrd++) { + for (int dimOrd = 0; dimOrd < perLeafMatchingDocs[leafOrd].length; dimOrd++) { + if (perLeafMatchingDocs[leafOrd][dimOrd] != null) { + currentSliceSize += perLeafMatchingDocs[leafOrd][dimOrd].totalHits(); + } + } + if (currentSliceSize >= minDocsPerSlice) { + slices.add( + new Slice( + ArrayUtil.copyOfSubArray(perLeafMatchingDocs, lastSliceEnd + 1, leafOrd + 1))); + currentSliceSize = 0; + lastSliceEnd = leafOrd; + } + } + // add final slice + if (currentSliceSize > 0) { + slices.add( + new Slice( + ArrayUtil.copyOfSubArray( + perLeafMatchingDocs, lastSliceEnd + 1, perLeafMatchingDocs.length))); + } + return slices; + } + + /** + * Collects facet results by replaying documents through collectors in parallel slices. + * + * @return result containing drill-down and drill-sideways facet results + * @throws IOException if an I/O error occurs during collection + */ + public Result collect() throws IOException { + FacetsCollector.MatchingDocs[][] perLeafMatchingDocs = getPerLeafMatchingDocs(); + final List leafSlices = getSlices(MIN_DOCS_PER_SLICE, perLeafMatchingDocs); + + if (leafSlices.size() == 0) { + return getEmptyResult(); + } else { + final List drillDownCollectors; + if (drillDownCollectorManager != null) { + drillDownCollectors = new ArrayList<>(leafSlices.size()); + } else { + drillDownCollectors = null; + } + final List> drillSidewaysCollectors = new ArrayList<>(leafSlices.size()); + final List> listTasks = new ArrayList<>(leafSlices.size()); + for (int i = 0; i < leafSlices.size(); ++i) { + final Slice slice = leafSlices.get(i); + // drill down collector + final C drillDownCollector; + if (drillDownCollectorManager != null) { + drillDownCollector = drillDownCollectorManager.newCollector(); + drillDownCollectors.add(drillDownCollector); + } else { + drillDownCollector = null; + } + + // drill sideways collectors + List drillSidewaysSliceCollectors = + new ArrayList<>(drillSidewaysCollectorManagers.size()); + for (CollectorManager manager : drillSidewaysCollectorManagers) { + drillSidewaysSliceCollectors.add(manager.newCollector()); + } + drillSidewaysCollectors.add(drillSidewaysSliceCollectors); + listTasks.add(() -> collectSlice(slice, drillDownCollector, drillSidewaysSliceCollectors)); + } + taskExecutor.invokeAll(listTasks); + Map drillSidewaysResults = new HashMap<>(drillSidewaysCollectorManagers.size()); + for (Map.Entry entry : dimToIndexMap.entrySet()) { + List collectors = + drillSidewaysCollectors.stream() + .map(list -> list.get(dimToIndexMap.get(entry.getKey()))) + .toList(); + CollectorManager collectorManager = + drillSidewaysCollectorManagers.get(entry.getValue()); + drillSidewaysResults.put(entry.getKey(), collectorManager.reduce(collectors)); + } + T drillDownResult; + if (drillDownCollectorManager != null) { + drillDownResult = drillDownCollectorManager.reduce(drillDownCollectors); + } else { + drillDownResult = null; + } + return new Result<>(drillDownResult, drillSidewaysResults); + } + } + + private Result getEmptyResult() throws IOException { + // there are no segments, nothing to offload to the executor, but we do need to call reduce to + // create some kind of empty result + Map emptyResults = new HashMap<>(); + for (Map.Entry entry : dimToIndexMap.entrySet()) { + emptyResults.put( + entry.getKey(), drillSidewaysCollectorManagers.get(entry.getValue()).reduce(List.of())); + } + T drillDownResult = null; + if (drillDownCollectorManager != null) { + drillDownResult = drillDownCollectorManager.reduce(List.of()); + } + return new Result<>(drillDownResult, emptyResults); + } + + /** + * The result. It is very similar to DrillSideways.Result, but it uses Map instead of List for + * drill sideways results. See also to-do comment in DrillSidewaysFacetOrchestrator. + * + * @param drill down result + * @param drill sideways result + * @param drillDownResult the drill down result + * @param drillSidewaysResults the drill sideways results + */ + public record Result(T drillDownResult, Map drillSidewaysResults) {} + + private static class MatchingDocsScorable extends Scorable { + private final FacetsCollector.MatchingDocs matchingDocs; + private int currentDocId = -1; + + MatchingDocsScorable(FacetsCollector.MatchingDocs matchingDocs) { + this.matchingDocs = matchingDocs; + } + + void setCurrentDocId(int docId) { + this.currentDocId = docId; + } + + @Override + public float score() throws IOException { + assert currentDocId >= 0 : "setCurrentDocId() must be called before score()"; + assert matchingDocs.scores().length > currentDocId + : "scores array is indexed by doc ID (see FacetsCollector.MatchingDocs), so length must be greater" + + " than currentDocId"; + return matchingDocs.scores()[currentDocId]; + } + } + + private static LeafCollector getLeafCollector( + FacetsCollector.MatchingDocs matchingDocs, Collector collector) throws IOException { + if (matchingDocs == null || collector == null) { + return null; + } + return collector.getLeafCollector(matchingDocs.context()); + } + + private static MatchingDocsScorable createScorer( + FacetsCollector.MatchingDocs matchingDocs, Collector collector) { + if (matchingDocs == null || collector == null) { + return null; + } + if (collector.scoreMode().needsScores()) { + if (matchingDocs.scores() == null) { + throw new IllegalStateException( + "Collector requires scores, but FacetCollector doesn't have them."); + } else { + return new MatchingDocsScorable(matchingDocs); + } + } + return null; + } + + private Void collectSlice(Slice slice, C drillDownCollector, List drillSidewaysCollectors) + throws IOException { + LeafCollector[] leafCollectors = new LeafCollector[drillSidewaysCollectors.size() + 1]; + // Init lazily as it is not often needed. + MatchingDocsScorable[] scorables = null; + for (FacetsCollector.MatchingDocs[] leafMatchingDocs : slice.leafMatchingDocs()) { + leafCollectors[0] = getLeafCollector(leafMatchingDocs[0], drillDownCollector); + // TODO dedup scorer code for drill down and sideways + MatchingDocsScorable scorer = createScorer(leafMatchingDocs[0], drillDownCollector); + if (scorer != null) { + if (scorables == null) { + scorables = new MatchingDocsScorable[leafCollectors.length]; + } + scorables[0] = scorer; + leafCollectors[0].setScorer(scorer); + } + + for (int i = 0; i < drillSidewaysCollectors.size(); i++) { + leafCollectors[i + 1] = + getLeafCollector(leafMatchingDocs[i + 1], drillSidewaysCollectors.get(i)); + scorer = createScorer(leafMatchingDocs[i + 1], drillSidewaysCollectors.get(i)); + if (scorer != null) { + if (scorables == null) { + scorables = new MatchingDocsScorable[leafCollectors.length]; + } + scorables[i + 1] = scorer; + leafCollectors[i + 1].setScorer(scorer); + } + } + collectLeaf(leafMatchingDocs, leafCollectors, scorables); + } + return null; + } + + private static void collectLeaf( + FacetsCollector.MatchingDocs[] matchingDocs, + LeafCollector[] leafCollectors, + MatchingDocsScorable[] scorables) + throws IOException { + assert matchingDocs.length == leafCollectors.length; + // init + int currentDocToCollect = NO_MORE_DOCS; + // TODO: can move iterators out of this method, pass instead of matchingDocs? + DocIdSetIterator[] iterators = new DocIdSetIterator[matchingDocs.length]; + for (int i = 0; i < matchingDocs.length; i++) { + if (matchingDocs[i] != null && leafCollectors[i] != null) { + iterators[i] = matchingDocs[i].bits().iterator(); + int firstDoc = iterators[i].nextDoc(); + if (firstDoc != NO_MORE_DOCS + && (currentDocToCollect == NO_MORE_DOCS || currentDocToCollect > firstDoc)) { + currentDocToCollect = firstDoc; + } + } + } + // collection + int nextDocToCollect; + while (currentDocToCollect < NO_MORE_DOCS) { + nextDocToCollect = Integer.MAX_VALUE; + for (int i = 0; i < iterators.length; i++) { + if (iterators[i] == null) { + continue; + } + if (iterators[i].docID() == currentDocToCollect) { + assert leafCollectors[i] != null + : "leafCollectors[" + i + "] is null but the iterator is not null"; + if (scorables != null && scorables[i] != null) { + scorables[i].setCurrentDocId(currentDocToCollect); + } + leafCollectors[i].collect(currentDocToCollect); + int nextDoc = iterators[i].nextDoc(); + if (nextDoc == NO_MORE_DOCS) { + iterators[i] = null; + } else if (nextDocToCollect > nextDoc) { + nextDocToCollect = nextDoc; + } + } else { + assert iterators[i].docID() > currentDocToCollect + : "currentDocToCollect (" + + currentDocToCollect + + ") should always be greater than iterators[i].docID() (" + + iterators[i].docID() + + ")"; + + if (nextDocToCollect > iterators[i].docID()) { + nextDocToCollect = iterators[i].docID(); + } + } + } + assert nextDocToCollect > currentDocToCollect; + currentDocToCollect = nextDocToCollect; + } + // finish + for (LeafCollector leafCollector : leafCollectors) { + if (leafCollector != null) { + leafCollector.finish(); + } + } + } +} diff --git a/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/utils/TestPostCollectionFaceting.java b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/utils/TestPostCollectionFaceting.java new file mode 100644 index 000000000000..cb290f0d8934 --- /dev/null +++ b/lucene/sandbox/src/test/org/apache/lucene/sandbox/facet/utils/TestPostCollectionFaceting.java @@ -0,0 +1,593 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.sandbox.facet.utils; + +import static java.util.concurrent.Executors.newFixedThreadPool; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ExecutorService; +import org.apache.lucene.document.Document; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NoMergePolicy; +import org.apache.lucene.search.CollectorManager; +import org.apache.lucene.search.Scorable; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.SimpleCollector; +import org.apache.lucene.store.Directory; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.NamedThreadFactory; + +public class TestPostCollectionFaceting extends LuceneTestCase { + + private final List testReaders = new ArrayList<>(); + private final List testDirs = new ArrayList<>(); + + @Override + public void tearDown() throws Exception { + for (DirectoryReader reader : testReaders) { + reader.close(); + } + for (Directory dir : testDirs) { + dir.close(); + } + super.tearDown(); + } + + public void testBasic() throws IOException { + List contexts = createLeafContexts(2); + LeafReaderContext ctx0 = contexts.get(0); + LeafReaderContext ctx1 = contexts.get(1); + + TestFacetsCollector drillDownFacets = createFacetsCollector(ctx0, 2, 3); + addMatchingDocs(drillDownFacets, ctx1, 5); + + TestFacetsCollector dim1Facets = createFacetsCollector(ctx0, 1, 2, 6); + addMatchingDocs(dim1Facets, ctx1, 4, 7); + + TestFacetsCollector dim2Facets = createFacetsCollector(ctx0, 2, 3, 8); + addMatchingDocs(dim2Facets, ctx1, 5, 9); + + Map drillSidewaysFacets = new HashMap<>(); + drillSidewaysFacets.put("dim1", dim1Facets); + drillSidewaysFacets.put("dim2", dim2Facets); + + DocCountCollectorManager drillDownManager = new DocCountCollectorManager(); + Map drillSidewaysManagers = new HashMap<>(); + drillSidewaysManagers.put("dim1", new DocCountCollectorManager()); + drillSidewaysManagers.put("dim2", new DocCountCollectorManager()); + + PostCollectionFaceting + faceting = + new PostCollectionFaceting<>( + drillDownManager, + drillSidewaysManagers, + drillDownFacets, + drillSidewaysFacets, + null); + + PostCollectionFaceting.Result result = faceting.collect(); + + assertNotNull(result.drillDownResult()); + assertEquals(3, result.drillDownResult().totalDocs); + assertEquals(Set.of(2, 3, 5), result.drillDownResult().allDocIds); + + assertNotNull(result.drillSidewaysResults()); + assertEquals(2, result.drillSidewaysResults().size()); + + DocCountResult dim1Result = result.drillSidewaysResults().get("dim1"); + assertNotNull(dim1Result); + assertEquals(5, dim1Result.totalDocs); + assertEquals(Set.of(1, 2, 6, 4, 7), dim1Result.allDocIds); + + DocCountResult dim2Result = result.drillSidewaysResults().get("dim2"); + assertNotNull(dim2Result); + assertEquals(5, dim2Result.totalDocs); + assertEquals(Set.of(2, 3, 8, 5, 9), dim2Result.allDocIds); + } + + public void testEmptySlices() throws IOException { + TestFacetsCollector drillDownFacets = new TestFacetsCollector(); + Map drillSidewaysFacets = new HashMap<>(); + drillSidewaysFacets.put("dim1", new TestFacetsCollector()); + + DocCountCollectorManager drillDownManager = new DocCountCollectorManager(); + Map drillSidewaysManagers = new HashMap<>(); + drillSidewaysManagers.put("dim1", new DocCountCollectorManager()); + + PostCollectionFaceting + faceting = + new PostCollectionFaceting<>( + drillDownManager, + drillSidewaysManagers, + drillDownFacets, + drillSidewaysFacets, + null); + + PostCollectionFaceting.Result result = faceting.collect(); + + assertNotNull(result.drillDownResult()); + assertEquals(0, result.drillDownResult().totalDocs); + assertEquals(1, result.drillSidewaysResults().size()); + assertEquals(0, result.drillSidewaysResults().get("dim1").totalDocs); + } + + public void testScoreCollector() throws IOException { + List contexts = createLeafContexts(1); + TestFacetsCollector drillDownFacets = + createFacetsCollectorWithScores( + contexts.get(0), new int[] {0, 1}, new float[] {1.0f, 2.0f}); + + Map drillSidewaysFacets = new HashMap<>(); + drillSidewaysFacets.put( + "dim1", + createFacetsCollectorWithScores(contexts.get(0), new int[] {0}, new float[] {1.5f})); + + ScoreCollectorManager drillDownManager = new ScoreCollectorManager(); + Map drillSidewaysManagers = new HashMap<>(); + drillSidewaysManagers.put("dim1", new ScoreCollectorManager()); + + PostCollectionFaceting faceting = + new PostCollectionFaceting<>( + drillDownManager, drillSidewaysManagers, drillDownFacets, drillSidewaysFacets, null); + + PostCollectionFaceting.Result result = faceting.collect(); + assertNotNull(result.drillDownResult()); + assertEquals(3.0f, result.drillDownResult(), 0.001f); + assertEquals(1.5f, result.drillSidewaysResults().get("dim1"), 0.001f); + } + + public void testScoreCollectorWithoutScores() throws IOException { + List contexts = createLeafContexts(1); + TestFacetsCollector drillDownFacets = createFacetsCollector(contexts.get(0), 1, 2); + + Map drillSidewaysFacets = new HashMap<>(); + drillSidewaysFacets.put("dim1", createFacetsCollector(contexts.get(0), 1)); + + ScoreCollectorManager drillDownManager = new ScoreCollectorManager(); + Map drillSidewaysManagers = new HashMap<>(); + drillSidewaysManagers.put("dim1", new ScoreCollectorManager()); + + PostCollectionFaceting faceting = + new PostCollectionFaceting<>( + drillDownManager, drillSidewaysManagers, drillDownFacets, drillSidewaysFacets, null); + + try { + faceting.collect(); + fail("Expected IllegalStateException"); + } catch (IllegalStateException e) { + assertTrue(e.getMessage().contains("Collector requires scores")); + } + } + + public void testMultipleSlices() throws IOException { + List contexts = createLeafContexts(3); + + TestFacetsCollector drillDownFacets = + createLargeFacetsCollector(contexts, PostCollectionFaceting.MIN_DOCS_PER_SLICE); + TestFacetsCollector dim1Facets = + createLargeFacetsCollector(contexts, PostCollectionFaceting.MIN_DOCS_PER_SLICE * 2); + + Map drillSidewaysFacets = new HashMap<>(); + drillSidewaysFacets.put("dim1", dim1Facets); + + DocCountCollectorManager drillDownManager = new DocCountCollectorManager(); + Map drillSidewaysManagers = new HashMap<>(); + drillSidewaysManagers.put("dim1", new DocCountCollectorManager()); + + PostCollectionFaceting + faceting = + new PostCollectionFaceting<>( + drillDownManager, + drillSidewaysManagers, + drillDownFacets, + drillSidewaysFacets, + null); + + PostCollectionFaceting.Result result = faceting.collect(); + assertNotNull(result.drillDownResult()); + assertEquals(PostCollectionFaceting.MIN_DOCS_PER_SLICE, result.drillDownResult().totalDocs); + assertEquals( + PostCollectionFaceting.MIN_DOCS_PER_SLICE * 2, + result.drillSidewaysResults().get("dim1").totalDocs); + } + + public void testNullDrillSideways() throws IOException { + List contexts = createLeafContexts(1); + TestFacetsCollector drillDownFacets = createFacetsCollector(contexts.get(0), 1, 2); + + DocCountCollectorManager drillDownManager = new DocCountCollectorManager(); + + PostCollectionFaceting + faceting = new PostCollectionFaceting<>(drillDownManager, drillDownFacets, null); + + PostCollectionFaceting.Result result = faceting.collect(); + assertNotNull(result.drillDownResult()); + assertEquals(2, result.drillDownResult().totalDocs); + assertEquals(Set.of(1, 2), result.drillDownResult().allDocIds); + assertNotNull(result.drillSidewaysResults()); + assertEquals(0, result.drillSidewaysResults().size()); + } + + public void testNullDrillDownManager() throws IOException { + List contexts = createLeafContexts(1); + TestFacetsCollector drillDownFacets = createFacetsCollector(contexts.get(0), 1, 2); + TestFacetsCollector dim1Facets = createFacetsCollector(contexts.get(0), 1); + + Map drillSidewaysFacets = new HashMap<>(); + drillSidewaysFacets.put("dim1", dim1Facets); + + Map drillSidewaysManagers = new HashMap<>(); + drillSidewaysManagers.put("dim1", new DocCountCollectorManager()); + + PostCollectionFaceting + faceting = + new PostCollectionFaceting<>( + null, drillSidewaysManagers, drillDownFacets, drillSidewaysFacets, null); + + PostCollectionFaceting.Result result = faceting.collect(); + assertNull(result.drillDownResult()); + assertNotNull(result.drillSidewaysResults()); + assertEquals(1, result.drillSidewaysResults().size()); + assertEquals(1, result.drillSidewaysResults().get("dim1").totalDocs); + } + + public void testNullMatchingDocs() throws IOException { + List contexts = createLeafContexts(2); + LeafReaderContext ctx0 = contexts.get(0); + LeafReaderContext ctx1 = contexts.get(1); + + // Create drill down facets with docs in both contexts + TestFacetsCollector drillDownFacets = createFacetsCollector(ctx0, 1, 2); + addMatchingDocs(drillDownFacets, ctx1, 3); + + // Create drill sideways facets with docs only in ctx0, leaving ctx1 with null matching docs + TestFacetsCollector dim1Facets = createFacetsCollector(ctx0, 1); + // Intentionally not adding matching docs for ctx1 to create null scenario + + Map drillSidewaysFacets = new HashMap<>(); + drillSidewaysFacets.put("dim1", dim1Facets); + + DocCountCollectorManager drillDownManager = new DocCountCollectorManager(); + Map drillSidewaysManagers = new HashMap<>(); + drillSidewaysManagers.put("dim1", new DocCountCollectorManager()); + + PostCollectionFaceting + faceting = + new PostCollectionFaceting<>( + drillDownManager, + drillSidewaysManagers, + drillDownFacets, + drillSidewaysFacets, + null); + + PostCollectionFaceting.Result result = faceting.collect(); + assertNotNull(result.drillDownResult()); + assertEquals(3, result.drillDownResult().totalDocs); + assertEquals(1, result.drillSidewaysResults().get("dim1").totalDocs); + } + + public void testMismatchedDimensionMaps() throws IOException { + List contexts = createLeafContexts(1); + + // Scenario 1: FacetsCollector is missing for a dimension + TestFacetsCollector drillDownFacets = createFacetsCollector(contexts.get(0), 1, 2); + + Map drillSidewaysFacets = new HashMap<>(); + drillSidewaysFacets.put("dim1", createFacetsCollector(contexts.get(0), 1)); + + DocCountCollectorManager drillDownManager = new DocCountCollectorManager(); + Map drillSidewaysManagers = new HashMap<>(); + drillSidewaysManagers.put("dim1", new DocCountCollectorManager()); + drillSidewaysManagers.put("dim2", new DocCountCollectorManager()); + + PostCollectionFaceting + faceting = + new PostCollectionFaceting<>( + drillDownManager, + drillSidewaysManagers, + drillDownFacets, + drillSidewaysFacets, + null); + + PostCollectionFaceting.Result result = faceting.collect(); + assertNotNull(result.drillDownResult()); + assertEquals(2, result.drillDownResult().totalDocs); + assertEquals(1, result.drillSidewaysResults().size()); + assertTrue(result.drillSidewaysResults().containsKey("dim1")); + assertFalse(result.drillSidewaysResults().containsKey("dim2")); + + // Scenario 1: collector manager is missing for a dimension + drillDownFacets = createFacetsCollector(contexts.get(0), 1, 2); + + drillSidewaysFacets = new HashMap<>(); + drillSidewaysFacets.put("dim1", createFacetsCollector(contexts.get(0), 1)); + drillSidewaysFacets.put("dim2", createFacetsCollector(contexts.get(0), 1, 2)); + + drillDownManager = new DocCountCollectorManager(); + drillSidewaysManagers = new HashMap<>(); + drillSidewaysManagers.put("dim1", new DocCountCollectorManager()); + + faceting = + new PostCollectionFaceting<>( + drillDownManager, drillSidewaysManagers, drillDownFacets, drillSidewaysFacets, null); + + result = faceting.collect(); + assertNotNull(result.drillDownResult()); + assertEquals(2, result.drillDownResult().totalDocs); + assertEquals(1, result.drillSidewaysResults().size()); + assertTrue(result.drillSidewaysResults().containsKey("dim1")); + assertFalse(result.drillSidewaysResults().containsKey("dim2")); + } + + public void testRandomConcurrentExecution() throws IOException { + List contexts = createLeafContexts(4); + + // Ensure enough docs to create 3 slices (threshold is MIN_DOCS_PER_SLICE) + int docsPerLeaf = PostCollectionFaceting.MIN_DOCS_PER_SLICE * 3 / contexts.size() + 10; + + var drillDownResult = createRandomFacetsCollector(contexts, docsPerLeaf); + TestFacetsCollector drillDownFacets = drillDownResult.collector(); + Set expectedDrillDownDocs = drillDownResult.allDocIds(); + + var dim1Result = createRandomFacetsCollector(contexts, 5); + TestFacetsCollector dim1Facets = dim1Result.collector(); + Set expectedDim1Docs = dim1Result.allDocIds(); + + Map drillSidewaysFacets = new HashMap<>(); + drillSidewaysFacets.put("dim1", dim1Facets); + + DocCountCollectorManager drillDownManager = new DocCountCollectorManager(); + Map drillSidewaysManagers = new HashMap<>(); + drillSidewaysManagers.put("dim1", new DocCountCollectorManager()); + + // Use multi-threaded executor + ExecutorService executor = + newFixedThreadPool(3, new NamedThreadFactory("TestPostCollectionFaceting")); + try { + PostCollectionFaceting + faceting = + new PostCollectionFaceting<>( + drillDownManager, + drillSidewaysManagers, + drillDownFacets, + drillSidewaysFacets, + executor); + + PostCollectionFaceting.Result result = faceting.collect(); + + // Assert exact doc IDs match + assertEquals(expectedDrillDownDocs.size(), result.drillDownResult().totalDocs); + assertEquals(expectedDrillDownDocs, result.drillDownResult().allDocIds); + + assertEquals(expectedDim1Docs.size(), result.drillSidewaysResults().get("dim1").totalDocs); + assertEquals(expectedDim1Docs, result.drillSidewaysResults().get("dim1").allDocIds); + } finally { + executor.shutdown(); + } + } + + // Helper methods + + private record FacetsCollectorData(TestFacetsCollector collector, Set allDocIds) {} + + private FacetsCollectorData createRandomFacetsCollector( + List contexts, int docsPerLeaf) { + Set allDocIds = new HashSet<>(); + TestFacetsCollector collector = new TestFacetsCollector(); + + for (LeafReaderContext context : contexts) { + Set leafDocs = new HashSet<>(); + while (leafDocs.size() < docsPerLeaf) { + int docId; + do { + docId = random().nextInt(); + } while (docId < 0 || docId == Integer.MAX_VALUE || allDocIds.contains(docId)); + leafDocs.add(docId); + allDocIds.add(docId); + } + int[] docIds = leafDocs.stream().mapToInt(Integer::intValue).toArray(); + addMatchingDocs(collector, context, docIds); + } + + return new FacetsCollectorData(collector, allDocIds); + } + + private TestFacetsCollector createFacetsCollectorWithScores( + LeafReaderContext context, int[] docIds, float[] scores) { + TestFacetsCollector fc = new TestFacetsCollector(); + addMatchingDocs(fc, context, docIds, scores); + return fc; + } + + private TestFacetsCollector createLargeFacetsCollector( + List contexts, int totalDocs) { + TestFacetsCollector fc = new TestFacetsCollector(); + int docsPerContext = totalDocs / contexts.size(); + int remainder = totalDocs % contexts.size(); + + for (int i = 0; i < contexts.size(); i++) { + int docsInThisContext = docsPerContext; + if (i < remainder) { + docsInThisContext++; + } + int[] docIds = new int[docsInThisContext]; + for (int j = 0; j < docsInThisContext; j++) { + docIds[j] = j; + } + addMatchingDocs(fc, contexts.get(i), docIds); + } + return fc; + } + + private List createLeafContexts(int count) throws IOException { + Directory dir = newDirectory(); + IndexWriter writer = + new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(NoMergePolicy.INSTANCE)); + + for (int i = 0; i < count; i++) { + writer.addDocument(new Document()); + writer.commit(); + } + writer.close(); + + DirectoryReader reader = DirectoryReader.open(dir); + List contexts = reader.leaves(); + assertEquals(count, contexts.size()); + + testReaders.add(reader); + testDirs.add(dir); + + return contexts; + } + + private TestFacetsCollector createFacetsCollector(LeafReaderContext context, int... docIds) { + TestFacetsCollector fc = new TestFacetsCollector(); + addMatchingDocs(fc, context, docIds); + return fc; + } + + private void addMatchingDocs(TestFacetsCollector fc, LeafReaderContext context, int... docIds) { + addMatchingDocs(fc, context, docIds, null); + } + + private void addMatchingDocs( + TestFacetsCollector fc, LeafReaderContext context, int[] docIds, float[] scores) { + int maxDocId = Arrays.stream(docIds).max().orElse(0); + DocIdSetBuilder builder = new DocIdSetBuilder(maxDocId + 1); + DocIdSetBuilder.BulkAdder adder = builder.grow(docIds.length); + for (int docId : docIds) { + adder.add(docId); + } + + FacetsCollector.MatchingDocs matchingDocs = + new FacetsCollector.MatchingDocs(context, builder.build(), docIds.length, scores); + fc.addMatchingDocs(matchingDocs); + } + + // Test helper classes + + private static final class DocCountCollector extends SimpleCollector { + private int docCount = 0; + private final Set allDocIds = new HashSet<>(); + + @Override + public void collect(int doc) { + docCount++; + allDocIds.add(doc); + } + + @Override + public ScoreMode scoreMode() { + return ScoreMode.COMPLETE_NO_SCORES; + } + + public int getDocCount() { + return docCount; + } + + public Set getAllDocIds() { + return allDocIds; + } + } + + private record DocCountResult(int totalDocs, Set allDocIds) {} + + private static final class DocCountCollectorManager + implements CollectorManager { + @Override + public DocCountCollector newCollector() { + return new DocCountCollector(); + } + + @Override + public DocCountResult reduce(Collection collectors) { + int total = 0; + Set allDocs = new HashSet<>(); + for (DocCountCollector c : collectors) { + total += c.getDocCount(); + allDocs.addAll(c.getAllDocIds()); + } + return new DocCountResult(total, allDocs); + } + } + + private static final class TestFacetsCollector extends FacetsCollector { + private final List testMatchingDocs = new ArrayList<>(); + + void addMatchingDocs(MatchingDocs docs) { + testMatchingDocs.add(docs); + } + + @Override + public List getMatchingDocs() { + return testMatchingDocs; + } + } + + private static final class ScoreCollector extends SimpleCollector { + private float totalScore = 0; + private Scorable scorer; + + @Override + public void collect(int doc) throws IOException { + totalScore += scorer.score(); + } + + @Override + public ScoreMode scoreMode() { + return ScoreMode.COMPLETE; + } + + @Override + public void setScorer(Scorable scorer) { + this.scorer = scorer; + } + + public float getTotalScore() { + return totalScore; + } + } + + private static final class ScoreCollectorManager + implements CollectorManager { + @Override + public ScoreCollector newCollector() { + return new ScoreCollector(); + } + + @Override + public Float reduce(Collection collectors) { + float total = 0; + for (ScoreCollector c : collectors) { + total += c.getTotalScore(); + } + return total; + } + } +}