@@ -41,6 +41,8 @@ def __init__(
4141 # LRU embedding cache — avoids redundant embedding calls for similar queries
4242 self ._embed_cache : dict [str , list [float ]] = {}
4343 self ._cache_max = 32
44+ # Chunk→document mapping built during search for coherence boost
45+ self ._chunk_doc_map : dict [str , str ] = {}
4446
4547 async def search (
4648 self ,
@@ -89,6 +91,17 @@ async def search(
8991 original_task , hype_task , bm25_task
9092 )
9193
94+ # Build chunk→document map from search results for coherence boost
95+ self ._chunk_doc_map .clear ()
96+ for result in original_results :
97+ doc_id = (result .metadata or {}).get ("document_id" , "" )
98+ if doc_id :
99+ self ._chunk_doc_map [result .id ] = doc_id
100+ for result in bm25_results : # type: ignore[union-attr]
101+ doc_id = (getattr (result , "metadata" , None ) or {}).get ("document_id" , "" )
102+ if doc_id :
103+ self ._chunk_doc_map [result .id ] = doc_id
104+
92105 # Map HyPE results back to chunk IDs
93106 hype_chunk_results = self ._map_hype_to_chunks (hype_results )
94107
@@ -197,10 +210,25 @@ def _reciprocal_rank_fusion(
197210 raw_score = max (result .score , 0.0 )
198211 scores [result .id ] = scores .get (result .id , 0.0 ) + w_hype * raw_score / (k + rank + 1 )
199212
213+ # Normalize BM25 scores to [0,1] using min-max within this result set
214+ # instead of capping at 1.0 (which loses signal discrimination)
215+ bm25_scores = [
216+ max (getattr (r , "score" , 1.0 ), 0.0 )
217+ for r in bm25 # type: ignore[union-attr]
218+ ]
219+ bm25_max = max (bm25_scores ) if bm25_scores else 1.0
220+ bm25_min = min (bm25_scores ) if bm25_scores else 0.0
221+ bm25_range = bm25_max - bm25_min if bm25_max > bm25_min else 1.0
222+
200223 for rank , result in enumerate (bm25 ): # type: ignore[assignment]
201- # BM25 scores are not normalized to [0,1], so cap at 1.0
202- raw_score = min (max (result .score , 0.0 ), 1.0 ) if hasattr (result , "score" ) else 1.0
203- scores [result .id ] = scores .get (result .id , 0.0 ) + w_bm25 * raw_score / (k + rank + 1 )
224+ raw = max (getattr (result , "score" , 1.0 ), 0.0 )
225+ # Min-max normalization preserves relative differences
226+ # When all scores are identical (or single result), use raw/max
227+ if bm25_max > bm25_min :
228+ normalized = (raw - bm25_min ) / bm25_range
229+ else :
230+ normalized = min (raw / bm25_max , 1.0 ) if bm25_max > 0 else 1.0
231+ scores [result .id ] = scores .get (result .id , 0.0 ) + w_bm25 * normalized / (k + rank + 1 )
204232
205233 # Normalize to [0, 1]
206234 max_score = max (scores .values ()) if scores else 1.0
@@ -217,29 +245,52 @@ def _apply_document_coherence_boost(
217245 ) -> list [tuple [str , float ]]:
218246 """Boost scores when multiple chunks from the same document appear in results.
219247
220- If a document has N chunks in the top candidates, each gets a boost
221- proportional to N. This helps when the correct answer is spread across
222- multiple chunks of the same source document.
248+ If a document has N chunks in the top candidates, each gets a small
249+ boost proportional to N. This helps when the correct answer is spread
250+ across multiple chunks of the same source document.
251+
252+ Uses chunk metadata from BM25 results (which carry document_id) and
253+ the vector store's stored metadata to map chunk_id → document_id.
254+ Falls back to a prefix heuristic when metadata is unavailable.
223255 """
224- # Count chunks per document in the top candidates (look at 3x top_k)
225- candidate_pool = fused [: top_k * 3 ]
226- doc_counts : dict [str , int ] = {}
256+ if len (fused ) < 2 :
257+ return fused
258+
259+ candidate_pool = fused [: top_k * 4 ]
227260 chunk_to_doc : dict [str , str ] = {}
228261
262+ # Try to resolve document_id via document_store metadata
229263 for chunk_id , _ in candidate_pool :
230- # chunk_id contains document info — extract via document_store
231- # Use a simple heuristic: chunk IDs from same ingest share a doc prefix
232- # The actual document_id is stored in chunk metadata, but we don't have
233- # it here. Instead, use the score pattern: if multiple chunks score well,
234- # they likely share a document.
235- doc_counts [chunk_id ] = doc_counts .get (chunk_id , 0 )
236-
237- # Without document_id in the fusion result, we can't do document-level
238- # grouping here. Instead, apply a simpler heuristic: if a chunk_id
239- # appears in multiple indexes (original + hype + bm25), it gets a
240- # natural boost from RRF already. The score-weighting above handles
241- # the precision issue. Return unchanged.
242- return fused
264+ # Use stored _chunk_doc_map if available (populated during search)
265+ doc_id = self ._chunk_doc_map .get (chunk_id , "" )
266+ if not doc_id :
267+ # Heuristic fallback: first 12 chars of chunk_id often share
268+ # a common prefix for chunks from the same document, but this
269+ # is unreliable. Skip boost for unknown chunks.
270+ continue
271+ chunk_to_doc [chunk_id ] = doc_id
272+
273+ if not chunk_to_doc :
274+ return fused
275+
276+ # Count chunks per document in candidate pool
277+ doc_counts : dict [str , int ] = {}
278+ for doc_id in chunk_to_doc .values ():
279+ doc_counts [doc_id ] = doc_counts .get (doc_id , 0 ) + 1
280+
281+ # Apply boost: +5% per additional chunk from same document (max +20%)
282+ boosted : list [tuple [str , float ]] = []
283+ for chunk_id , score in fused :
284+ doc_id = chunk_to_doc .get (chunk_id , "" )
285+ if doc_id and doc_counts .get (doc_id , 1 ) > 1 :
286+ n_extra = min (doc_counts [doc_id ] - 1 , 4 ) # Cap at 4 extras
287+ boost = 1.0 + n_extra * 0.05
288+ boosted .append ((chunk_id , score * boost ))
289+ else :
290+ boosted .append ((chunk_id , score ))
291+
292+ boosted .sort (key = lambda x : x [1 ], reverse = True )
293+ return boosted
243294
244295
245296class ScoredChunk :
0 commit comments