11"""Hybrid search with BM25 + semantic fusion and Cohere reranking."""
22import os
3+ import re
34from typing import List , Dict , Optional
45from dataclasses import dataclass
56
@@ -20,6 +21,19 @@ class ScoredResult:
2021 fused_score : float = 0.0
2122
2223
24+ def _split_camel_case (text : str ) -> str :
25+ """Split CamelCase into separate words for better tokenization."""
26+ # AuthenticationMiddleware -> Authentication Middleware
27+ return re .sub (r'([a-z])([A-Z])' , r'\1 \2' , text )
28+
29+
30+ def _tokenize (text : str ) -> List [str ]:
31+ """Tokenize text with camelCase splitting."""
32+ # split camelCase, then lowercase and split on whitespace/punctuation
33+ expanded = _split_camel_case (text )
34+ return re .findall (r'\w+' , expanded .lower ())
35+
36+
2337class HybridSearcher :
2438 """Combines BM25 keyword search with semantic search and reranking."""
2539
@@ -57,26 +71,18 @@ async def search(
5771 3. Fuse scores using RRF
5872 4. Rerank top results with Cohere
5973 """
60- # get semantic candidates
6174 candidates = await self ._semantic_search (query , repo_id , top_k = 50 )
6275 if not candidates :
6376 return []
6477
65- # apply bm25 on candidates
6678 candidates = self ._apply_bm25 (query , candidates )
67-
68- # fuse scores
6979 candidates = self ._rrf_fusion (candidates , semantic_weight , bm25_weight )
70-
71- # sort by fused score
7280 candidates .sort (key = lambda x : x .fused_score , reverse = True )
7381
74- # rerank top results
7582 top_candidates = candidates [:top_k * 2 ]
7683 if use_reranking and self .cohere :
7784 top_candidates = await self ._rerank (query , top_candidates )
7885
79- # convert to SearchResult
8086 return [self ._to_search_result (c ) for c in top_candidates [:top_k ]]
8187
8288 async def _semantic_search (self , query : str , repo_id : str , top_k : int ) -> List [ScoredResult ]:
@@ -99,23 +105,29 @@ async def _semantic_search(self, query: str, repo_id: str, top_k: int) -> List[S
99105 ]
100106
101107 def _apply_bm25 (self , query : str , candidates : List [ScoredResult ]) -> List [ScoredResult ]:
102- """Score candidates with BM25."""
108+ """Score candidates with BM25 (with camelCase support) ."""
103109 if not candidates :
104110 return candidates
105111
106- # build corpus from candidates
107112 corpus = []
108113 for c in candidates :
109- text = f"{ c .metadata .get ('name' , '' )} { c .metadata .get ('qualified_name' , '' )} "
110- text += f"{ c .metadata .get ('signature' , '' )} { c .metadata .get ('docstring' , '' )} "
111- text += c .metadata .get ('summary' , '' )
112- corpus .append (text .lower ().split ())
114+ # build searchable text from all available metadata
115+ parts = [
116+ c .metadata .get ('name' , '' ),
117+ c .metadata .get ('qualified_name' , '' ),
118+ c .metadata .get ('signature' , '' ),
119+ c .metadata .get ('docstring' , '' ),
120+ c .metadata .get ('summary' , '' ),
121+ c .metadata .get ('type' , '' ),
122+ ]
123+ text = ' ' .join (filter (None , parts ))
124+ # tokenize with camelCase splitting
125+ corpus .append (_tokenize (text ))
113126
114127 bm25 = BM25Okapi (corpus )
115- query_tokens = query . lower (). split ( )
128+ query_tokens = _tokenize ( query )
116129 scores = bm25 .get_scores (query_tokens )
117130
118- # normalize scores
119131 max_score = max (scores ) if max (scores ) > 0 else 1
120132 for i , c in enumerate (candidates ):
121133 c .bm25_score = scores [i ] / max_score
@@ -130,28 +142,41 @@ def _rrf_fusion(
130142 k : int = 60
131143 ) -> List [ScoredResult ]:
132144 """Reciprocal Rank Fusion."""
133- # sort by semantic for ranking
134145 by_semantic = sorted (candidates , key = lambda x : x .semantic_score , reverse = True )
135146 for rank , c in enumerate (by_semantic ):
136147 c .fused_score = semantic_weight / (k + rank + 1 )
137148
138- # sort by bm25 for ranking
139149 by_bm25 = sorted (candidates , key = lambda x : x .bm25_score , reverse = True )
140150 for rank , c in enumerate (by_bm25 ):
141151 c .fused_score += bm25_weight / (k + rank + 1 )
142152
143153 return candidates
144154
145155 async def _rerank (self , query : str , candidates : List [ScoredResult ]) -> List [ScoredResult ]:
146- """Rerank with Cohere."""
156+ """Rerank with Cohere (backward compatible with V1 indexed data) ."""
147157 if not candidates :
148158 return candidates
149159
150160 docs = []
151161 for c in candidates :
152- doc = f"{ c .metadata .get ('qualified_name' , '' )} : { c .metadata .get ('summary' , '' )} "
153- if not c .metadata .get ('summary' ):
154- doc = f"{ c .metadata .get ('qualified_name' , '' )} : { c .metadata .get ('signature' , '' )} "
162+ # try V2 metadata first
163+ qn = c .metadata .get ('qualified_name' ) or c .metadata .get ('name' , '' )
164+ summary = c .metadata .get ('summary' , '' )
165+ sig = c .metadata .get ('signature' , '' )
166+
167+ if summary :
168+ doc = f"{ qn } : { summary } "
169+ elif sig :
170+ doc = f"{ qn } : { sig } "
171+ else :
172+ # fallback for V1 indexed data: use name + code snippet
173+ code = c .metadata .get ('code' , '' )[:200 ]
174+ doc = f"{ qn } : { code } " if code else qn
175+
176+ # ensure non-empty doc
177+ if not doc .strip () or doc .strip () == ':' :
178+ doc = c .metadata .get ('name' , 'unknown' )
179+
155180 docs .append (doc )
156181
157182 try :
@@ -179,7 +204,7 @@ def _to_search_result(self, scored: ScoredResult) -> SearchResult:
179204 m = scored .metadata
180205 return SearchResult (
181206 name = m .get ("name" , "" ),
182- qualified_name = m .get ("qualified_name" , "" ),
207+ qualified_name = m .get ("qualified_name" ) or m . get ( "name" , "" ),
183208 file_path = m .get ("file_path" , "" ),
184209 code = m .get ("code" , "" ),
185210 signature = m .get ("signature" , "" ),
0 commit comments