Skip to content

Commit f7ea9cd

Browse files
committed
WIP: Search V2 hybrid fixes - camelCase tokenization, backward compat, playground upgrade
1 parent f4cd7a2 commit f7ea9cd

4 files changed

Lines changed: 79 additions & 35 deletions

File tree

backend/.env.example

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ OPENAI_API_KEY=your_openai_api_key_here
33
PINECONE_API_KEY=your_pinecone_api_key_here
44
PINECONE_INDEX_NAME=codeintel
55

6+
# Search V2 - Cohere Reranking (optional but recommended)
7+
COHERE_API_KEY=your_cohere_api_key_here
8+
69
# Supabase
710
SUPABASE_URL=https://your-project.supabase.co
811
SUPABASE_ANON_KEY=your_supabase_anon_key_here

backend/routes/playground.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -429,15 +429,32 @@ async def playground_search(
429429
"limit": limit_result.limit,
430430
}
431431

432-
# Search
433-
results = await indexer.semantic_search(
432+
# Search V2: Hybrid search with BM25 + Cohere reranking
433+
v2_results = await indexer.search_v2(
434434
query=sanitized_query,
435435
repo_id=repo_id,
436-
max_results=min(request.max_results, 10),
437-
use_query_expansion=True,
436+
top_k=min(request.max_results, 10),
438437
use_reranking=True
439438
)
440439

440+
# Format results for frontend compatibility
441+
results = []
442+
for r in v2_results:
443+
results.append({
444+
"name": r.get("name", ""),
445+
"qualified_name": r.get("qualified_name", r.get("name", "")),
446+
"file_path": r.get("file_path", ""),
447+
"code": r.get("code", ""),
448+
"signature": r.get("signature", ""),
449+
"language": r.get("language", ""),
450+
"score": r.get("score", 0),
451+
"line_start": r.get("line_start", 0),
452+
"line_end": r.get("line_end", 0),
453+
"type": "function", # backward compat with V1
454+
"summary": r.get("summary"),
455+
"class_name": r.get("class_name"),
456+
})
457+
441458
# Cache results
442459
cache.set_search_results(sanitized_query, repo_id, results, ttl=3600)
443460

backend/services/indexer_optimized.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -538,17 +538,16 @@ async def search_v2(
538538
metrics.increment("search_v2_requests")
539539

540540
try:
541-
searcher = HybridSearcher(
542-
pinecone_index=self.index,
543-
embedding_fn=lambda q: self._create_embeddings_batch([q]).then(lambda x: x[0]),
544-
)
545-
546-
# wrapper for async embed
547-
async def embed(q):
541+
async def embed_query(q: str) -> List[float]:
542+
"""Embed a single query string."""
548543
embs = await self._create_embeddings_batch([q])
549544
return embs[0]
550545

551-
searcher.embed = embed
546+
searcher = HybridSearcher(
547+
pinecone_index=self.index,
548+
embedding_fn=embed_query,
549+
)
550+
searcher.embed = embed_query
552551

553552
results = await searcher.search(
554553
query=query,

backend/services/search_v2/hybrid_searcher.py

Lines changed: 48 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Hybrid search with BM25 + semantic fusion and Cohere reranking."""
22
import os
3+
import re
34
from typing import List, Dict, Optional
45
from dataclasses import dataclass
56

@@ -20,6 +21,19 @@ class ScoredResult:
2021
fused_score: float = 0.0
2122

2223

24+
def _split_camel_case(text: str) -> str:
25+
"""Split CamelCase into separate words for better tokenization."""
26+
# AuthenticationMiddleware -> Authentication Middleware
27+
return re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
28+
29+
30+
def _tokenize(text: str) -> List[str]:
31+
"""Tokenize text with camelCase splitting."""
32+
# split camelCase, then lowercase and split on whitespace/punctuation
33+
expanded = _split_camel_case(text)
34+
return re.findall(r'\w+', expanded.lower())
35+
36+
2337
class HybridSearcher:
2438
"""Combines BM25 keyword search with semantic search and reranking."""
2539

@@ -57,26 +71,18 @@ async def search(
5771
3. Fuse scores using RRF
5872
4. Rerank top results with Cohere
5973
"""
60-
# get semantic candidates
6174
candidates = await self._semantic_search(query, repo_id, top_k=50)
6275
if not candidates:
6376
return []
6477

65-
# apply bm25 on candidates
6678
candidates = self._apply_bm25(query, candidates)
67-
68-
# fuse scores
6979
candidates = self._rrf_fusion(candidates, semantic_weight, bm25_weight)
70-
71-
# sort by fused score
7280
candidates.sort(key=lambda x: x.fused_score, reverse=True)
7381

74-
# rerank top results
7582
top_candidates = candidates[:top_k * 2]
7683
if use_reranking and self.cohere:
7784
top_candidates = await self._rerank(query, top_candidates)
7885

79-
# convert to SearchResult
8086
return [self._to_search_result(c) for c in top_candidates[:top_k]]
8187

8288
async def _semantic_search(self, query: str, repo_id: str, top_k: int) -> List[ScoredResult]:
@@ -99,23 +105,29 @@ async def _semantic_search(self, query: str, repo_id: str, top_k: int) -> List[S
99105
]
100106

101107
def _apply_bm25(self, query: str, candidates: List[ScoredResult]) -> List[ScoredResult]:
102-
"""Score candidates with BM25."""
108+
"""Score candidates with BM25 (with camelCase support)."""
103109
if not candidates:
104110
return candidates
105111

106-
# build corpus from candidates
107112
corpus = []
108113
for c in candidates:
109-
text = f"{c.metadata.get('name', '')} {c.metadata.get('qualified_name', '')} "
110-
text += f"{c.metadata.get('signature', '')} {c.metadata.get('docstring', '')} "
111-
text += c.metadata.get('summary', '')
112-
corpus.append(text.lower().split())
114+
# build searchable text from all available metadata
115+
parts = [
116+
c.metadata.get('name', ''),
117+
c.metadata.get('qualified_name', ''),
118+
c.metadata.get('signature', ''),
119+
c.metadata.get('docstring', ''),
120+
c.metadata.get('summary', ''),
121+
c.metadata.get('type', ''),
122+
]
123+
text = ' '.join(filter(None, parts))
124+
# tokenize with camelCase splitting
125+
corpus.append(_tokenize(text))
113126

114127
bm25 = BM25Okapi(corpus)
115-
query_tokens = query.lower().split()
128+
query_tokens = _tokenize(query)
116129
scores = bm25.get_scores(query_tokens)
117130

118-
# normalize scores
119131
max_score = max(scores) if max(scores) > 0 else 1
120132
for i, c in enumerate(candidates):
121133
c.bm25_score = scores[i] / max_score
@@ -130,28 +142,41 @@ def _rrf_fusion(
130142
k: int = 60
131143
) -> List[ScoredResult]:
132144
"""Reciprocal Rank Fusion."""
133-
# sort by semantic for ranking
134145
by_semantic = sorted(candidates, key=lambda x: x.semantic_score, reverse=True)
135146
for rank, c in enumerate(by_semantic):
136147
c.fused_score = semantic_weight / (k + rank + 1)
137148

138-
# sort by bm25 for ranking
139149
by_bm25 = sorted(candidates, key=lambda x: x.bm25_score, reverse=True)
140150
for rank, c in enumerate(by_bm25):
141151
c.fused_score += bm25_weight / (k + rank + 1)
142152

143153
return candidates
144154

145155
async def _rerank(self, query: str, candidates: List[ScoredResult]) -> List[ScoredResult]:
146-
"""Rerank with Cohere."""
156+
"""Rerank with Cohere (backward compatible with V1 indexed data)."""
147157
if not candidates:
148158
return candidates
149159

150160
docs = []
151161
for c in candidates:
152-
doc = f"{c.metadata.get('qualified_name', '')}: {c.metadata.get('summary', '')}"
153-
if not c.metadata.get('summary'):
154-
doc = f"{c.metadata.get('qualified_name', '')}: {c.metadata.get('signature', '')}"
162+
# try V2 metadata first
163+
qn = c.metadata.get('qualified_name') or c.metadata.get('name', '')
164+
summary = c.metadata.get('summary', '')
165+
sig = c.metadata.get('signature', '')
166+
167+
if summary:
168+
doc = f"{qn}: {summary}"
169+
elif sig:
170+
doc = f"{qn}: {sig}"
171+
else:
172+
# fallback for V1 indexed data: use name + code snippet
173+
code = c.metadata.get('code', '')[:200]
174+
doc = f"{qn}: {code}" if code else qn
175+
176+
# ensure non-empty doc
177+
if not doc.strip() or doc.strip() == ':':
178+
doc = c.metadata.get('name', 'unknown')
179+
155180
docs.append(doc)
156181

157182
try:
@@ -179,7 +204,7 @@ def _to_search_result(self, scored: ScoredResult) -> SearchResult:
179204
m = scored.metadata
180205
return SearchResult(
181206
name=m.get("name", ""),
182-
qualified_name=m.get("qualified_name", ""),
207+
qualified_name=m.get("qualified_name") or m.get("name", ""),
183208
file_path=m.get("file_path", ""),
184209
code=m.get("code", ""),
185210
signature=m.get("signature", ""),

0 commit comments

Comments
 (0)