Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
16 commits
Select commit Hold shift + click to select a range
f7ea9cd
WIP: Search V2 hybrid fixes - camelCase tokenization, backward compat…
DevanshuNEU Jan 26, 2026
c19fab8
feat(search): Search V3 'Project Brain' - Full Overhaul
DevanshuNEU Jan 26, 2026
b4d3d97
feat(search): integrate Cohere reranking with YAML formatting
DevanshuNEU Jan 26, 2026
331a580
feat(search): gate Cohere reranking behind pro_user flag
DevanshuNEU Jan 26, 2026
da674c0
fix(query): preserve CamelCase in code term extraction
DevanshuNEU Jan 26, 2026
6646c18
fix(search): use get_running_loop instead of deprecated get_event_loop
DevanshuNEU Jan 26, 2026
21d7a30
fix(metrics): add gauge method and use it for rerank avg_score
DevanshuNEU Jan 26, 2026
d1476e2
fix(scripts): catch Exception instead of bare except in extended_quer…
DevanshuNEU Jan 26, 2026
2655a90
fix(scripts): add load_dotenv + catch Exception in final_v3_test
DevanshuNEU Jan 26, 2026
6989b31
fix(scripts): add load_dotenv to human_query_test
DevanshuNEU Jan 26, 2026
aa86547
feat(search): pass pro_user flag through indexer search_v3 method
DevanshuNEU Jan 26, 2026
e8325e2
fix(tests): mock search_v3 instead of semantic_search in playground test
DevanshuNEU Jan 26, 2026
661706f
fix(scripts): stricter test file detection in has_test_file
DevanshuNEU Jan 26, 2026
757bd07
fix(search): filter tests in V2 fallback when include_tests=False
DevanshuNEU Jan 26, 2026
72605e6
refactor: consolidate test file detection into shared utility
DevanshuNEU Jan 26, 2026
ba3370f
fix(utils): use anchored regex patterns for test file detection
DevanshuNEU Jan 26, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions backend/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ OPENAI_API_KEY=your_openai_api_key_here
PINECONE_API_KEY=your_pinecone_api_key_here
PINECONE_INDEX_NAME=codeintel

# Search V2 - Cohere Reranking (optional but recommended)
COHERE_API_KEY=your_cohere_api_key_here

# Supabase
SUPABASE_URL=https://your-project.supabase.co
SUPABASE_ANON_KEY=your_supabase_anon_key_here
Expand All @@ -23,3 +26,7 @@ REDIS_PORT=6379
# Get DSN from https://sentry.io β†’ Settings β†’ Projects β†’ Client Keys
SENTRY_DSN=
ENVIRONMENT=development

# Search V3 - Voyage AI Code Embeddings (recommended for code search)
# Get API key from https://dash.voyageai.com/
VOYAGE_API_KEY=your_voyage_api_key_here
3 changes: 3 additions & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,6 @@ sentry-sdk[fastapi]>=2.0.0
# Search V2 - Hybrid search
rank-bm25>=0.2.2
cohere>=5.0.0

# Search V3 - Code-optimized embeddings
voyageai>=0.3.0
56 changes: 44 additions & 12 deletions backend/routes/playground.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ class PlaygroundSearchRequest(BaseModel):
demo_repo: Optional[str] = None # Keep for backward compat
repo_id: Optional[str] = None # Direct repo_id (user-indexed repos)
max_results: int = 10
# V3 options
use_v3: bool = True # Use Search V3 by default (better accuracy)
include_tests: bool = False # Include test files in results
Comment thread
coderabbitai[bot] marked this conversation as resolved.


class ValidateRepoRequest(BaseModel):
Expand Down Expand Up @@ -418,8 +421,9 @@ async def playground_search(
try:
sanitized_query = InputValidator.sanitize_string(request.query, max_length=200)

# Check cache
cached_results = cache.get_search_results(sanitized_query, repo_id)
# Check cache (include flags in key to avoid returning wrong results)
cache_key = f"{sanitized_query}:v3={request.use_v3}:tests={request.include_tests}"
cached_results = cache.get_search_results(cache_key, repo_id)
if cached_results:
return {
"results": cached_results,
Expand All @@ -429,17 +433,44 @@ async def playground_search(
"limit": limit_result.limit,
}

# Search
results = await indexer.semantic_search(
query=sanitized_query,
repo_id=repo_id,
max_results=min(request.max_results, 10),
use_query_expansion=True,
use_reranking=True
)
# Search V3 (default) or V2 (fallback)
if request.use_v3:
search_results = await indexer.search_v3(
query=sanitized_query,
repo_id=repo_id,
top_k=min(request.max_results, 10),
include_tests=request.include_tests,
use_reranking=True
)
else:
search_results = await indexer.search_v2(
query=sanitized_query,
repo_id=repo_id,
top_k=min(request.max_results, 10),
use_reranking=True
)

# Cache results
cache.set_search_results(sanitized_query, repo_id, results, ttl=3600)
# Format results for frontend compatibility
results = []
for r in search_results:
results.append({
"name": r.get("name", ""),
"qualified_name": r.get("qualified_name", r.get("name", "")),
"file_path": r.get("file_path", ""),
"code": r.get("code", ""),
"signature": r.get("signature", ""),
"language": r.get("language", ""),
"score": r.get("score", 0),
"line_start": r.get("line_start", 0),
"line_end": r.get("line_end", 0),
"type": "function", # backward compat with V1
"summary": r.get("summary"),
"class_name": r.get("class_name"),
"is_test_file": r.get("is_test_file", False), # V3 feature
})

# Cache results (using same key that includes flags)
cache.set_search_results(cache_key, repo_id, results, ttl=3600)

search_time = int((time.time() - start_time) * 1000)

Expand All @@ -450,6 +481,7 @@ async def playground_search(
"remaining_searches": limit_result.remaining,
"limit": limit_result.limit,
"search_time_ms": search_time,
"search_version": "v3" if request.use_v3 else "v2",
Comment thread
DevanshuNEU marked this conversation as resolved.
}
except HTTPException:
raise
Expand Down
247 changes: 247 additions & 0 deletions backend/scripts/benchmark_search_v3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,247 @@
#!/usr/bin/env python3
"""
Search V3 vs V2 Benchmark
Run with: python3 scripts/benchmark_search_v3.py

Compares:
- V2 (OpenAI embeddings + Cohere reranking)
- V3 (Voyage AI embeddings + Query Understanding + Code Graph + Cohere reranking)
"""
import asyncio
import os
import sys
import time
from typing import List, Dict, Tuple

# add parent to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from dotenv import load_dotenv
load_dotenv()

from services.indexer_optimized import OptimizedCodeIndexer

# Test queries representing real developer scenarios
TEST_QUERIES = [
{
"query": "how to add authentication",
"expected_keywords": ["auth", "middleware", "authenticate", "credential"],
"description": "Developer wants to add auth to their app"
},
{
"query": "handle websocket messages",
"expected_keywords": ["websocket", "message", "send", "receive", "on_"],
"description": "Developer working with WebSockets"
},
{
"query": "return json from endpoint",
"expected_keywords": ["json", "response", "jsonresponse", "return"],
"description": "Developer wants to return JSON data"
},
{
"query": "validate request data",
"expected_keywords": ["valid", "request", "data", "schema"],
"description": "Developer needs input validation"
},
{
"query": "middleware that runs before request",
"expected_keywords": ["middleware", "before", "dispatch", "call_next"],
"description": "Developer needs pre-request processing"
},
{
"query": "error handling",
"expected_keywords": ["error", "exception", "handler", "catch"],
"description": "Looking for error handling patterns"
},
{
"query": "route decorator",
"expected_keywords": ["route", "decorator", "path", "endpoint"],
"description": "Developer needs routing functionality"
},
{
"query": "database session",
"expected_keywords": ["database", "session", "db", "connection"],
"description": "Working with database sessions"
},
]


def score_results(results: List[Dict], expected_keywords: List[str]) -> Tuple[float, int, bool]:
"""
Score search results based on expected keywords
Returns: (score 0-10, matches count, is_test_in_top_3)
"""
if not results:
return 0.0, 0, False

# combine text from top 3 results
top_3_text = ""
has_test_in_top_3 = False

for r in results[:3]:
name = r.get("name", "").lower()
qualified = r.get("qualified_name", "").lower()
summary = (r.get("summary") or "").lower()
file_path = r.get("file_path", "").lower()

top_3_text += f" {name} {qualified} {summary} "

# check for test files
if "test" in file_path or "test" in name:
has_test_in_top_3 = True

# count keyword matches
matches = sum(1 for kw in expected_keywords if kw.lower() in top_3_text)
score = min(10.0, (matches / len(expected_keywords)) * 10)

return score, matches, has_test_in_top_3


async def run_benchmark(repo_id: str):
"""Run benchmark comparing V2 vs V3"""
print("=" * 80)
print("πŸ§ͺ SEARCH V3 vs V2 BENCHMARK")
print("=" * 80)
print()

indexer = OptimizedCodeIndexer()

v2_scores = []
v3_scores = []
v2_times = []
v3_times = []
v2_test_count = 0
v3_test_count = 0

for tc in TEST_QUERIES:
query = tc["query"]
expected = tc["expected_keywords"]
desc = tc["description"]

print(f"πŸ“ Query: \"{query}\"")
print(f" Scenario: {desc}")
print()

# V2 Search
start = time.time()
try:
v2_results = await indexer.search_v2(
query=query,
repo_id=repo_id,
top_k=5,
use_reranking=True
)
v2_time = (time.time() - start) * 1000
except Exception as e:
print(f" ❌ V2 Error: {e}")
v2_results = []
v2_time = 0

v2_score, v2_matches, v2_has_test = score_results(v2_results, expected)
v2_scores.append(v2_score)
v2_times.append(v2_time)
if v2_has_test:
v2_test_count += 1

# V3 Search
start = time.time()
try:
v3_results = await indexer.search_v3(
query=query,
repo_id=repo_id,
top_k=5,
include_tests=False,
use_reranking=True
)
v3_time = (time.time() - start) * 1000
except Exception as e:
print(f" ❌ V3 Error: {e}")
v3_results = []
v3_time = 0

v3_score, v3_matches, v3_has_test = score_results(v3_results, expected)
v3_scores.append(v3_score)
v3_times.append(v3_time)
if v3_has_test:
v3_test_count += 1

# Print comparison
print(f" V2: Score {v2_score:.1f}/10 ({v2_matches}/{len(expected)} keywords) | {v2_time:.0f}ms")
if v2_results:
print(f" Top result: {v2_results[0].get('name', 'unknown')}")

print(f" V3: Score {v3_score:.1f}/10 ({v3_matches}/{len(expected)} keywords) | {v3_time:.0f}ms")
if v3_results:
print(f" Top result: {v3_results[0].get('name', 'unknown')}")

# Winner
if v3_score > v2_score:
print(f" πŸ† V3 WINS (+{v3_score - v2_score:.1f})")
elif v2_score > v3_score:
print(f" πŸ† V2 WINS (+{v2_score - v3_score:.1f})")
else:
print(f" 🀝 TIE")

print()

# Summary
print("=" * 80)
print("πŸ“Š BENCHMARK RESULTS")
print("=" * 80)

v2_avg = sum(v2_scores) / len(v2_scores)
v3_avg = sum(v3_scores) / len(v3_scores)
v2_total_time = sum(v2_times)
v3_total_time = sum(v3_times)

v2_wins = sum(1 for v2, v3 in zip(v2_scores, v3_scores) if v2 > v3)
v3_wins = sum(1 for v2, v3 in zip(v2_scores, v3_scores) if v3 > v2)
ties = len(v2_scores) - v2_wins - v3_wins

print(f"""
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ METRIC β”‚ V2 β”‚ V3 β”‚ β”‚
β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€
β”‚ Average Score β”‚ {v2_avg:>6.1f}/10 β”‚ {v3_avg:>6.1f}/10 β”‚ {"V3 βœ“" if v3_avg > v2_avg else "V2 βœ“" if v2_avg > v3_avg else "TIE":<5}β”‚
β”‚ Total Time β”‚ {v2_total_time:>6.0f}ms β”‚ {v3_total_time:>6.0f}ms β”‚ {"V3 βœ“" if v3_total_time < v2_total_time else "V2 βœ“":<5}β”‚
β”‚ Queries with test in top3 β”‚ {v2_test_count:>6} β”‚ {v3_test_count:>6} β”‚ {"V3 βœ“" if v3_test_count < v2_test_count else "V2 βœ“" if v2_test_count < v3_test_count else "TIE":<5}β”‚
β”‚ Wins β”‚ {v2_wins:>6} β”‚ {v3_wins:>6} β”‚ β”‚
β”‚ Ties β”‚ {ties:>6} β”‚ {ties:>6} β”‚ β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
""")

# Final verdict
print()
if v3_avg >= v2_avg + 1.0:
print("βœ… VERDICT: V3 is SIGNIFICANTLY BETTER - Ready for production!")
elif v3_avg > v2_avg:
print("βœ… VERDICT: V3 is BETTER - Consider shipping!")
elif v3_avg == v2_avg:
print("⚠️ VERDICT: V3 is EQUAL to V2 - Need more optimization")
else:
print("❌ VERDICT: V3 is WORSE than V2 - Needs more work")

print()

# Check for Voyage
try:
from services.search_v3.integration import get_search_v3
v3 = get_search_v3()
if v3.is_voyage_enabled:
print("πŸš€ Using Voyage AI code-specific embeddings")
else:
print("⚠️ Voyage AI not enabled - using OpenAI embeddings")
print(" Set VOYAGE_API_KEY for better code search accuracy!")
except Exception as e:
print(f"⚠️ Could not check Voyage status: {e}")


if __name__ == "__main__":
# default repo ID (starlette) - change as needed
REPO_ID = os.getenv("BENCHMARK_REPO_ID", "0323a08f-9d21-4c59-b567-e0629a9bbb24")

print(f"Using repo_id: {REPO_ID}")
print("Set BENCHMARK_REPO_ID env var to use a different repo")
print()

asyncio.run(run_benchmark(REPO_ID))
Loading