|
| 1 | +#!/usr/bin/env python3 |
| 2 | +""" |
| 3 | +Search V3 vs V2 Benchmark |
| 4 | +Run with: python3 scripts/benchmark_search_v3.py |
| 5 | +
|
| 6 | +Compares: |
| 7 | +- V2 (OpenAI embeddings + Cohere reranking) |
| 8 | +- V3 (Voyage AI embeddings + Query Understanding + Code Graph + Cohere reranking) |
| 9 | +""" |
| 10 | +import asyncio |
| 11 | +import os |
| 12 | +import sys |
| 13 | +import time |
| 14 | +from typing import List, Dict, Tuple |
| 15 | + |
| 16 | +# add parent to path |
| 17 | +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) |
| 18 | + |
| 19 | +from dotenv import load_dotenv |
| 20 | +load_dotenv() |
| 21 | + |
| 22 | +from services.indexer_optimized import OptimizedCodeIndexer |
| 23 | + |
| 24 | +# Test queries representing real developer scenarios |
| 25 | +TEST_QUERIES = [ |
| 26 | + { |
| 27 | + "query": "how to add authentication", |
| 28 | + "expected_keywords": ["auth", "middleware", "authenticate", "credential"], |
| 29 | + "description": "Developer wants to add auth to their app" |
| 30 | + }, |
| 31 | + { |
| 32 | + "query": "handle websocket messages", |
| 33 | + "expected_keywords": ["websocket", "message", "send", "receive", "on_"], |
| 34 | + "description": "Developer working with WebSockets" |
| 35 | + }, |
| 36 | + { |
| 37 | + "query": "return json from endpoint", |
| 38 | + "expected_keywords": ["json", "response", "jsonresponse", "return"], |
| 39 | + "description": "Developer wants to return JSON data" |
| 40 | + }, |
| 41 | + { |
| 42 | + "query": "validate request data", |
| 43 | + "expected_keywords": ["valid", "request", "data", "schema"], |
| 44 | + "description": "Developer needs input validation" |
| 45 | + }, |
| 46 | + { |
| 47 | + "query": "middleware that runs before request", |
| 48 | + "expected_keywords": ["middleware", "before", "dispatch", "call_next"], |
| 49 | + "description": "Developer needs pre-request processing" |
| 50 | + }, |
| 51 | + { |
| 52 | + "query": "error handling", |
| 53 | + "expected_keywords": ["error", "exception", "handler", "catch"], |
| 54 | + "description": "Looking for error handling patterns" |
| 55 | + }, |
| 56 | + { |
| 57 | + "query": "route decorator", |
| 58 | + "expected_keywords": ["route", "decorator", "path", "endpoint"], |
| 59 | + "description": "Developer needs routing functionality" |
| 60 | + }, |
| 61 | + { |
| 62 | + "query": "database session", |
| 63 | + "expected_keywords": ["database", "session", "db", "connection"], |
| 64 | + "description": "Working with database sessions" |
| 65 | + }, |
| 66 | +] |
| 67 | + |
| 68 | + |
| 69 | +def score_results(results: List[Dict], expected_keywords: List[str]) -> Tuple[float, int, bool]: |
| 70 | + """ |
| 71 | + Score search results based on expected keywords |
| 72 | + Returns: (score 0-10, matches count, is_test_in_top_3) |
| 73 | + """ |
| 74 | + if not results: |
| 75 | + return 0.0, 0, False |
| 76 | + |
| 77 | + # combine text from top 3 results |
| 78 | + top_3_text = "" |
| 79 | + has_test_in_top_3 = False |
| 80 | + |
| 81 | + for r in results[:3]: |
| 82 | + name = r.get("name", "").lower() |
| 83 | + qualified = r.get("qualified_name", "").lower() |
| 84 | + summary = (r.get("summary") or "").lower() |
| 85 | + file_path = r.get("file_path", "").lower() |
| 86 | + |
| 87 | + top_3_text += f" {name} {qualified} {summary} " |
| 88 | + |
| 89 | + # check for test files |
| 90 | + if "test" in file_path or "test" in name: |
| 91 | + has_test_in_top_3 = True |
| 92 | + |
| 93 | + # count keyword matches |
| 94 | + matches = sum(1 for kw in expected_keywords if kw.lower() in top_3_text) |
| 95 | + score = min(10.0, (matches / len(expected_keywords)) * 10) |
| 96 | + |
| 97 | + return score, matches, has_test_in_top_3 |
| 98 | + |
| 99 | + |
| 100 | +async def run_benchmark(repo_id: str): |
| 101 | + """Run benchmark comparing V2 vs V3""" |
| 102 | + print("=" * 80) |
| 103 | + print("🧪 SEARCH V3 vs V2 BENCHMARK") |
| 104 | + print("=" * 80) |
| 105 | + print() |
| 106 | + |
| 107 | + indexer = OptimizedCodeIndexer() |
| 108 | + |
| 109 | + v2_scores = [] |
| 110 | + v3_scores = [] |
| 111 | + v2_times = [] |
| 112 | + v3_times = [] |
| 113 | + v2_test_count = 0 |
| 114 | + v3_test_count = 0 |
| 115 | + |
| 116 | + for tc in TEST_QUERIES: |
| 117 | + query = tc["query"] |
| 118 | + expected = tc["expected_keywords"] |
| 119 | + desc = tc["description"] |
| 120 | + |
| 121 | + print(f"📝 Query: \"{query}\"") |
| 122 | + print(f" Scenario: {desc}") |
| 123 | + print() |
| 124 | + |
| 125 | + # V2 Search |
| 126 | + start = time.time() |
| 127 | + try: |
| 128 | + v2_results = await indexer.search_v2( |
| 129 | + query=query, |
| 130 | + repo_id=repo_id, |
| 131 | + top_k=5, |
| 132 | + use_reranking=True |
| 133 | + ) |
| 134 | + v2_time = (time.time() - start) * 1000 |
| 135 | + except Exception as e: |
| 136 | + print(f" ❌ V2 Error: {e}") |
| 137 | + v2_results = [] |
| 138 | + v2_time = 0 |
| 139 | + |
| 140 | + v2_score, v2_matches, v2_has_test = score_results(v2_results, expected) |
| 141 | + v2_scores.append(v2_score) |
| 142 | + v2_times.append(v2_time) |
| 143 | + if v2_has_test: |
| 144 | + v2_test_count += 1 |
| 145 | + |
| 146 | + # V3 Search |
| 147 | + start = time.time() |
| 148 | + try: |
| 149 | + v3_results = await indexer.search_v3( |
| 150 | + query=query, |
| 151 | + repo_id=repo_id, |
| 152 | + top_k=5, |
| 153 | + include_tests=False, |
| 154 | + use_reranking=True |
| 155 | + ) |
| 156 | + v3_time = (time.time() - start) * 1000 |
| 157 | + except Exception as e: |
| 158 | + print(f" ❌ V3 Error: {e}") |
| 159 | + v3_results = [] |
| 160 | + v3_time = 0 |
| 161 | + |
| 162 | + v3_score, v3_matches, v3_has_test = score_results(v3_results, expected) |
| 163 | + v3_scores.append(v3_score) |
| 164 | + v3_times.append(v3_time) |
| 165 | + if v3_has_test: |
| 166 | + v3_test_count += 1 |
| 167 | + |
| 168 | + # Print comparison |
| 169 | + print(f" V2: Score {v2_score:.1f}/10 ({v2_matches}/{len(expected)} keywords) | {v2_time:.0f}ms") |
| 170 | + if v2_results: |
| 171 | + print(f" Top result: {v2_results[0].get('name', 'unknown')}") |
| 172 | + |
| 173 | + print(f" V3: Score {v3_score:.1f}/10 ({v3_matches}/{len(expected)} keywords) | {v3_time:.0f}ms") |
| 174 | + if v3_results: |
| 175 | + print(f" Top result: {v3_results[0].get('name', 'unknown')}") |
| 176 | + |
| 177 | + # Winner |
| 178 | + if v3_score > v2_score: |
| 179 | + print(f" 🏆 V3 WINS (+{v3_score - v2_score:.1f})") |
| 180 | + elif v2_score > v3_score: |
| 181 | + print(f" 🏆 V2 WINS (+{v2_score - v3_score:.1f})") |
| 182 | + else: |
| 183 | + print(f" 🤝 TIE") |
| 184 | + |
| 185 | + print() |
| 186 | + |
| 187 | + # Summary |
| 188 | + print("=" * 80) |
| 189 | + print("📊 BENCHMARK RESULTS") |
| 190 | + print("=" * 80) |
| 191 | + |
| 192 | + v2_avg = sum(v2_scores) / len(v2_scores) |
| 193 | + v3_avg = sum(v3_scores) / len(v3_scores) |
| 194 | + v2_total_time = sum(v2_times) |
| 195 | + v3_total_time = sum(v3_times) |
| 196 | + |
| 197 | + v2_wins = sum(1 for v2, v3 in zip(v2_scores, v3_scores) if v2 > v3) |
| 198 | + v3_wins = sum(1 for v2, v3 in zip(v2_scores, v3_scores) if v3 > v2) |
| 199 | + ties = len(v2_scores) - v2_wins - v3_wins |
| 200 | + |
| 201 | + print(f""" |
| 202 | +┌─────────────────────────────────────────────────────────┐ |
| 203 | +│ METRIC │ V2 │ V3 │ │ |
| 204 | +├─────────────────────────────────────────────────────────┤ |
| 205 | +│ Average Score │ {v2_avg:>6.1f}/10 │ {v3_avg:>6.1f}/10 │ {"V3 ✓" if v3_avg > v2_avg else "V2 ✓" if v2_avg > v3_avg else "TIE":<5}│ |
| 206 | +│ Total Time │ {v2_total_time:>6.0f}ms │ {v3_total_time:>6.0f}ms │ {"V3 ✓" if v3_total_time < v2_total_time else "V2 ✓":<5}│ |
| 207 | +│ Queries with test in top3 │ {v2_test_count:>6} │ {v3_test_count:>6} │ {"V3 ✓" if v3_test_count < v2_test_count else "V2 ✓" if v2_test_count < v3_test_count else "TIE":<5}│ |
| 208 | +│ Wins │ {v2_wins:>6} │ {v3_wins:>6} │ │ |
| 209 | +│ Ties │ {ties:>6} │ {ties:>6} │ │ |
| 210 | +└─────────────────────────────────────────────────────────┘ |
| 211 | + """) |
| 212 | + |
| 213 | + # Final verdict |
| 214 | + print() |
| 215 | + if v3_avg >= v2_avg + 1.0: |
| 216 | + print("✅ VERDICT: V3 is SIGNIFICANTLY BETTER - Ready for production!") |
| 217 | + elif v3_avg > v2_avg: |
| 218 | + print("✅ VERDICT: V3 is BETTER - Consider shipping!") |
| 219 | + elif v3_avg == v2_avg: |
| 220 | + print("⚠️ VERDICT: V3 is EQUAL to V2 - Need more optimization") |
| 221 | + else: |
| 222 | + print("❌ VERDICT: V3 is WORSE than V2 - Needs more work") |
| 223 | + |
| 224 | + print() |
| 225 | + |
| 226 | + # Check for Voyage |
| 227 | + try: |
| 228 | + from services.search_v3.integration import get_search_v3 |
| 229 | + v3 = get_search_v3() |
| 230 | + if v3.is_voyage_enabled: |
| 231 | + print("🚀 Using Voyage AI code-specific embeddings") |
| 232 | + else: |
| 233 | + print("⚠️ Voyage AI not enabled - using OpenAI embeddings") |
| 234 | + print(" Set VOYAGE_API_KEY for better code search accuracy!") |
| 235 | + except Exception as e: |
| 236 | + print(f"⚠️ Could not check Voyage status: {e}") |
| 237 | + |
| 238 | + |
| 239 | +if __name__ == "__main__": |
| 240 | + # default repo ID (starlette) - change as needed |
| 241 | + REPO_ID = os.getenv("BENCHMARK_REPO_ID", "0323a08f-9d21-4c59-b567-e0629a9bbb24") |
| 242 | + |
| 243 | + print(f"Using repo_id: {REPO_ID}") |
| 244 | + print("Set BENCHMARK_REPO_ID env var to use a different repo") |
| 245 | + print() |
| 246 | + |
| 247 | + asyncio.run(run_benchmark(REPO_ID)) |
0 commit comments