diff --git a/.env.example b/.env.example index 1fb5b44..1b9bb71 100644 --- a/.env.example +++ b/.env.example @@ -27,6 +27,13 @@ SUPABASE_SERVICE_ROLE_KEY=eyJ... # Backend API API_KEY=change-this-secret-key-for-production BACKEND_API_URL=http://backend:8000 +FRONTEND_URL=http://localhost:3000 + +# GitHub OAuth (Required for GitHub repo import) +# Create OAuth App: https://github.com/settings/developers +GITHUB_CLIENT_ID= +GITHUB_CLIENT_SECRET= +GITHUB_REDIRECT_URI=http://localhost:3000/auth/github/callback # CORS Configuration (Security) # Comma-separated list of allowed origins @@ -39,10 +46,15 @@ REDIS_HOST=redis REDIS_PORT=6379 # Sentry Error Tracking (Optional but recommended for production) -# Get DSN from: https://sentry.io → Settings → Projects → Client Keys +# Get DSN from: https://sentry.io -> Settings -> Projects -> Client Keys SENTRY_DSN= +SENTRY_SEND_PII=false +SENTRY_INCLUDE_LOCAL_VARS=false ENVIRONMENT=development # development, staging, production +# Discord Webhook (Optional - for feedback notifications) +DISCORD_FEEDBACK_WEBHOOK= + # Search V2 Configuration # Cohere API for reranking (Optional - improves search quality) # Get from: https://dashboard.cohere.com/api-keys diff --git a/.gitignore b/.gitignore index a0ef7da..2b2dd7b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,11 @@ __pycache__/ venv/ env/ +# Test/coverage artifacts +.coverage +htmlcov/ +.pytest_cache/ + # Node node_modules/ dist/ diff --git a/backend/.env.example b/backend/.env.example deleted file mode 100644 index dc8f3b1..0000000 --- a/backend/.env.example +++ /dev/null @@ -1,96 +0,0 @@ -# ============================================================================= -# OpenCodeIntel Backend Environment Variables -# ============================================================================= - -# ----------------------------------------------------------------------------- -# AI & Search APIs -# ----------------------------------------------------------------------------- - -# OpenAI - for embeddings and AI features -OPENAI_API_KEY=your_openai_api_key_here - -# Pinecone - vector database for semantic search -PINECONE_API_KEY=your_pinecone_api_key_here -PINECONE_INDEX_NAME=codeintel - -# Cohere - reranking for search quality (optional but recommended) -COHERE_API_KEY=your_cohere_api_key_here - -# Voyage AI - code-specific embeddings (recommended for code search) -# Get API key from https://dash.voyageai.com/ -VOYAGE_API_KEY=your_voyage_api_key_here - -# ----------------------------------------------------------------------------- -# Supabase - Authentication & Database -# ----------------------------------------------------------------------------- - -SUPABASE_URL=https://your-project.supabase.co -SUPABASE_ANON_KEY=your_supabase_anon_key_here -SUPABASE_JWT_SECRET=your_jwt_secret_here - -# Service role key - required for server-side database access (e.g., storing GitHub tokens) -# Get from Supabase Dashboard → Settings → API → service_role key -SUPABASE_SERVICE_ROLE_KEY=your_service_role_key_here - -# ----------------------------------------------------------------------------- -# GitHub OAuth - One-Click Repo Import Feature -# ----------------------------------------------------------------------------- -# IMPORTANT: This is SEPARATE from Supabase GitHub login! -# -# Supabase login uses its own OAuth app (configured in Supabase Dashboard). -# This OAuth app is ONLY for importing repositories from GitHub. -# -# Setup: -# 1. Go to GitHub → Settings → Developer settings → OAuth Apps → New OAuth App -# 2. Create app with name like "YourApp Repo Import" -# 3. Set callback URL based on environment: -# - Development: http://localhost:3000/github/callback -# - Production: https://yourdomain.com/github/callback -# 4. Copy Client ID and generate Client Secret -# -# You may need separate OAuth apps for dev and production (different callback URLs) - -GITHUB_CLIENT_ID=your_github_oauth_app_client_id -GITHUB_CLIENT_SECRET=your_github_oauth_app_client_secret - -# Must match EXACTLY what you set in GitHub OAuth App settings -GITHUB_REDIRECT_URI=http://localhost:3000/github/callback - -# Frontend URL for redirects after OAuth -FRONTEND_URL=http://localhost:3000 - -# ----------------------------------------------------------------------------- -# Backend Configuration -# ----------------------------------------------------------------------------- - -BACKEND_API_URL=http://localhost:8000 -API_KEY=dev-secret-key - -# CORS - allowed frontend origins (comma-separated for multiple) -ALLOWED_ORIGINS=http://localhost:3000 - -# ----------------------------------------------------------------------------- -# Redis Cache (Optional) -# ----------------------------------------------------------------------------- - -REDIS_HOST=localhost -REDIS_PORT=6379 - -# ----------------------------------------------------------------------------- -# Monitoring & Debugging -# ----------------------------------------------------------------------------- - -# Sentry error tracking (optional) -# Get DSN from https://sentry.io → Settings → Projects → Client Keys -SENTRY_DSN= - -# Environment identifier -ENVIRONMENT=development - -# ----------------------------------------------------------------------------- -# Discord Webhooks - Feedback & Waitlist -# ----------------------------------------------------------------------------- - -# Discord webhook for receiving user feedback and waitlist signups -# Create at: Discord Server → Channel Settings → Integrations → Webhooks -DISCORD_FEEDBACK_WEBHOOK=https://discord.com/api/webhooks/your_webhook_id/your_webhook_token diff --git a/backend/config/api.py b/backend/config/api.py index 1c2a770..d231006 100644 --- a/backend/config/api.py +++ b/backend/config/api.py @@ -2,43 +2,7 @@ API Configuration - Single Source of Truth for API Versioning Change API_VERSION here to update all routes across the application. -Example: "v1" -> "v2" will change /api/v1/* to /api/v2/* """ -# API VERSION CONFIGURATION - API_VERSION = "v1" - -# DERIVED PREFIXES (auto-calculated from version) - -# Current versioned API prefix: /api/v1 API_PREFIX = f"/api/{API_VERSION}" - -# Legacy prefix for backward compatibility: /api -# Routes here will be deprecated but still functional -LEGACY_API_PREFIX = "/api" - -# DEPRECATION SETTINGS - -# When True, legacy routes (/api/*) will include deprecation warning headers -LEGACY_DEPRECATION_ENABLED = True - -# Header to add on deprecated routes -DEPRECATION_HEADER = "X-API-Deprecated" -DEPRECATION_MESSAGE = f"This endpoint is deprecated. Please use {API_PREFIX} instead." - -# HELPER FUNCTIONS - -def get_versioned_prefix() -> str: - """Get the current versioned API prefix.""" - return API_PREFIX - - -def get_legacy_prefix() -> str: - """Get the legacy (deprecated) API prefix.""" - return LEGACY_API_PREFIX - - -def is_legacy_route(path: str) -> bool: - """Check if a route path is using the legacy prefix.""" - return path.startswith(LEGACY_API_PREFIX) and not path.startswith(API_PREFIX) diff --git a/backend/routes/__init__.py b/backend/routes/__init__.py index 993ab14..fe01b3d 100644 --- a/backend/routes/__init__.py +++ b/backend/routes/__init__.py @@ -1,4 +1 @@ -"""API Routes package""" -from .auth import router as auth_router - -__all__ = ["auth_router"] +"""API Routes package.""" diff --git a/backend/scripts/benchmark_search_v3.py b/backend/scripts/benchmark_search_v3.py deleted file mode 100644 index 8da31de..0000000 --- a/backend/scripts/benchmark_search_v3.py +++ /dev/null @@ -1,247 +0,0 @@ -#!/usr/bin/env python3 -""" -Search V3 vs V2 Benchmark -Run with: python3 scripts/benchmark_search_v3.py - -Compares: -- V2 (OpenAI embeddings + Cohere reranking) -- V3 (Voyage AI embeddings + Query Understanding + Code Graph + Cohere reranking) -""" -import asyncio -import os -import sys -import time -from typing import List, Dict, Tuple - -# add parent to path -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -from dotenv import load_dotenv -load_dotenv() - -from services.indexer_optimized import OptimizedCodeIndexer - -# Test queries representing real developer scenarios -TEST_QUERIES = [ - { - "query": "how to add authentication", - "expected_keywords": ["auth", "middleware", "authenticate", "credential"], - "description": "Developer wants to add auth to their app" - }, - { - "query": "handle websocket messages", - "expected_keywords": ["websocket", "message", "send", "receive", "on_"], - "description": "Developer working with WebSockets" - }, - { - "query": "return json from endpoint", - "expected_keywords": ["json", "response", "jsonresponse", "return"], - "description": "Developer wants to return JSON data" - }, - { - "query": "validate request data", - "expected_keywords": ["valid", "request", "data", "schema"], - "description": "Developer needs input validation" - }, - { - "query": "middleware that runs before request", - "expected_keywords": ["middleware", "before", "dispatch", "call_next"], - "description": "Developer needs pre-request processing" - }, - { - "query": "error handling", - "expected_keywords": ["error", "exception", "handler", "catch"], - "description": "Looking for error handling patterns" - }, - { - "query": "route decorator", - "expected_keywords": ["route", "decorator", "path", "endpoint"], - "description": "Developer needs routing functionality" - }, - { - "query": "database session", - "expected_keywords": ["database", "session", "db", "connection"], - "description": "Working with database sessions" - }, -] - - -def score_results(results: List[Dict], expected_keywords: List[str]) -> Tuple[float, int, bool]: - """ - Score search results based on expected keywords - Returns: (score 0-10, matches count, is_test_in_top_3) - """ - if not results: - return 0.0, 0, False - - # combine text from top 3 results - top_3_text = "" - has_test_in_top_3 = False - - for r in results[:3]: - name = r.get("name", "").lower() - qualified = r.get("qualified_name", "").lower() - summary = (r.get("summary") or "").lower() - file_path = r.get("file_path", "").lower() - - top_3_text += f" {name} {qualified} {summary} " - - # check for test files - if "test" in file_path or "test" in name: - has_test_in_top_3 = True - - # count keyword matches - matches = sum(1 for kw in expected_keywords if kw.lower() in top_3_text) - score = min(10.0, (matches / len(expected_keywords)) * 10) - - return score, matches, has_test_in_top_3 - - -async def run_benchmark(repo_id: str): - """Run benchmark comparing V2 vs V3""" - print("=" * 80) - print("🧪 SEARCH V3 vs V2 BENCHMARK") - print("=" * 80) - print() - - indexer = OptimizedCodeIndexer() - - v2_scores = [] - v3_scores = [] - v2_times = [] - v3_times = [] - v2_test_count = 0 - v3_test_count = 0 - - for tc in TEST_QUERIES: - query = tc["query"] - expected = tc["expected_keywords"] - desc = tc["description"] - - print(f"📝 Query: \"{query}\"") - print(f" Scenario: {desc}") - print() - - # V2 Search - start = time.time() - try: - v2_results = await indexer.search_v2( - query=query, - repo_id=repo_id, - top_k=5, - use_reranking=True - ) - v2_time = (time.time() - start) * 1000 - except Exception as e: - print(f" ❌ V2 Error: {e}") - v2_results = [] - v2_time = 0 - - v2_score, v2_matches, v2_has_test = score_results(v2_results, expected) - v2_scores.append(v2_score) - v2_times.append(v2_time) - if v2_has_test: - v2_test_count += 1 - - # V3 Search - start = time.time() - try: - v3_results = await indexer.search_v3( - query=query, - repo_id=repo_id, - top_k=5, - include_tests=False, - use_reranking=True - ) - v3_time = (time.time() - start) * 1000 - except Exception as e: - print(f" ❌ V3 Error: {e}") - v3_results = [] - v3_time = 0 - - v3_score, v3_matches, v3_has_test = score_results(v3_results, expected) - v3_scores.append(v3_score) - v3_times.append(v3_time) - if v3_has_test: - v3_test_count += 1 - - # Print comparison - print(f" V2: Score {v2_score:.1f}/10 ({v2_matches}/{len(expected)} keywords) | {v2_time:.0f}ms") - if v2_results: - print(f" Top result: {v2_results[0].get('name', 'unknown')}") - - print(f" V3: Score {v3_score:.1f}/10 ({v3_matches}/{len(expected)} keywords) | {v3_time:.0f}ms") - if v3_results: - print(f" Top result: {v3_results[0].get('name', 'unknown')}") - - # Winner - if v3_score > v2_score: - print(f" 🏆 V3 WINS (+{v3_score - v2_score:.1f})") - elif v2_score > v3_score: - print(f" 🏆 V2 WINS (+{v2_score - v3_score:.1f})") - else: - print(f" 🤝 TIE") - - print() - - # Summary - print("=" * 80) - print("📊 BENCHMARK RESULTS") - print("=" * 80) - - v2_avg = sum(v2_scores) / len(v2_scores) - v3_avg = sum(v3_scores) / len(v3_scores) - v2_total_time = sum(v2_times) - v3_total_time = sum(v3_times) - - v2_wins = sum(1 for v2, v3 in zip(v2_scores, v3_scores) if v2 > v3) - v3_wins = sum(1 for v2, v3 in zip(v2_scores, v3_scores) if v3 > v2) - ties = len(v2_scores) - v2_wins - v3_wins - - print(f""" -┌─────────────────────────────────────────────────────────┐ -│ METRIC │ V2 │ V3 │ │ -├─────────────────────────────────────────────────────────┤ -│ Average Score │ {v2_avg:>6.1f}/10 │ {v3_avg:>6.1f}/10 │ {"V3 ✓" if v3_avg > v2_avg else "V2 ✓" if v2_avg > v3_avg else "TIE":<5}│ -│ Total Time │ {v2_total_time:>6.0f}ms │ {v3_total_time:>6.0f}ms │ {"V3 ✓" if v3_total_time < v2_total_time else "V2 ✓":<5}│ -│ Queries with test in top3 │ {v2_test_count:>6} │ {v3_test_count:>6} │ {"V3 ✓" if v3_test_count < v2_test_count else "V2 ✓" if v2_test_count < v3_test_count else "TIE":<5}│ -│ Wins │ {v2_wins:>6} │ {v3_wins:>6} │ │ -│ Ties │ {ties:>6} │ {ties:>6} │ │ -└─────────────────────────────────────────────────────────┘ - """) - - # Final verdict - print() - if v3_avg >= v2_avg + 1.0: - print("✅ VERDICT: V3 is SIGNIFICANTLY BETTER - Ready for production!") - elif v3_avg > v2_avg: - print("✅ VERDICT: V3 is BETTER - Consider shipping!") - elif v3_avg == v2_avg: - print("⚠️ VERDICT: V3 is EQUAL to V2 - Need more optimization") - else: - print("❌ VERDICT: V3 is WORSE than V2 - Needs more work") - - print() - - # Check for Voyage - try: - from services.search_v3.integration import get_search_v3 - v3 = get_search_v3() - if v3.is_voyage_enabled: - print("🚀 Using Voyage AI code-specific embeddings") - else: - print("⚠️ Voyage AI not enabled - using OpenAI embeddings") - print(" Set VOYAGE_API_KEY for better code search accuracy!") - except Exception as e: - print(f"⚠️ Could not check Voyage status: {e}") - - -if __name__ == "__main__": - # default repo ID (starlette) - change as needed - REPO_ID = os.getenv("BENCHMARK_REPO_ID", "0323a08f-9d21-4c59-b567-e0629a9bbb24") - - print(f"Using repo_id: {REPO_ID}") - print("Set BENCHMARK_REPO_ID env var to use a different repo") - print() - - asyncio.run(run_benchmark(REPO_ID)) diff --git a/backend/scripts/cross_repo_test.py b/backend/scripts/cross_repo_test.py deleted file mode 100644 index 52775ca..0000000 --- a/backend/scripts/cross_repo_test.py +++ /dev/null @@ -1,128 +0,0 @@ -#!/usr/bin/env python3 -""" -Cross-Repo Test - Test V3 on multiple repositories -""" -import asyncio -import os -import sys - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -# Load from environment (set in .env or export manually) -if not os.environ.get("VOYAGE_API_KEY"): - print("❌ VOYAGE_API_KEY not set. Export it or add to .env file.") - sys.exit(1) - -from services.indexer_optimized import OptimizedCodeIndexer - -REPOS = [ - {"id": "b0d22b4c-9d05-426e-8d9c-7278cce0f4c7", "name": "Flask"}, - {"id": "778333ff-6532-4c05-b73a-d54d44c6917d", "name": "Jotai"}, - {"id": "409fbeac-376f-4593-99a2-882d74e2cae6", "name": "Bun"}, -] - -QUERIES = [ - {"query": "routing", "good": ["route", "router", "path", "url"]}, - {"query": "middleware", "good": ["middleware", "dispatch", "handler"]}, - {"query": "request", "good": ["request", "req"]}, - {"query": "response", "good": ["response", "res", "reply"]}, - {"query": "error handling", "good": ["error", "exception", "handler"]}, -] - - -def has_test_in_top3(results): - for r in results[:3]: - name = r.get("name", "").lower() - file_path = r.get("file_path", "").lower() - if "test" in name or "test" in file_path: - return True - return False - - -async def test_repo(indexer, repo): - print(f"\n{'='*60}") - print(f"📦 Testing: {repo['name']}") - print(f"{'='*60}") - - v2_test_count = 0 - v3_test_count = 0 - v2_wins = 0 - v3_wins = 0 - - for q in QUERIES: - query = q["query"] - - try: - v2_results = await indexer.search_v2(query, repo["id"], top_k=5) - v2_has_test = has_test_in_top3(v2_results) - v2_top = v2_results[0].get("name", "?")[:20] if v2_results else "-" - except Exception as e: - v2_has_test = False - v2_top = f"error" - v2_results = [] - - try: - v3_results = await indexer.search_v3(query, repo["id"], top_k=5, include_tests=False) - v3_has_test = has_test_in_top3(v3_results) - v3_top = v3_results[0].get("name", "?")[:20] if v3_results else "-" - except Exception as e: - v3_has_test = False - v3_top = f"error" - v3_results = [] - - if v2_has_test: - v2_test_count += 1 - if v3_has_test: - v3_test_count += 1 - - # Simple win: no test pollution = better - if not v3_has_test and v2_has_test: - v3_wins += 1 - winner = "V3" - elif not v2_has_test and v3_has_test: - v2_wins += 1 - winner = "V2" - else: - winner = "TIE" - - v2_marker = "❌" if v2_has_test else "✅" - v3_marker = "❌" if v3_has_test else "✅" - - print(f" \"{query}\"") - print(f" V2: {v2_marker} {v2_top:<20} | V3: {v3_marker} {v3_top:<20} | {winner}") - - print(f"\n Summary: V2 test pollution={v2_test_count}, V3 test pollution={v3_test_count}") - return {"v2_tests": v2_test_count, "v3_tests": v3_test_count, "v2_wins": v2_wins, "v3_wins": v3_wins} - - -async def main(): - print("🧪 CROSS-REPOSITORY TEST - V2 vs V3") - - indexer = OptimizedCodeIndexer() - - total_v2_tests = 0 - total_v3_tests = 0 - - for repo in REPOS: - try: - result = await test_repo(indexer, repo) - total_v2_tests += result["v2_tests"] - total_v3_tests += result["v3_tests"] - except Exception as e: - print(f" ⚠️ Error testing {repo['name']}: {e}") - - print(f"\n{'='*60}") - print(f"📊 CROSS-REPO SUMMARY") - print(f"{'='*60}") - print(f" Total V2 test pollution: {total_v2_tests}") - print(f" Total V3 test pollution: {total_v3_tests}") - print(f" V3 reduction: {total_v2_tests - total_v3_tests} fewer test files") - - if total_v3_tests < total_v2_tests: - print(f"\n✅ V3 WINS across multiple repos!") - else: - print(f"\n⚠️ Results mixed") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/backend/scripts/edge_case_test.py b/backend/scripts/edge_case_test.py deleted file mode 100644 index e9da277..0000000 --- a/backend/scripts/edge_case_test.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python3 -""" -Edge Case Test - Weird queries, typos, edge cases -""" -import asyncio -import os -import sys - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -# Load from environment (set in .env or export manually) -if not os.environ.get("VOYAGE_API_KEY"): - print("❌ VOYAGE_API_KEY not set. Export it or add to .env file.") - sys.exit(1) - -from services.indexer_optimized import OptimizedCodeIndexer - -repo_id = "0323a08f-9d21-4c59-b567-e0629a9bbb24" # starlette - -EDGE_CASES = [ - # Typos - {"query": "authnetication", "desc": "typo in authentication"}, - {"query": "midleware", "desc": "typo in middleware"}, - - # Very short queries - {"query": "ws", "desc": "abbreviation for websocket"}, - {"query": "req", "desc": "abbreviation for request"}, - {"query": "res", "desc": "abbreviation for response"}, - - # Very long queries - {"query": "how do i create a custom middleware that logs all requests and responses", "desc": "long natural language"}, - - # Code-like queries - {"query": "async def", "desc": "code pattern"}, - {"query": "@app.route", "desc": "decorator pattern"}, - {"query": "raise HTTPException", "desc": "exception pattern"}, - - # Empty-ish queries - {"query": "the", "desc": "common word"}, - {"query": "a function that", "desc": "vague query"}, - - # Include test keyword (should include tests) - {"query": "test authentication", "desc": "explicitly wants tests"}, -] - - -async def main(): - print("🧪 EDGE CASE TEST - V3 Robustness") - print("=" * 70) - - indexer = OptimizedCodeIndexer() - - passed = 0 - failed = 0 - - for case in EDGE_CASES: - query = case["query"] - desc = case["desc"] - - print(f"\n📝 \"{query}\" ({desc})") - - try: - # Check if query should include tests - include_tests = "test" in query.lower() - - results = await indexer.search_v3( - query, repo_id, top_k=3, - include_tests=include_tests - ) - - if results: - top = results[0] - name = top.get("name", "?")[:25] - file = top.get("file_path", "?").split("/")[-1][:20] - score = top.get("score", 0) - - has_test = "test" in file.lower() or "test" in name.lower() - - # If we asked for tests, having tests is OK - if include_tests: - status = "✅ PASS" if has_test else "✅ PASS (no tests found)" - else: - status = "✅ PASS" if not has_test else "⚠️ test leak" - - print(f" Result: {name} ({file}) | score={score:.2f}") - print(f" Status: {status}") - passed += 1 - else: - print(f" Result: No results") - print(f" Status: ⚠️ empty (may be OK for weird queries)") - passed += 1 # Empty is OK for edge cases - - except Exception as e: - print(f" ❌ ERROR: {str(e)[:50]}") - failed += 1 - - print(f"\n{'='*70}") - print(f"📊 EDGE CASE RESULTS") - print(f"{'='*70}") - print(f" Passed: {passed}/{len(EDGE_CASES)}") - print(f" Failed: {failed}/{len(EDGE_CASES)}") - - if failed == 0: - print(f"\n✅ V3 handles all edge cases!") - else: - print(f"\n⚠️ {failed} edge cases need attention") - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/backend/scripts/extended_query_test.py b/backend/scripts/extended_query_test.py deleted file mode 100644 index 6b1f533..0000000 --- a/backend/scripts/extended_query_test.py +++ /dev/null @@ -1,176 +0,0 @@ -#!/usr/bin/env python3 -""" -Extended Human Query Test - More realistic developer queries -""" -import asyncio -import os -import sys -import time - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -# Load from environment (set in .env or export manually) -if not os.environ.get("VOYAGE_API_KEY"): - print("❌ VOYAGE_API_KEY not set. Export it or add to .env file.") - sys.exit(1) - -from services.indexer_optimized import OptimizedCodeIndexer - -# More realistic queries developers would type -EXTENDED_QUERIES = [ - # Natural language questions - {"query": "how to validate input", "wants": "validation logic", "good": ["valid", "check", "schema"], "bad": ["test_"]}, - {"query": "send response to client", "wants": "response handling", "good": ["response", "send", "return"], "bad": ["test_"]}, - {"query": "parse cookies", "wants": "cookie handling", "good": ["cookie", "parse", "get"], "bad": ["test_"]}, - {"query": "handle file uploads", "wants": "file upload logic", "good": ["file", "upload", "form", "multipart"], "bad": ["test_"]}, - {"query": "cors settings", "wants": "CORS middleware", "good": ["cors", "origin", "header"], "bad": ["test_"]}, - - # Short keyword searches - {"query": "session", "wants": "session management", "good": ["session"], "bad": ["test_session"]}, - {"query": "redirect", "wants": "redirect response", "good": ["redirect", "location"], "bad": ["test_redirect"]}, - {"query": "template", "wants": "template rendering", "good": ["template", "render", "jinja"], "bad": ["test_template"]}, - {"query": "background task", "wants": "async background tasks", "good": ["background", "task", "async"], "bad": ["test_"]}, - {"query": "lifespan", "wants": "app lifespan events", "good": ["lifespan", "startup", "shutdown"], "bad": ["test_"]}, - - # Specific patterns - {"query": "404 not found", "wants": "404 error handling", "good": ["404", "not_found", "notfound"], "bad": ["test_"]}, - {"query": "rate limit", "wants": "rate limiting", "good": ["rate", "limit", "throttle"], "bad": ["test_"]}, - {"query": "database connection", "wants": "DB connection", "good": ["database", "db", "connection", "pool"], "bad": ["test_"]}, - {"query": "form data", "wants": "form parsing", "good": ["form", "data", "parse", "multipart"], "bad": ["test_"]}, - {"query": "headers", "wants": "HTTP headers", "good": ["header", "headers"], "bad": ["test_header"]}, -] - - -def score_result(result, good_keywords, bad_keywords): - name = result.get("name", "").lower() - file_path = result.get("file_path", "").lower() - qualified = result.get("qualified_name", "").lower() - text = f"{name} {file_path} {qualified}" - - for bad in bad_keywords: - if bad in text: - return -1, True - - matches = sum(1 for good in good_keywords if good in text) - return matches, False - - -def evaluate_results(results, query_info): - if not results: - return {"score": 0, "test_count": 0, "top_3": []} - - good = query_info["good"] - bad = query_info["bad"] - - total_score = 0 - test_count = 0 - top_3 = [] - - for i, r in enumerate(results[:5]): - match_score, is_test = score_result(r, good, bad) - - if i < 3: - top_3.append({ - "name": r.get("name", "?")[:25], - "file": r.get("file_path", "?").split("/")[-1][:20], - "is_test": is_test - }) - if is_test: - test_count += 1 - - position_weight = 6 - (i + 1) - if is_test: - total_score -= position_weight - else: - total_score += match_score * position_weight - - return {"score": max(0, total_score), "test_count": test_count, "top_3": top_3} - - -async def run_extended_test(): - print("=" * 70) - print("🧪 EXTENDED HUMAN QUERY TEST - V2 vs V3") - print("=" * 70) - print() - - indexer = OptimizedCodeIndexer() - repo_id = "0323a08f-9d21-4c59-b567-e0629a9bbb24" # starlette - - v2_total, v3_total = 0, 0 - v2_tests, v3_tests = 0, 0 - v2_wins, v3_wins, ties = 0, 0, 0 - - for q in EXTENDED_QUERIES: - query = q["query"] - - # V2 - try: - v2_results = await indexer.search_v2(query, repo_id, top_k=5) - except Exception as e: - print(f" V2 error for '{query}': {e}") - v2_results = [] - v2_eval = evaluate_results(v2_results, q) - - # V3 - try: - v3_results = await indexer.search_v3(query, repo_id, top_k=5, include_tests=False) - except Exception as e: - print(f" V3 error for '{query}': {e}") - v3_results = [] - v3_eval = evaluate_results(v3_results, q) - - v2_total += v2_eval["score"] - v3_total += v3_eval["score"] - v2_tests += v2_eval["test_count"] - v3_tests += v3_eval["test_count"] - - if v3_eval["score"] > v2_eval["score"]: - winner = "V3 ✓" - v3_wins += 1 - elif v2_eval["score"] > v3_eval["score"]: - winner = "V2 ✓" - v2_wins += 1 - else: - winner = "TIE" - ties += 1 - - # Compact output - print(f"📝 \"{query}\"") - print(f" V2: {v2_eval['score']:>2} | V3: {v3_eval['score']:>2} | {winner}") - - # Show top result comparison - v2_top = v2_eval["top_3"][0] if v2_eval["top_3"] else {"name": "-", "is_test": False} - v3_top = v3_eval["top_3"][0] if v3_eval["top_3"] else {"name": "-", "is_test": False} - v2_marker = "❌" if v2_top.get("is_test") else "✅" - v3_marker = "❌" if v3_top.get("is_test") else "✅" - print(f" V2 top: {v2_marker} {v2_top['name']}") - print(f" V3 top: {v3_marker} {v3_top['name']}") - print() - - # Summary - print("=" * 70) - print("📊 EXTENDED TEST RESULTS") - print("=" * 70) - print(f""" - Metric V2 V3 Winner - ───────────────────────────────────────────────────── - Total Score {v2_total:>3} {v3_total:>3} {"V3 ✓" if v3_total > v2_total else "V2 ✓" if v2_total > v3_total else "TIE"} - Test Pollution {v2_tests:>3} {v3_tests:>3} {"V3 ✓" if v3_tests < v2_tests else "V2 ✓" if v2_tests < v3_tests else "TIE"} - Queries Won {v2_wins:>3} {v3_wins:>3} - Ties {ties:>3} {ties:>3} - """) - - improvement = ((v3_total - v2_total) / max(v2_total, 1)) * 100 - print(f" V3 improvement: {improvement:.0f}%") - print() - - if v3_total > v2_total * 1.2: - print("✅ V3 SIGNIFICANTLY BETTER!") - elif v3_total > v2_total: - print("✅ V3 is better") - else: - print("⚠️ Results inconclusive") - - -if __name__ == "__main__": - asyncio.run(run_extended_test()) diff --git a/backend/scripts/extended_v3_test.py b/backend/scripts/extended_v3_test.py deleted file mode 100644 index a5655ea..0000000 --- a/backend/scripts/extended_v3_test.py +++ /dev/null @@ -1,138 +0,0 @@ -#!/usr/bin/env python3 -""" -Extended Search V3 Testing Suite -More human-like queries across different patterns -""" -import asyncio -import os -import sys - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -# Load from environment (set in .env or export manually) -if not os.environ.get("VOYAGE_API_KEY"): - print("❌ VOYAGE_API_KEY not set. Export it or add to .env file.") - sys.exit(1) - -from services.indexer_optimized import OptimizedCodeIndexer - -# More natural human queries - how devs ACTUALLY search -EXTENDED_QUERIES = [ - # Natural language questions - {"query": "how to send a response", "wants": "Response classes"}, - {"query": "validate input", "wants": "Input validation"}, - {"query": "cookies", "wants": "Cookie handling"}, - {"query": "session management", "wants": "Session handling"}, - {"query": "cors", "wants": "CORS middleware"}, - - # Typos and variations - {"query": "http request", "wants": "Request handling"}, - {"query": "url parameters", "wants": "Path/query params"}, - {"query": "background tasks", "wants": "BackgroundTask class"}, - - # Implementation patterns - {"query": "async function", "wants": "Async handlers"}, - {"query": "decorator", "wants": "Route decorators"}, - {"query": "exception", "wants": "Exception classes"}, - - # Specific features - {"query": "file upload", "wants": "File handling"}, - {"query": "template", "wants": "Template rendering"}, - {"query": "redirect", "wants": "Redirect responses"}, - {"query": "headers", "wants": "Header handling"}, -] - - -async def run_extended_tests(): - print("=" * 70) - print("🧪 EXTENDED V3 TESTING - More Human Queries") - print("=" * 70) - print() - - indexer = OptimizedCodeIndexer() - repo_id = "0323a08f-9d21-4c59-b567-e0629a9bbb24" # starlette - - v2_wins = 0 - v3_wins = 0 - ties = 0 - v2_test_pollution = 0 - v3_test_pollution = 0 - - for q in EXTENDED_QUERIES: - query = q["query"] - wants = q["wants"] - - # V2 - try: - v2_results = await indexer.search_v2(query, repo_id, top_k=3) - except Exception as e: - print(f" V2 error: {e}") - v2_results = [] - - # V3 - try: - v3_results = await indexer.search_v3(query, repo_id, top_k=3, include_tests=False) - except Exception as e: - print(f" V3 error: {e}") - v3_results = [] - - # Check for test files in top 3 - v2_tests = sum(1 for r in v2_results[:3] if "test" in r.get("file_path", "").lower()) - v3_tests = sum(1 for r in v3_results[:3] if "test" in r.get("file_path", "").lower()) - v2_test_pollution += v2_tests - v3_test_pollution += v3_tests - - # Simple scoring: penalize test files heavily - v2_score = len(v2_results) - (v2_tests * 2) - v3_score = len(v3_results) - (v3_tests * 2) - - if v3_score > v2_score: - v3_wins += 1 - winner = "V3 ✓" - elif v2_score > v3_score: - v2_wins += 1 - winner = "V2 ✓" - else: - ties += 1 - winner = "TIE" - - # Print results - v2_top = v2_results[0].get("name", "?")[:25] if v2_results else "none" - v3_top = v3_results[0].get("name", "?")[:25] if v3_results else "none" - v2_file = v2_results[0].get("file_path", "").split("/")[-1][:20] if v2_results else "" - v3_file = v3_results[0].get("file_path", "").split("/")[-1][:20] if v3_results else "" - - test_marker_v2 = "❌" if v2_tests > 0 else "✅" - test_marker_v3 = "❌" if v3_tests > 0 else "✅" - - print(f"🔍 \"{query}\" (wants: {wants})") - print(f" V2: {test_marker_v2} {v2_top:<25} ({v2_file})") - print(f" V3: {test_marker_v3} {v3_top:<25} ({v3_file})") - print(f" Winner: {winner}") - print() - - # Summary - print("=" * 70) - print("📊 EXTENDED TEST RESULTS") - print("=" * 70) - print(f""" - V2 Wins: {v2_wins} - V3 Wins: {v3_wins} - Ties: {ties} - - V2 Test Pollution: {v2_test_pollution} test files in results - V3 Test Pollution: {v3_test_pollution} test files in results - - V3 Win Rate: {v3_wins}/{len(EXTENDED_QUERIES)} = {v3_wins/len(EXTENDED_QUERIES)*100:.0f}% - """) - - if v3_wins > v2_wins: - print("✅ V3 WINS EXTENDED TESTING!") - elif v2_wins > v3_wins: - print("❌ V2 performed better - needs investigation") - else: - print("🤝 TIE - V3 matches V2") - - -if __name__ == "__main__": - asyncio.run(run_extended_tests()) diff --git a/backend/scripts/final_v3_test.py b/backend/scripts/final_v3_test.py deleted file mode 100644 index 9d8066a..0000000 --- a/backend/scripts/final_v3_test.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env python3 -""" -Final Comprehensive V3 Test - Summary Report for CEO -""" -import asyncio -import os -import sys -import time - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -# Load .env file if present -try: - from dotenv import load_dotenv - load_dotenv() -except ImportError: - pass # dotenv not installed, rely on exported env vars - -# Load from environment (set in .env or export manually) -if not os.environ.get("VOYAGE_API_KEY"): - print("❌ VOYAGE_API_KEY not set. Export it or add to .env file.") - sys.exit(1) - -from services.indexer_optimized import OptimizedCodeIndexer -from utils.test_detection import has_test_file_in_top_n as has_test_file - -# All query types combined -ALL_QUERIES = [ - # Core functionality - "authentication", "middleware", "routing", "websocket", "session", - # Natural language - "how to return json", "handle errors", "send response", "validate input", - # Features - "static files", "file upload", "cookies", "headers", "redirect", - # Implementation - "request body", "background task", "exception handler", "form data", - # Short keywords - "cors", "template", "lifespan", -] - -REPOS = [ - {"id": "0323a08f-9d21-4c59-b567-e0629a9bbb24", "name": "Starlette"}, - {"id": "b0d22b4c-9d05-426e-8d9c-7278cce0f4c7", "name": "Flask"}, -] - - -async def run_final_test(): - print() - print("╔" + "═" * 68 + "╗") - print("║" + " 🧪 FINAL V3 COMPREHENSIVE TEST REPORT ".center(68) + "║") - print("╚" + "═" * 68 + "╝") - print() - - indexer = OptimizedCodeIndexer() - - total_v2_wins = 0 - total_v3_wins = 0 - total_ties = 0 - total_v2_test_pollution = 0 - total_v3_test_pollution = 0 - total_v2_time = 0 - total_v3_time = 0 - total_queries = 0 - - for repo in REPOS: - print(f"📦 Repository: {repo['name']}") - print("-" * 50) - - repo_v2_tests = 0 - repo_v3_tests = 0 - repo_v3_wins = 0 - - for query in ALL_QUERIES: - total_queries += 1 - - # V2 - start = time.time() - try: - v2_results = await indexer.search_v2(query, repo["id"], top_k=3) - except Exception as e: - print(f" V2 error [{repo['name']}] '{query}': {e}") - v2_results = [] - v2_time = (time.time() - start) * 1000 - total_v2_time += v2_time - - # V3 - start = time.time() - try: - v3_results = await indexer.search_v3(query, repo["id"], top_k=3, include_tests=False) - except Exception as e: - print(f" V3 error [{repo['name']}] '{query}': {e}") - v3_results = [] - v3_time = (time.time() - start) * 1000 - total_v3_time += v3_time - - v2_has_test = has_test_file(v2_results) - v3_has_test = has_test_file(v3_results) - - if v2_has_test: - total_v2_test_pollution += 1 - repo_v2_tests += 1 - if v3_has_test: - total_v3_test_pollution += 1 - repo_v3_tests += 1 - - # Win logic: V3 wins if it has no test but V2 does - if not v3_has_test and v2_has_test: - total_v3_wins += 1 - repo_v3_wins += 1 - elif v3_has_test and not v2_has_test: - total_v2_wins += 1 - else: - total_ties += 1 - - print(f" V2 test pollution: {repo_v2_tests}/{len(ALL_QUERIES)}") - print(f" V3 test pollution: {repo_v3_tests}/{len(ALL_QUERIES)}") - print(f" V3 wins: {repo_v3_wins}/{len(ALL_QUERIES)}") - print() - - # Final Summary - print("╔" + "═" * 68 + "╗") - print("║" + " 📊 FINAL RESULTS ".center(68) + "║") - print("╠" + "═" * 68 + "╣") - - print(f"║ {'Metric':<35} {'V2':>10} {'V3':>10} {'Winner':>8} ║") - print("╠" + "═" * 68 + "╣") - - # Test pollution - winner = "V3 ✓" if total_v3_test_pollution < total_v2_test_pollution else "V2" if total_v2_test_pollution < total_v3_test_pollution else "TIE" - print(f"║ {'Test Files in Top 3':<35} {total_v2_test_pollution:>10} {total_v3_test_pollution:>10} {winner:>8} ║") - - # Wins - winner = "V3 ✓" if total_v3_wins > total_v2_wins else "V2" if total_v2_wins > total_v3_wins else "TIE" - print(f"║ {'Query Wins':<35} {total_v2_wins:>10} {total_v3_wins:>10} {winner:>8} ║") - - # Avg latency - avg_v2 = total_v2_time / total_queries - avg_v3 = total_v3_time / total_queries - winner = "V3 ✓" if avg_v3 < avg_v2 else "V2" if avg_v2 < avg_v3 else "TIE" - print(f"║ {'Avg Latency (ms)':<35} {avg_v2:>10.0f} {avg_v3:>10.0f} {winner:>8} ║") - - print("╠" + "═" * 68 + "╣") - - # Improvement stats - test_reduction = total_v2_test_pollution - total_v3_test_pollution - test_reduction_pct = (test_reduction / max(total_v2_test_pollution, 1)) * 100 - - print(f"║ {'Total Queries Tested':<35} {total_queries:>21} ║") - print(f"║ {'Test Pollution Reduction':<35} {test_reduction:>10} ({test_reduction_pct:.0f}%) ║") - print(f"║ {'V3 Win Rate':<35} {total_v3_wins/total_queries*100:>20.0f}% ║") - - print("╚" + "═" * 68 + "╝") - print() - - # Final verdict - if total_v3_test_pollution < total_v2_test_pollution and total_v3_wins > total_v2_wins: - print("🎯 VERDICT: V3 'Project Brain' is READY TO SHIP! 🚀") - print() - print(" ✅ Significantly reduced test file pollution") - print(" ✅ Better relevance for human-like queries") - print(" ✅ Works across multiple repositories") - print(" ✅ Query understanding + code graph ranking working") - else: - print("⚠️ VERDICT: Results inconclusive, needs review") - - -if __name__ == "__main__": - asyncio.run(run_final_test()) diff --git a/backend/scripts/human_query_test.py b/backend/scripts/human_query_test.py deleted file mode 100644 index 7031dd9..0000000 --- a/backend/scripts/human_query_test.py +++ /dev/null @@ -1,267 +0,0 @@ -#!/usr/bin/env python3 -""" -Real-World Human Query Test - V2 vs V3 -Tests with queries that REAL developers would actually type -""" -import asyncio -import os -import sys -import time - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -# Load .env file if present -try: - from dotenv import load_dotenv - load_dotenv() -except ImportError: - pass # dotenv not installed, rely on exported env vars - -# Load from environment (set in .env or export manually) -if not os.environ.get("VOYAGE_API_KEY"): - print("❌ VOYAGE_API_KEY not set. Export it or add to .env file.") - sys.exit(1) - -from services.indexer_optimized import OptimizedCodeIndexer - -# Real human queries - how developers ACTUALLY search -HUMAN_QUERIES = [ - { - "query": "authentication", - "what_user_wants": "Auth middleware/decorators", - "good_results": ["auth", "middleware", "authenticate", "login", "session"], - "bad_results": ["test_", "_test", "mock", "fixture"], - }, - { - "query": "how do I return json", - "what_user_wants": "JSONResponse or json return patterns", - "good_results": ["json", "response", "jsonresponse", "return"], - "bad_results": ["test_", "_test"], - }, - { - "query": "handle errors", - "what_user_wants": "Error handlers, exception handling", - "good_results": ["error", "exception", "handler", "catch"], - "bad_results": ["test_error", "mock"], - }, - { - "query": "websocket", - "what_user_wants": "WebSocket connection handling", - "good_results": ["websocket", "socket", "ws", "connect"], - "bad_results": ["test_websocket"], - }, - { - "query": "middleware", - "what_user_wants": "Middleware classes/functions", - "good_results": ["middleware", "dispatch", "call_next"], - "bad_results": ["test_middleware"], - }, - { - "query": "request body", - "what_user_wants": "How to read request body/data", - "good_results": ["request", "body", "data", "json", "form"], - "bad_results": ["test_request"], - }, - { - "query": "routing", - "what_user_wants": "Route definitions, URL patterns", - "good_results": ["route", "router", "path", "endpoint", "url"], - "bad_results": ["test_route"], - }, - { - "query": "static files", - "what_user_wants": "Serving static files", - "good_results": ["static", "file", "serve", "mount"], - "bad_results": ["test_static"], - }, -] - - -def score_result(result, good_keywords, bad_keywords): - """Score a single result""" - name = result.get("name", "").lower() - file_path = result.get("file_path", "").lower() - qualified = result.get("qualified_name", "").lower() - text = f"{name} {file_path} {qualified}" - - # Check for bad results (test files) - for bad in bad_keywords: - if bad in text: - return -1, "test_file" - - # Check for good results - matches = sum(1 for good in good_keywords if good in text) - return matches, "ok" - - -def evaluate_results(results, query_info): - """Evaluate search results quality""" - if not results: - return {"score": 0, "reason": "no_results", "top_3": []} - - good = query_info["good_results"] - bad = query_info["bad_results"] - - total_score = 0 - test_files_in_top_3 = 0 - top_3 = [] - - for i, r in enumerate(results[:5]): # Check top 5 - match_score, status = score_result(r, good, bad) - - if i < 3: # Track top 3 - top_3.append({ - "name": r.get("name", "?"), - "file": r.get("file_path", "?").split("/")[-1], - "score": r.get("score", 0), - "is_test": status == "test_file" - }) - - if status == "test_file": - test_files_in_top_3 += 1 - - # Weight by position (position 1 = 5pts, position 5 = 1pt) - position_weight = 6 - (i + 1) - - if status == "test_file": - total_score -= position_weight # Penalty for test files - else: - total_score += match_score * position_weight - - return { - "score": max(0, total_score), - "test_files_in_top_3": test_files_in_top_3, - "top_3": top_3 - } - - -async def run_comparison(): - print("=" * 80) - print("🧪 REAL HUMAN QUERY TEST: V2 vs V3 (with Voyage AI)") - print("=" * 80) - print() - - indexer = OptimizedCodeIndexer() - - # Use starlette repo - repo_id = "0323a08f-9d21-4c59-b567-e0629a9bbb24" - - v2_total = 0 - v3_total = 0 - v2_test_pollution = 0 - v3_test_pollution = 0 - - results_table = [] - - for q in HUMAN_QUERIES: - query = q["query"] - print(f"🔍 Query: \"{query}\"") - print(f" User wants: {q['what_user_wants']}") - - # V2 - start = time.time() - try: - v2_results = await indexer.search_v2(query, repo_id, top_k=5) - v2_time = (time.time() - start) * 1000 - except Exception as e: - print(f" V2 Error: {e}") - v2_results = [] - v2_time = 0 - - v2_eval = evaluate_results(v2_results, q) - - # V3 - start = time.time() - try: - v3_results = await indexer.search_v3(query, repo_id, top_k=5, include_tests=False) - v3_time = (time.time() - start) * 1000 - except Exception as e: - print(f" V3 Error: {e}") - v3_results = [] - v3_time = 0 - - v3_eval = evaluate_results(v3_results, q) - - # Compare - v2_total += v2_eval["score"] - v3_total += v3_eval["score"] - v2_test_pollution += v2_eval.get("test_files_in_top_3", 0) - v3_test_pollution += v3_eval.get("test_files_in_top_3", 0) - - # Print results - print(f"\n V2 (OpenAI): Score={v2_eval['score']:>2} | {v2_time:>4.0f}ms | Tests in top3: {v2_eval.get('test_files_in_top_3', 0)}") - for r in v2_eval["top_3"]: - marker = "❌" if r["is_test"] else "✅" - print(f" {marker} {r['name'][:30]:<30} ({r['file'][:25]})") - - print(f"\n V3 (Voyage): Score={v3_eval['score']:>2} | {v3_time:>4.0f}ms | Tests in top3: {v3_eval.get('test_files_in_top_3', 0)}") - for r in v3_eval["top_3"]: - marker = "❌" if r["is_test"] else "✅" - print(f" {marker} {r['name'][:30]:<30} ({r['file'][:25]})") - - # Winner - if v3_eval["score"] > v2_eval["score"]: - print(f"\n 🏆 V3 WINS (+{v3_eval['score'] - v2_eval['score']})") - elif v2_eval["score"] > v3_eval["score"]: - print(f"\n 🏆 V2 WINS (+{v2_eval['score'] - v3_eval['score']})") - else: - print(f"\n 🤝 TIE") - - results_table.append({ - "query": query, - "v2_score": v2_eval["score"], - "v3_score": v3_eval["score"], - "v2_tests": v2_eval.get("test_files_in_top_3", 0), - "v3_tests": v3_eval.get("test_files_in_top_3", 0), - }) - - print() - print("-" * 80) - print() - - # Final Summary - print() - print("=" * 80) - print("📊 FINAL RESULTS") - print("=" * 80) - - v2_wins = sum(1 for r in results_table if r["v2_score"] > r["v3_score"]) - v3_wins = sum(1 for r in results_table if r["v3_score"] > r["v2_score"]) - ties = len(results_table) - v2_wins - v3_wins - - print(f""" -┌────────────────────────────────────────────────────────────────┐ -│ V2 (OpenAI) V3 (Voyage) WINNER │ -├────────────────────────────────────────────────────────────────┤ -│ Total Score {v2_total:>4} {v3_total:>4} {"V3 ✓" if v3_total > v2_total else "V2 ✓" if v2_total > v3_total else "TIE":<10} │ -│ Test Files in Top 3 {v2_test_pollution:>4} {v3_test_pollution:>4} {"V3 ✓" if v3_test_pollution < v2_test_pollution else "V2 ✓" if v2_test_pollution < v3_test_pollution else "TIE":<10} │ -│ Query Wins {v2_wins:>4} {v3_wins:>4} {"V3 ✓" if v3_wins > v2_wins else "V2 ✓" if v2_wins > v3_wins else "TIE":<10} │ -│ Ties {ties:>4} {ties:>4} │ -└────────────────────────────────────────────────────────────────┘ - """) - - # Per-query breakdown - print("\nPer-Query Breakdown:") - print(f"{'Query':<20} {'V2':>6} {'V3':>6} {'Winner':>10}") - print("-" * 45) - for r in results_table: - winner = "V3" if r["v3_score"] > r["v2_score"] else "V2" if r["v2_score"] > r["v3_score"] else "TIE" - print(f"{r['query']:<20} {r['v2_score']:>6} {r['v3_score']:>6} {winner:>10}") - - # Final verdict - print() - if v3_total > v2_total * 1.2: # 20% better - print("✅ VERDICT: V3 is SIGNIFICANTLY BETTER - Ship it! 🚀") - elif v3_total > v2_total: - print("✅ VERDICT: V3 is BETTER - Ready to ship!") - elif v3_total == v2_total: - print("⚠️ VERDICT: V3 is EQUAL to V2") - else: - print("❌ VERDICT: V3 needs more work") - - if v3_test_pollution < v2_test_pollution: - print(f"✅ V3 has {v2_test_pollution - v3_test_pollution} fewer test files polluting results!") - - -if __name__ == "__main__": - asyncio.run(run_comparison()) diff --git a/backend/scripts/manual_ws_test.py b/backend/scripts/manual_ws_test.py deleted file mode 100644 index 07c7874..0000000 --- a/backend/scripts/manual_ws_test.py +++ /dev/null @@ -1,175 +0,0 @@ -#!/usr/bin/env python3 -""" -MANUAL WebSocket E2E test for playground indexing. - -NOT run in CI - requires: - - Running backend server (uvicorn main:app) - - Redis running - - aiohttp installed (pip install aiohttp) - -This script: -1. Creates an indexing job via the REST API -2. Connects to the WebSocket endpoint -3. Listens for all events until completion/error -4. Reports what we received - -Usage: - cd backend - pip install aiohttp # if not installed - python3 scripts/manual_ws_test.py -""" -import asyncio -import aiohttp -import json -import sys -from datetime import datetime - -# Config -BASE_URL = "http://localhost:8000/api/v1" -WS_URL = "ws://localhost:8000/api/v1" -TEST_REPO = "https://github.com/pmndrs/zustand" # Small, fast to index - - -def log(msg: str, level: str = "INFO"): - """Print timestamped log message.""" - ts = datetime.now().strftime("%H:%M:%S.%f")[:-3] - icon = {"INFO": "ℹ️", "OK": "✅", "ERR": "❌", "WS": "🔌", "EVENT": "📨"}.get(level, "•") - print(f"[{ts}] {icon} {msg}") - - -async def create_indexing_job(session: aiohttp.ClientSession) -> dict: - """Create a new indexing job via REST API.""" - log("Creating indexing job for zustand...") - - async with session.post( - f"{BASE_URL}/playground/index", - json={"github_url": TEST_REPO} - ) as resp: - # 202 Accepted is the expected status for async job creation - if resp.status not in (200, 202): - text = await resp.text() - log(f"Failed to create job: {resp.status} - {text}", "ERR") - return None - - data = await resp.json() - job_id = data.get("job_id") - log(f"Job created: {job_id} (status: {resp.status})", "OK") - return data - - -async def listen_websocket(job_id: str) -> list: - """Connect to WebSocket and collect all events.""" - events = [] - ws_endpoint = f"{WS_URL}/ws/playground/{job_id}" - - log(f"Connecting to WebSocket: {ws_endpoint}", "WS") - - async with aiohttp.ClientSession() as session: - try: - async with session.ws_connect(ws_endpoint, timeout=120) as ws: - log("WebSocket connected!", "OK") - - async for msg in ws: - if msg.type == aiohttp.WSMsgType.TEXT: - event = json.loads(msg.data) - events.append(event) - - event_type = event.get("type", "unknown") - - # Log based on event type - if event_type == "connected": - log(f"Server acknowledged connection", "EVENT") - elif event_type == "ping": - log("Received keepalive ping", "EVENT") - elif event_type == "cloning": - repo = event.get("repo_name", "?") - log(f"Cloning: {repo}", "EVENT") - elif event_type == "progress": - pct = event.get("percent", 0) - files = event.get("files_processed", 0) - total = event.get("files_total", 0) - current = event.get("current_file") or "" - funcs = event.get("functions_found", 0) - # Truncate long paths - if current and len(current) > 40: - current = "..." + current[-37:] - log(f"Progress: {pct}% ({files}/{total}) | {funcs} funcs | {current}", "EVENT") - elif event_type == "completed": - stats = event.get("stats", {}) - log(f"COMPLETED! Functions: {stats.get('functions_found', '?')}, Time: {stats.get('time_taken_seconds', '?')}s", "OK") - break - elif event_type == "error": - log(f"ERROR: {event.get('message', 'Unknown error')}", "ERR") - break - else: - log(f"Unknown event: {event_type}", "EVENT") - - elif msg.type == aiohttp.WSMsgType.ERROR: - log(f"WebSocket error: {ws.exception()}", "ERR") - break - elif msg.type == aiohttp.WSMsgType.CLOSED: - log("WebSocket closed by server", "WS") - break - - except asyncio.TimeoutError: - log("WebSocket connection timed out", "ERR") - except Exception as e: - log(f"WebSocket error: {e}", "ERR") - - return events - - -async def main(): - """Run the end-to-end test.""" - print("\n" + "="*60) - print(" WebSocket E2E Test - Playground Indexing") - print("="*60 + "\n") - - async with aiohttp.ClientSession() as session: - # Step 1: Create job - job_data = await create_indexing_job(session) - if not job_data: - sys.exit(1) - - job_id = job_data.get("job_id") - if not job_id: - log("No job_id in response", "ERR") - sys.exit(1) - - # Step 2: Listen to WebSocket - print() - events = await listen_websocket(job_id) - - # Step 3: Summary - print("\n" + "="*60) - print(" Test Summary") - print("="*60) - - event_types = [e.get("type") for e in events] - print(f"\nTotal events received: {len(events)}") - print(f"Event types: {' → '.join(event_types)}") - - # Check expected flow - # Note: "cloning" may be skipped if repo was recently cloned - required = ["connected", "completed"] - has_required = all(t in event_types for t in required) - has_progress = "progress" in event_types - - print() - if has_required and has_progress: - log("TEST PASSED - Full event flow received!", "OK") - print() - return 0 - elif "error" in event_types: - log("TEST COMPLETED WITH ERROR - Error event received (may be expected)", "ERR") - print() - return 1 - else: - log(f"TEST INCOMPLETE - Missing events. Got: {event_types}", "ERR") - print() - return 1 - - -if __name__ == "__main__": - exit_code = asyncio.run(main()) - sys.exit(exit_code) diff --git a/backend/scripts/validate_cohere_rerank.py b/backend/scripts/validate_cohere_rerank.py deleted file mode 100644 index 238788d..0000000 --- a/backend/scripts/validate_cohere_rerank.py +++ /dev/null @@ -1,180 +0,0 @@ -#!/usr/bin/env python3 -""" -Phase 3: Cohere Reranking Validation Test -Compare V3 with reranking ON vs OFF -""" -import asyncio -import os -import sys - -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) - -# load env -try: - with open('.env', 'r') as f: - for line in f: - if '=' in line and not line.startswith('#'): - key, val = line.strip().split('=', 1) - os.environ[key] = val -except: - pass - -from services.indexer_optimized import OptimizedCodeIndexer - -QUERIES = [ - "authentication", - "how to return json", - "handle errors", - "middleware", - "websocket connection", - "static files", - "request body", - "redirect response", -] - -repo_id = "0323a08f-9d21-4c59-b567-e0629a9bbb24" # starlette - - -def has_test_in_top3(results): - for r in results[:3]: - fp = r.get("file_path", "").lower() - if "test" in fp: - return True - return False - - -def score_results(results, query): - """Simple relevance scoring based on name/file matching query terms""" - if not results: - return 0 - - score = 0 - terms = query.lower().split() - - for i, r in enumerate(results[:5]): - name = r.get("name", "").lower() - file_path = r.get("file_path", "").lower() - - # penalize test files heavily - if "test" in file_path: - score -= (5 - i) - continue - - # reward matches - for term in terms: - if term in name: - score += (5 - i) * 2 - if term in file_path: - score += (5 - i) - - return max(0, score) - - -async def run_validation(): - print("=" * 70) - print("🧪 COHERE RERANKING VALIDATION TEST") - print("=" * 70) - print() - - indexer = OptimizedCodeIndexer() - - # check if Cohere is working - from services.search_v3.integration import get_search_v3 - v3 = get_search_v3() - v3._ensure_initialized() - has_cohere = v3._search_engine.cohere_client is not None - print(f"Cohere Status: {'✅ ENABLED' if has_cohere else '❌ DISABLED'}") - print() - - if not has_cohere: - print("⚠️ Cohere not available - cannot test reranking") - return - - # test with reranking ON vs OFF - rerank_on_score = 0 - rerank_off_score = 0 - rerank_on_tests = 0 - rerank_off_tests = 0 - - for query in QUERIES: - print(f"📝 \"{query}\"") - - # V3 with reranking OFF - try: - results_off = await indexer.search_v3( - query, repo_id, top_k=5, - include_tests=False, - use_reranking=False # disable reranking - ) - off_score = score_results(results_off, query) - off_test = has_test_in_top3(results_off) - off_top = results_off[0].get("name", "?")[:25] if results_off else "none" - except Exception as e: - print(f" ❌ OFF error: {e}") - off_score, off_test, off_top = 0, False, "error" - results_off = [] - - # V3 with reranking ON - try: - results_on = await indexer.search_v3( - query, repo_id, top_k=5, - include_tests=False, - use_reranking=True # enable reranking - ) - on_score = score_results(results_on, query) - on_test = has_test_in_top3(results_on) - on_top = results_on[0].get("name", "?")[:25] if results_on else "none" - - # show rerank scores if available - if results_on and 'rerank_score' in results_on[0]: - top_rerank = results_on[0].get('rerank_score', 0) - print(f" Cohere relevance: {top_rerank:.3f}") - except Exception as e: - print(f" ❌ ON error: {e}") - on_score, on_test, on_top = 0, False, "error" - results_on = [] - - rerank_off_score += off_score - rerank_on_score += on_score - if off_test: rerank_off_tests += 1 - if on_test: rerank_on_tests += 1 - - # determine winner - if on_score > off_score: - winner = "RERANK ✓" - elif off_score > on_score: - winner = "NO-RERANK" - else: - winner = "TIE" - - off_marker = "❌" if off_test else "✅" - on_marker = "❌" if on_test else "✅" - - print(f" OFF: {off_marker} {off_top:<25} (score={off_score})") - print(f" ON: {on_marker} {on_top:<25} (score={on_score})") - print(f" Winner: {winner}") - print() - - # Summary - print("=" * 70) - print("📊 RERANKING IMPACT SUMMARY") - print("=" * 70) - print(f""" - Metric Rerank OFF Rerank ON Better? - ────────────────────────────────────────────────────────────── - Total Score {rerank_off_score:>10} {rerank_on_score:>10} {"✅ ON" if rerank_on_score > rerank_off_score else "❌ OFF"} - Test Pollution {rerank_off_tests:>10} {rerank_on_tests:>10} {"✅ ON" if rerank_on_tests < rerank_off_tests else "TIE" if rerank_on_tests == rerank_off_tests else "❌ OFF"} - """) - - improvement = ((rerank_on_score - rerank_off_score) / max(rerank_off_score, 1)) * 100 - print(f" Reranking improvement: {improvement:+.0f}%") - print() - - if rerank_on_score >= rerank_off_score and rerank_on_tests <= rerank_off_tests: - print("✅ COHERE RERANKING IS WORKING AND IMPROVING RESULTS!") - else: - print("⚠️ Reranking needs tuning") - - -if __name__ == "__main__": - asyncio.run(run_validation())