diff --git a/backend/routes/playground.py b/backend/routes/playground.py deleted file mode 100644 index e7ccacf..0000000 --- a/backend/routes/playground.py +++ /dev/null @@ -1,1303 +0,0 @@ -""" -Playground routes - no auth required, rate limited via Redis. - -Rate limiting strategy (see #93): -- Session token (httpOnly cookie): 50 searches/day per device -- IP fallback: 100 searches/day for shared networks -- Global circuit breaker: 10k searches/hour (cost protection) -""" -import os -import re -import httpx -from typing import Optional -from fastapi import APIRouter, HTTPException, Request, Response, BackgroundTasks -from pydantic import BaseModel, field_validator -import time - -from dependencies import indexer, cache, repo_manager, redis_client -from services.input_validator import InputValidator -from services.repo_validator import RepoValidator -from services.observability import logger, capture_exception -from services.playground_limiter import PlaygroundLimiter, get_playground_limiter, IndexedRepoData -from services.anonymous_indexer import ( - AnonymousIndexingJob, - run_indexing_job, -) - -router = APIRouter(prefix="/playground", tags=["Playground"]) - -# Demo repo mapping (populated on startup) -DEMO_REPO_IDS = {} - -# Session cookie config -SESSION_COOKIE_NAME = "pg_session" -SESSION_COOKIE_MAX_AGE = 86400 # 24 hours -IS_PRODUCTION = os.getenv("ENVIRONMENT", "development").lower() == "production" - -# GitHub validation config -GITHUB_URL_PATTERN = re.compile( - r"^https?://github\.com/(?P[a-zA-Z0-9_.-]+)/(?P[a-zA-Z0-9_.-]+)/?$" -) -ANONYMOUS_FILE_LIMIT = 200 # Max files for anonymous indexing -GITHUB_API_BASE = "https://api.github.com" -GITHUB_API_TIMEOUT = 10.0 # seconds -VALIDATION_CACHE_TTL = 300 # 5 minutes - - -class PlaygroundSearchRequest(BaseModel): - query: str - demo_repo: Optional[str] = None # Keep for backward compat - repo_id: Optional[str] = None # Direct repo_id (user-indexed repos) - max_results: int = 10 - # V3 options - use_v3: bool = True # Use Search V3 by default (better accuracy) - include_tests: bool = False # Include test files in results - - -class ValidateRepoRequest(BaseModel): - """Request body for GitHub repo validation.""" - github_url: str - - @field_validator("github_url") - @classmethod - def validate_github_url_format(cls, v: str) -> str: - """Basic URL format validation.""" - v = v.strip() - if not v: - raise ValueError("GitHub URL is required") - if not v.startswith(("http://", "https://")): - raise ValueError("URL must start with http:// or https://") - if "github.com" not in v.lower(): - raise ValueError("URL must be a GitHub repository URL") - return v - - -class IndexRepoRequest(BaseModel): - """ - Request body for anonymous repository indexing. - - Used by POST /playground/index endpoint (#125). - """ - github_url: str - branch: Optional[str] = None # None = use repo's default branch - partial: bool = False # If True, index first 200 files of large repos - - @field_validator("github_url") - @classmethod - def validate_github_url_format(cls, v: str) -> str: - """Basic URL format validation (detailed validation in endpoint).""" - v = v.strip() - if not v: - raise ValueError("GitHub URL is required") - if not v.startswith(("http://", "https://")): - raise ValueError("URL must start with http:// or https://") - if "github.com" not in v.lower(): - raise ValueError("URL must be a GitHub repository URL") - return v - - -async def load_demo_repos(): - """Load pre-indexed demo repos. Called from main.py on startup.""" - # Note: We mutate DEMO_REPO_IDS dict, no need for 'global' statement - try: - repos = repo_manager.list_repos() - for repo in repos: - name_lower = repo.get("name", "").lower() - if "flask" in name_lower: - DEMO_REPO_IDS["flask"] = repo["id"] - elif "fastapi" in name_lower: - DEMO_REPO_IDS["fastapi"] = repo["id"] - elif "express" in name_lower: - DEMO_REPO_IDS["express"] = repo["id"] - elif "react" in name_lower: - DEMO_REPO_IDS["react"] = repo["id"] - logger.info("Loaded demo repos", repos=list(DEMO_REPO_IDS.keys())) - except Exception as e: - logger.warning("Could not load demo repos", error=str(e)) - - -def _get_client_ip(req: Request) -> str: - """Extract client IP from request.""" - client_ip = req.client.host if req.client else "unknown" - forwarded = req.headers.get("x-forwarded-for") - if forwarded: - client_ip = forwarded.split(",")[0].strip() - return client_ip - - -def _get_session_token(req: Request) -> Optional[str]: - """Get session token from cookie.""" - return req.cookies.get(SESSION_COOKIE_NAME) - - -def _set_session_cookie(response: Response, token: str): - """Set httpOnly session cookie.""" - response.set_cookie( - key=SESSION_COOKIE_NAME, - value=token, - max_age=SESSION_COOKIE_MAX_AGE, - httponly=True, # Can't be accessed by JavaScript - samesite="lax", # CSRF protection - secure=IS_PRODUCTION, # HTTPS only in production - ) - - -def _get_limiter() -> PlaygroundLimiter: - """Get the playground limiter instance.""" - return get_playground_limiter(redis_client) - - -def _resolve_repo_id( - request: PlaygroundSearchRequest, - limiter: PlaygroundLimiter, - limit_result, - req: Request -) -> str: - """ - Resolve which repository to search. - - Priority: repo_id > demo_repo > default "flask" - - For user-indexed repos, validates session ownership and expiry. - Demo repos are always accessible without auth. - - Returns: - repo_id string - - Raises: - HTTPException 403: Access denied (not owner) - HTTPException 410: Repo expired - HTTPException 404: Demo repo not found - """ - # Case 1: Direct repo_id provided - if request.repo_id: - repo_id = request.repo_id - - # Demo repos bypass auth check - if repo_id in DEMO_REPO_IDS.values(): - logger.debug("Search on demo repo via repo_id", repo_id=repo_id[:16]) - return repo_id - - # User-indexed repo - validate ownership - return _validate_user_repo_access(repo_id, limiter, limit_result, req) - - # Case 2: Fall back to demo_repo or default - demo_name = request.demo_repo or "flask" - repo_id = DEMO_REPO_IDS.get(demo_name) - - if repo_id: - logger.debug("Search on demo repo", demo_name=demo_name) - return repo_id - - # Case 3: Demo not in mapping, try first indexed repo - repos = repo_manager.list_repos() - indexed_repos = [r for r in repos if r.get("status") == "indexed"] - - if indexed_repos: - fallback_id = indexed_repos[0]["id"] - logger.debug("Using fallback indexed repo", repo_id=fallback_id[:16]) - return fallback_id - - logger.warning("No demo repo available", requested=demo_name) - raise HTTPException( - status_code=404, - detail=f"Demo repo '{demo_name}' not available" - ) - - -def _validate_user_repo_access( - repo_id: str, - limiter: PlaygroundLimiter, - limit_result, - req: Request -) -> str: - """ - Validate that the session owns the requested user-indexed repo. - - Returns: - repo_id if valid - - Raises: - HTTPException 403: No session or not owner - HTTPException 410: Repo expired - """ - session_token = limit_result.session_token or _get_session_token(req) - token_preview = session_token[:8] if session_token else "none" - - # No session token at all - if not session_token: - logger.warning( - "Search denied - no session token", - repo_id=repo_id[:16] - ) - raise HTTPException( - status_code=403, - detail={ - "error": "access_denied", - "message": "You don't have access to this repository" - } - ) - - # Get session data and check ownership - session_data = limiter.get_session_data(session_token) - indexed_repo = session_data.indexed_repo - session_repo_id = indexed_repo.get("repo_id") if indexed_repo else None - - if not indexed_repo or session_repo_id != repo_id: - logger.warning( - "Search denied - repo not owned by session", - requested_repo_id=repo_id[:16], - session_repo_id=session_repo_id[:16] if session_repo_id else "none", - session_token=token_preview - ) - raise HTTPException( - status_code=403, - detail={ - "error": "access_denied", - "message": "You don't have access to this repository" - } - ) - - # Check expiry - repo_data = IndexedRepoData.from_dict(indexed_repo) - if repo_data.is_expired(): - logger.warning( - "Search denied - repo expired", - repo_id=repo_id[:16], - expired_at=indexed_repo.get("expires_at"), - session_token=token_preview - ) - raise HTTPException( - status_code=410, - detail={ - "error": "repo_expired", - "message": "Repository index expired. Re-index to continue searching.", - "can_reindex": True - } - ) - - # All checks passed - logger.info( - "Search on user-indexed repo", - repo_id=repo_id[:16], - repo_name=indexed_repo.get("name"), - session_token=token_preview - ) - return repo_id - - -@router.get("/limits") -async def get_playground_limits(req: Request): - """ - Get current rate limit status for this user. - - Frontend should call this on page load to show accurate remaining count. - """ - session_token = _get_session_token(req) - client_ip = _get_client_ip(req) - - limiter = _get_limiter() - result = limiter.check_limit(session_token, client_ip) - - return { - "remaining": result.remaining, - "limit": result.limit, - "resets_at": result.resets_at.isoformat(), - "tier": "anonymous", - } - - -@router.get("/session") -async def get_session_info(req: Request, response: Response): - """ - Get current session state including indexed repo info. - - Returns complete session data for frontend state management. - Creates a new session if none exists. - - Response schema (see issue #127): - { - "session_id": "pg_abc123...", - "created_at": "2025-12-24T10:00:00Z", - "expires_at": "2025-12-25T10:00:00Z", - "indexed_repo": { - "repo_id": "repo_abc123", - "github_url": "https://github.com/user/repo", - "name": "repo", - "indexed_at": "2025-12-24T10:05:00Z", - "expires_at": "2025-12-25T10:05:00Z", - "file_count": 198 - }, - "searches": { - "used": 12, - "limit": 50, - "remaining": 38 - } - } - """ - session_token = _get_session_token(req) - limiter = _get_limiter() - - # Check if Redis is available - if not redis_client: - logger.error("Redis unavailable for session endpoint") - raise HTTPException( - status_code=503, - detail={ - "message": "Service temporarily unavailable", - "retry_after": 30, - } - ) - - # Get existing session data - session_data = limiter.get_session_data(session_token) - - # If no session exists, create one - if session_data.session_id is None: - new_token = limiter._generate_session_token() - - if limiter.create_session(new_token): - _set_session_cookie(response, new_token) - session_data = limiter.get_session_data(new_token) - logger.info("Created new session via /session endpoint", - session_token=new_token[:8]) - else: - # Failed to create session (Redis issue) - raise HTTPException( - status_code=503, - detail={ - "message": "Failed to create session", - "retry_after": 30, - } - ) - - # Return formatted response - return session_data.to_response(limit=limiter.SESSION_LIMIT_PER_DAY) - - -@router.post("/search") -async def playground_search( - request: PlaygroundSearchRequest, - req: Request, - response: Response -): - """ - Public playground search - rate limited by session/IP. - - Sets httpOnly cookie on first request to track device. - """ - session_token = _get_session_token(req) - client_ip = _get_client_ip(req) - - # Rate limit check AND record - limiter = _get_limiter() - limit_result = limiter.check_and_record(session_token, client_ip) - - if not limit_result.allowed: - raise HTTPException( - status_code=429, - detail={ - "message": limit_result.reason, - "remaining": 0, - "limit": limit_result.limit, - "resets_at": limit_result.resets_at.isoformat(), - } - ) - - # Set session cookie if new token was created - if limit_result.session_token: - _set_session_cookie(response, limit_result.session_token) - - # Validate query - valid_query, query_error = InputValidator.validate_search_query(request.query) - if not valid_query: - raise HTTPException(status_code=400, detail=f"Invalid query: {query_error}") - - # Resolve repo_id: priority is repo_id > demo_repo > default "flask" - repo_id = _resolve_repo_id(request, limiter, limit_result, req) - - start_time = time.time() - - try: - sanitized_query = InputValidator.sanitize_string(request.query, max_length=200) - - # Check cache (include flags in key to avoid returning wrong results) - cache_key = f"{sanitized_query}:v3={request.use_v3}:tests={request.include_tests}" - cached_results = cache.get_search_results(cache_key, repo_id) - if cached_results: - return { - "results": cached_results, - "count": len(cached_results), - "cached": True, - "remaining_searches": limit_result.remaining, - "limit": limit_result.limit, - } - - # Search V3 (default) or V2 (fallback) - if request.use_v3: - search_results = await indexer.search_v3( - query=sanitized_query, - repo_id=repo_id, - top_k=min(request.max_results, 10), - include_tests=request.include_tests, - use_reranking=True - ) - else: - search_results = await indexer.search_v2( - query=sanitized_query, - repo_id=repo_id, - top_k=min(request.max_results, 10), - use_reranking=True - ) - - # Format results for frontend compatibility - results = [] - for r in search_results: - results.append({ - "name": r.get("name", ""), - "qualified_name": r.get("qualified_name", r.get("name", "")), - "file_path": r.get("file_path", ""), - "code": r.get("code", ""), - "signature": r.get("signature", ""), - "language": r.get("language", ""), - "score": r.get("score", 0), - "line_start": r.get("line_start", 0), - "line_end": r.get("line_end", 0), - "type": "function", # backward compat with V1 - "summary": r.get("summary"), - "class_name": r.get("class_name"), - "is_test_file": r.get("is_test_file", False), # V3 feature - }) - - # Cache results (using same key that includes flags) - cache.set_search_results(cache_key, repo_id, results, ttl=3600) - - search_time = int((time.time() - start_time) * 1000) - - return { - "results": results, - "count": len(results), - "cached": False, - "remaining_searches": limit_result.remaining, - "limit": limit_result.limit, - "search_time_ms": search_time, - "search_version": "v3" if request.use_v3 else "v2", - } - except HTTPException: - raise - except Exception as e: - capture_exception(e, operation="playground_search") - logger.error("Playground search failed", error=str(e)) - raise HTTPException(status_code=500, detail="Search failed") - - -@router.get("/repos") -async def list_playground_repos(): - """List available demo repositories.""" - return { - "repos": [ - { - "id": "flask", - "name": "Flask", - "description": "Python web framework", - "available": "flask" in DEMO_REPO_IDS - }, - { - "id": "fastapi", - "name": "FastAPI", - "description": "Modern Python API", - "available": "fastapi" in DEMO_REPO_IDS - }, - { - "id": "express", - "name": "Express", - "description": "Node.js framework", - "available": "express" in DEMO_REPO_IDS - }, - ] - } - - -@router.get("/stats") -async def get_playground_stats(): - """ - Get playground usage stats (for monitoring/debugging). - """ - limiter = _get_limiter() - stats = limiter.get_usage_stats() - return stats - - -def _parse_github_url(url: str) -> tuple[Optional[str], Optional[str], Optional[str]]: - """ - Parse GitHub URL to extract owner and repo. - - Returns: - (owner, repo, error) - error is None if successful - """ - match = GITHUB_URL_PATTERN.match(url.strip().rstrip("/")) - if not match: - return None, None, "Invalid GitHub URL format. Expected: https://github.com/owner/repo" - return match.group("owner"), match.group("repo"), None - - -async def _fetch_repo_metadata(owner: str, repo: str) -> dict: - """ - Fetch repository metadata from GitHub API. - - Returns dict with repo info or error details. - """ - url = f"{GITHUB_API_BASE}/repos/{owner}/{repo}" - headers = { - "Accept": "application/vnd.github.v3+json", - "User-Agent": "OpenCodeIntel/1.0", - } - - # Add GitHub token if available (for higher rate limits) - github_token = os.getenv("GITHUB_TOKEN") - if github_token: - headers["Authorization"] = f"token {github_token}" - - async with httpx.AsyncClient(timeout=GITHUB_API_TIMEOUT) as client: - try: - response = await client.get(url, headers=headers) - - if response.status_code == 404: - return {"error": "not_found", "message": "Repository not found"} - if response.status_code == 403: - return { - "error": "rate_limited", - "message": "GitHub API rate limit exceeded" - } - if response.status_code != 200: - return { - "error": "api_error", - "message": f"GitHub API error: {response.status_code}" - } - - return response.json() - except httpx.TimeoutException: - return {"error": "timeout", "message": "GitHub API request timed out"} - except Exception as e: - logger.error("GitHub API request failed", error=str(e)) - return {"error": "request_failed", "message": "Failed to fetch repository metadata"} - - -async def _count_code_files( - owner: str, repo: str, default_branch: str -) -> tuple[int, Optional[str]]: - """ - Count code files in repository using GitHub tree API. - - Returns: - (file_count, error) - error is None if successful - """ - url = f"{GITHUB_API_BASE}/repos/{owner}/{repo}/git/trees/{default_branch}?recursive=1" - headers = { - "Accept": "application/vnd.github.v3+json", - "User-Agent": "OpenCodeIntel/1.0", - } - - github_token = os.getenv("GITHUB_TOKEN") - if github_token: - headers["Authorization"] = f"token {github_token}" - - async with httpx.AsyncClient(timeout=GITHUB_API_TIMEOUT) as client: - try: - response = await client.get(url, headers=headers) - - if response.status_code == 404: - return 0, "Could not fetch repository tree" - if response.status_code == 403: - return 0, "GitHub API rate limit exceeded" - if response.status_code != 200: - return 0, f"GitHub API error: {response.status_code}" - - data = response.json() - - # Check if tree was truncated (very large repos) - if data.get("truncated", False): - # For truncated trees, estimate from repo size - # GitHub's size is in KB, rough estimate: 1 code file per 5KB - return -1, "truncated" - - # Count files with code extensions - code_extensions = RepoValidator.CODE_EXTENSIONS - skip_dirs = RepoValidator.SKIP_DIRS - - count = 0 - for item in data.get("tree", []): - if item.get("type") != "blob": - continue - - path = item.get("path", "") - - # Skip if in excluded directory - path_parts = path.split("/") - if any(part in skip_dirs for part in path_parts): - continue - - # Check extension - ext = "." + path.rsplit(".", 1)[-1] if "." in path else "" - if ext.lower() in code_extensions: - count += 1 - - return count, None - except httpx.TimeoutException: - return 0, "GitHub API request timed out" - except Exception as e: - # Log detailed error server-side, but don't expose to client - logger.error("GitHub tree API failed", error=str(e)) - return 0, "error" - - -@router.post("/validate-repo") -async def validate_github_repo(request: ValidateRepoRequest, req: Request): - """ - Validate a GitHub repository URL for anonymous indexing. - - Checks: - - URL format is valid - - Repository exists and is public - - File count is within anonymous limit (200 files) - - Response varies based on validation result (see issue #124). - """ - start_time = time.time() - - # Check cache first - cache_key = f"validate:{request.github_url}" - cached = cache.get(cache_key) if cache else None - if cached: - logger.info("Returning cached validation", url=request.github_url[:50]) - return cached - - # Parse URL - owner, repo_name, parse_error = _parse_github_url(request.github_url) - if parse_error: - return { - "valid": False, - "reason": "invalid_url", - "message": parse_error, - } - - # Fetch repo metadata from GitHub - metadata = await _fetch_repo_metadata(owner, repo_name) - - if "error" in metadata: - error_type = metadata["error"] - if error_type == "not_found": - return { - "valid": False, - "reason": "not_found", - "message": "Repository not found. Check the URL or ensure it's public.", - } - elif error_type == "rate_limited": - raise HTTPException( - status_code=429, - detail={"message": "GitHub API rate limit exceeded. Try again later."} - ) - else: - raise HTTPException( - status_code=502, - detail={"message": metadata.get("message", "Failed to fetch repository info")} - ) - - # Check if private - is_private = metadata.get("private", False) - if is_private: - return { - "valid": True, - "repo_name": repo_name, - "owner": owner, - "is_public": False, - "can_index": False, - "reason": "private", - "message": "This repository is private. " - "Anonymous indexing only supports public repositories.", - } - - # Get file count - default_branch = metadata.get("default_branch", "main") - file_count, count_error = await _count_code_files(owner, repo_name, default_branch) - - # Handle truncated tree (very large repo) - if count_error == "truncated": - # Estimate from repo size (GitHub size is in KB) - repo_size_kb = metadata.get("size", 0) - # Rough estimate: 1 code file per 3KB for code repos - file_count = max(repo_size_kb // 3, ANONYMOUS_FILE_LIMIT + 1) - logger.info("Using estimated file count for large repo", - owner=owner, repo=repo_name, estimated=file_count) - - elif count_error: - logger.warning("Could not count files", owner=owner, repo=repo_name, error=count_error) - # Fall back to size-based estimate - repo_size_kb = metadata.get("size", 0) - file_count = max(repo_size_kb // 3, 1) - - # Build response - response_time_ms = int((time.time() - start_time) * 1000) - - if file_count > ANONYMOUS_FILE_LIMIT: - result = { - "valid": True, - "repo_name": repo_name, - "owner": owner, - "is_public": True, - "default_branch": default_branch, - "file_count": file_count, - "size_kb": metadata.get("size", 0), - "language": metadata.get("language"), - "stars": metadata.get("stargazers_count", 0), - "can_index": False, - "reason": "too_large", - "message": f"Repository has {file_count:,} code files. " - f"Anonymous limit is {ANONYMOUS_FILE_LIMIT}.", - "limit": ANONYMOUS_FILE_LIMIT, - "response_time_ms": response_time_ms, - } - else: - result = { - "valid": True, - "repo_name": repo_name, - "owner": owner, - "is_public": True, - "default_branch": default_branch, - "file_count": file_count, - "size_kb": metadata.get("size", 0), - "language": metadata.get("language"), - "stars": metadata.get("stargazers_count", 0), - "can_index": True, - "message": "Ready to index", - "response_time_ms": response_time_ms, - } - - # Cache successful validations - if cache: - cache.set(cache_key, result, ttl=VALIDATION_CACHE_TTL) - - logger.info("Validated GitHub repo", - owner=owner, repo=repo_name, - file_count=file_count, can_index=result["can_index"], - response_time_ms=response_time_ms) - - return result - - -# Anonymous Indexing Endpoint (#125) - -@router.post("/index", status_code=202) -async def start_anonymous_indexing( - request: IndexRepoRequest, - req: Request, - response: Response, - background_tasks: BackgroundTasks -): - """ - Start indexing a public GitHub repository for anonymous users. - - This endpoint validates the repository and queues it for indexing. - Returns a job_id that can be used to poll for status via GET /index/{job_id}. - - Constraints: - - Max 200 code files (anonymous limit) - - 1 repo per session (no concurrent indexing) - - Public repos only - - 24hr TTL on indexed data - - See issue #125 for full specification. - """ - start_time = time.time() - limiter = _get_limiter() - - # --- Step 1: Session validation (get existing or create new) --- - session_token = _get_session_token(req) - client_ip = _get_client_ip(req) - - if not session_token: - # Create new session - generate token first, then create session - session_token = limiter._generate_session_token() - limiter.create_session(session_token) - _set_session_cookie(response, session_token) - logger.info("Created new session for indexing", - session_token=session_token[:8], - client_ip=client_ip) - - # --- Step 2: Check if session already has an indexed repo --- - session_data = limiter.get_session_data(session_token) - - if session_data.indexed_repo: - # Check if the existing repo has expired - from datetime import datetime, timezone - - expires_at_str = session_data.indexed_repo.get("expires_at", "") - is_expired = False - - if expires_at_str: - try: - expires_at = datetime.fromisoformat(expires_at_str.replace("Z", "+00:00")) - is_expired = datetime.now(timezone.utc) > expires_at - except (ValueError, AttributeError): - is_expired = True # Treat parse errors as expired - - if not is_expired: - # Session already has a valid indexed repo - return 409 Conflict - logger.info("Session already has indexed repo", - session_token=session_token[:8], - existing_repo=session_data.indexed_repo.get("repo_id")) - - raise HTTPException( - status_code=409, - detail={ - "error": "already_indexed", - "message": "You already have an indexed repository. " - "Only 1 repo per session allowed.", - "indexed_repo": session_data.indexed_repo - } - ) - else: - # Existing repo expired - allow new indexing - logger.info("Existing indexed repo expired, allowing new indexing", - session_token=session_token[:8]) - - # --- Step 3: Validate GitHub URL (reuse existing logic) --- - owner, repo_name, parse_error = _parse_github_url(request.github_url) - if parse_error: - raise HTTPException( - status_code=400, - detail={ - "error": "validation_failed", - "reason": "invalid_url", - "message": parse_error - } - ) - - # Fetch repo metadata from GitHub - metadata = await _fetch_repo_metadata(owner, repo_name) - - if "error" in metadata: - error_type = metadata["error"] - if error_type == "not_found": - raise HTTPException( - status_code=400, - detail={ - "error": "validation_failed", - "reason": "not_found", - "message": "Repository not found. Check the URL or ensure it's public." - } - ) - elif error_type == "rate_limited": - raise HTTPException( - status_code=429, - detail={ - "error": "github_rate_limit", - "message": "GitHub API rate limit exceeded. Try again later." - } - ) - else: - raise HTTPException( - status_code=502, - detail={ - "error": "github_error", - "message": metadata.get("message", "Failed to fetch repository info") - } - ) - - # Check if private - if metadata.get("private", False): - raise HTTPException( - status_code=400, - detail={ - "error": "validation_failed", - "reason": "private", - "message": "This repository is private. " - "Anonymous indexing only supports public repositories." - } - ) - - # Determine branch - branch = request.branch or metadata.get("default_branch", "main") - - # Get file count - file_count, count_error = await _count_code_files(owner, repo_name, branch) - - # Handle truncated tree (very large repo) - if count_error == "truncated": - repo_size_kb = metadata.get("size", 0) - file_count = max(repo_size_kb // 3, ANONYMOUS_FILE_LIMIT + 1) - elif count_error: - repo_size_kb = metadata.get("size", 0) - file_count = max(repo_size_kb // 3, 1) - - # Check file limit - is_partial = False - files_to_index = file_count - - if file_count > ANONYMOUS_FILE_LIMIT: - if request.partial: - # Partial indexing - cap at limit - is_partial = True - files_to_index = ANONYMOUS_FILE_LIMIT - logger.info("Partial indexing enabled", - total_files=file_count, - indexing=files_to_index) - else: - # Reject large repos without partial flag - raise HTTPException( - status_code=400, - detail={ - "error": "validation_failed", - "reason": "too_large", - "message": f"Repository has {file_count:,} code files. " - f"Anonymous limit is {ANONYMOUS_FILE_LIMIT}. " - f"Use partial=true to index first {ANONYMOUS_FILE_LIMIT} files.", - "file_count": file_count, - "limit": ANONYMOUS_FILE_LIMIT, - "hint": "Set partial=true to index a subset of files" - } - ) - - # --- Validation passed! Create job and start background indexing --- - - response_time_ms = int((time.time() - start_time) * 1000) - - # Initialize job manager - job_manager = AnonymousIndexingJob(redis_client) - job_id = job_manager.generate_job_id() - - # Create job in Redis - job_manager.create_job( - job_id=job_id, - session_id=session_token, - github_url=request.github_url, - owner=owner, - repo_name=repo_name, - branch=branch, - file_count=file_count, - is_partial=is_partial, - max_files=files_to_index - ) - - # Queue background task - background_tasks.add_task( - run_indexing_job, - job_manager=job_manager, - indexer=indexer, - limiter=limiter, - job_id=job_id, - session_id=session_token, - github_url=request.github_url, - owner=owner, - repo_name=repo_name, - branch=branch, - file_count=files_to_index, # Actual files to index (may be capped) - max_files=files_to_index if is_partial else None # Limit for partial indexing - ) - - logger.info("Indexing job queued", - job_id=job_id, - owner=owner, - repo=repo_name, - branch=branch, - file_count=files_to_index, - is_partial=is_partial, - session_token=session_token[:8], - response_time_ms=response_time_ms) - - # Estimate time based on file count (~0.3s per file) - estimated_seconds = max(10, int(files_to_index * 0.3)) - - response_data = { - "job_id": job_id, - "status": "queued", - "estimated_time_seconds": estimated_seconds, - "file_count": files_to_index, - "message": f"Indexing started. Poll /playground/index/{job_id} for status." - } - - # Add partial info if applicable - if is_partial: - response_data["partial"] = True - response_data["total_files"] = file_count - response_data["message"] = ( - f"Partial indexing started ({files_to_index} of {file_count} files). " - f"Poll /playground/index/{job_id} for status." - ) - - return response_data - - -# GET /playground/index/{job_id} - Check indexing job status (#126) - -@router.get( - "/index/{job_id}", - summary="Check indexing job status", - description=""" -Poll this endpoint to check the status of an anonymous indexing job. - -**Status values:** -- `queued` - Job is waiting to start -- `cloning` - Repository is being cloned from GitHub -- `processing` - Files are being parsed and indexed -- `completed` - Indexing finished, `repo_id` available for search -- `failed` - Error occurred, check `error` and `error_message` fields - -**Polling recommendation:** Every 2-3 seconds until completed/failed. - -**TTL:** Job metadata expires after 1 hour. -""", - responses={ - 200: { - "description": "Job status", - "content": { - "application/json": { - "examples": { - "queued": { - "summary": "Job queued", - "value": { - "job_id": "idx_abc123", - "status": "queued", - "message": "Job is queued for processing", - "created_at": "2025-12-26T10:00:00Z", - "updated_at": "2025-12-26T10:00:00Z", - "repository": { - "owner": "pallets", - "name": "flask", - "branch": "main", - "github_url": "https://github.com/pallets/flask" - } - } - }, - "cloning": { - "summary": "Cloning repository", - "value": { - "job_id": "idx_abc123", - "status": "cloning", - "message": "Cloning repository...", - "created_at": "2025-12-26T10:00:00Z", - "updated_at": "2025-12-26T10:00:05Z", - "repository": { - "owner": "pallets", - "name": "flask", - "branch": "main", - "github_url": "https://github.com/pallets/flask" - } - } - }, - "processing": { - "summary": "Indexing in progress", - "value": { - "job_id": "idx_abc123", - "status": "processing", - "message": "Indexing files...", - "created_at": "2025-12-26T10:00:00Z", - "updated_at": "2025-12-26T10:00:30Z", - "repository": { - "owner": "pallets", - "name": "flask", - "branch": "main", - "github_url": "https://github.com/pallets/flask" - }, - "progress": { - "files_processed": 50, - "files_total": 100, - "functions_found": 250, - "percent_complete": 50, - "current_file": "src/flask/app.py" - } - } - }, - "processing_partial": { - "summary": "Partial indexing in progress", - "value": { - "job_id": "idx_abc123", - "status": "processing", - "message": "Indexing files...", - "partial": True, - "max_files": 200, - "created_at": "2025-12-26T10:00:00Z", - "updated_at": "2025-12-26T10:00:30Z", - "repository": { - "owner": "facebook", - "name": "react", - "branch": "main", - "github_url": "https://github.com/facebook/react" - }, - "progress": { - "files_processed": 100, - "files_total": 200, - "functions_found": 450, - "percent_complete": 50, - "current_file": "packages/react/src/React.js" - } - } - }, - "completed": { - "summary": "Indexing completed", - "value": { - "job_id": "idx_abc123", - "status": "completed", - "message": "Indexing completed successfully", - "repo_id": "anon_idx_abc123", - "created_at": "2025-12-26T10:00:00Z", - "updated_at": "2025-12-26T10:01:00Z", - "repository": { - "owner": "pallets", - "name": "flask", - "branch": "main", - "github_url": "https://github.com/pallets/flask" - }, - "stats": { - "files_indexed": 100, - "functions_found": 500, - "time_taken_seconds": 45.2 - } - } - }, - "failed": { - "summary": "Indexing failed", - "value": { - "job_id": "idx_abc123", - "status": "failed", - "message": "Repository not found or access denied", - "error": "clone_failed", - "error_message": "Repository not found or access denied", - "created_at": "2025-12-26T10:00:00Z", - "updated_at": "2025-12-26T10:00:10Z", - "repository": { - "owner": "user", - "name": "private-repo", - "branch": "main", - "github_url": "https://github.com/user/private-repo" - } - } - } - } - } - } - }, - 400: { - "description": "Invalid job ID format", - "content": { - "application/json": { - "example": { - "detail": { - "error": "invalid_job_id", - "message": "Invalid job ID format" - } - } - } - } - }, - 404: { - "description": "Job not found or expired", - "content": { - "application/json": { - "example": { - "detail": { - "error": "job_not_found", - "message": "Job not found or has expired. Jobs expire after 1 hour." - } - } - } - } - } - } -) -async def get_indexing_status( - job_id: str, - req: Request -): - """ - Check the status of an anonymous indexing job. - - Poll this endpoint after starting an indexing job to track progress. - Jobs expire after 1 hour. - - Status values: - - queued: Job is waiting to start - - cloning: Repository is being cloned - - processing: Files are being indexed - - completed: Indexing finished successfully - - failed: Indexing failed (check error field) - """ - # Validate job_id format - if not job_id or not job_id.startswith("idx_"): - raise HTTPException( - status_code=400, - detail={ - "error": "invalid_job_id", - "message": "Invalid job ID format" - } - ) - - # Get job from Redis - job_manager = AnonymousIndexingJob(redis_client) - job = job_manager.get_job(job_id) - - if not job: - raise HTTPException( - status_code=404, - detail={ - "error": "job_not_found", - "message": "Job not found or has expired. Jobs expire after 1 hour." - } - ) - - # Build response based on status - status = job.get("status", "unknown") - response = { - "job_id": job_id, - "status": status, - "created_at": job.get("created_at"), - "updated_at": job.get("updated_at"), - } - - # Add repo info - response["repository"] = { - "owner": job.get("owner"), - "name": job.get("repo_name"), - "branch": job.get("branch"), - "github_url": job.get("github_url"), - } - - # Add partial info if applicable - if job.get("is_partial"): - response["partial"] = True - response["max_files"] = job.get("max_files") - - # Status-specific fields - if status == "queued": - response["message"] = "Job is queued for processing" - - elif status == "cloning": - response["message"] = "Cloning repository..." - - elif status == "processing": - response["message"] = "Indexing files..." - if job.get("progress"): - progress = job["progress"] - files_processed = progress.get("files_processed", 0) - files_total = progress.get("files_total", 1) - percent = round((files_processed / files_total) * 100) if files_total > 0 else 0 - response["progress"] = { - "files_processed": files_processed, - "files_total": files_total, - "functions_found": progress.get("functions_found", 0), - "percent_complete": percent, - "current_file": progress.get("current_file") - } - - elif status == "completed": - response["message"] = "Indexing completed successfully" - response["repo_id"] = job.get("repo_id") - if job.get("stats"): - response["stats"] = job["stats"] - - elif status == "failed": - response["message"] = job.get("error_message", "Indexing failed") - response["error"] = job.get("error", "unknown_error") - response["error_message"] = job.get("error_message") - - return response diff --git a/backend/routes/playground/__init__.py b/backend/routes/playground/__init__.py new file mode 100644 index 0000000..a6afeff --- /dev/null +++ b/backend/routes/playground/__init__.py @@ -0,0 +1,29 @@ +""" +Playground routes package. + +Split from a 1306-line monolith into focused modules: + search.py -- search endpoint, repo resolution + session.py -- session info, rate limits + validation.py -- GitHub URL validation, metadata + indexing.py -- anonymous indexing start + status + helpers.py -- shared constants and utilities +""" +from fastapi import APIRouter + +from routes.playground.helpers import load_demo_repos +from routes.playground.search import router as search_router +from routes.playground.session import router as session_router +from routes.playground.validation import router as validation_router +from routes.playground.indexing import router as indexing_router + +# Re-export for main.py: from routes.playground import router, load_demo_repos +router = APIRouter(prefix="/playground", tags=["Playground"]) +router.include_router(session_router) +router.include_router(search_router) +router.include_router(validation_router) +router.include_router(indexing_router) + +# Re-export DEMO_REPO_IDS for tests that reference it +from routes.playground.helpers import DEMO_REPO_IDS + +__all__ = ["router", "load_demo_repos", "DEMO_REPO_IDS"] diff --git a/backend/routes/playground/helpers.py b/backend/routes/playground/helpers.py new file mode 100644 index 0000000..4f04517 --- /dev/null +++ b/backend/routes/playground/helpers.py @@ -0,0 +1,80 @@ +""" +Shared helpers and constants for playground routes. + +All playground sub-modules import from here to avoid circular deps. +""" +import os +import re +from typing import Optional +from fastapi import Request, Response + +from dependencies import repo_manager, redis_client +from services.observability import logger +from services.playground_limiter import PlaygroundLimiter, get_playground_limiter + +# Demo repo mapping (populated on startup via load_demo_repos) +DEMO_REPO_IDS = {} + +# Session cookie config +SESSION_COOKIE_NAME = "pg_session" +SESSION_COOKIE_MAX_AGE = 86400 # 24 hours +IS_PRODUCTION = os.getenv("ENVIRONMENT", "development").lower() == "production" + +# GitHub validation config +GITHUB_URL_PATTERN = re.compile( + r"^https?://github\.com/(?P[a-zA-Z0-9_.-]+)/(?P[a-zA-Z0-9_.-]+)/?$" +) +ANONYMOUS_FILE_LIMIT = 200 +GITHUB_API_BASE = "https://api.github.com" +GITHUB_API_TIMEOUT = 10.0 +VALIDATION_CACHE_TTL = 300 # 5 minutes + + +async def load_demo_repos() -> None: + """Load pre-indexed demo repos. Called from main.py on startup.""" + try: + repos = repo_manager.list_repos() + for repo in repos: + name_lower = repo.get("name", "").lower() + if "flask" in name_lower: + DEMO_REPO_IDS["flask"] = repo["id"] + elif "fastapi" in name_lower: + DEMO_REPO_IDS["fastapi"] = repo["id"] + elif "express" in name_lower: + DEMO_REPO_IDS["express"] = repo["id"] + elif "react" in name_lower: + DEMO_REPO_IDS["react"] = repo["id"] + logger.info("Loaded demo repos", repos=list(DEMO_REPO_IDS.keys())) + except Exception as e: + logger.warning("Could not load demo repos", error=str(e)) + + +def get_client_ip(req: Request) -> str: + """Extract client IP from request.""" + client_ip = req.client.host if req.client else "unknown" + forwarded = req.headers.get("x-forwarded-for") + if forwarded: + client_ip = forwarded.split(",")[0].strip() + return client_ip + + +def get_session_token(req: Request) -> Optional[str]: + """Get session token from cookie.""" + return req.cookies.get(SESSION_COOKIE_NAME) + + +def set_session_cookie(response: Response, token: str) -> None: + """Set httpOnly session cookie.""" + response.set_cookie( + key=SESSION_COOKIE_NAME, + value=token, + max_age=SESSION_COOKIE_MAX_AGE, + httponly=True, + samesite="lax", + secure=IS_PRODUCTION, + ) + + +def get_limiter() -> PlaygroundLimiter: + """Get the playground limiter instance.""" + return get_playground_limiter(redis_client) diff --git a/backend/routes/playground/indexing.py b/backend/routes/playground/indexing.py new file mode 100644 index 0000000..fb50dc5 --- /dev/null +++ b/backend/routes/playground/indexing.py @@ -0,0 +1,254 @@ +"""Anonymous indexing routes for the playground.""" +import time +from typing import Optional +from datetime import datetime, timezone +from fastapi import APIRouter, HTTPException, Request, Response, BackgroundTasks +from pydantic import BaseModel, field_validator + +from dependencies import indexer, redis_client +from services.observability import logger +from services.anonymous_indexer import AnonymousIndexingJob, run_indexing_job +from routes.playground.helpers import ( + ANONYMOUS_FILE_LIMIT, + get_client_ip, get_session_token, set_session_cookie, get_limiter, +) +from routes.playground.validation import ( + parse_github_url, fetch_repo_metadata, count_code_files, +) + +router = APIRouter() + + +class IndexRepoRequest(BaseModel): + """Request body for anonymous repository indexing.""" + github_url: str + branch: Optional[str] = None + partial: bool = False + + @field_validator("github_url") + @classmethod + def validate_github_url_format(cls, v: str) -> str: + v = v.strip() + if not v: + raise ValueError("GitHub URL is required") + if not v.startswith(("http://", "https://")): + raise ValueError("URL must start with http:// or https://") + if "github.com" not in v.lower(): + raise ValueError("URL must be a GitHub repository URL") + return v + + +@router.post("/index", status_code=202) +async def start_anonymous_indexing( + request: IndexRepoRequest, + req: Request, + response: Response, + background_tasks: BackgroundTasks, +) -> dict: + """Start indexing a public GitHub repository for anonymous users.""" + start_time = time.time() + limiter = get_limiter() + + # Session validation + session_token = get_session_token(req) + client_ip = get_client_ip(req) + + if not session_token: + session_token = limiter._generate_session_token() + limiter.create_session(session_token) + set_session_cookie(response, session_token) + logger.info("Created new session for indexing", + session_token=session_token[:8], client_ip=client_ip) + + # Check if session already has an indexed repo + session_data = limiter.get_session_data(session_token) + + if session_data.indexed_repo: + expires_at_str = session_data.indexed_repo.get("expires_at", "") + is_expired = False + if expires_at_str: + try: + expires_at = datetime.fromisoformat(expires_at_str.replace("Z", "+00:00")) + # Ensure timezone-aware comparison + if expires_at.tzinfo is None: + expires_at = expires_at.replace(tzinfo=timezone.utc) + is_expired = datetime.now(timezone.utc) > expires_at + except (ValueError, AttributeError, TypeError): + is_expired = True + + if not is_expired: + logger.info("Session already has indexed repo", + session_token=session_token[:8], + existing_repo=session_data.indexed_repo.get("repo_id")) + raise HTTPException( + status_code=409, + detail={ + "error": "already_indexed", + "message": "You already have an indexed repository. Only 1 repo per session allowed.", + "indexed_repo": session_data.indexed_repo, + } + ) + else: + logger.info("Existing indexed repo expired, allowing new indexing", + session_token=session_token[:8]) + + # Validate GitHub URL + owner, repo_name, parse_error = parse_github_url(request.github_url) + if parse_error: + raise HTTPException(status_code=400, detail={ + "error": "validation_failed", "reason": "invalid_url", "message": parse_error + }) + + metadata = await fetch_repo_metadata(owner, repo_name) + if "error" in metadata: + error_type = metadata["error"] + if error_type == "not_found": + raise HTTPException(status_code=400, detail={ + "error": "validation_failed", "reason": "not_found", + "message": "Repository not found. Check the URL or ensure it's public." + }) + elif error_type == "rate_limited": + raise HTTPException(status_code=429, detail={ + "error": "github_rate_limit", "message": "GitHub API rate limit exceeded. Try again later." + }) + else: + raise HTTPException(status_code=502, detail={ + "error": "github_error", "message": metadata.get("message", "Failed to fetch repository info") + }) + + if metadata.get("private", False): + raise HTTPException(status_code=400, detail={ + "error": "validation_failed", "reason": "private", + "message": "This repository is private. Anonymous indexing only supports public repositories." + }) + + branch = request.branch or metadata.get("default_branch", "main") + file_count, count_error = await count_code_files(owner, repo_name, branch) + + if count_error == "truncated": + repo_size_kb = metadata.get("size", 0) + file_count = max(repo_size_kb // 3, ANONYMOUS_FILE_LIMIT + 1) + elif count_error: + repo_size_kb = metadata.get("size", 0) + file_count = max(repo_size_kb // 3, 1) + + is_partial = False + files_to_index = file_count + + if file_count > ANONYMOUS_FILE_LIMIT: + if request.partial: + is_partial = True + files_to_index = ANONYMOUS_FILE_LIMIT + logger.info("Partial indexing enabled", total_files=file_count, indexing=files_to_index) + else: + raise HTTPException(status_code=400, detail={ + "error": "validation_failed", "reason": "too_large", + "message": f"Repository has {file_count:,} code files. " + f"Anonymous limit is {ANONYMOUS_FILE_LIMIT}. " + f"Use partial=true to index first {ANONYMOUS_FILE_LIMIT} files.", + "file_count": file_count, "limit": ANONYMOUS_FILE_LIMIT, + "hint": "Set partial=true to index a subset of files", + }) + + # Create job and start background indexing + response_time_ms = int((time.time() - start_time) * 1000) + if not redis_client: + raise HTTPException(status_code=503, detail="Indexing service unavailable (Redis down)") + job_manager = AnonymousIndexingJob(redis_client) + job_id = job_manager.generate_job_id() + + job_manager.create_job( + job_id=job_id, session_id=session_token, github_url=request.github_url, + owner=owner, repo_name=repo_name, branch=branch, + file_count=file_count, is_partial=is_partial, max_files=files_to_index, + ) + + background_tasks.add_task( + run_indexing_job, + job_manager=job_manager, indexer=indexer, limiter=limiter, + job_id=job_id, session_id=session_token, github_url=request.github_url, + owner=owner, repo_name=repo_name, branch=branch, + file_count=files_to_index, max_files=files_to_index if is_partial else None, + ) + + logger.info("Indexing job queued", job_id=job_id, owner=owner, repo=repo_name, + branch=branch, file_count=files_to_index, is_partial=is_partial, + session_token=session_token[:8], response_time_ms=response_time_ms) + + estimated_seconds = max(10, int(files_to_index * 0.3)) + result = { + "job_id": job_id, "status": "queued", + "estimated_time_seconds": estimated_seconds, "file_count": files_to_index, + "message": f"Indexing started. Poll /playground/index/{job_id} for status.", + } + + if is_partial: + result["partial"] = True + result["total_files"] = file_count + result["message"] = ( + f"Partial indexing started ({files_to_index} of {file_count} files). " + f"Poll /playground/index/{job_id} for status." + ) + + return result + + +@router.get("/index/{job_id}") +async def get_indexing_status(job_id: str, req: Request) -> dict: + """Check the status of an anonymous indexing job.""" + if not job_id or not job_id.startswith("idx_"): + raise HTTPException(status_code=400, detail={ + "error": "invalid_job_id", "message": "Invalid job ID format" + }) + + if not redis_client: + raise HTTPException(status_code=503, detail="Indexing service unavailable (Redis down)") + job_manager = AnonymousIndexingJob(redis_client) + job = job_manager.get_job(job_id) + + if not job: + raise HTTPException(status_code=404, detail={ + "error": "job_not_found", "message": "Job not found or has expired. Jobs expire after 1 hour." + }) + + status = job.get("status", "unknown") + result = { + "job_id": job_id, "status": status, + "created_at": job.get("created_at"), "updated_at": job.get("updated_at"), + "repository": { + "owner": job.get("owner"), "name": job.get("repo_name"), + "branch": job.get("branch"), "github_url": job.get("github_url"), + }, + } + + if job.get("is_partial"): + result["partial"] = True + result["max_files"] = job.get("max_files") + + if status == "queued": + result["message"] = "Job is queued for processing" + elif status == "cloning": + result["message"] = "Cloning repository..." + elif status == "processing": + result["message"] = "Indexing files..." + if job.get("progress"): + progress = job["progress"] + files_processed = progress.get("files_processed", 0) + files_total = progress.get("files_total", 1) + percent = round((files_processed / files_total) * 100) if files_total > 0 else 0 + result["progress"] = { + "files_processed": files_processed, "files_total": files_total, + "functions_found": progress.get("functions_found", 0), + "percent_complete": percent, "current_file": progress.get("current_file"), + } + elif status == "completed": + result["message"] = "Indexing completed successfully" + result["repo_id"] = job.get("repo_id") + if job.get("stats"): + result["stats"] = job["stats"] + elif status == "failed": + result["message"] = job.get("error_message", "Indexing failed") + result["error"] = job.get("error", "unknown_error") + result["error_message"] = job.get("error_message") + + return result diff --git a/backend/routes/playground/search.py b/backend/routes/playground/search.py new file mode 100644 index 0000000..391b71e --- /dev/null +++ b/backend/routes/playground/search.py @@ -0,0 +1,234 @@ +"""Search route for the playground -- rate-limited, no auth required.""" +import time +from typing import Optional +from fastapi import APIRouter, HTTPException, Request, Response +from pydantic import BaseModel + +from dependencies import indexer, cache, repo_manager +from services.input_validator import InputValidator +from services.observability import logger, capture_exception +from services.playground_limiter import PlaygroundLimiter, IndexedRepoData +from routes.playground.helpers import ( + DEMO_REPO_IDS, + get_client_ip, get_session_token, set_session_cookie, get_limiter, +) + +router = APIRouter() + + +class PlaygroundSearchRequest(BaseModel): + query: str + demo_repo: Optional[str] = None + repo_id: Optional[str] = None + max_results: int = 10 + use_v3: bool = True + include_tests: bool = False + + +def _resolve_repo_id( + request: PlaygroundSearchRequest, + limiter: PlaygroundLimiter, + limit_result, + req: Request, +) -> str: + """ + Resolve which repository to search. + Priority: repo_id > demo_repo > default "flask" + """ + if request.repo_id: + repo_id = request.repo_id + if repo_id in DEMO_REPO_IDS.values(): + logger.debug("Search on demo repo via repo_id", repo_id=repo_id[:16]) + return repo_id + return _validate_user_repo_access(repo_id, limiter, limit_result, req) + + demo_name = request.demo_repo or "flask" + repo_id = DEMO_REPO_IDS.get(demo_name) + + if repo_id: + logger.debug("Search on demo repo", demo_name=demo_name) + return repo_id + + repos = repo_manager.list_repos() + indexed_repos = [r for r in repos if r.get("status") == "indexed"] + + if indexed_repos: + fallback_id = indexed_repos[0]["id"] + logger.debug("Using fallback indexed repo", repo_id=fallback_id[:16]) + return fallback_id + + logger.warning("No demo repo available", requested=demo_name) + raise HTTPException(status_code=404, detail=f"Demo repo '{demo_name}' not available") + + +def _validate_user_repo_access( + repo_id: str, + limiter: PlaygroundLimiter, + limit_result, + req: Request, +) -> str: + """Validate that the session owns the requested user-indexed repo.""" + session_token = limit_result.session_token or get_session_token(req) + token_preview = session_token[:8] if session_token else "none" + + if not session_token: + logger.warning("Search denied - no session token", repo_id=repo_id[:16]) + raise HTTPException( + status_code=403, + detail={"error": "access_denied", "message": "You don't have access to this repository"} + ) + + session_data = limiter.get_session_data(session_token) + indexed_repo = session_data.indexed_repo + session_repo_id = indexed_repo.get("repo_id") if indexed_repo else None + + if not indexed_repo or session_repo_id != repo_id: + logger.warning("Search denied - repo not owned by session", + requested_repo_id=repo_id[:16], + session_repo_id=session_repo_id[:16] if session_repo_id else "none", + session_token=token_preview) + raise HTTPException( + status_code=403, + detail={"error": "access_denied", "message": "You don't have access to this repository"} + ) + + repo_data = IndexedRepoData.from_dict(indexed_repo) + if repo_data.is_expired(): + logger.warning("Search denied - repo expired", repo_id=repo_id[:16], + expired_at=indexed_repo.get("expires_at"), session_token=token_preview) + raise HTTPException( + status_code=410, + detail={ + "error": "repo_expired", + "message": "Repository index expired. Re-index to continue searching.", + "can_reindex": True, + } + ) + + logger.info("Search on user-indexed repo", repo_id=repo_id[:16], + repo_name=indexed_repo.get("name"), session_token=token_preview) + return repo_id + + +@router.post("/search") +async def playground_search( + request: PlaygroundSearchRequest, + req: Request, + response: Response, +) -> dict: + """Public playground search - rate limited by session/IP.""" + session_token = get_session_token(req) + client_ip = get_client_ip(req) + + limiter = get_limiter() + limit_result = limiter.check_and_record(session_token, client_ip) + + if not limit_result.allowed: + raise HTTPException( + status_code=429, + detail={ + "message": limit_result.reason, + "remaining": 0, + "limit": limit_result.limit, + "resets_at": limit_result.resets_at.isoformat(), + } + ) + + if limit_result.session_token: + set_session_cookie(response, limit_result.session_token) + + valid_query, query_error = InputValidator.validate_search_query(request.query) + if not valid_query: + raise HTTPException(status_code=400, detail=f"Invalid query: {query_error}") + + repo_id = _resolve_repo_id(request, limiter, limit_result, req) + start_time = time.time() + + try: + sanitized_query = InputValidator.sanitize_string(request.query, max_length=200) + cache_key = f"{sanitized_query}:v3={request.use_v3}:tests={request.include_tests}" + + cached_results = cache.get_search_results(cache_key, repo_id) + if cached_results is not None: + return { + "results": cached_results, "count": len(cached_results), + "cached": True, "remaining_searches": limit_result.remaining, + "limit": limit_result.limit, + } + + if request.use_v3: + search_results = await indexer.search_v3( + query=sanitized_query, repo_id=repo_id, + top_k=min(request.max_results, 10), + include_tests=request.include_tests, use_reranking=True, + ) + else: + search_results = await indexer.search_v2( + query=sanitized_query, repo_id=repo_id, + top_k=min(request.max_results, 10), use_reranking=True, + ) + + results = [] + for r in search_results: + results.append({ + "name": r.get("name", ""), + "qualified_name": r.get("qualified_name", r.get("name", "")), + "file_path": r.get("file_path", ""), + "code": r.get("code", ""), + "signature": r.get("signature", ""), + "language": r.get("language", ""), + "score": r.get("score", 0), + "line_start": r.get("line_start", 0), + "line_end": r.get("line_end", 0), + "type": "function", + "summary": r.get("summary"), + "class_name": r.get("class_name"), + "is_test_file": r.get("is_test_file", False), + }) + + cache.set_search_results(cache_key, repo_id, results, ttl=3600) + search_time = int((time.time() - start_time) * 1000) + + return { + "results": results, "count": len(results), "cached": False, + "remaining_searches": limit_result.remaining, "limit": limit_result.limit, + "search_time_ms": search_time, + "search_version": "v3" if request.use_v3 else "v2", + } + except HTTPException: + raise + except Exception as e: + capture_exception(e, operation="playground_search") + logger.error("Playground search failed", error=str(e)) + raise HTTPException(status_code=500, detail="Search failed") + + +@router.get("/repos") +async def list_playground_repos() -> dict: + """List available demo repositories.""" + return { + "repos": [ + { + "id": "flask", "name": "Flask", + "description": "Python web framework", + "available": "flask" in DEMO_REPO_IDS, + }, + { + "id": "fastapi", "name": "FastAPI", + "description": "Modern Python API", + "available": "fastapi" in DEMO_REPO_IDS, + }, + { + "id": "express", "name": "Express", + "description": "Node.js framework", + "available": "express" in DEMO_REPO_IDS, + }, + ] + } + + +@router.get("/stats") +async def get_playground_stats() -> dict: + """Get playground usage stats (for monitoring/debugging).""" + limiter = get_limiter() + return limiter.get_usage_stats() diff --git a/backend/routes/playground/session.py b/backend/routes/playground/session.py new file mode 100644 index 0000000..220b322 --- /dev/null +++ b/backend/routes/playground/session.py @@ -0,0 +1,68 @@ +"""Session and rate limit routes for the playground.""" +from fastapi import APIRouter, HTTPException, Request, Response + +from dependencies import redis_client +from services.observability import logger +from routes.playground.helpers import ( + get_client_ip, get_session_token, set_session_cookie, get_limiter, +) + +router = APIRouter() + + +@router.get("/limits") +async def get_playground_limits(req: Request) -> dict: + """ + Get current rate limit status for this user. + + Frontend should call this on page load to show accurate remaining count. + """ + session_token = get_session_token(req) + client_ip = get_client_ip(req) + + limiter = get_limiter() + result = limiter.check_limit(session_token, client_ip) + + return { + "remaining": result.remaining, + "limit": result.limit, + "resets_at": result.resets_at.isoformat(), + "tier": "anonymous", + } + + +@router.get("/session") +async def get_session_info(req: Request, response: Response) -> dict: + """ + Get current session state including indexed repo info. + + Creates a new session if none exists. Returns complete session data + for frontend state management. + """ + session_token = get_session_token(req) + limiter = get_limiter() + + if not redis_client: + logger.error("Redis unavailable for session endpoint") + raise HTTPException( + status_code=503, + detail={"message": "Service temporarily unavailable", "retry_after": 30} + ) + + session_data = limiter.get_session_data(session_token) + + if session_data.session_id is None: + new_token = limiter._generate_session_token() + + if limiter.create_session(new_token): + set_session_cookie(response, new_token) + session_data = limiter.get_session_data(new_token) + logger.info("Created new session via /session endpoint", + session_token=new_token[:8]) + else: + raise HTTPException( + status_code=503, + detail={"message": "Failed to create session", "retry_after": 30} + ) + + return session_data.to_response(limit=limiter.SESSION_LIMIT_PER_DAY) diff --git a/backend/routes/playground/validation.py b/backend/routes/playground/validation.py new file mode 100644 index 0000000..651180f --- /dev/null +++ b/backend/routes/playground/validation.py @@ -0,0 +1,187 @@ +"""GitHub repository validation for the playground.""" +import os +import time +from typing import Optional +import httpx +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel, field_validator + +from dependencies import cache +from services.observability import logger +from services.repo_validator import RepoValidator +from routes.playground.helpers import ( + GITHUB_URL_PATTERN, GITHUB_API_BASE, GITHUB_API_TIMEOUT, + ANONYMOUS_FILE_LIMIT, VALIDATION_CACHE_TTL, +) + +router = APIRouter() + + +class ValidateRepoRequest(BaseModel): + """Request body for GitHub repo validation.""" + github_url: str + + @field_validator("github_url") + @classmethod + def validate_github_url_format(cls, v: str) -> str: + v = v.strip() + if not v: + raise ValueError("GitHub URL is required") + if not v.startswith(("http://", "https://")): + raise ValueError("URL must start with http:// or https://") + if "github.com" not in v.lower(): + raise ValueError("URL must be a GitHub repository URL") + return v + + +def parse_github_url(url: str) -> tuple[Optional[str], Optional[str], Optional[str]]: + """Parse GitHub URL to extract owner and repo. Returns (owner, repo, error).""" + match = GITHUB_URL_PATTERN.match(url.strip().rstrip("/")) + if not match: + return None, None, "Invalid GitHub URL format. Expected: https://github.com/owner/repo" + return match.group("owner"), match.group("repo"), None + + +def _github_headers() -> dict: + """Build GitHub API request headers with optional auth token.""" + headers = {"Accept": "application/vnd.github.v3+json", "User-Agent": "OpenCodeIntel/1.0"} + github_token = os.getenv("GITHUB_TOKEN") + if github_token: + headers["Authorization"] = f"token {github_token}" + return headers + + +async def fetch_repo_metadata(owner: str, repo: str) -> dict: + """Fetch repository metadata from GitHub API.""" + url = f"{GITHUB_API_BASE}/repos/{owner}/{repo}" + + async with httpx.AsyncClient(timeout=GITHUB_API_TIMEOUT) as client: + try: + response = await client.get(url, headers=_github_headers()) + if response.status_code == 404: + return {"error": "not_found", "message": "Repository not found"} + if response.status_code == 403: + return {"error": "rate_limited", "message": "GitHub API rate limit exceeded"} + if response.status_code != 200: + return {"error": "api_error", "message": f"GitHub API error: {response.status_code}"} + return response.json() + except httpx.TimeoutException: + return {"error": "timeout", "message": "GitHub API request timed out"} + except Exception as e: + logger.error("GitHub API request failed", error=str(e)) + return {"error": "request_failed", "message": "Failed to fetch repository metadata"} + + +async def count_code_files( + owner: str, repo: str, default_branch: str +) -> tuple[int, Optional[str]]: + """Count code files using GitHub tree API. Returns (file_count, error).""" + url = f"{GITHUB_API_BASE}/repos/{owner}/{repo}/git/trees/{default_branch}?recursive=1" + + async with httpx.AsyncClient(timeout=GITHUB_API_TIMEOUT) as client: + try: + response = await client.get(url, headers=_github_headers()) + if response.status_code == 404: + return 0, "Could not fetch repository tree" + if response.status_code == 403: + return 0, "GitHub API rate limit exceeded" + if response.status_code != 200: + return 0, f"GitHub API error: {response.status_code}" + + data = response.json() + if data.get("truncated", False): + return -1, "truncated" + + code_extensions = RepoValidator.CODE_EXTENSIONS + skip_dirs = RepoValidator.SKIP_DIRS + count = 0 + for item in data.get("tree", []): + if item.get("type") != "blob": + continue + path = item.get("path", "") + path_parts = path.split("/") + if any(part in skip_dirs for part in path_parts): + continue + ext = "." + path.rsplit(".", 1)[-1] if "." in path else "" + if ext.lower() in code_extensions: + count += 1 + return count, None + except httpx.TimeoutException: + return 0, "GitHub API request timed out" + except Exception as e: + logger.error("GitHub tree API failed", error=str(e)) + return 0, "error" + + +@router.post("/validate-repo") +async def validate_github_repo(request: ValidateRepoRequest) -> dict: + """Validate a GitHub repository URL for anonymous indexing.""" + start_time = time.time() + + cache_key = f"validate:{request.github_url}" + cached = cache.get(cache_key) if cache else None + if cached: + logger.info("Returning cached validation", url=request.github_url[:50]) + return cached + + owner, repo_name, parse_error = parse_github_url(request.github_url) + if parse_error: + return {"valid": False, "reason": "invalid_url", "message": parse_error} + + metadata = await fetch_repo_metadata(owner, repo_name) + if "error" in metadata: + error_type = metadata["error"] + if error_type == "not_found": + return {"valid": False, "reason": "not_found", + "message": "Repository not found. Check the URL or ensure it's public."} + elif error_type == "rate_limited": + raise HTTPException(status_code=429, detail={"message": "GitHub API rate limit exceeded. Try again later."}) + else: + raise HTTPException( + status_code=502, + detail={"message": metadata.get("message", "Failed to fetch repository info")}, + ) + + if metadata.get("private", False): + return { + "valid": True, "repo_name": repo_name, "owner": owner, "is_public": False, + "can_index": False, "reason": "private", + "message": "This repository is private. Anonymous indexing only supports public repositories.", + } + + default_branch = metadata.get("default_branch", "main") + file_count, count_error = await count_code_files(owner, repo_name, default_branch) + + if count_error == "truncated": + repo_size_kb = metadata.get("size", 0) + file_count = max(repo_size_kb // 3, ANONYMOUS_FILE_LIMIT + 1) + logger.info("Using estimated file count for large repo", owner=owner, repo=repo_name, estimated=file_count) + elif count_error: + logger.warning("Could not count files", owner=owner, repo=repo_name, error=count_error) + repo_size_kb = metadata.get("size", 0) + file_count = max(repo_size_kb // 3, 1) + + response_time_ms = int((time.time() - start_time) * 1000) + can_index = file_count <= ANONYMOUS_FILE_LIMIT + + result = { + "valid": True, "repo_name": repo_name, "owner": owner, "is_public": True, + "default_branch": default_branch, "file_count": file_count, + "size_kb": metadata.get("size", 0), "language": metadata.get("language"), + "stars": metadata.get("stargazers_count", 0), "can_index": can_index, + "response_time_ms": response_time_ms, + } + + if not can_index: + result["reason"] = "too_large" + result["message"] = f"Repository has {file_count:,} code files. Anonymous limit is {ANONYMOUS_FILE_LIMIT}." + result["limit"] = ANONYMOUS_FILE_LIMIT + else: + result["message"] = "Ready to index" + + if cache: + cache.set(cache_key, result, ttl=VALIDATION_CACHE_TTL) + + logger.info("Validated GitHub repo", owner=owner, repo=repo_name, + file_count=file_count, can_index=can_index, response_time_ms=response_time_ms) + return result diff --git a/backend/tests/test_anonymous_indexing.py b/backend/tests/test_anonymous_indexing.py index e2160c5..583a4d7 100644 --- a/backend/tests/test_anonymous_indexing.py +++ b/backend/tests/test_anonymous_indexing.py @@ -10,10 +10,8 @@ import json # Import directly - conftest.py handles external service mocking -from routes.playground import ( - IndexRepoRequest, - ANONYMOUS_FILE_LIMIT, -) +from routes.playground.indexing import IndexRepoRequest +from routes.playground.helpers import ANONYMOUS_FILE_LIMIT from services.anonymous_indexer import ( AnonymousIndexingJob, JobStatus, @@ -264,6 +262,7 @@ def test_job_stats_to_dict(self): # ENDPOINT TESTS (Integration) +@patch('routes.playground.indexing.redis_client', MagicMock()) class TestIndexEndpoint: """Integration tests for POST /playground/index.""" @@ -290,8 +289,8 @@ def test_missing_url_returns_422(self, client): ) assert response.status_code == 422 - @patch('routes.playground._fetch_repo_metadata') - @patch('routes.playground._count_code_files') + @patch('routes.playground.indexing.fetch_repo_metadata') + @patch('routes.playground.indexing.count_code_files') def test_private_repo_returns_400( self, mock_count, mock_metadata, client ): @@ -307,8 +306,8 @@ def test_private_repo_returns_400( assert response.status_code == 400 assert "private" in response.json()["detail"]["reason"] - @patch('routes.playground._fetch_repo_metadata') - @patch('routes.playground._count_code_files') + @patch('routes.playground.indexing.fetch_repo_metadata') + @patch('routes.playground.indexing.count_code_files') def test_too_large_repo_without_partial_returns_400( self, mock_count, mock_metadata, client ): @@ -330,9 +329,9 @@ def test_too_large_repo_without_partial_returns_400( assert detail["reason"] == "too_large" assert "partial" in detail.get("hint", "").lower() - @patch('routes.playground._fetch_repo_metadata') - @patch('routes.playground._count_code_files') - @patch('routes.playground.AnonymousIndexingJob') + @patch('routes.playground.indexing.fetch_repo_metadata') + @patch('routes.playground.indexing.count_code_files') + @patch('routes.playground.indexing.AnonymousIndexingJob') def test_large_repo_with_partial_returns_202( self, mock_job_class, mock_count, mock_metadata, client ): @@ -364,9 +363,9 @@ def test_large_repo_with_partial_returns_202( assert data["partial"] is True assert data["file_count"] == ANONYMOUS_FILE_LIMIT # Capped at 200 - @patch('routes.playground._fetch_repo_metadata') - @patch('routes.playground._count_code_files') - @patch('routes.playground.AnonymousIndexingJob') + @patch('routes.playground.indexing.fetch_repo_metadata') + @patch('routes.playground.indexing.count_code_files') + @patch('routes.playground.indexing.AnonymousIndexingJob') def test_valid_request_returns_202_with_job_id( self, mock_job_class, mock_count, mock_metadata, client ): @@ -394,7 +393,7 @@ def test_valid_request_returns_202_with_job_id( assert data["status"] == "queued" assert "estimated_time_seconds" in data - @patch('routes.playground._fetch_repo_metadata') + @patch('routes.playground.indexing.fetch_repo_metadata') def test_repo_not_found_returns_400(self, mock_metadata, client): """Repository not found returns 400.""" mock_metadata.return_value = {"error": "not_found"} @@ -407,7 +406,7 @@ def test_repo_not_found_returns_400(self, mock_metadata, client): assert response.status_code == 400 assert response.json()["detail"]["reason"] == "not_found" - @patch('routes.playground._fetch_repo_metadata') + @patch('routes.playground.indexing.fetch_repo_metadata') def test_github_rate_limit_returns_429(self, mock_metadata, client): """GitHub rate limit returns 429.""" mock_metadata.return_value = {"error": "rate_limited"} @@ -422,6 +421,7 @@ def test_github_rate_limit_returns_429(self, mock_metadata, client): # SESSION CONFLICT TESTS +@patch('routes.playground.indexing.redis_client', MagicMock()) class TestSessionConflict: """Tests for session-already-has-repo behavior.""" @@ -431,9 +431,9 @@ def client(self): from main import app return TestClient(app) - @patch('routes.playground._fetch_repo_metadata') - @patch('routes.playground._count_code_files') - @patch('routes.playground._get_limiter') + @patch('routes.playground.indexing.fetch_repo_metadata') + @patch('routes.playground.indexing.count_code_files') + @patch('routes.playground.indexing.get_limiter') def test_session_with_existing_repo_returns_409( self, mock_get_limiter, mock_count, mock_metadata, client ): @@ -464,10 +464,10 @@ def test_session_with_existing_repo_returns_409( assert response.status_code == 409 assert response.json()["detail"]["error"] == "already_indexed" - @patch('routes.playground._fetch_repo_metadata') - @patch('routes.playground._count_code_files') - @patch('routes.playground._get_limiter') - @patch('routes.playground.AnonymousIndexingJob') + @patch('routes.playground.indexing.fetch_repo_metadata') + @patch('routes.playground.indexing.count_code_files') + @patch('routes.playground.indexing.get_limiter') + @patch('routes.playground.indexing.AnonymousIndexingJob') def test_expired_repo_allows_new_indexing( self, mock_job_class, mock_get_limiter, mock_count, mock_metadata, client ): @@ -506,6 +506,7 @@ def test_expired_repo_allows_new_indexing( # STATUS ENDPOINT TESTS (GET /playground/index/{job_id}) +@patch('routes.playground.indexing.redis_client', MagicMock(get=MagicMock(return_value=None))) class TestStatusEndpoint: """Tests for GET /playground/index/{job_id} status endpoint.""" @@ -528,7 +529,7 @@ def test_job_not_found_returns_404(self, client): assert response.status_code == 404 assert response.json()["detail"]["error"] == "job_not_found" - @patch('routes.playground.AnonymousIndexingJob') + @patch('routes.playground.indexing.AnonymousIndexingJob') def test_queued_job_returns_status(self, mock_job_class, client): """Queued job returns correct status.""" mock_job_manager = MagicMock() @@ -551,7 +552,7 @@ def test_queued_job_returns_status(self, mock_job_class, client): assert data["status"] == "queued" assert data["message"] == "Job is queued for processing" - @patch('routes.playground.AnonymousIndexingJob') + @patch('routes.playground.indexing.AnonymousIndexingJob') def test_processing_job_returns_progress(self, mock_job_class, client): """Processing job returns progress info.""" mock_job_manager = MagicMock() @@ -581,7 +582,7 @@ def test_processing_job_returns_progress(self, mock_job_class, client): assert data["progress"]["files_processed"] == 50 assert data["progress"]["percent_complete"] == 50 - @patch('routes.playground.AnonymousIndexingJob') + @patch('routes.playground.indexing.AnonymousIndexingJob') def test_completed_job_returns_repo_id(self, mock_job_class, client): """Completed job returns repo_id and stats.""" mock_job_manager = MagicMock() @@ -611,7 +612,7 @@ def test_completed_job_returns_repo_id(self, mock_job_class, client): assert data["repo_id"] == "anon_idx_test123456" assert data["stats"]["files_processed"] == 100 - @patch('routes.playground.AnonymousIndexingJob') + @patch('routes.playground.indexing.AnonymousIndexingJob') def test_failed_job_returns_error(self, mock_job_class, client): """Failed job returns error details.""" mock_job_manager = MagicMock() @@ -637,7 +638,7 @@ def test_failed_job_returns_error(self, mock_job_class, client): assert data["error"] == "clone_failed" assert "not found" in data["error_message"].lower() - @patch('routes.playground.AnonymousIndexingJob') + @patch('routes.playground.indexing.AnonymousIndexingJob') def test_partial_job_includes_partial_info(self, mock_job_class, client): """Partial indexing job includes partial flag.""" mock_job_manager = MagicMock() @@ -675,8 +676,8 @@ def test_partial_job_includes_partial_info(self, mock_job_class, client): class TestSearchUserRepos: """Tests for searching user-indexed repositories.""" - @patch('routes.playground._get_limiter') - @patch('routes.playground.indexer') + @patch('routes.playground.search.get_limiter') + @patch('routes.playground.search.indexer') def test_search_with_repo_id_user_owns(self, mock_indexer, mock_get_limiter, client): """User can search their own indexed repo via repo_id.""" mock_limiter = MagicMock() @@ -712,7 +713,7 @@ def test_search_with_repo_id_user_owns(self, mock_indexer, mock_get_limiter, cli data = response.json() assert data["count"] == 1 - @patch('routes.playground._get_limiter') + @patch('routes.playground.search.get_limiter') def test_search_repo_id_not_owned_returns_403(self, mock_get_limiter, client): """Searching repo_id user doesn't own returns 403.""" mock_limiter = MagicMock() @@ -744,7 +745,7 @@ def test_search_repo_id_not_owned_returns_403(self, mock_get_limiter, client): data = response.json() assert data["detail"]["error"] == "access_denied" - @patch('routes.playground._get_limiter') + @patch('routes.playground.search.get_limiter') def test_search_repo_id_no_session_repo_returns_403(self, mock_get_limiter, client): """Searching repo_id when session has no indexed repo returns 403.""" mock_limiter = MagicMock() @@ -765,7 +766,7 @@ def test_search_repo_id_no_session_repo_returns_403(self, mock_get_limiter, clie assert response.status_code == 403 - @patch('routes.playground._get_limiter') + @patch('routes.playground.search.get_limiter') def test_search_expired_repo_returns_410(self, mock_get_limiter, client): """Searching expired repo returns 410 with can_reindex hint.""" mock_limiter = MagicMock() @@ -798,8 +799,8 @@ def test_search_expired_repo_returns_410(self, mock_get_limiter, client): assert data["detail"]["error"] == "repo_expired" assert data["detail"]["can_reindex"] is True - @patch('routes.playground._get_limiter') - @patch('routes.playground.indexer') + @patch('routes.playground.search.get_limiter') + @patch('routes.playground.search.indexer') def test_search_demo_repo_via_repo_id_allowed(self, mock_indexer, mock_get_limiter, client): """Demo repos can be accessed via repo_id without ownership check.""" mock_limiter = MagicMock() @@ -813,7 +814,7 @@ def test_search_demo_repo_via_repo_id_allowed(self, mock_indexer, mock_get_limit mock_indexer.semantic_search = AsyncMock(return_value=[]) # Use the flask demo repo ID - from routes.playground import DEMO_REPO_IDS + from routes.playground.helpers import DEMO_REPO_IDS flask_repo_id = DEMO_REPO_IDS.get("flask") if flask_repo_id: @@ -823,8 +824,8 @@ def test_search_demo_repo_via_repo_id_allowed(self, mock_indexer, mock_get_limit ) assert response.status_code == 200 - @patch('routes.playground._get_limiter') - @patch('routes.playground.indexer') + @patch('routes.playground.search.get_limiter') + @patch('routes.playground.search.indexer') def test_search_backward_compat_demo_repo(self, mock_indexer, mock_get_limiter, client): """Backward compat: demo_repo parameter still works.""" mock_limiter = MagicMock() @@ -845,8 +846,8 @@ def test_search_backward_compat_demo_repo(self, mock_indexer, mock_get_limiter, # Should work (200) or 404 if flask not indexed - but not 4xx auth error assert response.status_code in [200, 404] - @patch('routes.playground._get_limiter') - @patch('routes.playground.indexer') + @patch('routes.playground.search.get_limiter') + @patch('routes.playground.search.indexer') def test_search_default_to_flask_when_no_repo_specified(self, mock_indexer, mock_get_limiter, client): """When neither repo_id nor demo_repo provided, defaults to flask.""" mock_limiter = MagicMock() diff --git a/backend/tests/test_validate_repo.py b/backend/tests/test_validate_repo.py index 59df0a9..3603e75 100644 --- a/backend/tests/test_validate_repo.py +++ b/backend/tests/test_validate_repo.py @@ -8,12 +8,8 @@ from unittest.mock import AsyncMock, patch, MagicMock # Import directly - conftest.py handles external service mocking -from routes.playground import ( - _parse_github_url, - GITHUB_URL_PATTERN, - ANONYMOUS_FILE_LIMIT, - ValidateRepoRequest, -) +from routes.playground.validation import parse_github_url, ValidateRepoRequest +from routes.playground.helpers import GITHUB_URL_PATTERN, ANONYMOUS_FILE_LIMIT # URL PARSING TESTS @@ -22,25 +18,25 @@ class TestParseGitHubUrl: """Tests for URL parsing.""" def test_valid_https_url(self): - owner, repo, error = _parse_github_url("https://github.com/facebook/react") + owner, repo, error = parse_github_url("https://github.com/facebook/react") assert owner == "facebook" assert repo == "react" assert error is None def test_valid_http_url(self): - owner, repo, error = _parse_github_url("http://github.com/user/repo") + owner, repo, error = parse_github_url("http://github.com/user/repo") assert owner == "user" assert repo == "repo" assert error is None def test_url_with_trailing_slash(self): - owner, repo, error = _parse_github_url("https://github.com/owner/repo/") + owner, repo, error = parse_github_url("https://github.com/owner/repo/") assert owner == "owner" assert repo == "repo" assert error is None def test_url_with_dots_and_dashes(self): - owner, repo, error = _parse_github_url( + owner, repo, error = parse_github_url( "https://github.com/my-org/my.repo-name" ) assert owner == "my-org" @@ -48,25 +44,25 @@ def test_url_with_dots_and_dashes(self): assert error is None def test_invalid_url_wrong_domain(self): - owner, repo, error = _parse_github_url("https://gitlab.com/user/repo") + owner, repo, error = parse_github_url("https://gitlab.com/user/repo") assert owner is None assert repo is None assert "Invalid GitHub URL format" in error def test_invalid_url_no_repo(self): - owner, repo, error = _parse_github_url("https://github.com/justowner") + owner, repo, error = parse_github_url("https://github.com/justowner") assert owner is None assert error is not None def test_invalid_url_with_path(self): - owner, repo, error = _parse_github_url( + owner, repo, error = parse_github_url( "https://github.com/owner/repo/tree/main" ) assert owner is None assert error is not None def test_invalid_url_blob_path(self): - owner, repo, error = _parse_github_url( + owner, repo, error = parse_github_url( "https://github.com/owner/repo/blob/main/file.py" ) assert owner is None @@ -132,43 +128,43 @@ class TestFetchRepoMetadata: @pytest.mark.asyncio async def test_repo_not_found(self): """Test handling of 404 response.""" - from routes.playground import _fetch_repo_metadata + from routes.playground.validation import fetch_repo_metadata mock_response = MagicMock() mock_response.status_code = 404 - with patch("routes.playground.httpx.AsyncClient") as mock_client: + with patch("routes.playground.validation.httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.return_value = mock_response mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance - result = await _fetch_repo_metadata("nonexistent", "repo") + result = await fetch_repo_metadata("nonexistent", "repo") assert result["error"] == "not_found" @pytest.mark.asyncio async def test_rate_limited(self): """Test handling of 403 rate limit response.""" - from routes.playground import _fetch_repo_metadata + from routes.playground.validation import fetch_repo_metadata mock_response = MagicMock() mock_response.status_code = 403 - with patch("routes.playground.httpx.AsyncClient") as mock_client: + with patch("routes.playground.validation.httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.return_value = mock_response mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance - result = await _fetch_repo_metadata("user", "repo") + result = await fetch_repo_metadata("user", "repo") assert result["error"] == "rate_limited" @pytest.mark.asyncio async def test_successful_fetch(self): """Test successful metadata fetch.""" - from routes.playground import _fetch_repo_metadata + from routes.playground.validation import fetch_repo_metadata mock_response = MagicMock() mock_response.status_code = 200 @@ -182,14 +178,14 @@ async def test_successful_fetch(self): "size": 1024, } - with patch("routes.playground.httpx.AsyncClient") as mock_client: + with patch("routes.playground.validation.httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.return_value = mock_response mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance - result = await _fetch_repo_metadata("user", "repo") + result = await fetch_repo_metadata("user", "repo") assert result["name"] == "repo" assert result["private"] is False assert result["stargazers_count"] == 100 @@ -197,17 +193,17 @@ async def test_successful_fetch(self): @pytest.mark.asyncio async def test_timeout_handling(self): """Test timeout is handled gracefully.""" - from routes.playground import _fetch_repo_metadata + from routes.playground.validation import fetch_repo_metadata import httpx - with patch("routes.playground.httpx.AsyncClient") as mock_client: + with patch("routes.playground.validation.httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.side_effect = httpx.TimeoutException("timeout") mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance - result = await _fetch_repo_metadata("user", "repo") + result = await fetch_repo_metadata("user", "repo") assert result["error"] == "timeout" @@ -219,7 +215,7 @@ class TestCountCodeFiles: @pytest.mark.asyncio async def test_count_python_files(self): """Test counting Python files.""" - from routes.playground import _count_code_files + from routes.playground.validation import count_code_files mock_response = MagicMock() mock_response.status_code = 200 @@ -233,21 +229,21 @@ async def test_count_python_files(self): ] } - with patch("routes.playground.httpx.AsyncClient") as mock_client: + with patch("routes.playground.validation.httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.return_value = mock_response mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance - count, error = await _count_code_files("user", "repo", "main") + count, error = await count_code_files("user", "repo", "main") assert count == 2 # Only .py files assert error is None @pytest.mark.asyncio async def test_skip_node_modules(self): """Test that node_modules is skipped.""" - from routes.playground import _count_code_files + from routes.playground.validation import count_code_files mock_response = MagicMock() mock_response.status_code = 200 @@ -260,21 +256,21 @@ async def test_skip_node_modules(self): ] } - with patch("routes.playground.httpx.AsyncClient") as mock_client: + with patch("routes.playground.validation.httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.return_value = mock_response mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance - count, error = await _count_code_files("user", "repo", "main") + count, error = await count_code_files("user", "repo", "main") assert count == 2 # index.js and src/app.js, not node_modules assert error is None @pytest.mark.asyncio async def test_truncated_tree(self): """Test handling of truncated tree response.""" - from routes.playground import _count_code_files + from routes.playground.validation import count_code_files mock_response = MagicMock() mock_response.status_code = 200 @@ -283,21 +279,21 @@ async def test_truncated_tree(self): "tree": [] } - with patch("routes.playground.httpx.AsyncClient") as mock_client: + with patch("routes.playground.validation.httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.return_value = mock_response mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance - count, error = await _count_code_files("user", "repo", "main") + count, error = await count_code_files("user", "repo", "main") assert count == -1 assert error == "truncated" @pytest.mark.asyncio async def test_multiple_extensions(self): """Test counting multiple file types.""" - from routes.playground import _count_code_files + from routes.playground.validation import count_code_files mock_response = MagicMock() mock_response.status_code = 200 @@ -313,21 +309,21 @@ async def test_multiple_extensions(self): ] } - with patch("routes.playground.httpx.AsyncClient") as mock_client: + with patch("routes.playground.validation.httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.return_value = mock_response mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance - count, error = await _count_code_files("user", "repo", "main") + count, error = await count_code_files("user", "repo", "main") assert count == 4 # py, js, go, rs assert error is None @pytest.mark.asyncio async def test_skip_git_directory(self): """Test that .git directory is skipped.""" - from routes.playground import _count_code_files + from routes.playground.validation import count_code_files mock_response = MagicMock() mock_response.status_code = 200 @@ -339,14 +335,14 @@ async def test_skip_git_directory(self): ] } - with patch("routes.playground.httpx.AsyncClient") as mock_client: + with patch("routes.playground.validation.httpx.AsyncClient") as mock_client: mock_instance = AsyncMock() mock_instance.get.return_value = mock_response mock_instance.__aenter__.return_value = mock_instance mock_instance.__aexit__.return_value = None mock_client.return_value = mock_instance - count, error = await _count_code_files("user", "repo", "main") + count, error = await count_code_files("user", "repo", "main") assert count == 1 # Only app.py assert error is None