From 2f17f3f7c571d81e8aefe617a95a36dbd9f6c9bc Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Thu, 25 Dec 2025 17:18:52 -0600 Subject: [PATCH 1/5] feat(playground): add POST /index endpoint with validation (#125) Checkpoint 1: Request validation and session management - Add IndexRepoRequest Pydantic model with github_url and branch - Add POST /playground/index endpoint (returns 202) - Session validation: get existing or create new session - Check has_indexed_repo: return 409 if session already has active repo - Handle expired repos: allow re-indexing if existing repo expired - Reuse GitHub validation logic from validate-repo endpoint - Validate: URL format, repo exists, is public, file count <= 200 Next: Add job management and background indexing task --- backend/routes/playground.py | 209 +++++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) diff --git a/backend/routes/playground.py b/backend/routes/playground.py index bb11e96..3283250 100644 --- a/backend/routes/playground.py +++ b/backend/routes/playground.py @@ -64,6 +64,29 @@ def validate_github_url_format(cls, v: str) -> str: return v +class IndexRepoRequest(BaseModel): + """ + Request body for anonymous repository indexing. + + Used by POST /playground/index endpoint (#125). + """ + github_url: str + branch: Optional[str] = None # None = use repo's default branch + + @field_validator("github_url") + @classmethod + def validate_github_url_format(cls, v: str) -> str: + """Basic URL format validation (detailed validation in endpoint).""" + v = v.strip() + if not v: + raise ValueError("GitHub URL is required") + if not v.startswith(("http://", "https://")): + raise ValueError("URL must start with http:// or https://") + if "github.com" not in v.lower(): + raise ValueError("URL must be a GitHub repository URL") + return v + + async def load_demo_repos(): """Load pre-indexed demo repos. Called from main.py on startup.""" # Note: We mutate DEMO_REPO_IDS dict, no need for 'global' statement @@ -591,3 +614,189 @@ async def validate_github_repo(request: ValidateRepoRequest, req: Request): response_time_ms=response_time_ms) return result + + +# ============================================================================= +# Anonymous Indexing Endpoint (#125) +# ============================================================================= + +@router.post("/index", status_code=202) +async def start_anonymous_indexing( + request: IndexRepoRequest, + req: Request, + response: Response +): + """ + Start indexing a public GitHub repository for anonymous users. + + This endpoint validates the repository and queues it for indexing. + Returns a job_id that can be used to poll for status via GET /index/{job_id}. + + Constraints: + - Max 200 code files (anonymous limit) + - 1 repo per session (no concurrent indexing) + - Public repos only + - 24hr TTL on indexed data + + See issue #125 for full specification. + """ + start_time = time.time() + limiter = _get_limiter() + + # --- Step 1: Session validation (get existing or create new) --- + session_token = _get_session_token(req) + client_ip = _get_client_ip(req) + + if not session_token: + # Create new session + session_token = limiter.create_session() + _set_session_cookie(response, session_token) + logger.info("Created new session for indexing", + session_token=session_token[:8], + client_ip=client_ip) + + # --- Step 2: Check if session already has an indexed repo --- + session_data = limiter.get_session_data(session_token) + + if session_data.indexed_repo: + # Check if the existing repo has expired + from datetime import datetime, timezone + + expires_at_str = session_data.indexed_repo.get("expires_at", "") + is_expired = False + + if expires_at_str: + try: + expires_at = datetime.fromisoformat(expires_at_str.replace("Z", "+00:00")) + is_expired = datetime.now(timezone.utc) > expires_at + except (ValueError, AttributeError): + is_expired = True # Treat parse errors as expired + + if not is_expired: + # Session already has a valid indexed repo - return 409 Conflict + logger.info("Session already has indexed repo", + session_token=session_token[:8], + existing_repo=session_data.indexed_repo.get("repo_id")) + + raise HTTPException( + status_code=409, + detail={ + "error": "already_indexed", + "message": "You already have an indexed repository. " + "Only 1 repo per session allowed.", + "indexed_repo": session_data.indexed_repo + } + ) + else: + # Existing repo expired - allow new indexing + logger.info("Existing indexed repo expired, allowing new indexing", + session_token=session_token[:8]) + + # --- Step 3: Validate GitHub URL (reuse existing logic) --- + owner, repo_name, parse_error = _parse_github_url(request.github_url) + if parse_error: + raise HTTPException( + status_code=400, + detail={ + "error": "validation_failed", + "reason": "invalid_url", + "message": parse_error + } + ) + + # Fetch repo metadata from GitHub + metadata = await _fetch_repo_metadata(owner, repo_name) + + if "error" in metadata: + error_type = metadata["error"] + if error_type == "not_found": + raise HTTPException( + status_code=400, + detail={ + "error": "validation_failed", + "reason": "not_found", + "message": "Repository not found. Check the URL or ensure it's public." + } + ) + elif error_type == "rate_limited": + raise HTTPException( + status_code=429, + detail={ + "error": "github_rate_limit", + "message": "GitHub API rate limit exceeded. Try again later." + } + ) + else: + raise HTTPException( + status_code=502, + detail={ + "error": "github_error", + "message": metadata.get("message", "Failed to fetch repository info") + } + ) + + # Check if private + if metadata.get("private", False): + raise HTTPException( + status_code=400, + detail={ + "error": "validation_failed", + "reason": "private", + "message": "This repository is private. " + "Anonymous indexing only supports public repositories." + } + ) + + # Determine branch + branch = request.branch or metadata.get("default_branch", "main") + + # Get file count + file_count, count_error = await _count_code_files(owner, repo_name, branch) + + # Handle truncated tree (very large repo) + if count_error == "truncated": + repo_size_kb = metadata.get("size", 0) + file_count = max(repo_size_kb // 3, ANONYMOUS_FILE_LIMIT + 1) + elif count_error: + repo_size_kb = metadata.get("size", 0) + file_count = max(repo_size_kb // 3, 1) + + # Check file limit + if file_count > ANONYMOUS_FILE_LIMIT: + raise HTTPException( + status_code=400, + detail={ + "error": "validation_failed", + "reason": "too_large", + "message": f"Repository has {file_count:,} code files. " + f"Anonymous limit is {ANONYMOUS_FILE_LIMIT}.", + "file_count": file_count, + "limit": ANONYMOUS_FILE_LIMIT + } + ) + + # --- Validation passed! --- + # Next checkpoint will add: job creation, background task, Redis tracking + + response_time_ms = int((time.time() - start_time) * 1000) + + # TODO: Checkpoint 2 will add actual indexing logic + # For now, return a placeholder to confirm endpoint works + logger.info("Index request validated", + owner=owner, repo=repo_name, branch=branch, + file_count=file_count, session_token=session_token[:8], + response_time_ms=response_time_ms) + + return { + "status": "checkpoint_1_complete", + "message": "Validation passed. Indexing logic will be added in next checkpoint.", + "validated": { + "owner": owner, + "repo_name": repo_name, + "branch": branch, + "file_count": file_count, + "github_url": request.github_url + }, + "session_token": session_token[:8] + "...", + "response_time_ms": response_time_ms + } From d4ec4b4eb423d38f6f11753577116520dfa4ddf9 Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Thu, 25 Dec 2025 17:24:27 -0600 Subject: [PATCH 2/5] feat(playground): add job manager and background indexing (#125) --- backend/routes/playground.py | 73 ++++-- backend/services/anonymous_indexer.py | 358 ++++++++++++++++++++++++++ 2 files changed, 411 insertions(+), 20 deletions(-) create mode 100644 backend/services/anonymous_indexer.py diff --git a/backend/routes/playground.py b/backend/routes/playground.py index 3283250..6ae39b5 100644 --- a/backend/routes/playground.py +++ b/backend/routes/playground.py @@ -10,7 +10,7 @@ import re import httpx from typing import Optional -from fastapi import APIRouter, HTTPException, Request, Response +from fastapi import APIRouter, HTTPException, Request, Response, BackgroundTasks from pydantic import BaseModel, field_validator import time @@ -19,6 +19,10 @@ from services.repo_validator import RepoValidator from services.observability import logger from services.playground_limiter import PlaygroundLimiter, get_playground_limiter +from services.anonymous_indexer import ( + AnonymousIndexingJob, + run_indexing_job, +) router = APIRouter(prefix="/playground", tags=["Playground"]) @@ -624,7 +628,8 @@ async def validate_github_repo(request: ValidateRepoRequest, req: Request): async def start_anonymous_indexing( request: IndexRepoRequest, req: Request, - response: Response + response: Response, + background_tasks: BackgroundTasks ): """ Start indexing a public GitHub repository for anonymous users. @@ -775,28 +780,56 @@ async def start_anonymous_indexing( } ) - # --- Validation passed! --- - # Next checkpoint will add: job creation, background task, Redis tracking + # --- Validation passed! Create job and start background indexing --- response_time_ms = int((time.time() - start_time) * 1000) - # TODO: Checkpoint 2 will add actual indexing logic - # For now, return a placeholder to confirm endpoint works - logger.info("Index request validated", - owner=owner, repo=repo_name, branch=branch, - file_count=file_count, session_token=session_token[:8], + # Initialize job manager + job_manager = AnonymousIndexingJob(redis_client) + job_id = job_manager.generate_job_id() + + # Create job in Redis + job_manager.create_job( + job_id=job_id, + session_id=session_token, + github_url=request.github_url, + owner=owner, + repo_name=repo_name, + branch=branch, + file_count=file_count + ) + + # Queue background task + background_tasks.add_task( + run_indexing_job, + job_manager=job_manager, + indexer=indexer, + limiter=limiter, + job_id=job_id, + session_id=session_token, + github_url=request.github_url, + owner=owner, + repo_name=repo_name, + branch=branch, + file_count=file_count + ) + + logger.info("Indexing job queued", + job_id=job_id, + owner=owner, + repo=repo_name, + branch=branch, + file_count=file_count, + session_token=session_token[:8], response_time_ms=response_time_ms) + # Estimate time based on file count (~0.3s per file) + estimated_seconds = max(10, int(file_count * 0.3)) + return { - "status": "checkpoint_1_complete", - "message": "Validation passed. Indexing logic will be added in next checkpoint.", - "validated": { - "owner": owner, - "repo_name": repo_name, - "branch": branch, - "file_count": file_count, - "github_url": request.github_url - }, - "session_token": session_token[:8] + "...", - "response_time_ms": response_time_ms + "job_id": job_id, + "status": "queued", + "estimated_time_seconds": estimated_seconds, + "file_count": file_count, + "message": f"Indexing started. Poll /playground/index/{job_id} for status." } diff --git a/backend/services/anonymous_indexer.py b/backend/services/anonymous_indexer.py new file mode 100644 index 0000000..4fa9d5c --- /dev/null +++ b/backend/services/anonymous_indexer.py @@ -0,0 +1,358 @@ +""" +Anonymous Indexing Service (#125) + +Handles job management and background indexing for anonymous users. +Jobs are tracked in Redis with progress updates. +""" +import uuid +import json +import shutil +import asyncio +from pathlib import Path +from datetime import datetime, timezone, timedelta +from typing import Optional +from dataclasses import dataclass, asdict +from enum import Enum + +import git + +from services.observability import logger, metrics, capture_exception + + +class JobStatus(str, Enum): + """Job status values.""" + QUEUED = "queued" + CLONING = "cloning" + PROCESSING = "processing" + COMPLETED = "completed" + FAILED = "failed" + + +@dataclass +class JobProgress: + """Progress tracking for indexing job.""" + files_total: int = 0 + files_processed: int = 0 + functions_found: int = 0 + current_file: Optional[str] = None + + def to_dict(self) -> dict: + return {k: v for k, v in asdict(self).items() if v is not None} + + +@dataclass +class JobStats: + """Final stats for completed job.""" + files_indexed: int = 0 + functions_found: int = 0 + time_taken_seconds: float = 0 + + def to_dict(self) -> dict: + return asdict(self) + + +class AnonymousIndexingJob: + """ + Manages anonymous indexing jobs in Redis. + + Redis key: anon_job:{job_id} + TTL: 1 hour for job metadata + """ + + REDIS_PREFIX = "anon_job:" + JOB_TTL_SECONDS = 3600 # 1 hour for job metadata + REPO_TTL_HOURS = 24 # 24 hours for indexed data + TEMP_DIR = "/tmp/anon_repos" + CLONE_TIMEOUT_SECONDS = 120 # 2 minutes for clone + INDEX_TIMEOUT_SECONDS = 300 # 5 minutes for indexing + + def __init__(self, redis_client): + self.redis = redis_client + # Ensure temp directory exists + Path(self.TEMP_DIR).mkdir(parents=True, exist_ok=True) + + @staticmethod + def generate_job_id() -> str: + """Generate unique job ID.""" + return f"idx_{uuid.uuid4().hex[:12]}" + + @staticmethod + def generate_repo_id(job_id: str) -> str: + """Generate repo ID from job ID (for Pinecone namespace).""" + return f"anon_{job_id.replace('idx_', '')}" + + def _get_key(self, job_id: str) -> str: + """Get Redis key for job.""" + return f"{self.REDIS_PREFIX}{job_id}" + + def create_job( + self, + job_id: str, + session_id: str, + github_url: str, + owner: str, + repo_name: str, + branch: str, + file_count: int + ) -> dict: + """ + Create a new indexing job in Redis. + + Returns the initial job state. + """ + now = datetime.now(timezone.utc) + expires_at = now + timedelta(hours=self.REPO_TTL_HOURS) + + job_data = { + "job_id": job_id, + "session_id": session_id, + "github_url": github_url, + "owner": owner, + "repo_name": repo_name, + "branch": branch, + "file_count": file_count, + "status": JobStatus.QUEUED.value, + "progress": None, + "stats": None, + "repo_id": None, + "error": None, + "error_message": None, + "created_at": now.isoformat(), + "updated_at": now.isoformat(), + "expires_at": expires_at.isoformat(), + } + + if self.redis: + key = self._get_key(job_id) + self.redis.setex(key, self.JOB_TTL_SECONDS, json.dumps(job_data)) + logger.info("Created indexing job", job_id=job_id, session_id=session_id[:8]) + + return job_data + + def get_job(self, job_id: str) -> Optional[dict]: + """Get job data from Redis.""" + if not self.redis: + return None + + key = self._get_key(job_id) + data = self.redis.get(key) + + if not data: + return None + + try: + return json.loads(data) + except json.JSONDecodeError: + logger.error("Invalid job data in Redis", job_id=job_id) + return None + + def update_status( + self, + job_id: str, + status: JobStatus, + progress: Optional[JobProgress] = None, + stats: Optional[JobStats] = None, + repo_id: Optional[str] = None, + error: Optional[str] = None, + error_message: Optional[str] = None + ) -> bool: + """Update job status in Redis.""" + if not self.redis: + return False + + job = self.get_job(job_id) + if not job: + logger.warning("Job not found for update", job_id=job_id) + return False + + job["status"] = status.value + job["updated_at"] = datetime.now(timezone.utc).isoformat() + + if progress: + job["progress"] = progress.to_dict() + if stats: + job["stats"] = stats.to_dict() + if repo_id: + job["repo_id"] = repo_id + if error: + job["error"] = error + job["error_message"] = error_message + + key = self._get_key(job_id) + self.redis.setex(key, self.JOB_TTL_SECONDS, json.dumps(job)) + + return True + + def update_progress( + self, + job_id: str, + files_processed: int, + functions_found: int, + files_total: int, + current_file: Optional[str] = None + ) -> bool: + """Update job progress (called during indexing).""" + progress = JobProgress( + files_total=files_total, + files_processed=files_processed, + functions_found=functions_found, + current_file=current_file + ) + return self.update_status(job_id, JobStatus.PROCESSING, progress=progress) + + def get_temp_path(self, job_id: str) -> Path: + """Get temp directory path for job.""" + return Path(self.TEMP_DIR) / job_id + + def cleanup_temp(self, job_id: str) -> None: + """Clean up temp directory for job.""" + temp_path = self.get_temp_path(job_id) + if temp_path.exists(): + try: + shutil.rmtree(temp_path) + logger.debug("Cleaned up temp directory", job_id=job_id) + except Exception as e: + logger.warning("Failed to cleanup temp", job_id=job_id, error=str(e)) + + +async def run_indexing_job( + job_manager: AnonymousIndexingJob, + indexer, + limiter, + job_id: str, + session_id: str, + github_url: str, + owner: str, + repo_name: str, + branch: str, + file_count: int +) -> None: + """ + Background task to clone and index a repository. + + This runs asynchronously after the endpoint returns. + Updates Redis with progress and final status. + """ + import time + start_time = time.time() + temp_path = job_manager.get_temp_path(job_id) + repo_id = job_manager.generate_repo_id(job_id) + + try: + # --- Step 1: Clone repository --- + job_manager.update_status(job_id, JobStatus.CLONING) + logger.info("Cloning repository", job_id=job_id, url=github_url) + + git_url = f"https://github.com/{owner}/{repo_name}.git" + + # Clone in thread pool (git operations are blocking) + loop = asyncio.get_event_loop() + try: + await asyncio.wait_for( + loop.run_in_executor( + None, + lambda: git.Repo.clone_from( + git_url, + temp_path, + branch=branch, + depth=1, # Shallow clone + single_branch=True + ) + ), + timeout=job_manager.CLONE_TIMEOUT_SECONDS + ) + except asyncio.TimeoutError: + raise Exception("Clone timed out") + except git.GitCommandError as e: + raise Exception(f"Clone failed: {str(e)}") + + logger.info("Clone complete", job_id=job_id) + + # --- Step 2: Index repository --- + job_manager.update_status(job_id, JobStatus.PROCESSING) + + # Progress callback for real-time updates + async def progress_callback(files_processed: int, functions_found: int, total: int): + job_manager.update_progress( + job_id, + files_processed=files_processed, + functions_found=functions_found, + files_total=total + ) + + # Run indexing with timeout + try: + total_functions = await asyncio.wait_for( + indexer.index_repository_with_progress( + repo_id, + str(temp_path), + progress_callback + ), + timeout=job_manager.INDEX_TIMEOUT_SECONDS + ) + except asyncio.TimeoutError: + raise Exception("Indexing timed out") + + # --- Step 3: Mark complete --- + elapsed = time.time() - start_time + stats = JobStats( + files_indexed=file_count, + functions_found=total_functions, + time_taken_seconds=round(elapsed, 2) + ) + + job_manager.update_status( + job_id, + JobStatus.COMPLETED, + stats=stats, + repo_id=repo_id + ) + + # Store in session for search access + job = job_manager.get_job(job_id) + if job and limiter: + limiter.set_indexed_repo(session_id, { + "repo_id": repo_id, + "github_url": github_url, + "name": repo_name, + "file_count": file_count, + "indexed_at": datetime.now(timezone.utc).isoformat(), + "expires_at": job.get("expires_at"), + }) + + metrics.increment("anon_indexing_success") + logger.info("Indexing complete", + job_id=job_id, + repo_id=repo_id, + functions=total_functions, + elapsed=f"{elapsed:.2f}s") + + except Exception as e: + # --- Handle failure --- + error_msg = str(e) + error_type = "indexing_failed" + + if "timed out" in error_msg.lower(): + error_type = "timeout" + elif "clone" in error_msg.lower(): + error_type = "clone_failed" + elif "rate limit" in error_msg.lower(): + error_type = "github_rate_limit" + + job_manager.update_status( + job_id, + JobStatus.FAILED, + error=error_type, + error_message=error_msg + ) + + metrics.increment("anon_indexing_failed") + logger.error("Indexing failed", + job_id=job_id, + error_type=error_type, + error=error_msg) + capture_exception(e, operation="anonymous_indexing", job_id=job_id) + + finally: + # --- Always cleanup --- + job_manager.cleanup_temp(job_id) From 1e1ae2571b96d22d18e488ac398d19942b89f0d5 Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Thu, 25 Dec 2025 22:43:00 -0600 Subject: [PATCH 3/5] fix(playground): add partial indexing support and fix bugs (#125) - Add partial=true parameter to index first 200 files of large repos - Fix create_session() call - generate token before creating session - Fix JSON serialization in validation error handler - Add missing capture_http_exception function to sentry module - Add comprehensive tests for anonymous indexing (30 tests) All 169 tests passing. --- backend/main.py | 13 +- backend/routes/playground.py | 69 ++- backend/services/anonymous_indexer.py | 15 +- backend/services/indexer_optimized.py | 23 +- backend/services/sentry.py | 62 ++- backend/tests/test_anonymous_indexing.py | 514 +++++++++++++++++++++++ 6 files changed, 645 insertions(+), 51 deletions(-) create mode 100644 backend/tests/test_anonymous_indexing.py diff --git a/backend/main.py b/backend/main.py index 6eccc59..83f4b19 100644 --- a/backend/main.py +++ b/backend/main.py @@ -98,11 +98,22 @@ async def dispatch(self, request: Request, call_next): @app.exception_handler(RequestValidationError) async def validation_exception_handler(request: Request, exc: RequestValidationError): """Handle validation errors with clear messages.""" + # Convert errors to JSON-serializable format + errors = [] + for err in exc.errors(): + error_dict = { + "type": err.get("type"), + "loc": err.get("loc"), + "msg": err.get("msg"), + "input": str(err.get("input")) if err.get("input") is not None else None, + } + errors.append(error_dict) + return JSONResponse( status_code=422, content={ "detail": "Validation error", - "errors": exc.errors() + "errors": errors } ) diff --git a/backend/routes/playground.py b/backend/routes/playground.py index 6ae39b5..778036d 100644 --- a/backend/routes/playground.py +++ b/backend/routes/playground.py @@ -76,6 +76,7 @@ class IndexRepoRequest(BaseModel): """ github_url: str branch: Optional[str] = None # None = use repo's default branch + partial: bool = False # If True, index first 200 files of large repos @field_validator("github_url") @classmethod @@ -653,8 +654,9 @@ async def start_anonymous_indexing( client_ip = _get_client_ip(req) if not session_token: - # Create new session - session_token = limiter.create_session() + # Create new session - generate token first, then create session + session_token = limiter._generate_session_token() + limiter.create_session(session_token) _set_session_cookie(response, session_token) logger.info("Created new session for indexing", session_token=session_token[:8], @@ -767,18 +769,32 @@ async def start_anonymous_indexing( file_count = max(repo_size_kb // 3, 1) # Check file limit + is_partial = False + files_to_index = file_count + if file_count > ANONYMOUS_FILE_LIMIT: - raise HTTPException( - status_code=400, - detail={ - "error": "validation_failed", - "reason": "too_large", - "message": f"Repository has {file_count:,} code files. " - f"Anonymous limit is {ANONYMOUS_FILE_LIMIT}.", - "file_count": file_count, - "limit": ANONYMOUS_FILE_LIMIT - } - ) + if request.partial: + # Partial indexing - cap at limit + is_partial = True + files_to_index = ANONYMOUS_FILE_LIMIT + logger.info("Partial indexing enabled", + total_files=file_count, + indexing=files_to_index) + else: + # Reject large repos without partial flag + raise HTTPException( + status_code=400, + detail={ + "error": "validation_failed", + "reason": "too_large", + "message": f"Repository has {file_count:,} code files. " + f"Anonymous limit is {ANONYMOUS_FILE_LIMIT}. " + f"Use partial=true to index first {ANONYMOUS_FILE_LIMIT} files.", + "file_count": file_count, + "limit": ANONYMOUS_FILE_LIMIT, + "hint": "Set partial=true to index a subset of files" + } + ) # --- Validation passed! Create job and start background indexing --- @@ -796,7 +812,9 @@ async def start_anonymous_indexing( owner=owner, repo_name=repo_name, branch=branch, - file_count=file_count + file_count=file_count, + is_partial=is_partial, + max_files=files_to_index ) # Queue background task @@ -811,7 +829,8 @@ async def start_anonymous_indexing( owner=owner, repo_name=repo_name, branch=branch, - file_count=file_count + file_count=files_to_index, # Actual files to index (may be capped) + max_files=files_to_index if is_partial else None # Limit for partial indexing ) logger.info("Indexing job queued", @@ -819,17 +838,29 @@ async def start_anonymous_indexing( owner=owner, repo=repo_name, branch=branch, - file_count=file_count, + file_count=files_to_index, + is_partial=is_partial, session_token=session_token[:8], response_time_ms=response_time_ms) # Estimate time based on file count (~0.3s per file) - estimated_seconds = max(10, int(file_count * 0.3)) + estimated_seconds = max(10, int(files_to_index * 0.3)) - return { + response_data = { "job_id": job_id, "status": "queued", "estimated_time_seconds": estimated_seconds, - "file_count": file_count, + "file_count": files_to_index, "message": f"Indexing started. Poll /playground/index/{job_id} for status." } + + # Add partial info if applicable + if is_partial: + response_data["partial"] = True + response_data["total_files"] = file_count + response_data["message"] = ( + f"Partial indexing started ({files_to_index} of {file_count} files). " + f"Poll /playground/index/{job_id} for status." + ) + + return response_data diff --git a/backend/services/anonymous_indexer.py b/backend/services/anonymous_indexer.py index 4fa9d5c..b8048c6 100644 --- a/backend/services/anonymous_indexer.py +++ b/backend/services/anonymous_indexer.py @@ -93,7 +93,9 @@ def create_job( owner: str, repo_name: str, branch: str, - file_count: int + file_count: int, + is_partial: bool = False, + max_files: Optional[int] = None ) -> dict: """ Create a new indexing job in Redis. @@ -111,6 +113,8 @@ def create_job( "repo_name": repo_name, "branch": branch, "file_count": file_count, + "is_partial": is_partial, + "max_files": max_files, "status": JobStatus.QUEUED.value, "progress": None, "stats": None, @@ -225,13 +229,17 @@ async def run_indexing_job( owner: str, repo_name: str, branch: str, - file_count: int + file_count: int, + max_files: Optional[int] = None ) -> None: """ Background task to clone and index a repository. This runs asynchronously after the endpoint returns. Updates Redis with progress and final status. + + Args: + max_files: If set, limit indexing to first N files (for partial indexing) """ import time start_time = time.time() @@ -286,7 +294,8 @@ async def progress_callback(files_processed: int, functions_found: int, total: i indexer.index_repository_with_progress( repo_id, str(temp_path), - progress_callback + progress_callback, + max_files=max_files ), timeout=job_manager.INDEX_TIMEOUT_SECONDS ) diff --git a/backend/services/indexer_optimized.py b/backend/services/indexer_optimized.py index df579ee..4879065 100644 --- a/backend/services/indexer_optimized.py +++ b/backend/services/indexer_optimized.py @@ -464,17 +464,30 @@ async def explain_code( return f"Error: {str(e)}" async def index_repository_with_progress( - self, - repo_id: str, + self, + repo_id: str, repo_path: str, - progress_callback + progress_callback, + max_files: int = None ): - """Index repository with real-time progress updates""" + """Index repository with real-time progress updates + + Args: + max_files: If set, limit indexing to first N files (for partial indexing) + """ start_time = time.time() logger.info("Starting optimized indexing with progress", repo_id=repo_id) - + # Discover code files code_files = self._discover_code_files(repo_path) + + # Apply file limit if specified (partial indexing) + if max_files and len(code_files) > max_files: + logger.info("Limiting files for partial indexing", + total_discovered=len(code_files), + max_files=max_files) + code_files = code_files[:max_files] + total_files = len(code_files) logger.info("Found code files", repo_id=repo_id, total_files=total_files) diff --git a/backend/services/sentry.py b/backend/services/sentry.py index 12508b2..4f7dbdc 100644 --- a/backend/services/sentry.py +++ b/backend/services/sentry.py @@ -12,58 +12,58 @@ def init_sentry() -> bool: """ Initialize Sentry SDK if SENTRY_DSN is configured. - + Returns: bool: True if Sentry was initialized, False otherwise """ sentry_dsn = os.getenv("SENTRY_DSN") - + if not sentry_dsn: print("ℹ️ Sentry DSN not configured - error tracking disabled") return False - + try: import sentry_sdk from sentry_sdk.integrations.fastapi import FastApiIntegration from sentry_sdk.integrations.starlette import StarletteIntegration - + environment = os.getenv("ENVIRONMENT", "development") - + sentry_sdk.init( dsn=sentry_dsn, environment=environment, - + # Performance monitoring - sample rate based on environment traces_sample_rate=0.1 if environment == "production" else 1.0, - + # Profile sampled transactions profiles_sample_rate=0.1 if environment == "production" else 1.0, - + # Send PII for debugging (user IDs, emails) send_default_pii=True, - + # Integrations integrations=[ FastApiIntegration(transaction_style="endpoint"), StarletteIntegration(transaction_style="endpoint"), ], - + # Filter noisy events before_send=_filter_events, - + # Debug mode for development debug=environment == "development", - + # Attach stack traces to messages attach_stacktrace=True, - + # Include local variables in stack traces include_local_variables=True, ) - + print(f"✅ Sentry initialized (environment: {environment})") return True - + except ImportError: print("⚠️ sentry-sdk not installed - error tracking disabled") return False @@ -74,12 +74,12 @@ def init_sentry() -> bool: def _filter_events(event, hint): """Filter out noisy events before sending to Sentry.""" - + # Don't send health check errors request_url = event.get("request", {}).get("url", "") if "/health" in request_url: return None - + # Don't send 404s for common bot paths exception_values = event.get("exception", {}).get("values", []) if exception_values: @@ -87,13 +87,13 @@ def _filter_events(event, hint): bot_paths = ["/wp-admin", "/wp-login", "/.env", "/config", "/admin", "/phpmyadmin", "/.git"] if any(path in exception_value for path in bot_paths): return None - + # Don't send validation errors (they're expected) if exception_values: exception_type = exception_values[0].get("type", "") if exception_type in ("RequestValidationError", "ValidationError"): return None - + return event @@ -104,7 +104,7 @@ def _filter_events(event, hint): def set_user_context(user_id: Optional[str] = None, email: Optional[str] = None): """ Set user context for error tracking. - + DEPRECATED: Use from services.observability import set_user_context """ try: @@ -117,7 +117,7 @@ def set_user_context(user_id: Optional[str] = None, email: Optional[str] = None) def capture_exception(error: Exception, **extra_context): """ Manually capture an exception with additional context. - + DEPRECATED: Use from services.observability import capture_exception """ try: @@ -133,7 +133,7 @@ def capture_exception(error: Exception, **extra_context): def capture_message(message: str, level: str = "info", **extra_context): """ Capture a message (not an exception) for tracking. - + DEPRECATED: Use from services.observability import get_logger """ try: @@ -149,7 +149,7 @@ def capture_message(message: str, level: str = "info", **extra_context): def set_operation_context(operation: str, **tags): """ Set operation context for the current scope. - + DEPRECATED: Use from services.observability import trace_operation """ try: @@ -159,3 +159,19 @@ def set_operation_context(operation: str, **tags): sentry_sdk.set_tag(key, str(value)) except ImportError: pass + + +def capture_http_exception(request, exc: Exception, status_code: int): + """ + Capture HTTP exception with request context for error tracking. + """ + try: + import sentry_sdk + with sentry_sdk.push_scope() as scope: + scope.set_extra("status_code", status_code) + scope.set_extra("path", str(request.url.path)) + scope.set_extra("method", request.method) + sentry_sdk.capture_exception(exc) + except ImportError: + pass + pass diff --git a/backend/tests/test_anonymous_indexing.py b/backend/tests/test_anonymous_indexing.py new file mode 100644 index 0000000..c2d31a1 --- /dev/null +++ b/backend/tests/test_anonymous_indexing.py @@ -0,0 +1,514 @@ +""" +Tests for anonymous indexing endpoint (Issue #125). +Tests the POST /playground/index endpoint and related functionality. + +Note: These tests rely on conftest.py for Pinecone/OpenAI/Redis mocking. +""" +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +from datetime import datetime, timezone, timedelta +import json + +# Import directly - conftest.py handles external service mocking +from routes.playground import ( + IndexRepoRequest, + ANONYMOUS_FILE_LIMIT, +) +from services.anonymous_indexer import ( + AnonymousIndexingJob, + JobStatus, + JobProgress, + JobStats, +) + + +# ============================================================================= +# REQUEST MODEL TESTS +# ============================================================================= + +class TestIndexRepoRequest: + """Tests for IndexRepoRequest validation.""" + + def test_valid_request(self): + """Valid GitHub URL should pass.""" + req = IndexRepoRequest(github_url="https://github.com/facebook/react") + assert req.github_url == "https://github.com/facebook/react" + assert req.branch is None + assert req.partial is False + + def test_valid_request_with_branch(self): + """Request with branch specified.""" + req = IndexRepoRequest( + github_url="https://github.com/user/repo", + branch="develop" + ) + assert req.branch == "develop" + + def test_valid_request_with_partial(self): + """Request with partial=True.""" + req = IndexRepoRequest( + github_url="https://github.com/user/repo", + partial=True + ) + assert req.partial is True + + def test_invalid_empty_url(self): + """Empty URL should fail.""" + with pytest.raises(ValueError) as exc_info: + IndexRepoRequest(github_url="") + assert "required" in str(exc_info.value).lower() + + def test_invalid_url_no_scheme(self): + """URL without http(s) should fail.""" + with pytest.raises(ValueError) as exc_info: + IndexRepoRequest(github_url="github.com/user/repo") + assert "http" in str(exc_info.value).lower() + + def test_invalid_url_wrong_domain(self): + """Non-GitHub URL should fail.""" + with pytest.raises(ValueError) as exc_info: + IndexRepoRequest(github_url="https://gitlab.com/user/repo") + assert "github" in str(exc_info.value).lower() + + def test_url_whitespace_trimmed(self): + """Whitespace should be trimmed.""" + req = IndexRepoRequest(github_url=" https://github.com/user/repo ") + assert req.github_url == "https://github.com/user/repo" + + +# ============================================================================= +# JOB MANAGER TESTS +# ============================================================================= + +class TestAnonymousIndexingJob: + """Tests for AnonymousIndexingJob service.""" + + @pytest.fixture + def mock_redis(self): + """Create a mock Redis client.""" + redis = MagicMock() + redis.get.return_value = None + redis.setex.return_value = True + return redis + + @pytest.fixture + def job_manager(self, mock_redis): + """Create job manager with mock Redis.""" + return AnonymousIndexingJob(mock_redis) + + def test_generate_job_id_format(self, job_manager): + """Job ID should have correct format.""" + job_id = job_manager.generate_job_id() + assert job_id.startswith("idx_") + assert len(job_id) == 16 # idx_ + 12 hex chars + + def test_generate_job_id_unique(self, job_manager): + """Each job ID should be unique.""" + ids = [job_manager.generate_job_id() for _ in range(100)] + assert len(set(ids)) == 100 + + def test_generate_repo_id(self, job_manager): + """Repo ID derived from job ID.""" + repo_id = job_manager.generate_repo_id("idx_abc123def456") + assert repo_id == "anon_abc123def456" + + def test_create_job(self, job_manager, mock_redis): + """Create job stores data in Redis.""" + job_data = job_manager.create_job( + job_id="idx_test123456", + session_id="session_abc", + github_url="https://github.com/user/repo", + owner="user", + repo_name="repo", + branch="main", + file_count=50 + ) + + # Check return data + assert job_data["job_id"] == "idx_test123456" + assert job_data["session_id"] == "session_abc" + assert job_data["status"] == "queued" + assert job_data["file_count"] == 50 + + # Check Redis was called + mock_redis.setex.assert_called_once() + call_args = mock_redis.setex.call_args + assert "anon_job:idx_test123456" in call_args[0] + + def test_get_job_exists(self, job_manager, mock_redis): + """Get existing job from Redis.""" + mock_redis.get.return_value = json.dumps({ + "job_id": "idx_test123456", + "status": "processing" + }) + + job = job_manager.get_job("idx_test123456") + assert job is not None + assert job["status"] == "processing" + + def test_get_job_not_found(self, job_manager, mock_redis): + """Get non-existent job returns None.""" + mock_redis.get.return_value = None + job = job_manager.get_job("idx_nonexistent") + assert job is None + + def test_update_status(self, job_manager, mock_redis): + """Update job status in Redis.""" + # Setup existing job + mock_redis.get.return_value = json.dumps({ + "job_id": "idx_test123456", + "status": "queued", + "updated_at": "2025-01-01T00:00:00Z" + }) + + result = job_manager.update_status( + "idx_test123456", + JobStatus.PROCESSING + ) + + assert result is True + # Check Redis setex was called to update + assert mock_redis.setex.called + + def test_update_status_with_progress(self, job_manager, mock_redis): + """Update status with progress data.""" + mock_redis.get.return_value = json.dumps({ + "job_id": "idx_test123456", + "status": "cloning" + }) + + progress = JobProgress( + files_total=100, + files_processed=50, + functions_found=200 + ) + + result = job_manager.update_status( + "idx_test123456", + JobStatus.PROCESSING, + progress=progress + ) + + assert result is True + + def test_update_status_completed_with_stats(self, job_manager, mock_redis): + """Update status to completed with stats.""" + mock_redis.get.return_value = json.dumps({ + "job_id": "idx_test123456", + "status": "processing" + }) + + stats = JobStats( + files_indexed=100, + functions_found=500, + time_taken_seconds=45.5 + ) + + result = job_manager.update_status( + "idx_test123456", + JobStatus.COMPLETED, + stats=stats, + repo_id="anon_test123456" + ) + + assert result is True + + def test_update_status_failed_with_error(self, job_manager, mock_redis): + """Update status to failed with error.""" + mock_redis.get.return_value = json.dumps({ + "job_id": "idx_test123456", + "status": "cloning" + }) + + result = job_manager.update_status( + "idx_test123456", + JobStatus.FAILED, + error="clone_failed", + error_message="Repository not found" + ) + + assert result is True + + +# ============================================================================= +# JOB DATACLASS TESTS +# ============================================================================= + +class TestJobDataclasses: + """Tests for JobProgress and JobStats.""" + + def test_job_progress_to_dict(self): + """JobProgress converts to dict correctly.""" + progress = JobProgress( + files_total=100, + files_processed=50, + functions_found=200, + current_file="src/index.ts" + ) + d = progress.to_dict() + assert d["files_total"] == 100 + assert d["files_processed"] == 50 + assert d["current_file"] == "src/index.ts" + + def test_job_progress_none_excluded(self): + """JobProgress excludes None values.""" + progress = JobProgress(files_total=100) + d = progress.to_dict() + assert "current_file" not in d + + def test_job_stats_to_dict(self): + """JobStats converts to dict correctly.""" + stats = JobStats( + files_indexed=100, + functions_found=500, + time_taken_seconds=45.5 + ) + d = stats.to_dict() + assert d["files_indexed"] == 100 + assert d["time_taken_seconds"] == 45.5 + + +# ============================================================================= +# ENDPOINT TESTS (Integration) +# ============================================================================= + +class TestIndexEndpoint: + """Integration tests for POST /playground/index.""" + + @pytest.fixture + def client(self): + """Create test client.""" + from fastapi.testclient import TestClient + from main import app + return TestClient(app) + + def test_invalid_url_returns_400(self, client): + """Invalid GitHub URL returns 400.""" + response = client.post( + "/api/v1/playground/index", + json={"github_url": "not-a-valid-url"} + ) + assert response.status_code == 422 # Pydantic validation + + def test_missing_url_returns_422(self, client): + """Missing github_url returns 422.""" + response = client.post( + "/api/v1/playground/index", + json={} + ) + assert response.status_code == 422 + + @patch('routes.playground._fetch_repo_metadata') + @patch('routes.playground._count_code_files') + def test_private_repo_returns_400( + self, mock_count, mock_metadata, client + ): + """Private repository returns 400.""" + mock_metadata.return_value = {"private": True, "name": "repo"} + mock_count.return_value = (50, None) + + response = client.post( + "/api/v1/playground/index", + json={"github_url": "https://github.com/user/private-repo"} + ) + + assert response.status_code == 400 + assert "private" in response.json()["detail"]["reason"] + + @patch('routes.playground._fetch_repo_metadata') + @patch('routes.playground._count_code_files') + def test_too_large_repo_without_partial_returns_400( + self, mock_count, mock_metadata, client + ): + """Large repo without partial=true returns 400 with hint.""" + mock_metadata.return_value = { + "private": False, + "name": "large-repo", + "default_branch": "main" + } + mock_count.return_value = (500, None) # Over 200 limit + + response = client.post( + "/api/v1/playground/index", + json={"github_url": "https://github.com/user/large-repo"} + ) + + assert response.status_code == 400 + detail = response.json()["detail"] + assert detail["reason"] == "too_large" + assert "partial" in detail.get("hint", "").lower() + + @patch('routes.playground._fetch_repo_metadata') + @patch('routes.playground._count_code_files') + @patch('routes.playground.AnonymousIndexingJob') + def test_large_repo_with_partial_returns_202( + self, mock_job_class, mock_count, mock_metadata, client + ): + """Large repo with partial=true returns 202.""" + mock_metadata.return_value = { + "private": False, + "name": "large-repo", + "default_branch": "main" + } + mock_count.return_value = (500, None) + + # Mock job manager + mock_job_manager = MagicMock() + mock_job_manager.generate_job_id.return_value = "idx_test123456" + mock_job_manager.create_job.return_value = {"job_id": "idx_test123456"} + mock_job_class.return_value = mock_job_manager + + response = client.post( + "/api/v1/playground/index", + json={ + "github_url": "https://github.com/user/large-repo", + "partial": True + } + ) + + assert response.status_code == 202 + data = response.json() + assert data["job_id"] == "idx_test123456" + assert data["partial"] is True + assert data["file_count"] == ANONYMOUS_FILE_LIMIT # Capped at 200 + + @patch('routes.playground._fetch_repo_metadata') + @patch('routes.playground._count_code_files') + @patch('routes.playground.AnonymousIndexingJob') + def test_valid_request_returns_202_with_job_id( + self, mock_job_class, mock_count, mock_metadata, client + ): + """Valid request returns 202 with job_id.""" + mock_metadata.return_value = { + "private": False, + "name": "repo", + "default_branch": "main" + } + mock_count.return_value = (50, None) + + mock_job_manager = MagicMock() + mock_job_manager.generate_job_id.return_value = "idx_abc123def456" + mock_job_manager.create_job.return_value = {"job_id": "idx_abc123def456"} + mock_job_class.return_value = mock_job_manager + + response = client.post( + "/api/v1/playground/index", + json={"github_url": "https://github.com/user/repo"} + ) + + assert response.status_code == 202 + data = response.json() + assert data["job_id"] == "idx_abc123def456" + assert data["status"] == "queued" + assert "estimated_time_seconds" in data + + @patch('routes.playground._fetch_repo_metadata') + def test_repo_not_found_returns_400(self, mock_metadata, client): + """Repository not found returns 400.""" + mock_metadata.return_value = {"error": "not_found"} + + response = client.post( + "/api/v1/playground/index", + json={"github_url": "https://github.com/user/nonexistent"} + ) + + assert response.status_code == 400 + assert response.json()["detail"]["reason"] == "not_found" + + @patch('routes.playground._fetch_repo_metadata') + def test_github_rate_limit_returns_429(self, mock_metadata, client): + """GitHub rate limit returns 429.""" + mock_metadata.return_value = {"error": "rate_limited"} + + response = client.post( + "/api/v1/playground/index", + json={"github_url": "https://github.com/user/repo"} + ) + + assert response.status_code == 429 + + +# ============================================================================= +# SESSION CONFLICT TESTS +# ============================================================================= + +class TestSessionConflict: + """Tests for session-already-has-repo behavior.""" + + @pytest.fixture + def client(self): + from fastapi.testclient import TestClient + from main import app + return TestClient(app) + + @patch('routes.playground._fetch_repo_metadata') + @patch('routes.playground._count_code_files') + @patch('routes.playground._get_limiter') + def test_session_with_existing_repo_returns_409( + self, mock_get_limiter, mock_count, mock_metadata, client + ): + """Session with existing indexed repo returns 409.""" + mock_metadata.return_value = { + "private": False, + "name": "repo", + "default_branch": "main" + } + mock_count.return_value = (50, None) + + # Mock limiter with existing indexed repo + mock_limiter = MagicMock() + mock_session_data = MagicMock() + mock_session_data.indexed_repo = { + "repo_id": "existing_repo", + "expires_at": (datetime.now(timezone.utc) + timedelta(hours=12)).isoformat() + } + mock_limiter.get_session_data.return_value = mock_session_data + mock_limiter.create_session.return_value = "test_session" + mock_get_limiter.return_value = mock_limiter + + response = client.post( + "/api/v1/playground/index", + json={"github_url": "https://github.com/user/repo"} + ) + + assert response.status_code == 409 + assert response.json()["detail"]["error"] == "already_indexed" + + @patch('routes.playground._fetch_repo_metadata') + @patch('routes.playground._count_code_files') + @patch('routes.playground._get_limiter') + @patch('routes.playground.AnonymousIndexingJob') + def test_expired_repo_allows_new_indexing( + self, mock_job_class, mock_get_limiter, mock_count, mock_metadata, client + ): + """Expired indexed repo allows new indexing.""" + mock_metadata.return_value = { + "private": False, + "name": "repo", + "default_branch": "main" + } + mock_count.return_value = (50, None) + + # Mock limiter with expired indexed repo + mock_limiter = MagicMock() + mock_session_data = MagicMock() + mock_session_data.indexed_repo = { + "repo_id": "old_repo", + "expires_at": (datetime.now(timezone.utc) - timedelta(hours=1)).isoformat() + } + mock_limiter.get_session_data.return_value = mock_session_data + mock_limiter.create_session.return_value = "test_session" + mock_get_limiter.return_value = mock_limiter + + mock_job_manager = MagicMock() + mock_job_manager.generate_job_id.return_value = "idx_new123456" + mock_job_manager.create_job.return_value = {"job_id": "idx_new123456"} + mock_job_class.return_value = mock_job_manager + + response = client.post( + "/api/v1/playground/index", + json={"github_url": "https://github.com/user/repo"} + ) + + assert response.status_code == 202 + assert response.json()["job_id"] == "idx_new123456" From 5dd044c211d290b1963a99df421a57a2d9aa4460 Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Thu, 25 Dec 2025 23:52:29 -0600 Subject: [PATCH 4/5] feat(playground): add GET /index/{job_id} status endpoint (#125) - Poll endpoint for job status (queued/cloning/processing/completed/failed) - Returns progress with percent_complete during processing - Returns repo_id on completion for search access - Returns error details on failure - Handles partial indexing info - 7 new tests for status endpoint Checkpoint 3 complete. 176 tests passing. --- backend/routes/playground.py | 158 +++++++++++++++++++++ backend/tests/test_anonymous_indexing.py | 169 ++++++++++++++++++++++- 2 files changed, 326 insertions(+), 1 deletion(-) diff --git a/backend/routes/playground.py b/backend/routes/playground.py index 778036d..7f306c4 100644 --- a/backend/routes/playground.py +++ b/backend/routes/playground.py @@ -864,3 +864,161 @@ async def start_anonymous_indexing( ) return response_data + + +# ============================================================================= +# GET /playground/index/{job_id} - Check indexing job status (#126) +# ============================================================================= + +@router.get( + "/index/{job_id}", + summary="Check indexing job status", + description="Poll this endpoint to check the status of an indexing job.", + responses={ + 200: { + "description": "Job status", + "content": { + "application/json": { + "examples": { + "queued": { + "value": { + "job_id": "idx_abc123", + "status": "queued", + "message": "Job is queued for processing" + } + }, + "processing": { + "value": { + "job_id": "idx_abc123", + "status": "processing", + "progress": { + "files_processed": 50, + "files_total": 100, + "functions_found": 250, + "percent_complete": 50 + } + } + }, + "completed": { + "value": { + "job_id": "idx_abc123", + "status": "completed", + "repo_id": "anon_idx_abc123", + "stats": { + "files_indexed": 100, + "functions_found": 500, + "time_taken_seconds": 45.2 + } + } + }, + "failed": { + "value": { + "job_id": "idx_abc123", + "status": "failed", + "error": "clone_failed", + "error_message": "Repository not found" + } + } + } + } + } + }, + 404: {"description": "Job not found or expired"} + } +) +async def get_indexing_status( + job_id: str, + req: Request +): + """ + Check the status of an anonymous indexing job. + + Poll this endpoint after starting an indexing job to track progress. + Jobs expire after 1 hour. + + Status values: + - queued: Job is waiting to start + - cloning: Repository is being cloned + - processing: Files are being indexed + - completed: Indexing finished successfully + - failed: Indexing failed (check error field) + """ + # Validate job_id format + if not job_id or not job_id.startswith("idx_"): + raise HTTPException( + status_code=400, + detail={ + "error": "invalid_job_id", + "message": "Invalid job ID format" + } + ) + + # Get job from Redis + job_manager = AnonymousIndexingJob(redis_client) + job = job_manager.get_job(job_id) + + if not job: + raise HTTPException( + status_code=404, + detail={ + "error": "job_not_found", + "message": "Job not found or has expired. Jobs expire after 1 hour." + } + ) + + # Build response based on status + status = job.get("status", "unknown") + response = { + "job_id": job_id, + "status": status, + "created_at": job.get("created_at"), + "updated_at": job.get("updated_at"), + } + + # Add repo info + response["repository"] = { + "owner": job.get("owner"), + "name": job.get("repo_name"), + "branch": job.get("branch"), + "github_url": job.get("github_url"), + } + + # Add partial info if applicable + if job.get("is_partial"): + response["partial"] = True + response["max_files"] = job.get("max_files") + + # Status-specific fields + if status == "queued": + response["message"] = "Job is queued for processing" + + elif status == "cloning": + response["message"] = "Cloning repository..." + + elif status == "processing": + response["message"] = "Indexing files..." + if job.get("progress"): + progress = job["progress"] + files_processed = progress.get("files_processed", 0) + files_total = progress.get("files_total", 1) + percent = round((files_processed / files_total) * 100) if files_total > 0 else 0 + response["progress"] = { + "files_processed": files_processed, + "files_total": files_total, + "functions_found": progress.get("functions_found", 0), + "percent_complete": percent, + "current_file": progress.get("current_file") + } + + elif status == "completed": + response["message"] = "Indexing completed successfully" + response["repo_id"] = job.get("repo_id") + if job.get("stats"): + response["stats"] = job["stats"] + + elif status == "failed": + response["message"] = job.get("error_message", "Indexing failed") + response["error"] = job.get("error", "unknown_error") + response["error_message"] = job.get("error_message") + + return response diff --git a/backend/tests/test_anonymous_indexing.py b/backend/tests/test_anonymous_indexing.py index c2d31a1..ea36007 100644 --- a/backend/tests/test_anonymous_indexing.py +++ b/backend/tests/test_anonymous_indexing.py @@ -5,7 +5,7 @@ Note: These tests rely on conftest.py for Pinecone/OpenAI/Redis mocking. """ import pytest -from unittest.mock import AsyncMock, patch, MagicMock +from unittest.mock import patch, MagicMock from datetime import datetime, timezone, timedelta import json @@ -512,3 +512,170 @@ def test_expired_repo_allows_new_indexing( assert response.status_code == 202 assert response.json()["job_id"] == "idx_new123456" + + +# ============================================================================= +# STATUS ENDPOINT TESTS (GET /playground/index/{job_id}) +# ============================================================================= + +class TestStatusEndpoint: + """Tests for GET /playground/index/{job_id} status endpoint.""" + + @pytest.fixture + def client(self): + """Create test client.""" + from fastapi.testclient import TestClient + from main import app + return TestClient(app) + + def test_invalid_job_id_format_returns_400(self, client): + """Invalid job ID format returns 400.""" + response = client.get("/api/v1/playground/index/invalid_format") + assert response.status_code == 400 + assert response.json()["detail"]["error"] == "invalid_job_id" + + def test_job_not_found_returns_404(self, client): + """Non-existent job returns 404.""" + response = client.get("/api/v1/playground/index/idx_nonexistent123") + assert response.status_code == 404 + assert response.json()["detail"]["error"] == "job_not_found" + + @patch('routes.playground.AnonymousIndexingJob') + def test_queued_job_returns_status(self, mock_job_class, client): + """Queued job returns correct status.""" + mock_job_manager = MagicMock() + mock_job_manager.get_job.return_value = { + "job_id": "idx_test123456", + "status": "queued", + "owner": "user", + "repo_name": "repo", + "branch": "main", + "github_url": "https://github.com/user/repo", + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-01T00:00:00Z", + } + mock_job_class.return_value = mock_job_manager + + response = client.get("/api/v1/playground/index/idx_test123456") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "queued" + assert data["message"] == "Job is queued for processing" + + @patch('routes.playground.AnonymousIndexingJob') + def test_processing_job_returns_progress(self, mock_job_class, client): + """Processing job returns progress info.""" + mock_job_manager = MagicMock() + mock_job_manager.get_job.return_value = { + "job_id": "idx_test123456", + "status": "processing", + "owner": "user", + "repo_name": "repo", + "branch": "main", + "github_url": "https://github.com/user/repo", + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-01T00:00:01Z", + "progress": { + "files_processed": 50, + "files_total": 100, + "functions_found": 250, + "current_file": "src/index.ts" + } + } + mock_job_class.return_value = mock_job_manager + + response = client.get("/api/v1/playground/index/idx_test123456") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "processing" + assert data["progress"]["files_processed"] == 50 + assert data["progress"]["percent_complete"] == 50 + + @patch('routes.playground.AnonymousIndexingJob') + def test_completed_job_returns_repo_id(self, mock_job_class, client): + """Completed job returns repo_id and stats.""" + mock_job_manager = MagicMock() + mock_job_manager.get_job.return_value = { + "job_id": "idx_test123456", + "status": "completed", + "owner": "user", + "repo_name": "repo", + "branch": "main", + "github_url": "https://github.com/user/repo", + "repo_id": "anon_idx_test123456", + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-01T00:01:00Z", + "stats": { + "files_indexed": 100, + "functions_found": 500, + "time_taken_seconds": 45.2 + } + } + mock_job_class.return_value = mock_job_manager + + response = client.get("/api/v1/playground/index/idx_test123456") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "completed" + assert data["repo_id"] == "anon_idx_test123456" + assert data["stats"]["files_indexed"] == 100 + + @patch('routes.playground.AnonymousIndexingJob') + def test_failed_job_returns_error(self, mock_job_class, client): + """Failed job returns error details.""" + mock_job_manager = MagicMock() + mock_job_manager.get_job.return_value = { + "job_id": "idx_test123456", + "status": "failed", + "owner": "user", + "repo_name": "repo", + "branch": "main", + "github_url": "https://github.com/user/repo", + "error": "clone_failed", + "error_message": "Repository not found or access denied", + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-01T00:00:30Z", + } + mock_job_class.return_value = mock_job_manager + + response = client.get("/api/v1/playground/index/idx_test123456") + + assert response.status_code == 200 + data = response.json() + assert data["status"] == "failed" + assert data["error"] == "clone_failed" + assert "not found" in data["error_message"].lower() + + @patch('routes.playground.AnonymousIndexingJob') + def test_partial_job_includes_partial_info(self, mock_job_class, client): + """Partial indexing job includes partial flag.""" + mock_job_manager = MagicMock() + mock_job_manager.get_job.return_value = { + "job_id": "idx_test123456", + "status": "processing", + "owner": "user", + "repo_name": "large-repo", + "branch": "main", + "github_url": "https://github.com/user/large-repo", + "is_partial": True, + "max_files": 200, + "file_count": 500, + "created_at": "2024-01-01T00:00:00Z", + "updated_at": "2024-01-01T00:00:10Z", + "progress": { + "files_processed": 100, + "files_total": 200, + "functions_found": 400 + } + } + mock_job_class.return_value = mock_job_manager + + response = client.get("/api/v1/playground/index/idx_test123456") + + assert response.status_code == 200 + data = response.json() + assert data["partial"] is True + assert data["max_files"] == 200 From ae605cc62c305661d635e8c62120a0d61d7a825c Mon Sep 17 00:00:00 2001 From: Devanshu Rajesh Chicholikar Date: Fri, 26 Dec 2025 00:10:36 -0600 Subject: [PATCH 5/5] fix(security): don't expose exception details to client CodeQL flagged information exposure through exception. Return generic 'error' string instead of str(e) to avoid leaking internal details. Detailed errors still logged server-side. --- backend/routes/playground.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backend/routes/playground.py b/backend/routes/playground.py index 7f306c4..f12556f 100644 --- a/backend/routes/playground.py +++ b/backend/routes/playground.py @@ -483,8 +483,9 @@ async def _count_code_files( except httpx.TimeoutException: return 0, "GitHub API request timed out" except Exception as e: + # Log detailed error server-side, but don't expose to client logger.error("GitHub tree API failed", error=str(e)) - return 0, str(e) + return 0, "error" @router.post("/validate-repo")