|
| 1 | +"""Anonymous indexing routes for the playground.""" |
| 2 | +import time |
| 3 | +from typing import Optional |
| 4 | +from datetime import datetime, timezone |
| 5 | +from fastapi import APIRouter, HTTPException, Request, Response, BackgroundTasks |
| 6 | +from pydantic import BaseModel, field_validator |
| 7 | + |
| 8 | +from dependencies import indexer, redis_client |
| 9 | +from services.observability import logger |
| 10 | +from services.anonymous_indexer import AnonymousIndexingJob, run_indexing_job |
| 11 | +from routes.playground.helpers import ( |
| 12 | + ANONYMOUS_FILE_LIMIT, |
| 13 | + get_client_ip, get_session_token, set_session_cookie, get_limiter, |
| 14 | +) |
| 15 | +from routes.playground.validation import ( |
| 16 | + parse_github_url, fetch_repo_metadata, count_code_files, |
| 17 | +) |
| 18 | + |
| 19 | +router = APIRouter() |
| 20 | + |
| 21 | + |
| 22 | +class IndexRepoRequest(BaseModel): |
| 23 | + """Request body for anonymous repository indexing.""" |
| 24 | + github_url: str |
| 25 | + branch: Optional[str] = None |
| 26 | + partial: bool = False |
| 27 | + |
| 28 | + @field_validator("github_url") |
| 29 | + @classmethod |
| 30 | + def validate_github_url_format(cls, v: str) -> str: |
| 31 | + v = v.strip() |
| 32 | + if not v: |
| 33 | + raise ValueError("GitHub URL is required") |
| 34 | + if not v.startswith(("http://", "https://")): |
| 35 | + raise ValueError("URL must start with http:// or https://") |
| 36 | + if "github.com" not in v.lower(): |
| 37 | + raise ValueError("URL must be a GitHub repository URL") |
| 38 | + return v |
| 39 | + |
| 40 | + |
| 41 | +@router.post("/index", status_code=202) |
| 42 | +async def start_anonymous_indexing( |
| 43 | + request: IndexRepoRequest, |
| 44 | + req: Request, |
| 45 | + response: Response, |
| 46 | + background_tasks: BackgroundTasks, |
| 47 | +) -> dict: |
| 48 | + """Start indexing a public GitHub repository for anonymous users.""" |
| 49 | + start_time = time.time() |
| 50 | + limiter = get_limiter() |
| 51 | + |
| 52 | + # Session validation |
| 53 | + session_token = get_session_token(req) |
| 54 | + client_ip = get_client_ip(req) |
| 55 | + |
| 56 | + if not session_token: |
| 57 | + session_token = limiter._generate_session_token() |
| 58 | + limiter.create_session(session_token) |
| 59 | + set_session_cookie(response, session_token) |
| 60 | + logger.info("Created new session for indexing", |
| 61 | + session_token=session_token[:8], client_ip=client_ip) |
| 62 | + |
| 63 | + # Check if session already has an indexed repo |
| 64 | + session_data = limiter.get_session_data(session_token) |
| 65 | + |
| 66 | + if session_data.indexed_repo: |
| 67 | + expires_at_str = session_data.indexed_repo.get("expires_at", "") |
| 68 | + is_expired = False |
| 69 | + if expires_at_str: |
| 70 | + try: |
| 71 | + expires_at = datetime.fromisoformat(expires_at_str.replace("Z", "+00:00")) |
| 72 | + # Ensure timezone-aware comparison |
| 73 | + if expires_at.tzinfo is None: |
| 74 | + expires_at = expires_at.replace(tzinfo=timezone.utc) |
| 75 | + is_expired = datetime.now(timezone.utc) > expires_at |
| 76 | + except (ValueError, AttributeError, TypeError): |
| 77 | + is_expired = True |
| 78 | + |
| 79 | + if not is_expired: |
| 80 | + logger.info("Session already has indexed repo", |
| 81 | + session_token=session_token[:8], |
| 82 | + existing_repo=session_data.indexed_repo.get("repo_id")) |
| 83 | + raise HTTPException( |
| 84 | + status_code=409, |
| 85 | + detail={ |
| 86 | + "error": "already_indexed", |
| 87 | + "message": "You already have an indexed repository. Only 1 repo per session allowed.", |
| 88 | + "indexed_repo": session_data.indexed_repo, |
| 89 | + } |
| 90 | + ) |
| 91 | + else: |
| 92 | + logger.info("Existing indexed repo expired, allowing new indexing", |
| 93 | + session_token=session_token[:8]) |
| 94 | + |
| 95 | + # Validate GitHub URL |
| 96 | + owner, repo_name, parse_error = parse_github_url(request.github_url) |
| 97 | + if parse_error: |
| 98 | + raise HTTPException(status_code=400, detail={ |
| 99 | + "error": "validation_failed", "reason": "invalid_url", "message": parse_error |
| 100 | + }) |
| 101 | + |
| 102 | + metadata = await fetch_repo_metadata(owner, repo_name) |
| 103 | + if "error" in metadata: |
| 104 | + error_type = metadata["error"] |
| 105 | + if error_type == "not_found": |
| 106 | + raise HTTPException(status_code=400, detail={ |
| 107 | + "error": "validation_failed", "reason": "not_found", |
| 108 | + "message": "Repository not found. Check the URL or ensure it's public." |
| 109 | + }) |
| 110 | + elif error_type == "rate_limited": |
| 111 | + raise HTTPException(status_code=429, detail={ |
| 112 | + "error": "github_rate_limit", "message": "GitHub API rate limit exceeded. Try again later." |
| 113 | + }) |
| 114 | + else: |
| 115 | + raise HTTPException(status_code=502, detail={ |
| 116 | + "error": "github_error", "message": metadata.get("message", "Failed to fetch repository info") |
| 117 | + }) |
| 118 | + |
| 119 | + if metadata.get("private", False): |
| 120 | + raise HTTPException(status_code=400, detail={ |
| 121 | + "error": "validation_failed", "reason": "private", |
| 122 | + "message": "This repository is private. Anonymous indexing only supports public repositories." |
| 123 | + }) |
| 124 | + |
| 125 | + branch = request.branch or metadata.get("default_branch", "main") |
| 126 | + file_count, count_error = await count_code_files(owner, repo_name, branch) |
| 127 | + |
| 128 | + if count_error == "truncated": |
| 129 | + repo_size_kb = metadata.get("size", 0) |
| 130 | + file_count = max(repo_size_kb // 3, ANONYMOUS_FILE_LIMIT + 1) |
| 131 | + elif count_error: |
| 132 | + repo_size_kb = metadata.get("size", 0) |
| 133 | + file_count = max(repo_size_kb // 3, 1) |
| 134 | + |
| 135 | + is_partial = False |
| 136 | + files_to_index = file_count |
| 137 | + |
| 138 | + if file_count > ANONYMOUS_FILE_LIMIT: |
| 139 | + if request.partial: |
| 140 | + is_partial = True |
| 141 | + files_to_index = ANONYMOUS_FILE_LIMIT |
| 142 | + logger.info("Partial indexing enabled", total_files=file_count, indexing=files_to_index) |
| 143 | + else: |
| 144 | + raise HTTPException(status_code=400, detail={ |
| 145 | + "error": "validation_failed", "reason": "too_large", |
| 146 | + "message": f"Repository has {file_count:,} code files. " |
| 147 | + f"Anonymous limit is {ANONYMOUS_FILE_LIMIT}. " |
| 148 | + f"Use partial=true to index first {ANONYMOUS_FILE_LIMIT} files.", |
| 149 | + "file_count": file_count, "limit": ANONYMOUS_FILE_LIMIT, |
| 150 | + "hint": "Set partial=true to index a subset of files", |
| 151 | + }) |
| 152 | + |
| 153 | + # Create job and start background indexing |
| 154 | + response_time_ms = int((time.time() - start_time) * 1000) |
| 155 | + if not redis_client: |
| 156 | + raise HTTPException(status_code=503, detail="Indexing service unavailable (Redis down)") |
| 157 | + job_manager = AnonymousIndexingJob(redis_client) |
| 158 | + job_id = job_manager.generate_job_id() |
| 159 | + |
| 160 | + job_manager.create_job( |
| 161 | + job_id=job_id, session_id=session_token, github_url=request.github_url, |
| 162 | + owner=owner, repo_name=repo_name, branch=branch, |
| 163 | + file_count=file_count, is_partial=is_partial, max_files=files_to_index, |
| 164 | + ) |
| 165 | + |
| 166 | + background_tasks.add_task( |
| 167 | + run_indexing_job, |
| 168 | + job_manager=job_manager, indexer=indexer, limiter=limiter, |
| 169 | + job_id=job_id, session_id=session_token, github_url=request.github_url, |
| 170 | + owner=owner, repo_name=repo_name, branch=branch, |
| 171 | + file_count=files_to_index, max_files=files_to_index if is_partial else None, |
| 172 | + ) |
| 173 | + |
| 174 | + logger.info("Indexing job queued", job_id=job_id, owner=owner, repo=repo_name, |
| 175 | + branch=branch, file_count=files_to_index, is_partial=is_partial, |
| 176 | + session_token=session_token[:8], response_time_ms=response_time_ms) |
| 177 | + |
| 178 | + estimated_seconds = max(10, int(files_to_index * 0.3)) |
| 179 | + result = { |
| 180 | + "job_id": job_id, "status": "queued", |
| 181 | + "estimated_time_seconds": estimated_seconds, "file_count": files_to_index, |
| 182 | + "message": f"Indexing started. Poll /playground/index/{job_id} for status.", |
| 183 | + } |
| 184 | + |
| 185 | + if is_partial: |
| 186 | + result["partial"] = True |
| 187 | + result["total_files"] = file_count |
| 188 | + result["message"] = ( |
| 189 | + f"Partial indexing started ({files_to_index} of {file_count} files). " |
| 190 | + f"Poll /playground/index/{job_id} for status." |
| 191 | + ) |
| 192 | + |
| 193 | + return result |
| 194 | + |
| 195 | + |
| 196 | +@router.get("/index/{job_id}") |
| 197 | +async def get_indexing_status(job_id: str, req: Request) -> dict: |
| 198 | + """Check the status of an anonymous indexing job.""" |
| 199 | + if not job_id or not job_id.startswith("idx_"): |
| 200 | + raise HTTPException(status_code=400, detail={ |
| 201 | + "error": "invalid_job_id", "message": "Invalid job ID format" |
| 202 | + }) |
| 203 | + |
| 204 | + if not redis_client: |
| 205 | + raise HTTPException(status_code=503, detail="Indexing service unavailable (Redis down)") |
| 206 | + job_manager = AnonymousIndexingJob(redis_client) |
| 207 | + job = job_manager.get_job(job_id) |
| 208 | + |
| 209 | + if not job: |
| 210 | + raise HTTPException(status_code=404, detail={ |
| 211 | + "error": "job_not_found", "message": "Job not found or has expired. Jobs expire after 1 hour." |
| 212 | + }) |
| 213 | + |
| 214 | + status = job.get("status", "unknown") |
| 215 | + result = { |
| 216 | + "job_id": job_id, "status": status, |
| 217 | + "created_at": job.get("created_at"), "updated_at": job.get("updated_at"), |
| 218 | + "repository": { |
| 219 | + "owner": job.get("owner"), "name": job.get("repo_name"), |
| 220 | + "branch": job.get("branch"), "github_url": job.get("github_url"), |
| 221 | + }, |
| 222 | + } |
| 223 | + |
| 224 | + if job.get("is_partial"): |
| 225 | + result["partial"] = True |
| 226 | + result["max_files"] = job.get("max_files") |
| 227 | + |
| 228 | + if status == "queued": |
| 229 | + result["message"] = "Job is queued for processing" |
| 230 | + elif status == "cloning": |
| 231 | + result["message"] = "Cloning repository..." |
| 232 | + elif status == "processing": |
| 233 | + result["message"] = "Indexing files..." |
| 234 | + if job.get("progress"): |
| 235 | + progress = job["progress"] |
| 236 | + files_processed = progress.get("files_processed", 0) |
| 237 | + files_total = progress.get("files_total", 1) |
| 238 | + percent = round((files_processed / files_total) * 100) if files_total > 0 else 0 |
| 239 | + result["progress"] = { |
| 240 | + "files_processed": files_processed, "files_total": files_total, |
| 241 | + "functions_found": progress.get("functions_found", 0), |
| 242 | + "percent_complete": percent, "current_file": progress.get("current_file"), |
| 243 | + } |
| 244 | + elif status == "completed": |
| 245 | + result["message"] = "Indexing completed successfully" |
| 246 | + result["repo_id"] = job.get("repo_id") |
| 247 | + if job.get("stats"): |
| 248 | + result["stats"] = job["stats"] |
| 249 | + elif status == "failed": |
| 250 | + result["message"] = job.get("error_message", "Indexing failed") |
| 251 | + result["error"] = job.get("error", "unknown_error") |
| 252 | + result["error_message"] = job.get("error_message") |
| 253 | + |
| 254 | + return result |
0 commit comments