|
7 | 7 | - Global circuit breaker: 10k searches/hour (cost protection) |
8 | 8 | """ |
9 | 9 | import os |
| 10 | +import re |
| 11 | +import httpx |
10 | 12 | from typing import Optional |
11 | 13 | from fastapi import APIRouter, HTTPException, Request, Response |
12 | | -from pydantic import BaseModel |
| 14 | +from pydantic import BaseModel, field_validator |
13 | 15 | import time |
14 | 16 |
|
15 | 17 | from dependencies import indexer, cache, repo_manager, redis_client |
16 | 18 | from services.input_validator import InputValidator |
| 19 | +from services.repo_validator import RepoValidator |
17 | 20 | from services.observability import logger |
18 | 21 | from services.playground_limiter import PlaygroundLimiter, get_playground_limiter |
19 | 22 |
|
|
27 | 30 | SESSION_COOKIE_MAX_AGE = 86400 # 24 hours |
28 | 31 | IS_PRODUCTION = os.getenv("ENVIRONMENT", "development").lower() == "production" |
29 | 32 |
|
| 33 | +# GitHub validation config |
| 34 | +GITHUB_URL_PATTERN = re.compile( |
| 35 | + r"^https?://github\.com/(?P<owner>[a-zA-Z0-9_.-]+)/(?P<repo>[a-zA-Z0-9_.-]+)/?$" |
| 36 | +) |
| 37 | +ANONYMOUS_FILE_LIMIT = 200 # Max files for anonymous indexing |
| 38 | +GITHUB_API_BASE = "https://api.github.com" |
| 39 | +GITHUB_API_TIMEOUT = 10.0 # seconds |
| 40 | +VALIDATION_CACHE_TTL = 300 # 5 minutes |
| 41 | + |
30 | 42 |
|
31 | 43 | class PlaygroundSearchRequest(BaseModel): |
32 | 44 | query: str |
33 | 45 | demo_repo: str = "flask" |
34 | 46 | max_results: int = 10 |
35 | 47 |
|
36 | 48 |
|
| 49 | +class ValidateRepoRequest(BaseModel): |
| 50 | + """Request body for GitHub repo validation.""" |
| 51 | + github_url: str |
| 52 | + |
| 53 | + @field_validator("github_url") |
| 54 | + @classmethod |
| 55 | + def validate_github_url_format(cls, v: str) -> str: |
| 56 | + """Basic URL format validation.""" |
| 57 | + v = v.strip() |
| 58 | + if not v: |
| 59 | + raise ValueError("GitHub URL is required") |
| 60 | + if not v.startswith(("http://", "https://")): |
| 61 | + raise ValueError("URL must start with http:// or https://") |
| 62 | + if "github.com" not in v.lower(): |
| 63 | + raise ValueError("URL must be a GitHub repository URL") |
| 64 | + return v |
| 65 | + |
| 66 | + |
37 | 67 | async def load_demo_repos(): |
38 | 68 | """Load pre-indexed demo repos. Called from main.py on startup.""" |
39 | 69 | # Note: We mutate DEMO_REPO_IDS dict, no need for 'global' statement |
@@ -305,3 +335,259 @@ async def get_playground_stats(): |
305 | 335 | limiter = _get_limiter() |
306 | 336 | stats = limiter.get_usage_stats() |
307 | 337 | return stats |
| 338 | + |
| 339 | + |
| 340 | +def _parse_github_url(url: str) -> tuple[Optional[str], Optional[str], Optional[str]]: |
| 341 | + """ |
| 342 | + Parse GitHub URL to extract owner and repo. |
| 343 | +
|
| 344 | + Returns: |
| 345 | + (owner, repo, error) - error is None if successful |
| 346 | + """ |
| 347 | + match = GITHUB_URL_PATTERN.match(url.strip().rstrip("/")) |
| 348 | + if not match: |
| 349 | + return None, None, "Invalid GitHub URL format. Expected: https://github.com/owner/repo" |
| 350 | + return match.group("owner"), match.group("repo"), None |
| 351 | + |
| 352 | + |
| 353 | +async def _fetch_repo_metadata(owner: str, repo: str) -> dict: |
| 354 | + """ |
| 355 | + Fetch repository metadata from GitHub API. |
| 356 | +
|
| 357 | + Returns dict with repo info or error details. |
| 358 | + """ |
| 359 | + url = f"{GITHUB_API_BASE}/repos/{owner}/{repo}" |
| 360 | + headers = { |
| 361 | + "Accept": "application/vnd.github.v3+json", |
| 362 | + "User-Agent": "OpenCodeIntel/1.0", |
| 363 | + } |
| 364 | + |
| 365 | + # Add GitHub token if available (for higher rate limits) |
| 366 | + github_token = os.getenv("GITHUB_TOKEN") |
| 367 | + if github_token: |
| 368 | + headers["Authorization"] = f"token {github_token}" |
| 369 | + |
| 370 | + async with httpx.AsyncClient(timeout=GITHUB_API_TIMEOUT) as client: |
| 371 | + try: |
| 372 | + response = await client.get(url, headers=headers) |
| 373 | + |
| 374 | + if response.status_code == 404: |
| 375 | + return {"error": "not_found", "message": "Repository not found"} |
| 376 | + if response.status_code == 403: |
| 377 | + return { |
| 378 | + "error": "rate_limited", |
| 379 | + "message": "GitHub API rate limit exceeded" |
| 380 | + } |
| 381 | + if response.status_code != 200: |
| 382 | + return { |
| 383 | + "error": "api_error", |
| 384 | + "message": f"GitHub API error: {response.status_code}" |
| 385 | + } |
| 386 | + |
| 387 | + return response.json() |
| 388 | + except httpx.TimeoutException: |
| 389 | + return {"error": "timeout", "message": "GitHub API request timed out"} |
| 390 | + except Exception as e: |
| 391 | + logger.error("GitHub API request failed", error=str(e)) |
| 392 | + return {"error": "request_failed", "message": str(e)} |
| 393 | + |
| 394 | + |
| 395 | +async def _count_code_files( |
| 396 | + owner: str, repo: str, default_branch: str |
| 397 | +) -> tuple[int, Optional[str]]: |
| 398 | + """ |
| 399 | + Count code files in repository using GitHub tree API. |
| 400 | +
|
| 401 | + Returns: |
| 402 | + (file_count, error) - error is None if successful |
| 403 | + """ |
| 404 | + url = f"{GITHUB_API_BASE}/repos/{owner}/{repo}/git/trees/{default_branch}?recursive=1" |
| 405 | + headers = { |
| 406 | + "Accept": "application/vnd.github.v3+json", |
| 407 | + "User-Agent": "OpenCodeIntel/1.0", |
| 408 | + } |
| 409 | + |
| 410 | + github_token = os.getenv("GITHUB_TOKEN") |
| 411 | + if github_token: |
| 412 | + headers["Authorization"] = f"token {github_token}" |
| 413 | + |
| 414 | + async with httpx.AsyncClient(timeout=GITHUB_API_TIMEOUT) as client: |
| 415 | + try: |
| 416 | + response = await client.get(url, headers=headers) |
| 417 | + |
| 418 | + if response.status_code == 404: |
| 419 | + return 0, "Could not fetch repository tree" |
| 420 | + if response.status_code == 403: |
| 421 | + return 0, "GitHub API rate limit exceeded" |
| 422 | + if response.status_code != 200: |
| 423 | + return 0, f"GitHub API error: {response.status_code}" |
| 424 | + |
| 425 | + data = response.json() |
| 426 | + |
| 427 | + # Check if tree was truncated (very large repos) |
| 428 | + if data.get("truncated", False): |
| 429 | + # For truncated trees, estimate from repo size |
| 430 | + # GitHub's size is in KB, rough estimate: 1 code file per 5KB |
| 431 | + return -1, "truncated" |
| 432 | + |
| 433 | + # Count files with code extensions |
| 434 | + code_extensions = RepoValidator.CODE_EXTENSIONS |
| 435 | + skip_dirs = RepoValidator.SKIP_DIRS |
| 436 | + |
| 437 | + count = 0 |
| 438 | + for item in data.get("tree", []): |
| 439 | + if item.get("type") != "blob": |
| 440 | + continue |
| 441 | + |
| 442 | + path = item.get("path", "") |
| 443 | + |
| 444 | + # Skip if in excluded directory |
| 445 | + path_parts = path.split("/") |
| 446 | + if any(part in skip_dirs for part in path_parts): |
| 447 | + continue |
| 448 | + |
| 449 | + # Check extension |
| 450 | + ext = "." + path.rsplit(".", 1)[-1] if "." in path else "" |
| 451 | + if ext.lower() in code_extensions: |
| 452 | + count += 1 |
| 453 | + |
| 454 | + return count, None |
| 455 | + except httpx.TimeoutException: |
| 456 | + return 0, "GitHub API request timed out" |
| 457 | + except Exception as e: |
| 458 | + logger.error("GitHub tree API failed", error=str(e)) |
| 459 | + return 0, str(e) |
| 460 | + |
| 461 | + |
| 462 | +@router.post("/validate-repo") |
| 463 | +async def validate_github_repo(request: ValidateRepoRequest, req: Request): |
| 464 | + """ |
| 465 | + Validate a GitHub repository URL for anonymous indexing. |
| 466 | +
|
| 467 | + Checks: |
| 468 | + - URL format is valid |
| 469 | + - Repository exists and is public |
| 470 | + - File count is within anonymous limit (200 files) |
| 471 | +
|
| 472 | + Response varies based on validation result (see issue #124). |
| 473 | + """ |
| 474 | + start_time = time.time() |
| 475 | + |
| 476 | + # Check cache first |
| 477 | + cache_key = f"validate:{request.github_url}" |
| 478 | + cached = cache.get(cache_key) if cache else None |
| 479 | + if cached: |
| 480 | + logger.info("Returning cached validation", url=request.github_url[:50]) |
| 481 | + return cached |
| 482 | + |
| 483 | + # Parse URL |
| 484 | + owner, repo_name, parse_error = _parse_github_url(request.github_url) |
| 485 | + if parse_error: |
| 486 | + return { |
| 487 | + "valid": False, |
| 488 | + "reason": "invalid_url", |
| 489 | + "message": parse_error, |
| 490 | + } |
| 491 | + |
| 492 | + # Fetch repo metadata from GitHub |
| 493 | + metadata = await _fetch_repo_metadata(owner, repo_name) |
| 494 | + |
| 495 | + if "error" in metadata: |
| 496 | + error_type = metadata["error"] |
| 497 | + if error_type == "not_found": |
| 498 | + return { |
| 499 | + "valid": False, |
| 500 | + "reason": "not_found", |
| 501 | + "message": "Repository not found. Check the URL or ensure it's public.", |
| 502 | + } |
| 503 | + elif error_type == "rate_limited": |
| 504 | + raise HTTPException( |
| 505 | + status_code=429, |
| 506 | + detail={"message": "GitHub API rate limit exceeded. Try again later."} |
| 507 | + ) |
| 508 | + else: |
| 509 | + raise HTTPException( |
| 510 | + status_code=502, |
| 511 | + detail={"message": metadata.get("message", "Failed to fetch repository info")} |
| 512 | + ) |
| 513 | + |
| 514 | + # Check if private |
| 515 | + is_private = metadata.get("private", False) |
| 516 | + if is_private: |
| 517 | + return { |
| 518 | + "valid": True, |
| 519 | + "repo_name": repo_name, |
| 520 | + "owner": owner, |
| 521 | + "is_public": False, |
| 522 | + "can_index": False, |
| 523 | + "reason": "private", |
| 524 | + "message": "This repository is private. " |
| 525 | + "Anonymous indexing only supports public repositories.", |
| 526 | + } |
| 527 | + |
| 528 | + # Get file count |
| 529 | + default_branch = metadata.get("default_branch", "main") |
| 530 | + file_count, count_error = await _count_code_files(owner, repo_name, default_branch) |
| 531 | + |
| 532 | + # Handle truncated tree (very large repo) |
| 533 | + if count_error == "truncated": |
| 534 | + # Estimate from repo size (GitHub size is in KB) |
| 535 | + repo_size_kb = metadata.get("size", 0) |
| 536 | + # Rough estimate: 1 code file per 3KB for code repos |
| 537 | + file_count = max(repo_size_kb // 3, ANONYMOUS_FILE_LIMIT + 1) |
| 538 | + logger.info("Using estimated file count for large repo", |
| 539 | + owner=owner, repo=repo_name, estimated=file_count) |
| 540 | + |
| 541 | + elif count_error: |
| 542 | + logger.warning("Could not count files", owner=owner, repo=repo_name, error=count_error) |
| 543 | + # Fall back to size-based estimate |
| 544 | + repo_size_kb = metadata.get("size", 0) |
| 545 | + file_count = max(repo_size_kb // 3, 1) |
| 546 | + |
| 547 | + # Build response |
| 548 | + response_time_ms = int((time.time() - start_time) * 1000) |
| 549 | + |
| 550 | + if file_count > ANONYMOUS_FILE_LIMIT: |
| 551 | + result = { |
| 552 | + "valid": True, |
| 553 | + "repo_name": repo_name, |
| 554 | + "owner": owner, |
| 555 | + "is_public": True, |
| 556 | + "default_branch": default_branch, |
| 557 | + "file_count": file_count, |
| 558 | + "size_kb": metadata.get("size", 0), |
| 559 | + "language": metadata.get("language"), |
| 560 | + "stars": metadata.get("stargazers_count", 0), |
| 561 | + "can_index": False, |
| 562 | + "reason": "too_large", |
| 563 | + "message": f"Repository has {file_count:,} code files. " |
| 564 | + f"Anonymous limit is {ANONYMOUS_FILE_LIMIT}.", |
| 565 | + "limit": ANONYMOUS_FILE_LIMIT, |
| 566 | + "response_time_ms": response_time_ms, |
| 567 | + } |
| 568 | + else: |
| 569 | + result = { |
| 570 | + "valid": True, |
| 571 | + "repo_name": repo_name, |
| 572 | + "owner": owner, |
| 573 | + "is_public": True, |
| 574 | + "default_branch": default_branch, |
| 575 | + "file_count": file_count, |
| 576 | + "size_kb": metadata.get("size", 0), |
| 577 | + "language": metadata.get("language"), |
| 578 | + "stars": metadata.get("stargazers_count", 0), |
| 579 | + "can_index": True, |
| 580 | + "message": "Ready to index", |
| 581 | + "response_time_ms": response_time_ms, |
| 582 | + } |
| 583 | + |
| 584 | + # Cache successful validations |
| 585 | + if cache: |
| 586 | + cache.set(cache_key, result, ttl=VALIDATION_CACHE_TTL) |
| 587 | + |
| 588 | + logger.info("Validated GitHub repo", |
| 589 | + owner=owner, repo=repo_name, |
| 590 | + file_count=file_count, can_index=result["can_index"], |
| 591 | + response_time_ms=response_time_ms) |
| 592 | + |
| 593 | + return result |
0 commit comments