Skip to content

Commit 0bdafa3

Browse files
committed
feat(playground): add GitHub URL validation endpoint (#124)
- Add POST /api/v1/playground/validate-repo endpoint - Parse and validate GitHub URLs (owner/repo extraction) - Fetch repo metadata from GitHub API (public/private, stars, language) - Count code files using tree API (filters by CODE_EXTENSIONS) - Handle edge cases: 404, rate limits, truncated trees, timeouts - Cache validation results (5 min TTL) - Enforce 200-file limit for anonymous indexing - Add 25 comprehensive tests Closes #124
1 parent 8a057bf commit 0bdafa3

2 files changed

Lines changed: 651 additions & 1 deletion

File tree

backend/routes/playground.py

Lines changed: 287 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,16 @@
77
- Global circuit breaker: 10k searches/hour (cost protection)
88
"""
99
import os
10+
import re
11+
import httpx
1012
from typing import Optional
1113
from fastapi import APIRouter, HTTPException, Request, Response
12-
from pydantic import BaseModel
14+
from pydantic import BaseModel, field_validator
1315
import time
1416

1517
from dependencies import indexer, cache, repo_manager, redis_client
1618
from services.input_validator import InputValidator
19+
from services.repo_validator import RepoValidator
1720
from services.observability import logger
1821
from services.playground_limiter import PlaygroundLimiter, get_playground_limiter
1922

@@ -27,13 +30,40 @@
2730
SESSION_COOKIE_MAX_AGE = 86400 # 24 hours
2831
IS_PRODUCTION = os.getenv("ENVIRONMENT", "development").lower() == "production"
2932

33+
# GitHub validation config
34+
GITHUB_URL_PATTERN = re.compile(
35+
r"^https?://github\.com/(?P<owner>[a-zA-Z0-9_.-]+)/(?P<repo>[a-zA-Z0-9_.-]+)/?$"
36+
)
37+
ANONYMOUS_FILE_LIMIT = 200 # Max files for anonymous indexing
38+
GITHUB_API_BASE = "https://api.github.com"
39+
GITHUB_API_TIMEOUT = 10.0 # seconds
40+
VALIDATION_CACHE_TTL = 300 # 5 minutes
41+
3042

3143
class PlaygroundSearchRequest(BaseModel):
3244
query: str
3345
demo_repo: str = "flask"
3446
max_results: int = 10
3547

3648

49+
class ValidateRepoRequest(BaseModel):
50+
"""Request body for GitHub repo validation."""
51+
github_url: str
52+
53+
@field_validator("github_url")
54+
@classmethod
55+
def validate_github_url_format(cls, v: str) -> str:
56+
"""Basic URL format validation."""
57+
v = v.strip()
58+
if not v:
59+
raise ValueError("GitHub URL is required")
60+
if not v.startswith(("http://", "https://")):
61+
raise ValueError("URL must start with http:// or https://")
62+
if "github.com" not in v.lower():
63+
raise ValueError("URL must be a GitHub repository URL")
64+
return v
65+
66+
3767
async def load_demo_repos():
3868
"""Load pre-indexed demo repos. Called from main.py on startup."""
3969
# Note: We mutate DEMO_REPO_IDS dict, no need for 'global' statement
@@ -305,3 +335,259 @@ async def get_playground_stats():
305335
limiter = _get_limiter()
306336
stats = limiter.get_usage_stats()
307337
return stats
338+
339+
340+
def _parse_github_url(url: str) -> tuple[Optional[str], Optional[str], Optional[str]]:
341+
"""
342+
Parse GitHub URL to extract owner and repo.
343+
344+
Returns:
345+
(owner, repo, error) - error is None if successful
346+
"""
347+
match = GITHUB_URL_PATTERN.match(url.strip().rstrip("/"))
348+
if not match:
349+
return None, None, "Invalid GitHub URL format. Expected: https://github.com/owner/repo"
350+
return match.group("owner"), match.group("repo"), None
351+
352+
353+
async def _fetch_repo_metadata(owner: str, repo: str) -> dict:
354+
"""
355+
Fetch repository metadata from GitHub API.
356+
357+
Returns dict with repo info or error details.
358+
"""
359+
url = f"{GITHUB_API_BASE}/repos/{owner}/{repo}"
360+
headers = {
361+
"Accept": "application/vnd.github.v3+json",
362+
"User-Agent": "OpenCodeIntel/1.0",
363+
}
364+
365+
# Add GitHub token if available (for higher rate limits)
366+
github_token = os.getenv("GITHUB_TOKEN")
367+
if github_token:
368+
headers["Authorization"] = f"token {github_token}"
369+
370+
async with httpx.AsyncClient(timeout=GITHUB_API_TIMEOUT) as client:
371+
try:
372+
response = await client.get(url, headers=headers)
373+
374+
if response.status_code == 404:
375+
return {"error": "not_found", "message": "Repository not found"}
376+
if response.status_code == 403:
377+
return {
378+
"error": "rate_limited",
379+
"message": "GitHub API rate limit exceeded"
380+
}
381+
if response.status_code != 200:
382+
return {
383+
"error": "api_error",
384+
"message": f"GitHub API error: {response.status_code}"
385+
}
386+
387+
return response.json()
388+
except httpx.TimeoutException:
389+
return {"error": "timeout", "message": "GitHub API request timed out"}
390+
except Exception as e:
391+
logger.error("GitHub API request failed", error=str(e))
392+
return {"error": "request_failed", "message": str(e)}
393+
394+
395+
async def _count_code_files(
396+
owner: str, repo: str, default_branch: str
397+
) -> tuple[int, Optional[str]]:
398+
"""
399+
Count code files in repository using GitHub tree API.
400+
401+
Returns:
402+
(file_count, error) - error is None if successful
403+
"""
404+
url = f"{GITHUB_API_BASE}/repos/{owner}/{repo}/git/trees/{default_branch}?recursive=1"
405+
headers = {
406+
"Accept": "application/vnd.github.v3+json",
407+
"User-Agent": "OpenCodeIntel/1.0",
408+
}
409+
410+
github_token = os.getenv("GITHUB_TOKEN")
411+
if github_token:
412+
headers["Authorization"] = f"token {github_token}"
413+
414+
async with httpx.AsyncClient(timeout=GITHUB_API_TIMEOUT) as client:
415+
try:
416+
response = await client.get(url, headers=headers)
417+
418+
if response.status_code == 404:
419+
return 0, "Could not fetch repository tree"
420+
if response.status_code == 403:
421+
return 0, "GitHub API rate limit exceeded"
422+
if response.status_code != 200:
423+
return 0, f"GitHub API error: {response.status_code}"
424+
425+
data = response.json()
426+
427+
# Check if tree was truncated (very large repos)
428+
if data.get("truncated", False):
429+
# For truncated trees, estimate from repo size
430+
# GitHub's size is in KB, rough estimate: 1 code file per 5KB
431+
return -1, "truncated"
432+
433+
# Count files with code extensions
434+
code_extensions = RepoValidator.CODE_EXTENSIONS
435+
skip_dirs = RepoValidator.SKIP_DIRS
436+
437+
count = 0
438+
for item in data.get("tree", []):
439+
if item.get("type") != "blob":
440+
continue
441+
442+
path = item.get("path", "")
443+
444+
# Skip if in excluded directory
445+
path_parts = path.split("/")
446+
if any(part in skip_dirs for part in path_parts):
447+
continue
448+
449+
# Check extension
450+
ext = "." + path.rsplit(".", 1)[-1] if "." in path else ""
451+
if ext.lower() in code_extensions:
452+
count += 1
453+
454+
return count, None
455+
except httpx.TimeoutException:
456+
return 0, "GitHub API request timed out"
457+
except Exception as e:
458+
logger.error("GitHub tree API failed", error=str(e))
459+
return 0, str(e)
460+
461+
462+
@router.post("/validate-repo")
463+
async def validate_github_repo(request: ValidateRepoRequest, req: Request):
464+
"""
465+
Validate a GitHub repository URL for anonymous indexing.
466+
467+
Checks:
468+
- URL format is valid
469+
- Repository exists and is public
470+
- File count is within anonymous limit (200 files)
471+
472+
Response varies based on validation result (see issue #124).
473+
"""
474+
start_time = time.time()
475+
476+
# Check cache first
477+
cache_key = f"validate:{request.github_url}"
478+
cached = cache.get(cache_key) if cache else None
479+
if cached:
480+
logger.info("Returning cached validation", url=request.github_url[:50])
481+
return cached
482+
483+
# Parse URL
484+
owner, repo_name, parse_error = _parse_github_url(request.github_url)
485+
if parse_error:
486+
return {
487+
"valid": False,
488+
"reason": "invalid_url",
489+
"message": parse_error,
490+
}
491+
492+
# Fetch repo metadata from GitHub
493+
metadata = await _fetch_repo_metadata(owner, repo_name)
494+
495+
if "error" in metadata:
496+
error_type = metadata["error"]
497+
if error_type == "not_found":
498+
return {
499+
"valid": False,
500+
"reason": "not_found",
501+
"message": "Repository not found. Check the URL or ensure it's public.",
502+
}
503+
elif error_type == "rate_limited":
504+
raise HTTPException(
505+
status_code=429,
506+
detail={"message": "GitHub API rate limit exceeded. Try again later."}
507+
)
508+
else:
509+
raise HTTPException(
510+
status_code=502,
511+
detail={"message": metadata.get("message", "Failed to fetch repository info")}
512+
)
513+
514+
# Check if private
515+
is_private = metadata.get("private", False)
516+
if is_private:
517+
return {
518+
"valid": True,
519+
"repo_name": repo_name,
520+
"owner": owner,
521+
"is_public": False,
522+
"can_index": False,
523+
"reason": "private",
524+
"message": "This repository is private. "
525+
"Anonymous indexing only supports public repositories.",
526+
}
527+
528+
# Get file count
529+
default_branch = metadata.get("default_branch", "main")
530+
file_count, count_error = await _count_code_files(owner, repo_name, default_branch)
531+
532+
# Handle truncated tree (very large repo)
533+
if count_error == "truncated":
534+
# Estimate from repo size (GitHub size is in KB)
535+
repo_size_kb = metadata.get("size", 0)
536+
# Rough estimate: 1 code file per 3KB for code repos
537+
file_count = max(repo_size_kb // 3, ANONYMOUS_FILE_LIMIT + 1)
538+
logger.info("Using estimated file count for large repo",
539+
owner=owner, repo=repo_name, estimated=file_count)
540+
541+
elif count_error:
542+
logger.warning("Could not count files", owner=owner, repo=repo_name, error=count_error)
543+
# Fall back to size-based estimate
544+
repo_size_kb = metadata.get("size", 0)
545+
file_count = max(repo_size_kb // 3, 1)
546+
547+
# Build response
548+
response_time_ms = int((time.time() - start_time) * 1000)
549+
550+
if file_count > ANONYMOUS_FILE_LIMIT:
551+
result = {
552+
"valid": True,
553+
"repo_name": repo_name,
554+
"owner": owner,
555+
"is_public": True,
556+
"default_branch": default_branch,
557+
"file_count": file_count,
558+
"size_kb": metadata.get("size", 0),
559+
"language": metadata.get("language"),
560+
"stars": metadata.get("stargazers_count", 0),
561+
"can_index": False,
562+
"reason": "too_large",
563+
"message": f"Repository has {file_count:,} code files. "
564+
f"Anonymous limit is {ANONYMOUS_FILE_LIMIT}.",
565+
"limit": ANONYMOUS_FILE_LIMIT,
566+
"response_time_ms": response_time_ms,
567+
}
568+
else:
569+
result = {
570+
"valid": True,
571+
"repo_name": repo_name,
572+
"owner": owner,
573+
"is_public": True,
574+
"default_branch": default_branch,
575+
"file_count": file_count,
576+
"size_kb": metadata.get("size", 0),
577+
"language": metadata.get("language"),
578+
"stars": metadata.get("stargazers_count", 0),
579+
"can_index": True,
580+
"message": "Ready to index",
581+
"response_time_ms": response_time_ms,
582+
}
583+
584+
# Cache successful validations
585+
if cache:
586+
cache.set(cache_key, result, ttl=VALIDATION_CACHE_TTL)
587+
588+
logger.info("Validated GitHub repo",
589+
owner=owner, repo=repo_name,
590+
file_count=file_count, can_index=result["can_index"],
591+
response_time_ms=response_time_ms)
592+
593+
return result

0 commit comments

Comments
 (0)