diff --git a/backend/routes/repos.py b/backend/routes/repos.py index f3f99a4..a630aaf 100644 --- a/backend/routes/repos.py +++ b/backend/routes/repos.py @@ -3,10 +3,14 @@ from pydantic import BaseModel, field_validator from typing import List, Optional from pathlib import Path +from urllib.parse import quote import hashlib +import os +import re import time import asyncio import git +import httpx from dependencies import ( indexer, repo_manager, metrics, redis_client, @@ -149,6 +153,259 @@ async def add_repository( raise HTTPException(status_code=500, detail="Failed to add repository") +# -- GitHub API helpers for pre-clone analysis -------------------------------- + +_GITHUB_API_BASE = "https://api.github.com" +_GITHUB_URL_RE = re.compile( + r"^https?://github\.com/(?P[a-zA-Z0-9_.\-]+)/(?P[a-zA-Z0-9_.\-]+)/?$" +) + + +def _github_headers() -> dict: + """Build GitHub API request headers with optional auth token.""" + headers = {"Accept": "application/vnd.github.v3+json", "User-Agent": "OpenCodeIntel/1.0"} + token = os.getenv("GITHUB_TOKEN") + if token: + headers["Authorization"] = f"token {token}" + return headers + + +async def _fetch_directory_tree( + owner: str, repo: str, branch: str, + client: Optional[httpx.AsyncClient] = None, +) -> dict: + """Fetch directory structure from GitHub Tree API. + + Returns a dict with directories (name, path, file_count) grouped + at the most useful level -- top-level for flat repos, package-level + for monorepos with a packages/ directory. + + Args: + client: Reuse an existing httpx client to avoid opening a second + connection. If None, creates and closes its own. + """ + from services.repo_validator import RepoValidator + + # Encode branch for URL safety -- "feature/foo" -> "feature%2Ffoo" + encoded_branch = quote(branch, safe="") + url = f"{_GITHUB_API_BASE}/repos/{owner}/{repo}/git/trees/{encoded_branch}?recursive=1" + + async def _get(c: httpx.AsyncClient) -> httpx.Response: + return await c.get(url, headers=_github_headers()) + + try: + if client: + response = await _get(client) + else: + async with httpx.AsyncClient(timeout=15.0) as c: + response = await _get(c) + except httpx.TimeoutException: + raise HTTPException(status_code=504, detail="GitHub API request timed out") + except httpx.RequestError as e: + logger.error("GitHub tree API network error", error=str(e)) + raise HTTPException(status_code=502, detail="Failed to connect to GitHub API") + + if response.status_code == 404: + raise HTTPException(status_code=404, detail="Repository or branch not found") + if response.status_code == 403: + raise HTTPException(status_code=429, detail="GitHub API rate limit exceeded") + if response.status_code != 200: + raise HTTPException(status_code=502, detail=f"GitHub API error: {response.status_code}") + + try: + data = response.json() + except ValueError: + raise HTTPException(status_code=502, detail="Invalid response from GitHub API") + truncated = data.get("truncated", False) + + code_extensions = RepoValidator.CODE_EXTENSIONS + skip_dirs = RepoValidator.SKIP_DIRS + + # Count code files per top-level directory + dir_counts: dict[str, int] = {} + total_files = 0 + + for item in data.get("tree", []): + if item.get("type") != "blob": + continue + path = item.get("path", "") + parts = path.split("/") + if any(part in skip_dirs for part in parts): + continue + ext = "." + path.rsplit(".", 1)[-1] if "." in path else "" + if ext.lower() not in code_extensions: + continue + + total_files += 1 + + # Group by top-level dir, or "(root)" for root-level files + if len(parts) == 1: + dir_counts["(root)"] = dir_counts.get("(root)", 0) + 1 + else: + top = parts[0] + # For monorepos: if top is packages/libs/apps, group one level deeper + if top in ("packages", "libs", "apps", "modules", "crates") and len(parts) >= 3: + key = f"{parts[0]}/{parts[1]}" + else: + key = top + dir_counts[key] = dir_counts.get(key, 0) + 1 + + # Indexing is function-level, not file-level. Estimate function counts + # using the same multiplier the tier system uses for limit checks. + avg_fn = RepoValidator.AVG_FUNCTIONS_PER_FILE # 25 + + # Build sorted directory list with estimated function counts + directories = sorted( + [ + { + "name": d, "path": d, + "file_count": c, + "estimated_functions": c * avg_fn, + } + for d, c in dir_counts.items() if d != "(root)" + ], + key=lambda x: -x["file_count"], + ) + + root_files = dir_counts.get("(root)", 0) + if root_files > 0: + directories.append({ + "name": "(root files)", "path": ".", + "file_count": root_files, + "estimated_functions": root_files * avg_fn, + }) + + total_estimated = total_files * avg_fn + + # Suggest directory picker for large repos + suggestion = None + if total_files > 500 or len(directories) > 10: + suggestion = "large_repo" + + return { + "directories": directories, + "total_files": total_files, + "total_estimated_functions": total_estimated, + "total_directories": len(directories), + "truncated": truncated, + "suggestion": suggestion, + } + + +class AnalyzeRepoRequest(BaseModel): + """Request body for pre-clone repo analysis.""" + github_url: str + + @field_validator("github_url") + @classmethod + def validate_url(cls, v: str) -> str: + v = v.strip().rstrip("/") + if not v: + raise ValueError("GitHub URL is required") + if not _GITHUB_URL_RE.match(v): + raise ValueError( + "Invalid GitHub URL. Expected: https://github.com/owner/repo" + ) + return v + + +_ANALYZE_CACHE_TTL = 86400 # 24 hours -- directory structure rarely changes + + +@router.post("/analyze") +async def analyze_repository(request: AnalyzeRepoRequest) -> dict: + """Analyze a GitHub repo's directory structure WITHOUT cloning. + + Returns directory tree with file counts so the user can select + which directories to index (monorepo subset selection). + + Results are cached for 24 hours (see _ANALYZE_CACHE_TTL) since + directory structure rarely changes. + """ + match = _GITHUB_URL_RE.match(request.github_url) + if not match: + raise HTTPException( + status_code=400, + detail="Invalid GitHub URL. Expected: https://github.com/owner/repo", + ) + + owner = match.group("owner") + repo_name = match.group("repo").removesuffix(".git") + + # Check cache first (same pattern as validate-repo) + from dependencies import cache + cache_key = f"analyze:{owner}/{repo_name}" + cached = cache.get(cache_key) if cache else None + if cached: + logger.info("Returning cached analysis", owner=owner, repo=repo_name) + return cached + + # Single httpx client for both GitHub API calls + try: + async with httpx.AsyncClient(timeout=15.0) as client: + # 1. Fetch repo metadata for default branch and size + meta_resp = await client.get( + f"{_GITHUB_API_BASE}/repos/{owner}/{repo_name}", + headers=_github_headers(), + ) + + if meta_resp.status_code == 404: + raise HTTPException(status_code=404, detail="Repository not found") + if meta_resp.status_code == 403: + raise HTTPException(status_code=429, detail="GitHub API rate limit exceeded") + if meta_resp.status_code != 200: + raise HTTPException(status_code=502, detail="Failed to fetch repository metadata") + + try: + metadata = meta_resp.json() + except ValueError: + raise HTTPException(status_code=502, detail="Invalid response from GitHub API") + + # Block private repos -- server GITHUB_TOKEN could access them, + # but we must not leak private repo structure to unauthenticated callers + if metadata.get("private", False): + raise HTTPException( + status_code=403, + detail="Private repositories are not supported. Use authenticated indexing instead.", + ) + + default_branch = metadata.get("default_branch", "main") + + # 2. Fetch directory tree (reuse same client) + tree_data = await _fetch_directory_tree(owner, repo_name, default_branch, client=client) + except HTTPException: + raise + except httpx.TimeoutException: + raise HTTPException(status_code=504, detail="GitHub API request timed out") + except httpx.RequestError as e: + logger.error("GitHub API network error", error=str(e)) + raise HTTPException(status_code=502, detail="Failed to connect to GitHub API") + + logger.info( + "Analyzed repo structure", + owner=owner, repo=repo_name, + total_files=tree_data["total_files"], + dirs=tree_data["total_directories"], + suggestion=tree_data.get("suggestion"), + ) + + result = { + "owner": owner, + "repo": repo_name, + "default_branch": default_branch, + "size_kb": metadata.get("size", 0), + "stars": metadata.get("stargazers_count", 0), + "language": metadata.get("language"), + **tree_data, + } + + # Cache result + if cache: + cache.set(cache_key, result, ttl=_ANALYZE_CACHE_TTL) + + return result + + @router.delete("/{repo_id}") async def delete_repository( repo_id: str, diff --git a/backend/tests/test_analyze_repo.py b/backend/tests/test_analyze_repo.py new file mode 100644 index 0000000..0f21003 --- /dev/null +++ b/backend/tests/test_analyze_repo.py @@ -0,0 +1,237 @@ +"""Tests for POST /repos/analyze -- pre-clone directory analysis (OPE-109).""" +import pytest +from unittest.mock import AsyncMock, patch, MagicMock +from pydantic import ValidationError + +# Import after conftest patches external services +from routes.repos import ( + _fetch_directory_tree, + _GITHUB_URL_RE, + AnalyzeRepoRequest, + IndexConfig, +) + + +# -- URL regex tests ---------------------------------------------------------- + +class TestGitHubUrlRegex: + def test_standard_url(self): + match = _GITHUB_URL_RE.match("https://github.com/Effect-TS/effect") + assert match is not None + assert match.group("owner") == "Effect-TS" + assert match.group("repo") == "effect" + + def test_url_with_trailing_slash(self): + match = _GITHUB_URL_RE.match("https://github.com/owner/repo/") + assert match is not None + + def test_http_url(self): + match = _GITHUB_URL_RE.match("http://github.com/owner/repo") + assert match is not None + + def test_rejects_non_github(self): + assert _GITHUB_URL_RE.match("https://gitlab.com/owner/repo") is None + + def test_rejects_subpath(self): + assert _GITHUB_URL_RE.match("https://github.com/owner/repo/tree/main") is None + + def test_rejects_no_repo(self): + assert _GITHUB_URL_RE.match("https://github.com/justowner") is None + + +# -- AnalyzeRepoRequest validation -------------------------------------------- + +class TestAnalyzeRepoRequest: + def test_valid_url(self): + req = AnalyzeRepoRequest(github_url="https://github.com/owner/repo") + assert req.github_url == "https://github.com/owner/repo" + + def test_strips_whitespace_and_slash(self): + req = AnalyzeRepoRequest(github_url=" https://github.com/owner/repo/ ") + assert req.github_url == "https://github.com/owner/repo" + + def test_rejects_empty(self): + with pytest.raises(ValidationError): + AnalyzeRepoRequest(github_url="") + + def test_rejects_non_github(self): + with pytest.raises(ValidationError): + AnalyzeRepoRequest(github_url="https://gitlab.com/owner/repo") + + def test_rejects_malformed_github_domain(self): + with pytest.raises(ValidationError): + AnalyzeRepoRequest(github_url="https://fakegithub.com/owner/repo") + + +# -- IndexConfig validation (from PR #266) ------------------------------------ + +class TestIndexConfig: + def test_valid_paths(self): + cfg = IndexConfig(include_paths=["packages/effect", "packages/schema"]) + assert cfg.include_paths == ["packages/effect", "packages/schema"] + + def test_normalizes_slashes(self): + cfg = IndexConfig(include_paths=["/packages/effect/", " src "]) + assert cfg.include_paths == ["packages/effect", "src"] + + def test_normalizes_backslashes(self): + cfg = IndexConfig(include_paths=["packages\\effect"]) + assert cfg.include_paths == ["packages/effect"] + + def test_rejects_empty_string(self): + with pytest.raises(ValidationError): + IndexConfig(include_paths=["packages/effect", " "]) + + def test_rejects_path_traversal(self): + with pytest.raises(ValidationError): + IndexConfig(include_paths=["../etc/passwd"]) + + def test_rejects_nested_traversal(self): + with pytest.raises(ValidationError): + IndexConfig(include_paths=["packages/../../etc"]) + + def test_rejects_non_string(self): + with pytest.raises(ValidationError): + IndexConfig(include_paths=[123]) + + def test_none_is_valid(self): + cfg = IndexConfig() + assert cfg.include_paths is None + + +# -- _fetch_directory_tree grouping logic ------------------------------------- + +# Build a fake GitHub tree response for testing grouping +def _make_tree(paths: list[str]) -> dict: + """Build a mock GitHub Tree API response from file paths.""" + return { + "truncated": False, + "tree": [{"path": p, "type": "blob"} for p in paths], + } + + +class TestFetchDirectoryTree: + """Test directory grouping logic with mocked GitHub API.""" + + @pytest.mark.asyncio + async def test_flat_repo_groups_by_top_dir(self): + tree = _make_tree([ + "src/main.py", + "src/utils.py", + "tests/test_main.py", + "README.md", + ]) + with patch("routes.repos.httpx.AsyncClient") as mock_client: + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = tree + mock_client.return_value.__aenter__ = AsyncMock(return_value=MagicMock( + get=AsyncMock(return_value=mock_resp) + )) + + result = await _fetch_directory_tree("owner", "repo", "main") + + dir_names = [d["name"] for d in result["directories"]] + assert "src" in dir_names + assert result["total_files"] == 3 # README.md has no code ext + + @pytest.mark.asyncio + async def test_includes_estimated_functions(self): + tree = _make_tree([ + "src/main.py", + "src/utils.py", + "lib/helpers.ts", + ]) + with patch("routes.repos.httpx.AsyncClient") as mock_client: + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = tree + mock_client.return_value.__aenter__ = AsyncMock(return_value=MagicMock( + get=AsyncMock(return_value=mock_resp) + )) + + result = await _fetch_directory_tree("owner", "repo", "main") + + # 3 files * 25 avg functions per file = 75 + assert result["total_estimated_functions"] == 75 + src_dir = next(d for d in result["directories"] if d["name"] == "src") + assert src_dir["estimated_functions"] == 50 # 2 files * 25 + lib_dir = next(d for d in result["directories"] if d["name"] == "lib") + assert lib_dir["estimated_functions"] == 25 # 1 file * 25 + + @pytest.mark.asyncio + async def test_monorepo_groups_at_package_level(self): + tree = _make_tree([ + "packages/core/src/index.ts", + "packages/core/src/utils.ts", + "packages/cli/src/main.ts", + "scripts/build.ts", + ]) + with patch("routes.repos.httpx.AsyncClient") as mock_client: + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = tree + mock_client.return_value.__aenter__ = AsyncMock(return_value=MagicMock( + get=AsyncMock(return_value=mock_resp) + )) + + result = await _fetch_directory_tree("owner", "repo", "main") + + dir_names = [d["name"] for d in result["directories"]] + # Should group at packages/core level, not just packages + assert "packages/core" in dir_names + assert "packages/cli" in dir_names + assert "scripts" in dir_names + + @pytest.mark.asyncio + async def test_skips_node_modules(self): + tree = _make_tree([ + "src/index.ts", + "node_modules/lodash/index.js", + ]) + with patch("routes.repos.httpx.AsyncClient") as mock_client: + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = tree + mock_client.return_value.__aenter__ = AsyncMock(return_value=MagicMock( + get=AsyncMock(return_value=mock_resp) + )) + + result = await _fetch_directory_tree("owner", "repo", "main") + + assert result["total_files"] == 1 + + @pytest.mark.asyncio + async def test_large_repo_suggestion(self): + # 600 TypeScript files across 15 packages + paths = [f"packages/pkg{i}/src/file{j}.ts" for i in range(15) for j in range(40)] + tree = _make_tree(paths) + + with patch("routes.repos.httpx.AsyncClient") as mock_client: + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = tree + mock_client.return_value.__aenter__ = AsyncMock(return_value=MagicMock( + get=AsyncMock(return_value=mock_resp) + )) + + result = await _fetch_directory_tree("owner", "repo", "main") + + assert result["suggestion"] == "large_repo" + assert result["total_files"] == 600 + + @pytest.mark.asyncio + async def test_small_repo_no_suggestion(self): + tree = _make_tree(["src/main.py", "src/utils.py"]) + + with patch("routes.repos.httpx.AsyncClient") as mock_client: + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = tree + mock_client.return_value.__aenter__ = AsyncMock(return_value=MagicMock( + get=AsyncMock(return_value=mock_resp) + )) + + result = await _fetch_directory_tree("owner", "repo", "main") + + assert result["suggestion"] is None