Skip to content

Commit 7a7bdd5

Browse files
committed
feat: POST /repos/analyze -- pre-clone directory analysis via GitHub API (OPE-109)
New endpoint that returns a repo's directory structure WITHOUT cloning. Uses GitHub Tree API to fetch the full file tree instantly (~200ms), then groups by directory with code file counts. Implementation: - _github_headers(): builds auth headers with optional GITHUB_TOKEN - _fetch_directory_tree(): calls GitHub Tree API, groups files by directory, smart monorepo detection (packages/, libs/, apps/ grouped one level deeper) - AnalyzeRepoRequest: Pydantic model with URL validation - POST /repos/analyze: orchestrates metadata + tree fetch, returns directory list with suggestion flag for large repos (>500 files or >10 dirs) Key design decisions: - Route placed before /{repo_id} to avoid path conflict - Monorepo-aware grouping: 'packages/effect' not just 'packages' - No auth required (same as viewing a public GitHub repo) - Reuses RepoValidator.CODE_EXTENSIONS and SKIP_DIRS for consistency - Returns 'suggestion: large_repo' to trigger directory picker in frontend
1 parent d3ffafa commit 7a7bdd5

1 file changed

Lines changed: 173 additions & 0 deletions

File tree

backend/routes/repos.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,12 @@
44
from typing import List, Optional
55
from pathlib import Path
66
import hashlib
7+
import os
8+
import re
79
import time
810
import asyncio
911
import git
12+
import httpx
1013

1114
from dependencies import (
1215
indexer, repo_manager, metrics, redis_client,
@@ -149,6 +152,176 @@ async def add_repository(
149152
raise HTTPException(status_code=500, detail="Failed to add repository")
150153

151154

155+
# -- GitHub API helpers for pre-clone analysis --------------------------------
156+
157+
_GITHUB_API_BASE = "https://api.github.com"
158+
_GITHUB_URL_RE = re.compile(
159+
r"^https?://github\.com/(?P<owner>[a-zA-Z0-9_.\-]+)/(?P<repo>[a-zA-Z0-9_.\-]+)/?$"
160+
)
161+
162+
163+
def _github_headers() -> dict:
164+
"""Build GitHub API request headers with optional auth token."""
165+
headers = {"Accept": "application/vnd.github.v3+json", "User-Agent": "OpenCodeIntel/1.0"}
166+
token = os.getenv("GITHUB_TOKEN")
167+
if token:
168+
headers["Authorization"] = f"token {token}"
169+
return headers
170+
171+
172+
async def _fetch_directory_tree(
173+
owner: str, repo: str, branch: str,
174+
) -> dict:
175+
"""Fetch directory structure from GitHub Tree API.
176+
177+
Returns a dict with directories (name, path, file_count) grouped
178+
at the most useful level -- top-level for flat repos, package-level
179+
for monorepos with a packages/ directory.
180+
"""
181+
from services.repo_validator import RepoValidator
182+
183+
url = f"{_GITHUB_API_BASE}/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
184+
185+
async with httpx.AsyncClient(timeout=15.0) as client:
186+
response = await client.get(url, headers=_github_headers())
187+
188+
if response.status_code == 404:
189+
raise HTTPException(status_code=404, detail="Repository or branch not found")
190+
if response.status_code == 403:
191+
raise HTTPException(status_code=429, detail="GitHub API rate limit exceeded")
192+
if response.status_code != 200:
193+
raise HTTPException(status_code=502, detail=f"GitHub API error: {response.status_code}")
194+
195+
data = response.json()
196+
truncated = data.get("truncated", False)
197+
198+
code_extensions = RepoValidator.CODE_EXTENSIONS
199+
skip_dirs = RepoValidator.SKIP_DIRS
200+
201+
# Count code files per top-level directory
202+
dir_counts: dict[str, int] = {}
203+
total_files = 0
204+
205+
for item in data.get("tree", []):
206+
if item.get("type") != "blob":
207+
continue
208+
path = item.get("path", "")
209+
parts = path.split("/")
210+
if any(part in skip_dirs for part in parts):
211+
continue
212+
ext = "." + path.rsplit(".", 1)[-1] if "." in path else ""
213+
if ext.lower() not in code_extensions:
214+
continue
215+
216+
total_files += 1
217+
218+
# Group by top-level dir, or "(root)" for root-level files
219+
if len(parts) == 1:
220+
dir_counts["(root)"] = dir_counts.get("(root)", 0) + 1
221+
else:
222+
top = parts[0]
223+
# For monorepos: if top is packages/libs/apps, group one level deeper
224+
if top in ("packages", "libs", "apps", "modules", "crates") and len(parts) >= 3:
225+
key = f"{parts[0]}/{parts[1]}"
226+
else:
227+
key = top
228+
dir_counts[key] = dir_counts.get(key, 0) + 1
229+
230+
# Build sorted directory list
231+
directories = sorted(
232+
[{"name": d, "path": d, "file_count": c} for d, c in dir_counts.items() if d != "(root)"],
233+
key=lambda x: -x["file_count"],
234+
)
235+
236+
root_files = dir_counts.get("(root)", 0)
237+
if root_files > 0:
238+
directories.append({"name": "(root files)", "path": ".", "file_count": root_files})
239+
240+
# Suggest directory picker for large repos
241+
suggestion = None
242+
if total_files > 500 or len(directories) > 10:
243+
suggestion = "large_repo"
244+
245+
return {
246+
"directories": directories,
247+
"total_files": total_files,
248+
"total_directories": len(directories),
249+
"truncated": truncated,
250+
"suggestion": suggestion,
251+
}
252+
253+
254+
class AnalyzeRepoRequest(BaseModel):
255+
"""Request body for pre-clone repo analysis."""
256+
github_url: str
257+
258+
@field_validator("github_url")
259+
@classmethod
260+
def validate_url(cls, v: str) -> str:
261+
v = v.strip().rstrip("/")
262+
if not v:
263+
raise ValueError("GitHub URL is required")
264+
if "github.com" not in v.lower():
265+
raise ValueError("Only GitHub URLs are supported")
266+
return v
267+
268+
269+
@router.post("/analyze")
270+
async def analyze_repository(request: AnalyzeRepoRequest) -> dict:
271+
"""Analyze a GitHub repo's directory structure WITHOUT cloning.
272+
273+
Returns directory tree with file counts so the user can select
274+
which directories to index (monorepo subset selection).
275+
"""
276+
match = _GITHUB_URL_RE.match(request.github_url)
277+
if not match:
278+
raise HTTPException(
279+
status_code=400,
280+
detail="Invalid GitHub URL. Expected: https://github.com/owner/repo",
281+
)
282+
283+
owner = match.group("owner")
284+
repo_name = match.group("repo").removesuffix(".git")
285+
286+
# Fetch repo metadata for default branch and size
287+
async with httpx.AsyncClient(timeout=10.0) as client:
288+
meta_resp = await client.get(
289+
f"{_GITHUB_API_BASE}/repos/{owner}/{repo_name}",
290+
headers=_github_headers(),
291+
)
292+
293+
if meta_resp.status_code == 404:
294+
raise HTTPException(status_code=404, detail="Repository not found")
295+
if meta_resp.status_code == 403:
296+
raise HTTPException(status_code=429, detail="GitHub API rate limit exceeded")
297+
if meta_resp.status_code != 200:
298+
raise HTTPException(status_code=502, detail="Failed to fetch repository metadata")
299+
300+
metadata = meta_resp.json()
301+
default_branch = metadata.get("default_branch", "main")
302+
303+
# Fetch directory tree
304+
tree_data = await _fetch_directory_tree(owner, repo_name, default_branch)
305+
306+
logger.info(
307+
"Analyzed repo structure",
308+
owner=owner, repo=repo_name,
309+
total_files=tree_data["total_files"],
310+
dirs=tree_data["total_directories"],
311+
suggestion=tree_data.get("suggestion"),
312+
)
313+
314+
return {
315+
"owner": owner,
316+
"repo": repo_name,
317+
"default_branch": default_branch,
318+
"size_kb": metadata.get("size", 0),
319+
"stars": metadata.get("stargazers_count", 0),
320+
"language": metadata.get("language"),
321+
**tree_data,
322+
}
323+
324+
152325
@router.delete("/{repo_id}")
153326
async def delete_repository(
154327
repo_id: str,

0 commit comments

Comments
 (0)