Skip to content

Commit 6379d8f

Browse files
authored
Merge pull request #267 from DevanshuNEU/feat/pre-clone-analysis
feat: POST /repos/analyze -- pre-clone directory analysis via GitHub API (OPE-109)
2 parents d3ffafa + 1bd52b9 commit 6379d8f

2 files changed

Lines changed: 494 additions & 0 deletions

File tree

backend/routes/repos.py

Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@
33
from pydantic import BaseModel, field_validator
44
from typing import List, Optional
55
from pathlib import Path
6+
from urllib.parse import quote
67
import hashlib
8+
import os
9+
import re
710
import time
811
import asyncio
912
import git
13+
import httpx
1014

1115
from dependencies import (
1216
indexer, repo_manager, metrics, redis_client,
@@ -149,6 +153,259 @@ async def add_repository(
149153
raise HTTPException(status_code=500, detail="Failed to add repository")
150154

151155

156+
# -- GitHub API helpers for pre-clone analysis --------------------------------
157+
158+
_GITHUB_API_BASE = "https://api.github.com"
159+
_GITHUB_URL_RE = re.compile(
160+
r"^https?://github\.com/(?P<owner>[a-zA-Z0-9_.\-]+)/(?P<repo>[a-zA-Z0-9_.\-]+)/?$"
161+
)
162+
163+
164+
def _github_headers() -> dict:
165+
"""Build GitHub API request headers with optional auth token."""
166+
headers = {"Accept": "application/vnd.github.v3+json", "User-Agent": "OpenCodeIntel/1.0"}
167+
token = os.getenv("GITHUB_TOKEN")
168+
if token:
169+
headers["Authorization"] = f"token {token}"
170+
return headers
171+
172+
173+
async def _fetch_directory_tree(
174+
owner: str, repo: str, branch: str,
175+
client: Optional[httpx.AsyncClient] = None,
176+
) -> dict:
177+
"""Fetch directory structure from GitHub Tree API.
178+
179+
Returns a dict with directories (name, path, file_count) grouped
180+
at the most useful level -- top-level for flat repos, package-level
181+
for monorepos with a packages/ directory.
182+
183+
Args:
184+
client: Reuse an existing httpx client to avoid opening a second
185+
connection. If None, creates and closes its own.
186+
"""
187+
from services.repo_validator import RepoValidator
188+
189+
# Encode branch for URL safety -- "feature/foo" -> "feature%2Ffoo"
190+
encoded_branch = quote(branch, safe="")
191+
url = f"{_GITHUB_API_BASE}/repos/{owner}/{repo}/git/trees/{encoded_branch}?recursive=1"
192+
193+
async def _get(c: httpx.AsyncClient) -> httpx.Response:
194+
return await c.get(url, headers=_github_headers())
195+
196+
try:
197+
if client:
198+
response = await _get(client)
199+
else:
200+
async with httpx.AsyncClient(timeout=15.0) as c:
201+
response = await _get(c)
202+
except httpx.TimeoutException:
203+
raise HTTPException(status_code=504, detail="GitHub API request timed out")
204+
except httpx.RequestError as e:
205+
logger.error("GitHub tree API network error", error=str(e))
206+
raise HTTPException(status_code=502, detail="Failed to connect to GitHub API")
207+
208+
if response.status_code == 404:
209+
raise HTTPException(status_code=404, detail="Repository or branch not found")
210+
if response.status_code == 403:
211+
raise HTTPException(status_code=429, detail="GitHub API rate limit exceeded")
212+
if response.status_code != 200:
213+
raise HTTPException(status_code=502, detail=f"GitHub API error: {response.status_code}")
214+
215+
try:
216+
data = response.json()
217+
except ValueError:
218+
raise HTTPException(status_code=502, detail="Invalid response from GitHub API")
219+
truncated = data.get("truncated", False)
220+
221+
code_extensions = RepoValidator.CODE_EXTENSIONS
222+
skip_dirs = RepoValidator.SKIP_DIRS
223+
224+
# Count code files per top-level directory
225+
dir_counts: dict[str, int] = {}
226+
total_files = 0
227+
228+
for item in data.get("tree", []):
229+
if item.get("type") != "blob":
230+
continue
231+
path = item.get("path", "")
232+
parts = path.split("/")
233+
if any(part in skip_dirs for part in parts):
234+
continue
235+
ext = "." + path.rsplit(".", 1)[-1] if "." in path else ""
236+
if ext.lower() not in code_extensions:
237+
continue
238+
239+
total_files += 1
240+
241+
# Group by top-level dir, or "(root)" for root-level files
242+
if len(parts) == 1:
243+
dir_counts["(root)"] = dir_counts.get("(root)", 0) + 1
244+
else:
245+
top = parts[0]
246+
# For monorepos: if top is packages/libs/apps, group one level deeper
247+
if top in ("packages", "libs", "apps", "modules", "crates") and len(parts) >= 3:
248+
key = f"{parts[0]}/{parts[1]}"
249+
else:
250+
key = top
251+
dir_counts[key] = dir_counts.get(key, 0) + 1
252+
253+
# Indexing is function-level, not file-level. Estimate function counts
254+
# using the same multiplier the tier system uses for limit checks.
255+
avg_fn = RepoValidator.AVG_FUNCTIONS_PER_FILE # 25
256+
257+
# Build sorted directory list with estimated function counts
258+
directories = sorted(
259+
[
260+
{
261+
"name": d, "path": d,
262+
"file_count": c,
263+
"estimated_functions": c * avg_fn,
264+
}
265+
for d, c in dir_counts.items() if d != "(root)"
266+
],
267+
key=lambda x: -x["file_count"],
268+
)
269+
270+
root_files = dir_counts.get("(root)", 0)
271+
if root_files > 0:
272+
directories.append({
273+
"name": "(root files)", "path": ".",
274+
"file_count": root_files,
275+
"estimated_functions": root_files * avg_fn,
276+
})
277+
278+
total_estimated = total_files * avg_fn
279+
280+
# Suggest directory picker for large repos
281+
suggestion = None
282+
if total_files > 500 or len(directories) > 10:
283+
suggestion = "large_repo"
284+
285+
return {
286+
"directories": directories,
287+
"total_files": total_files,
288+
"total_estimated_functions": total_estimated,
289+
"total_directories": len(directories),
290+
"truncated": truncated,
291+
"suggestion": suggestion,
292+
}
293+
294+
295+
class AnalyzeRepoRequest(BaseModel):
296+
"""Request body for pre-clone repo analysis."""
297+
github_url: str
298+
299+
@field_validator("github_url")
300+
@classmethod
301+
def validate_url(cls, v: str) -> str:
302+
v = v.strip().rstrip("/")
303+
if not v:
304+
raise ValueError("GitHub URL is required")
305+
if not _GITHUB_URL_RE.match(v):
306+
raise ValueError(
307+
"Invalid GitHub URL. Expected: https://github.com/owner/repo"
308+
)
309+
return v
310+
311+
312+
_ANALYZE_CACHE_TTL = 86400 # 24 hours -- directory structure rarely changes
313+
314+
315+
@router.post("/analyze")
316+
async def analyze_repository(request: AnalyzeRepoRequest) -> dict:
317+
"""Analyze a GitHub repo's directory structure WITHOUT cloning.
318+
319+
Returns directory tree with file counts so the user can select
320+
which directories to index (monorepo subset selection).
321+
322+
Results are cached for 24 hours (see _ANALYZE_CACHE_TTL) since
323+
directory structure rarely changes.
324+
"""
325+
match = _GITHUB_URL_RE.match(request.github_url)
326+
if not match:
327+
raise HTTPException(
328+
status_code=400,
329+
detail="Invalid GitHub URL. Expected: https://github.com/owner/repo",
330+
)
331+
332+
owner = match.group("owner")
333+
repo_name = match.group("repo").removesuffix(".git")
334+
335+
# Check cache first (same pattern as validate-repo)
336+
from dependencies import cache
337+
cache_key = f"analyze:{owner}/{repo_name}"
338+
cached = cache.get(cache_key) if cache else None
339+
if cached:
340+
logger.info("Returning cached analysis", owner=owner, repo=repo_name)
341+
return cached
342+
343+
# Single httpx client for both GitHub API calls
344+
try:
345+
async with httpx.AsyncClient(timeout=15.0) as client:
346+
# 1. Fetch repo metadata for default branch and size
347+
meta_resp = await client.get(
348+
f"{_GITHUB_API_BASE}/repos/{owner}/{repo_name}",
349+
headers=_github_headers(),
350+
)
351+
352+
if meta_resp.status_code == 404:
353+
raise HTTPException(status_code=404, detail="Repository not found")
354+
if meta_resp.status_code == 403:
355+
raise HTTPException(status_code=429, detail="GitHub API rate limit exceeded")
356+
if meta_resp.status_code != 200:
357+
raise HTTPException(status_code=502, detail="Failed to fetch repository metadata")
358+
359+
try:
360+
metadata = meta_resp.json()
361+
except ValueError:
362+
raise HTTPException(status_code=502, detail="Invalid response from GitHub API")
363+
364+
# Block private repos -- server GITHUB_TOKEN could access them,
365+
# but we must not leak private repo structure to unauthenticated callers
366+
if metadata.get("private", False):
367+
raise HTTPException(
368+
status_code=403,
369+
detail="Private repositories are not supported. Use authenticated indexing instead.",
370+
)
371+
372+
default_branch = metadata.get("default_branch", "main")
373+
374+
# 2. Fetch directory tree (reuse same client)
375+
tree_data = await _fetch_directory_tree(owner, repo_name, default_branch, client=client)
376+
except HTTPException:
377+
raise
378+
except httpx.TimeoutException:
379+
raise HTTPException(status_code=504, detail="GitHub API request timed out")
380+
except httpx.RequestError as e:
381+
logger.error("GitHub API network error", error=str(e))
382+
raise HTTPException(status_code=502, detail="Failed to connect to GitHub API")
383+
384+
logger.info(
385+
"Analyzed repo structure",
386+
owner=owner, repo=repo_name,
387+
total_files=tree_data["total_files"],
388+
dirs=tree_data["total_directories"],
389+
suggestion=tree_data.get("suggestion"),
390+
)
391+
392+
result = {
393+
"owner": owner,
394+
"repo": repo_name,
395+
"default_branch": default_branch,
396+
"size_kb": metadata.get("size", 0),
397+
"stars": metadata.get("stargazers_count", 0),
398+
"language": metadata.get("language"),
399+
**tree_data,
400+
}
401+
402+
# Cache result
403+
if cache:
404+
cache.set(cache_key, result, ttl=_ANALYZE_CACHE_TTL)
405+
406+
return result
407+
408+
152409
@router.delete("/{repo_id}")
153410
async def delete_repository(
154411
repo_id: str,

0 commit comments

Comments
 (0)