Skip to content

Commit 2535cd8

Browse files
committed
fix: address PR review -- error handling, URL encoding, precise validation
4 findings verified and fixed: 1. Wrap GitHub API calls in try/except for httpx.TimeoutException, httpx.RequestError, and ValueError (JSON parse). Returns proper 504/502 instead of unhandled crash. 2. URL-encode branch name with urllib.parse.quote(safe='') so branches like 'feature/foo' produce 'feature%2Ffoo' in the GitHub Tree API URL instead of breaking the path. 3. Tests now use pytest.raises(ValidationError) instead of broad Exception -- catches only Pydantic validation errors as intended. Added test_rejects_malformed_github_domain for 'fakegithub.com'. 4. AnalyzeRepoRequest validator uses _GITHUB_URL_RE regex instead of substring check -- rejects 'fakegithub.com' and 'notgithub.com.evil'. 25 tests pass.
1 parent 745c288 commit 2535cd8

2 files changed

Lines changed: 61 additions & 31 deletions

File tree

backend/routes/repos.py

Lines changed: 50 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from pydantic import BaseModel, field_validator
44
from typing import List, Optional
55
from pathlib import Path
6+
from urllib.parse import quote
67
import hashlib
78
import os
89
import re
@@ -185,16 +186,24 @@ async def _fetch_directory_tree(
185186
"""
186187
from services.repo_validator import RepoValidator
187188

188-
url = f"{_GITHUB_API_BASE}/repos/{owner}/{repo}/git/trees/{branch}?recursive=1"
189+
# Encode branch for URL safety -- "feature/foo" -> "feature%2Ffoo"
190+
encoded_branch = quote(branch, safe="")
191+
url = f"{_GITHUB_API_BASE}/repos/{owner}/{repo}/git/trees/{encoded_branch}?recursive=1"
189192

190193
async def _get(c: httpx.AsyncClient) -> httpx.Response:
191194
return await c.get(url, headers=_github_headers())
192195

193-
if client:
194-
response = await _get(client)
195-
else:
196-
async with httpx.AsyncClient(timeout=15.0) as c:
197-
response = await _get(c)
196+
try:
197+
if client:
198+
response = await _get(client)
199+
else:
200+
async with httpx.AsyncClient(timeout=15.0) as c:
201+
response = await _get(c)
202+
except httpx.TimeoutException:
203+
raise HTTPException(status_code=504, detail="GitHub API request timed out")
204+
except httpx.RequestError as e:
205+
logger.error("GitHub tree API network error", error=str(e))
206+
raise HTTPException(status_code=502, detail="Failed to connect to GitHub API")
198207

199208
if response.status_code == 404:
200209
raise HTTPException(status_code=404, detail="Repository or branch not found")
@@ -203,7 +212,10 @@ async def _get(c: httpx.AsyncClient) -> httpx.Response:
203212
if response.status_code != 200:
204213
raise HTTPException(status_code=502, detail=f"GitHub API error: {response.status_code}")
205214

206-
data = response.json()
215+
try:
216+
data = response.json()
217+
except ValueError:
218+
raise HTTPException(status_code=502, detail="Invalid response from GitHub API")
207219
truncated = data.get("truncated", False)
208220

209221
code_extensions = RepoValidator.CODE_EXTENSIONS
@@ -290,8 +302,10 @@ def validate_url(cls, v: str) -> str:
290302
v = v.strip().rstrip("/")
291303
if not v:
292304
raise ValueError("GitHub URL is required")
293-
if "github.com" not in v.lower():
294-
raise ValueError("Only GitHub URLs are supported")
305+
if not _GITHUB_URL_RE.match(v):
306+
raise ValueError(
307+
"Invalid GitHub URL. Expected: https://github.com/owner/repo"
308+
)
295309
return v
296310

297311

@@ -326,25 +340,36 @@ async def analyze_repository(request: AnalyzeRepoRequest) -> dict:
326340
return cached
327341

328342
# Single httpx client for both GitHub API calls
329-
async with httpx.AsyncClient(timeout=15.0) as client:
330-
# 1. Fetch repo metadata for default branch and size
331-
meta_resp = await client.get(
332-
f"{_GITHUB_API_BASE}/repos/{owner}/{repo_name}",
333-
headers=_github_headers(),
334-
)
343+
try:
344+
async with httpx.AsyncClient(timeout=15.0) as client:
345+
# 1. Fetch repo metadata for default branch and size
346+
meta_resp = await client.get(
347+
f"{_GITHUB_API_BASE}/repos/{owner}/{repo_name}",
348+
headers=_github_headers(),
349+
)
335350

336-
if meta_resp.status_code == 404:
337-
raise HTTPException(status_code=404, detail="Repository not found")
338-
if meta_resp.status_code == 403:
339-
raise HTTPException(status_code=429, detail="GitHub API rate limit exceeded")
340-
if meta_resp.status_code != 200:
341-
raise HTTPException(status_code=502, detail="Failed to fetch repository metadata")
351+
if meta_resp.status_code == 404:
352+
raise HTTPException(status_code=404, detail="Repository not found")
353+
if meta_resp.status_code == 403:
354+
raise HTTPException(status_code=429, detail="GitHub API rate limit exceeded")
355+
if meta_resp.status_code != 200:
356+
raise HTTPException(status_code=502, detail="Failed to fetch repository metadata")
342357

343-
metadata = meta_resp.json()
344-
default_branch = metadata.get("default_branch", "main")
358+
try:
359+
metadata = meta_resp.json()
360+
except ValueError:
361+
raise HTTPException(status_code=502, detail="Invalid response from GitHub API")
362+
default_branch = metadata.get("default_branch", "main")
345363

346-
# 2. Fetch directory tree (reuse same client)
347-
tree_data = await _fetch_directory_tree(owner, repo_name, default_branch, client=client)
364+
# 2. Fetch directory tree (reuse same client)
365+
tree_data = await _fetch_directory_tree(owner, repo_name, default_branch, client=client)
366+
except HTTPException:
367+
raise
368+
except httpx.TimeoutException:
369+
raise HTTPException(status_code=504, detail="GitHub API request timed out")
370+
except httpx.RequestError as e:
371+
logger.error("GitHub API network error", error=str(e))
372+
raise HTTPException(status_code=502, detail="Failed to connect to GitHub API")
348373

349374
logger.info(
350375
"Analyzed repo structure",

backend/tests/test_analyze_repo.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Tests for POST /repos/analyze -- pre-clone directory analysis (OPE-109)."""
22
import pytest
33
from unittest.mock import AsyncMock, patch, MagicMock
4+
from pydantic import ValidationError
45

56
# Import after conftest patches external services
67
from routes.repos import (
@@ -50,13 +51,17 @@ def test_strips_whitespace_and_slash(self):
5051
assert req.github_url == "https://github.com/owner/repo"
5152

5253
def test_rejects_empty(self):
53-
with pytest.raises(Exception):
54+
with pytest.raises(ValidationError):
5455
AnalyzeRepoRequest(github_url="")
5556

5657
def test_rejects_non_github(self):
57-
with pytest.raises(Exception):
58+
with pytest.raises(ValidationError):
5859
AnalyzeRepoRequest(github_url="https://gitlab.com/owner/repo")
5960

61+
def test_rejects_malformed_github_domain(self):
62+
with pytest.raises(ValidationError):
63+
AnalyzeRepoRequest(github_url="https://fakegithub.com/owner/repo")
64+
6065

6166
# -- IndexConfig validation (from PR #266) ------------------------------------
6267

@@ -74,19 +79,19 @@ def test_normalizes_backslashes(self):
7479
assert cfg.include_paths == ["packages/effect"]
7580

7681
def test_rejects_empty_string(self):
77-
with pytest.raises(Exception):
82+
with pytest.raises(ValidationError):
7883
IndexConfig(include_paths=["packages/effect", " "])
7984

8085
def test_rejects_path_traversal(self):
81-
with pytest.raises(Exception):
86+
with pytest.raises(ValidationError):
8287
IndexConfig(include_paths=["../etc/passwd"])
8388

8489
def test_rejects_nested_traversal(self):
85-
with pytest.raises(Exception):
90+
with pytest.raises(ValidationError):
8691
IndexConfig(include_paths=["packages/../../etc"])
8792

8893
def test_rejects_non_string(self):
89-
with pytest.raises(Exception):
94+
with pytest.raises(ValidationError):
9095
IndexConfig(include_paths=[123])
9196

9297
def test_none_is_valid(self):

0 commit comments

Comments
 (0)