Skip to content

Commit 1ae5fae

Browse files
committed
feat(dna): add robustness improvements
- Add file content cache to avoid re-reading files - Add MAX_FILE_SIZE (1MB) and MAX_FILES (5000) limits - Add _safe_read_file() with encoding fallbacks (utf-8, latin-1, cp1252) - Add binary file detection (null bytes check) - Add symlink handling in _discover_files() - Add path validation in extract_dna() - Add performance stats logging (files_read, skipped, errors, duration) - Add .venv and site-packages to SKIP_DIRS
1 parent 94eb7ee commit 1ae5fae

1 file changed

Lines changed: 84 additions & 7 deletions

File tree

backend/services/dna_extractor.py

Lines changed: 84 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,9 @@ def to_markdown(self) -> str:
253253
class DNAExtractor:
254254
"""Extracts architectural DNA from a codebase"""
255255

256-
SKIP_DIRS = {'node_modules', '.git', '__pycache__', 'venv', 'env', 'dist', 'build', '.next', 'coverage'}
256+
SKIP_DIRS = {'node_modules', '.git', '__pycache__', 'venv', 'env', 'dist', 'build', '.next', 'coverage', '.venv', 'site-packages'}
257+
MAX_FILE_SIZE = 1024 * 1024 # 1MB
258+
MAX_FILES = 5000
257259

258260
def __init__(self):
259261
self.parsers = {
@@ -262,6 +264,8 @@ def __init__(self):
262264
'typescript': Parser(Language(tsjavascript.language())),
263265
}
264266
self._supabase = None
267+
self._file_cache: Dict[Path, str] = {}
268+
self._stats = {'files_read': 0, 'files_skipped': 0, 'read_errors': 0}
265269
logger.info("DNAExtractor initialized")
266270

267271
@property
@@ -270,6 +274,49 @@ def supabase(self):
270274
self._supabase = get_supabase_service()
271275
return self._supabase
272276

277+
def _reset_cache(self):
278+
"""Clear file cache between extractions"""
279+
self._file_cache.clear()
280+
self._stats = {'files_read': 0, 'files_skipped': 0, 'read_errors': 0}
281+
282+
def _safe_read_file(self, file_path: Path) -> Optional[str]:
283+
"""Safely read file with caching, size limits, and error handling"""
284+
if file_path in self._file_cache:
285+
return self._file_cache[file_path]
286+
287+
try:
288+
# size check
289+
if file_path.stat().st_size > self.MAX_FILE_SIZE:
290+
self._stats['files_skipped'] += 1
291+
return None
292+
293+
# read with fallback encodings
294+
content = None
295+
for encoding in ['utf-8', 'latin-1', 'cp1252']:
296+
try:
297+
content = file_path.read_text(encoding=encoding)
298+
break
299+
except UnicodeDecodeError:
300+
continue
301+
302+
if content is None:
303+
self._stats['read_errors'] += 1
304+
return None
305+
306+
# check for binary content (null bytes)
307+
if '\x00' in content[:1024]:
308+
self._stats['files_skipped'] += 1
309+
return None
310+
311+
self._file_cache[file_path] = content
312+
self._stats['files_read'] += 1
313+
return content
314+
315+
except Exception as e:
316+
logger.debug(f"Error reading {file_path}: {e}")
317+
self._stats['read_errors'] += 1
318+
return None
319+
273320
def _detect_language(self, file_path: str) -> str:
274321
ext = Path(file_path).suffix.lower()
275322
return {
@@ -281,14 +328,22 @@ def _detect_language(self, file_path: str) -> str:
281328
}.get(ext, 'unknown')
282329

283330
def _discover_files(self, repo_path: Path) -> List[Path]:
284-
"""Find all code files, skipping irrelevant directories"""
331+
"""Find all code files, skipping irrelevant directories and symlinks"""
285332
files = []
286333
extensions = {'.py', '.js', '.jsx', '.ts', '.tsx', '.sql'}
287334

288-
for item in repo_path.rglob('*'):
289-
if item.is_file() and item.suffix in extensions:
290-
if not any(skip in item.parts for skip in self.SKIP_DIRS):
291-
files.append(item)
335+
try:
336+
for item in repo_path.rglob('*'):
337+
if item.is_symlink():
338+
continue
339+
if item.is_file() and item.suffix in extensions:
340+
if not any(skip in item.parts for skip in self.SKIP_DIRS):
341+
files.append(item)
342+
if len(files) >= self.MAX_FILES:
343+
logger.warning(f"Hit max file limit ({self.MAX_FILES})")
344+
break
345+
except Exception as e:
346+
logger.error(f"Error discovering files: {e}")
292347

293348
return files
294349

@@ -824,8 +879,22 @@ def _extract_config_patterns(self, files: List[Path], repo_path: Path) -> Config
824879

825880
def extract_dna(self, repo_path: str, repo_id: str) -> CodebaseDNA:
826881
"""Extract complete DNA profile from a codebase"""
882+
import time
883+
start_time = time.time()
884+
827885
repo_path = Path(repo_path)
828886

887+
# validate path
888+
if not repo_path.exists():
889+
logger.error(f"Repo path does not exist: {repo_path}")
890+
raise ValueError(f"Repository path does not exist: {repo_path}")
891+
if not repo_path.is_dir():
892+
logger.error(f"Repo path is not a directory: {repo_path}")
893+
raise ValueError(f"Repository path is not a directory: {repo_path}")
894+
895+
# reset cache for fresh extraction
896+
self._reset_cache()
897+
829898
logger.info("Extracting codebase DNA", repo_id=repo_id, path=str(repo_path))
830899

831900
# Discover files
@@ -875,7 +944,15 @@ def extract_dna(self, repo_path: str, repo_id: str) -> CodebaseDNA:
875944
router_pattern=router_pattern,
876945
)
877946

878-
logger.info("DNA extraction complete", repo_id=repo_id)
947+
elapsed = time.time() - start_time
948+
logger.info(
949+
"DNA extraction complete",
950+
repo_id=repo_id,
951+
duration_sec=round(elapsed, 2),
952+
files_read=self._stats['files_read'],
953+
files_skipped=self._stats['files_skipped'],
954+
read_errors=self._stats['read_errors']
955+
)
879956
return dna
880957

881958
def save_to_cache(self, repo_id: str, dna: CodebaseDNA) -> bool:

0 commit comments

Comments
 (0)