@@ -253,7 +253,9 @@ def to_markdown(self) -> str:
253253class DNAExtractor :
254254 """Extracts architectural DNA from a codebase"""
255255
256- SKIP_DIRS = {'node_modules' , '.git' , '__pycache__' , 'venv' , 'env' , 'dist' , 'build' , '.next' , 'coverage' }
256+ SKIP_DIRS = {'node_modules' , '.git' , '__pycache__' , 'venv' , 'env' , 'dist' , 'build' , '.next' , 'coverage' , '.venv' , 'site-packages' }
257+ MAX_FILE_SIZE = 1024 * 1024 # 1MB
258+ MAX_FILES = 5000
257259
258260 def __init__ (self ):
259261 self .parsers = {
@@ -262,6 +264,8 @@ def __init__(self):
262264 'typescript' : Parser (Language (tsjavascript .language ())),
263265 }
264266 self ._supabase = None
267+ self ._file_cache : Dict [Path , str ] = {}
268+ self ._stats = {'files_read' : 0 , 'files_skipped' : 0 , 'read_errors' : 0 }
265269 logger .info ("DNAExtractor initialized" )
266270
267271 @property
@@ -270,6 +274,49 @@ def supabase(self):
270274 self ._supabase = get_supabase_service ()
271275 return self ._supabase
272276
277+ def _reset_cache (self ):
278+ """Clear file cache between extractions"""
279+ self ._file_cache .clear ()
280+ self ._stats = {'files_read' : 0 , 'files_skipped' : 0 , 'read_errors' : 0 }
281+
282+ def _safe_read_file (self , file_path : Path ) -> Optional [str ]:
283+ """Safely read file with caching, size limits, and error handling"""
284+ if file_path in self ._file_cache :
285+ return self ._file_cache [file_path ]
286+
287+ try :
288+ # size check
289+ if file_path .stat ().st_size > self .MAX_FILE_SIZE :
290+ self ._stats ['files_skipped' ] += 1
291+ return None
292+
293+ # read with fallback encodings
294+ content = None
295+ for encoding in ['utf-8' , 'latin-1' , 'cp1252' ]:
296+ try :
297+ content = file_path .read_text (encoding = encoding )
298+ break
299+ except UnicodeDecodeError :
300+ continue
301+
302+ if content is None :
303+ self ._stats ['read_errors' ] += 1
304+ return None
305+
306+ # check for binary content (null bytes)
307+ if '\x00 ' in content [:1024 ]:
308+ self ._stats ['files_skipped' ] += 1
309+ return None
310+
311+ self ._file_cache [file_path ] = content
312+ self ._stats ['files_read' ] += 1
313+ return content
314+
315+ except Exception as e :
316+ logger .debug (f"Error reading { file_path } : { e } " )
317+ self ._stats ['read_errors' ] += 1
318+ return None
319+
273320 def _detect_language (self , file_path : str ) -> str :
274321 ext = Path (file_path ).suffix .lower ()
275322 return {
@@ -281,14 +328,22 @@ def _detect_language(self, file_path: str) -> str:
281328 }.get (ext , 'unknown' )
282329
283330 def _discover_files (self , repo_path : Path ) -> List [Path ]:
284- """Find all code files, skipping irrelevant directories"""
331+ """Find all code files, skipping irrelevant directories and symlinks """
285332 files = []
286333 extensions = {'.py' , '.js' , '.jsx' , '.ts' , '.tsx' , '.sql' }
287334
288- for item in repo_path .rglob ('*' ):
289- if item .is_file () and item .suffix in extensions :
290- if not any (skip in item .parts for skip in self .SKIP_DIRS ):
291- files .append (item )
335+ try :
336+ for item in repo_path .rglob ('*' ):
337+ if item .is_symlink ():
338+ continue
339+ if item .is_file () and item .suffix in extensions :
340+ if not any (skip in item .parts for skip in self .SKIP_DIRS ):
341+ files .append (item )
342+ if len (files ) >= self .MAX_FILES :
343+ logger .warning (f"Hit max file limit ({ self .MAX_FILES } )" )
344+ break
345+ except Exception as e :
346+ logger .error (f"Error discovering files: { e } " )
292347
293348 return files
294349
@@ -824,8 +879,22 @@ def _extract_config_patterns(self, files: List[Path], repo_path: Path) -> Config
824879
825880 def extract_dna (self , repo_path : str , repo_id : str ) -> CodebaseDNA :
826881 """Extract complete DNA profile from a codebase"""
882+ import time
883+ start_time = time .time ()
884+
827885 repo_path = Path (repo_path )
828886
887+ # validate path
888+ if not repo_path .exists ():
889+ logger .error (f"Repo path does not exist: { repo_path } " )
890+ raise ValueError (f"Repository path does not exist: { repo_path } " )
891+ if not repo_path .is_dir ():
892+ logger .error (f"Repo path is not a directory: { repo_path } " )
893+ raise ValueError (f"Repository path is not a directory: { repo_path } " )
894+
895+ # reset cache for fresh extraction
896+ self ._reset_cache ()
897+
829898 logger .info ("Extracting codebase DNA" , repo_id = repo_id , path = str (repo_path ))
830899
831900 # Discover files
@@ -875,7 +944,15 @@ def extract_dna(self, repo_path: str, repo_id: str) -> CodebaseDNA:
875944 router_pattern = router_pattern ,
876945 )
877946
878- logger .info ("DNA extraction complete" , repo_id = repo_id )
947+ elapsed = time .time () - start_time
948+ logger .info (
949+ "DNA extraction complete" ,
950+ repo_id = repo_id ,
951+ duration_sec = round (elapsed , 2 ),
952+ files_read = self ._stats ['files_read' ],
953+ files_skipped = self ._stats ['files_skipped' ],
954+ read_errors = self ._stats ['read_errors' ]
955+ )
879956 return dna
880957
881958 def save_to_cache (self , repo_id : str , dna : CodebaseDNA ) -> bool :
0 commit comments