@@ -361,21 +361,33 @@ def _detect_language(self, file_path: str) -> str:
361361 '.tsx' : 'typescript' ,
362362 }.get (ext , 'unknown' )
363363
364- def _discover_files (self , repo_path : Path ) -> List [Path ]:
365- """Find all code files, skipping irrelevant directories and symlinks"""
364+ def _discover_files (self , repo_path : Path , include_paths : Optional [List [str ]] = None ) -> List [Path ]:
365+ """Find all code files, skipping irrelevant directories and symlinks.
366+
367+ If include_paths is set, only files within those directories are returned.
368+ """
366369 files = []
367370 extensions = {'.py' , '.js' , '.jsx' , '.ts' , '.tsx' , '.sql' }
368371
369372 try :
370373 for item in repo_path .rglob ('*' ):
371374 if item .is_symlink ():
372375 continue
373- if item .is_file () and item .suffix in extensions :
374- if not any (skip in item .parts for skip in self .SKIP_DIRS ):
375- files .append (item )
376- if len (files ) >= self .MAX_FILES :
377- logger .warning (f"Hit max file limit ({ self .MAX_FILES } )" )
378- break
376+ if not item .is_file () or item .suffix not in extensions :
377+ continue
378+ if any (skip in item .parts for skip in self .SKIP_DIRS ):
379+ continue
380+ if include_paths :
381+ rel_parts = item .relative_to (repo_path ).parts
382+ if not any (
383+ rel_parts [:len (Path (p ).parts )] == Path (p ).parts
384+ for p in include_paths
385+ ):
386+ continue
387+ files .append (item )
388+ if len (files ) >= self .MAX_FILES :
389+ logger .warning (f"Hit max file limit ({ self .MAX_FILES } )" )
390+ break
379391 except Exception as e :
380392 logger .error (f"Error discovering files: { e } " )
381393
@@ -914,8 +926,11 @@ def _extract_config_patterns(self, files: List[Path], repo_path: Path) -> Config
914926
915927 return pattern
916928
917- def extract_dna (self , repo_path : str , repo_id : str ) -> CodebaseDNA :
918- """Extract complete DNA profile from a codebase"""
929+ def extract_dna (self , repo_path : str , repo_id : str , include_paths : Optional [List [str ]] = None ) -> CodebaseDNA :
930+ """Extract complete DNA profile from a codebase.
931+
932+ If include_paths is set, only files within those directories are analyzed.
933+ """
919934 import time
920935 start_time = time .time ()
921936
@@ -929,13 +944,25 @@ def extract_dna(self, repo_path: str, repo_id: str) -> CodebaseDNA:
929944 logger .error (f"Repo path is not a directory: { repo_path } " )
930945 raise ValueError (f"Repository path is not a directory: { repo_path } " )
931946
947+ # Sanitize include_paths (could be corrupt jsonb from DB)
948+ if include_paths :
949+ cleaned = []
950+ for p in include_paths :
951+ if not isinstance (p , str ):
952+ continue
953+ p = p .replace ('\\ ' , '/' ).strip ().strip ('/' )
954+ if not p or '..' in p .split ('/' ):
955+ continue
956+ cleaned .append (p )
957+ include_paths = cleaned or None
958+
932959 # reset cache for fresh extraction
933960 self ._reset_cache ()
934961
935- logger .info ("Extracting codebase DNA" , repo_id = repo_id , path = str (repo_path ))
962+ logger .info ("Extracting codebase DNA" , repo_id = repo_id , path = str (repo_path ), include_paths = include_paths )
936963
937964 # Discover files
938- files = self ._discover_files (repo_path )
965+ files = self ._discover_files (repo_path , include_paths = include_paths )
939966 logger .info (f"Found { len (files )} code files" )
940967
941968 # Detect framework first
0 commit comments