|
| 1 | +"""Filter out low-quality functions from search index.""" |
| 2 | +from typing import List, Set |
| 3 | +from services.search_v2.types import ExtractedFunction |
| 4 | +from services.observability import logger |
| 5 | + |
| 6 | +# Junk function name prefixes |
| 7 | +JUNK_PREFIXES = ( |
| 8 | + 'test_', 'time_', 'rand_', 'mock_', 'fake_', 'stub_', |
| 9 | + 'setup_', 'teardown_', 'fixture_', 'check_', |
| 10 | + '_test_', '_time_', '_rand_', '_mock_', |
| 11 | + 'assert_', 'verify_', 'validate_test', |
| 12 | +) |
| 13 | + |
| 14 | +# Junk patterns anywhere in name |
| 15 | +JUNK_PATTERNS = ( |
| 16 | + '_fixture', '_setup', '_teardown', '_helper_test', |
| 17 | + 'benchmark', '_bench', '_perf_', |
| 18 | + '_random_data', '_test_data', '_sample_data', |
| 19 | + 'from_int_dict', 'from_test', '_for_test', |
| 20 | + '_for_split', 'create_data_for', |
| 21 | + 'doesnt_use_', 'check_main', |
| 22 | +) |
| 23 | + |
| 24 | +# Junk file paths |
| 25 | +JUNK_PATHS = ( |
| 26 | + 'tests/', 'test/', 'testing/', '/tests/', '/test/', '/testing/', |
| 27 | + 'benchmarks/', 'asv_bench/', 'bench/', |
| 28 | + 'examples/', 'docs/', 'doc/', |
| 29 | + '_testing/', 'conftest', |
| 30 | + 'fixtures/', '_fixtures/', |
| 31 | + 'mock/', 'mocks/', 'stubs/', |
| 32 | +) |
| 33 | + |
| 34 | +# Keep these even if they match junk patterns |
| 35 | +PUBLIC_API: Set[str] = { |
| 36 | + 'read_csv', 'read_excel', 'read_json', 'read_parquet', 'read_sql', |
| 37 | + 'to_csv', 'to_excel', 'to_json', 'to_parquet', 'to_sql', |
| 38 | + 'merge', 'concat', 'groupby', 'pivot', 'melt', |
| 39 | + 'fillna', 'dropna', 'isna', 'notna', |
| 40 | + 'apply', 'map', 'transform', 'agg', 'aggregate', |
| 41 | + 'sort_values', 'sort_index', 'reset_index', 'set_index', |
| 42 | + 'authenticate', 'authorize', 'login', 'logout', |
| 43 | + 'validate', 'serialize', 'deserialize', |
| 44 | + 'create', 'read', 'update', 'delete', |
| 45 | + 'get', 'set', 'post', 'put', 'patch', |
| 46 | + 'connect', 'disconnect', 'send', 'receive', |
| 47 | + 'parse', 'format', 'convert', 'transform', |
| 48 | + 'load', 'save', 'export', 'import_', |
| 49 | + 'init', 'setup', 'configure', 'initialize', |
| 50 | +} |
| 51 | + |
| 52 | + |
| 53 | +class FunctionFilter: |
| 54 | + """Filter functions to keep only high-quality, searchable ones.""" |
| 55 | + |
| 56 | + def __init__( |
| 57 | + self, |
| 58 | + include_private: bool = False, |
| 59 | + include_dunders: bool = True, |
| 60 | + max_name_length: int = 50, |
| 61 | + ): |
| 62 | + self.include_private = include_private |
| 63 | + self.include_dunders = include_dunders |
| 64 | + self.max_name_length = max_name_length |
| 65 | + |
| 66 | + def filter_functions(self, functions: List[ExtractedFunction]) -> List[ExtractedFunction]: |
| 67 | + original = len(functions) |
| 68 | + filtered = [f for f in functions if self._keep(f)] |
| 69 | + |
| 70 | + if original - len(filtered) > 0: |
| 71 | + logger.debug("Filtered functions", kept=len(filtered), removed=original - len(filtered)) |
| 72 | + |
| 73 | + return filtered |
| 74 | + |
| 75 | + def _keep(self, func: ExtractedFunction) -> bool: |
| 76 | + name = func.name.lower() |
| 77 | + path = func.file_path.lower() |
| 78 | + |
| 79 | + # always keep public API |
| 80 | + if any(api in name for api in PUBLIC_API): |
| 81 | + return True |
| 82 | + |
| 83 | + # skip junk paths |
| 84 | + if any(p in path for p in JUNK_PATHS): |
| 85 | + return False |
| 86 | + |
| 87 | + # skip junk prefixes |
| 88 | + if name.startswith(JUNK_PREFIXES): |
| 89 | + return False |
| 90 | + |
| 91 | + # skip junk patterns |
| 92 | + if any(p in name for p in JUNK_PATTERNS): |
| 93 | + return False |
| 94 | + |
| 95 | + # skip long auto-generated names |
| 96 | + if len(name) > self.max_name_length: |
| 97 | + return False |
| 98 | + |
| 99 | + # handle private functions |
| 100 | + if func.name.startswith('_') and not func.name.startswith('__'): |
| 101 | + return self.include_private |
| 102 | + |
| 103 | + # handle dunders |
| 104 | + if func.name.startswith('__') and func.name.endswith('__'): |
| 105 | + return self.include_dunders |
| 106 | + |
| 107 | + # skip test data generators |
| 108 | + if name.startswith('make_') and ('test' in path or 'random' in name): |
| 109 | + return False |
| 110 | + |
| 111 | + return True |
| 112 | + |
| 113 | + def get_stats(self, functions: List[ExtractedFunction]) -> dict: |
| 114 | + quality = [f for f in functions if self._keep(f)] |
| 115 | + return { |
| 116 | + "total": len(functions), |
| 117 | + "kept": len(quality), |
| 118 | + "removed": len(functions) - len(quality), |
| 119 | + } |
| 120 | + |
| 121 | + |
| 122 | +default_filter = FunctionFilter() |
| 123 | + |
| 124 | + |
| 125 | +def filter_functions(functions: List[ExtractedFunction]) -> List[ExtractedFunction]: |
| 126 | + """Filter using default settings.""" |
| 127 | + return default_filter.filter_functions(functions) |
0 commit comments