diff --git a/codebase_rag/parsers/call_processor.py b/codebase_rag/parsers/call_processor.py index 4895ac717..0ae949329 100644 --- a/codebase_rag/parsers/call_processor.py +++ b/codebase_rag/parsers/call_processor.py @@ -501,9 +501,19 @@ def _process_calls_in_functions( call_name_cache=call_name_cache, ) continue - if func_qn := self._build_nested_qualified_name( - func_node, module_qn, func_name, lang_config - ): + # (H) A C++ free function inside a namespace is bound by the definition + # (H) pass via build_qualified_name (qn `module.ns.fn`); _build_nested... + # (H) ignores namespace_definition ancestors and would drop the namespace + # (H) (`module.fn`), dangling the CALLS source. Use the same builder so + # (H) caller and node qns agree. + func_qn = ( + cpp_utils.build_qualified_name(func_node, module_qn, func_name) + if language == cs.SupportedLanguage.CPP + else self._build_nested_qualified_name( + func_node, module_qn, func_name, lang_config + ) + ) + if func_qn: filtered = ( self._filter_calls_in_node(all_call_nodes, call_starts, func_node) if all_call_nodes is not None and call_starts is not None @@ -744,7 +754,15 @@ def _process_calls_in_classes( class_name = self._get_class_name_for_node(class_node, language) if not class_name: continue - class_qn = f"{module_qn}{cs.SEPARATOR_DOT}{class_name}" + # (H) A C++ class inside a namespace is bound by the definition pass via + # (H) build_qualified_name (qn `module.ns.Class`); the bare join would drop + # (H) the namespace, dangling every inline method's CALLS source. Use the + # (H) same builder so the class qn (and thus method caller qns) agree. + class_qn = ( + cpp_utils.build_qualified_name(class_node, module_qn, class_name) + if language == cs.SupportedLanguage.CPP + else f"{module_qn}{cs.SEPARATOR_DOT}{class_name}" + ) if body_node := class_node.child_by_field_name(cs.FIELD_BODY): self._process_methods_in_class( body_node, diff --git a/codebase_rag/tests/test_cpp_namespace_call_caller_qn.py b/codebase_rag/tests/test_cpp_namespace_call_caller_qn.py new file mode 100644 index 000000000..2eda813e5 --- /dev/null +++ b/codebase_rag/tests/test_cpp_namespace_call_caller_qn.py @@ -0,0 +1,77 @@ +from __future__ import annotations + +from pathlib import Path +from unittest.mock import MagicMock + +from codebase_rag.tests.conftest import ( + get_nodes, + get_qualified_names, + get_relationships, + run_updater, +) + +# (H) A free function and an inline class method, both inside a namespace, each +# (H) calling a namespaced free function. The definition pass binds their nodes +# (H) WITH the enclosing namespace (qn `...ns.free_caller`, `...ns.K.method`), but +# (H) the call pass built the caller qn WITHOUT the namespace (`...free_caller`, +# (H) `...K.method`), so every such CALLS edge's source dangled (matched no node) +# (H) and the call was lost. On real namespaced C++ (e.g. all of leveldb, in +# (H) `namespace leveldb`) this silently dropped the bulk of cross-file calls. The +# (H) caller qn must include the enclosing namespace, matching the node. +CPP_SOURCE = """ +namespace acme { + +int callee(int x) { return x + 1; } + +int free_caller(int a) { return callee(a); } + +class K { +public: + int method(int b) { return callee(b); } +}; + +} // namespace acme +""" + + +def test_namespaced_callers_attribute_calls_to_namespaced_qn( + temp_repo: Path, + mock_ingestor: MagicMock, +) -> None: + project = temp_repo / "cpp_ns_calls" + project.mkdir() + (project / "sample.cpp").write_text(CPP_SOURCE, encoding="utf-8") + + run_updater(project, mock_ingestor) + + free_qn = next( + ( + q + for q in get_qualified_names(get_nodes(mock_ingestor, "Function")) + if q.endswith(".acme.free_caller") + ), + None, + ) + method_qn = next( + ( + q + for q in get_qualified_names(get_nodes(mock_ingestor, "Method")) + if q.endswith(".acme.K.method") + ), + None, + ) + assert free_qn is not None, "no ns.free_caller Function node" + assert method_qn is not None, "no ns.K.method Method node" + + calls = get_relationships(mock_ingestor, "CALLS") + # (H) ensure_relationship_batch(from_spec, rel_type, to_spec): from_spec[2] is + # (H) the caller qn, to_spec[2] the callee qn. + callers_of_callee = { + c.args[0][2] for c in calls if str(c.args[2][2]).endswith(".callee") + } + assert free_qn in callers_of_callee, ( + f"expected CALLS from {free_qn} to callee; got {sorted(callers_of_callee)}" + ) + assert method_qn in callers_of_callee, ( + f"expected CALLS from {method_qn} to callee; got {sorted(callers_of_callee)}" + ) diff --git a/codebase_rag/tests/test_cpp_retrieval_eval.py b/codebase_rag/tests/test_cpp_retrieval_eval.py new file mode 100644 index 000000000..7f8eb5607 --- /dev/null +++ b/codebase_rag/tests/test_cpp_retrieval_eval.py @@ -0,0 +1,75 @@ +from pathlib import Path + +import pytest + +from evals import constants as ec +from evals.cpp_retrieval import ( + cgr_cpp_call_edges, + oracle_cpp_call_edges, + score_cpp_retrieval, +) +from evals.oracles import cpp_available + +needs_clang = pytest.mark.skipif( + not cpp_available(), reason="libclang (clang.cindex) not importable" +) + + +def _make_project(root: Path) -> None: + root.mkdir(parents=True, exist_ok=True) + # (H) No #includes: the fixture parses cleanly regardless of whether an SDK + # (H) libc++ is discoverable, so coverage is deterministic in any CI. All decls + # (H) live inside a namespace, exercising the namespaced caller-qn path (free + # (H) functions and an inline method) that the libclang oracle grades cgr against. + (root / "lib.cc").write_text( + "namespace demo {\n" + "int add(int a, int b) { return a + b; }\n" + "int mul(int a, int b) { return a * b; }\n" + "int orphan(int a) { return a; }\n" + "}\n", + encoding="utf-8", + ) + (root / "main.cc").write_text( + "namespace demo {\n" + "int add(int a, int b);\n" + "int mul(int a, int b);\n" + "int compute(int x) { return add(x, x) + mul(x, x); }\n" + "class Runner {\n" + " public:\n" + " int run(int x) { return compute(x); }\n" + "};\n" + "}\n", + encoding="utf-8", + ) + + +@needs_clang +def test_oracle_captures_first_party_cpp_calls(tmp_path: Path) -> None: + _make_project(tmp_path) + edges, declared, covered = oracle_cpp_call_edges(tmp_path) + + # (H) add(), mul() (in compute), compute() (in Runner::run) are first-party. + assert ("main.cc", "add") in edges + assert ("main.cc", "mul") in edges + assert ("main.cc", "compute") in edges + # (H) orphan is defined but never called -> never a call edge. + assert ("lib.cc", "orphan") not in edges + assert {"add", "mul", "compute", "run", "orphan"} <= declared + # (H) Both header-free sources parse cleanly, so both are graded. + assert {"main.cc", "lib.cc"} <= covered + + +@needs_clang +def test_cgr_matches_oracle_on_clean_cpp_project(tmp_path: Path) -> None: + _make_project(tmp_path) + oracle, declared, covered = oracle_cpp_call_edges(tmp_path) + cgr = cgr_cpp_call_edges(tmp_path, tmp_path.name, declared, covered) + assert cgr == oracle + + +def test_score_cpp_retrieval_prf() -> None: + result = score_cpp_retrieval( + {("a.cc", "f"), ("a.cc", "g")}, {("a.cc", "f"), ("b.cc", "h")} + ) + row = next(r for r in result.rows if r["label"] == ec.CPP_RETRIEVAL_LABEL) + assert (row["tp"], row["fp"], row["fn"]) == (1, 1, 1) diff --git a/evals/README.md b/evals/README.md index 0fd3ad646..b1d262629 100644 --- a/evals/README.md +++ b/evals/README.md @@ -630,6 +630,76 @@ parse and drops the following declaration, present through the latest 0.24.2); tracked in issue #555 for an upstream report. It is rooted in the grammar, not in cgr's resolution logic, so it is reported here, not hidden. +## Multi-language retrieval (C++) — C++ CALLS vs `libclang` + +The same harness applied to C++: for each first-party C++ function or member +function, which files call it. cgr's C++ `CALLS` edges, reduced to +`(caller_file, callee_simple_name)`, are graded against call sites extracted by +`libclang`, over the same first-party name universe (free functions, function +templates, and member functions; constructors/destructors are excluded because +cgr models object creation as `INSTANTIATES`). cgr parses C++ with tree-sitter by +default (`CPP_FRONTEND=libclang` is off), so `libclang` is an independent oracle. +Overloads collapse under the `(file, simple-name)` metric, so they need no +disambiguation. + +```bash +uv run python -m evals.cpp_retrieval --target --define LEVELDB_PLATFORM_POSIX=1 +``` + +Requires `libclang`. C++ standard headers must be parsed by a `libclang` whose +clang version matches the active SDK's `libc++`; the bundled pip wheel's older +clang cannot, so the oracle prefers a system `libclang` +(`/Library/Developer/CommandLineTools/usr/lib/libclang.dylib` on macOS) and pins +it before the first parse. No `compile_commands.json` is needed: each source is +parsed directly with the SDK sysroot, the SDK's `libc++` headers (which must +precede clang's builtin resource headers), and every first-party header directory +added as an include path. A build normally supplies platform macros (e.g. +`LEVELDB_PLATFORM_POSIX`); pass them with `--define`. A translation unit that still +emits an error diagnostic **abstains** (left out of the covered set; the cgr side +is held to the same files, the graded count logged). To avoid crediting or +penalizing calls whose simple name merely collides with a first-party symbol, the +oracle grades a call only when `libclang` resolves its callee to a **first-party +declaration** (`child.referenced`), so a `std::string::size()` call is never +counted as a first-party `size` edge. Pinned by +`codebase_rag/tests/test_cpp_retrieval_eval.py`, where cgr's C++ call graph matches +the `libclang` oracle on a header-free namespaced fixture. + +Running it on a real project (`leveldb`, 40 of 42 core sources parsed cleanly; the +other two are Windows-only or need gmock) gives precision **0.96**, recall +**0.82**, F1 0.88 — recall up from **0.54** before the fix below. + +**The dominant gap was a real cgr bug: the call pass dropped the namespace from +the caller qn.** The definition pass binds a C++ free function or class inside a +`namespace` to a namespaced qualified name (`module.ns.fn`, `module.ns.Class`), +but the call pass built the enclosing caller's qn without the namespace +(`module.fn`, `module.Class.method`). Every such `CALLS` edge's source therefore +pointed at a node that does not exist (904 of 1227 C++ call sources dangled on +`leveldb`, all of it in `namespace leveldb`), so the call never attached. The fix +routes both the free-function and class qns through the same +`cpp_utils.build_qualified_name` the definition pass uses, so caller and node qns +always agree (RED test `test_cpp_namespace_call_caller_qn.py`). Dangling sources +fell to 251 and recall rose 0.54 → 0.82. + +The remaining tail is documented, not scoped away: + +- **Operator overloads** (`operator=` ×25, `operator[]`, `operator==`/`!=`): + `libclang` records `a = b` and `a[i]` as calls to the overloaded operator + methods, while cgr models them as `builtin.cpp.*` operator calls — a metric + difference, not a misresolution. +- **Trie-fallback misresolution of external calls** (the ~30 false positives: + `size`, `data`, `empty`, `clear`, `begin`, `end`): when a call's simple name + collides with a first-party method, cgr's name-only trie fallback binds the + external `std::` call to the same-named first-party method. The oracle correctly + treats it as external, so it surfaces as a cgr false positive. +- **Receiver-type method dispatch and out-of-line static methods** (`DB::Open`): + resolving `obj->method()` to the right class needs C++ receiver type inference + (C++ is not yet in the typed-language set that builds a local-variable type + map), the same deeper gap as the Go/Java/Rust tails. + +The last two share one root cause: cgr has no C++ receiver type inference, so it +resolves member calls by name alone. The eval keeps surfacing it; it is a +follow-on, not hidden. + ## Semantic search — query to function relevance cgr's semantic search embeds each function's source and retrieves by cosine diff --git a/evals/constants.py b/evals/constants.py index a0076d9c4..5fa054b01 100644 --- a/evals/constants.py +++ b/evals/constants.py @@ -199,6 +199,29 @@ class Category(StrEnum): C_RETRIEVAL_TITLE = "cgr multi-language retrieval: C CALLS vs libclang oracle" C_CALL_EDGE_REPR = "{file} -> {name}" +CPP_SOURCE_GLOBS: tuple[str, ...] = ("*.cc", "*.cpp", "*.cxx") +CPP_HEADER_GLOBS: tuple[str, ...] = ("*.h", "*.hpp", "*.hh", "*.hxx") +CPP_SUFFIXES: tuple[str, ...] = (".cc", ".cpp", ".cxx", ".h", ".hpp", ".hh", ".hxx") +CLANG_CPP_STD = "-std=c++17" +CLANG_CPP_LANG_FLAG = "-x" +CLANG_CPP_LANG = "c++" +CLANG_DEFINE_FLAG = "-D" +# (H) Apple ships a libclang whose version matches the active macOS SDK's libc++, +# (H) which the pip `libclang` wheel does not; C++ standard headers need that match +# (H) to parse. Probed in order; first existing path wins, else the bundled default. +LIBCLANG_CANDIDATES: tuple[str, ...] = ( + "/Library/Developer/CommandLineTools/usr/lib/libclang.dylib", +) +# (H) libc++ headers live under /usr/include/c++/v1 and MUST precede the clang +# (H) builtin resource headers, else libc++'s finds the C first. +CLANG_LIBCXX_SUBPATH = "usr/include/c++/v1" +CPP_RETRIEVAL_SCORES_FILENAME = "cpp_retrieval_scores.csv" +CPP_RETRIEVAL_DIFF_FILENAME = "cpp_retrieval_diff.json" +CPP_RETRIEVAL_DIFF_PREFIX = "cpp-retrieval:" +CPP_RETRIEVAL_LABEL = "graph" +CPP_RETRIEVAL_TITLE = "cgr multi-language retrieval: C++ CALLS vs libclang oracle" +CPP_CALL_EDGE_REPR = "{file} -> {name}" + # (H) Semantic-search relevance eval: does cgr's embedding ranking retrieve the # (H) right function for a natural-language query? Uses cgr's own embedder over # (H) function source extracted from the captured graph; graded as recall@k on diff --git a/evals/cpp_retrieval.py b/evals/cpp_retrieval.py new file mode 100644 index 000000000..3aab7b472 --- /dev/null +++ b/evals/cpp_retrieval.py @@ -0,0 +1,123 @@ +# (H) Multi-language retrieval (C++). Extends the file-level call-localization +# (H) benchmark to C++: for each first-party C++ function/method, which files call +# (H) it. cgr's C++ CALLS edges (reduced to (caller_file, callee_simple_name)) are +# (H) graded against call sites extracted by libclang, over the same first-party +# (H) name universe. libclang resolves the true translation-unit call graph, +# (H) independent of cgr's tree-sitter C++ frontend (cgr parses C++ with tree-sitter +# (H) by default; CPP_FRONTEND=libclang is off), so this measures cgr's cross-file +# (H) C++ call resolution against ground truth (mirrors evals/c_retrieval.py). +from pathlib import Path +from typing import Annotated + +import typer +from loguru import logger + +from codebase_rag import constants as cs + +from . import constants as ec +from . import logs as ls +from .cgr_graph import _capture +from .oracles import cpp_available, run_cpp_call_oracle +from .score import _prf +from .structure_report import render, write_outputs +from .types_defs import DiffBucket, LocationStats, ScoreResult, ScoreRow + +console_target = Path(ec.CPP_DEFAULT_TARGET) + +_CALLS = cs.RelationshipType.CALLS.value +_EMPTY_LOCATION = LocationStats(0, 0, 0, 0.0, 0) + +CallEdge = tuple[str, str] + + +def oracle_cpp_call_edges( + target: Path, extra_defines: tuple[str, ...] = () +) -> tuple[set[CallEdge], frozenset[str], frozenset[str]]: + return run_cpp_call_oracle(target, extra_defines) + + +def cgr_cpp_call_edges( + target: Path, project: str, declared: frozenset[str], covered: frozenset[str] +) -> set[CallEdge]: + ingestor = _capture(target, project) + caller_path: dict[tuple[str, str], str] = { + (str(label), str(uid)): str(props[cs.KEY_PATH]) + for (label, uid), props in ingestor.nodes.items() + if props.get(cs.KEY_PATH) and str(props[cs.KEY_PATH]).endswith(ec.CPP_SUFFIXES) + } + edges: set[CallEdge] = set() + for from_label, from_val, rel_type, _to_label, to_val in ingestor.rels: + if rel_type != _CALLS: + continue + path = caller_path.get((str(from_label), str(from_val))) + # (H) Grade only files the oracle parsed cleanly (its authoritative set). + if path is None or path not in covered: + continue + name = str(to_val).split(cs.SEPARATOR_DOT)[-1] + if name in declared: + edges.add((path, name)) + return edges + + +def _edge_repr(edge: CallEdge) -> str: + return ec.CPP_CALL_EDGE_REPR.format(file=edge[0], name=edge[1]) + + +def score_cpp_retrieval(cgr: set[CallEdge], oracle: set[CallEdge]) -> ScoreResult: + rows: list[ScoreRow] = [] + diff: dict[str, DiffBucket] = {} + row = _prf(ec.Category.RETRIEVAL.value, ec.CPP_RETRIEVAL_LABEL, cgr, oracle) + if row is not None: + rows.append(row) + diff[ec.CPP_RETRIEVAL_DIFF_PREFIX + ec.CPP_RETRIEVAL_LABEL] = DiffBucket( + missing=[_edge_repr(e) for e in sorted(oracle - cgr)], + extra=[_edge_repr(e) for e in sorted(cgr - oracle)], + ) + return ScoreResult(rows=rows, location=_EMPTY_LOCATION, diff=diff) + + +def main( + target: Annotated[ + Path, + typer.Option(help="Directory of C++ sources to evaluate call retrieval."), + ] = console_target, + project_name: Annotated[ + str, typer.Option(help="cgr project name; defaults to target dir name.") + ] = "", + define: Annotated[ + list[str], + typer.Option(help="Preprocessor macro the build would supply, e.g. NAME=1."), + ] = [], + out_dir: Annotated[ + Path, + typer.Option(help="Directory for cpp_retrieval_scores.csv and diff json."), + ] = Path(ec.DEFAULT_OUT_DIR), +) -> None: + if not cpp_available(): + logger.error(ls.CPP_RETRIEVAL_ORACLE_MISSING) + raise typer.Exit(code=1) + + target = target.resolve() + project = project_name or target.name + + logger.info(ls.CPP_RETRIEVAL_ORACLE.format(target=target)) + oracle, declared, covered = oracle_cpp_call_edges(target, tuple(define)) + logger.success(ls.CPP_RETRIEVAL_ORACLE_DONE.format(count=len(oracle))) + logger.info(ls.CPP_RETRIEVAL_COVERED.format(count=len(covered))) + + logger.info(ls.CPP_RETRIEVAL_CGR.format(target=target, project=project)) + cgr = cgr_cpp_call_edges(target, project, declared, covered) + logger.success(ls.CPP_RETRIEVAL_CGR_DONE.format(count=len(cgr))) + + result = score_cpp_retrieval(cgr, oracle) + write_outputs( + result, + out_dir, + ec.CPP_RETRIEVAL_SCORES_FILENAME, + ec.CPP_RETRIEVAL_DIFF_FILENAME, + ) + render(result, ec.CPP_RETRIEVAL_TITLE) + + +if __name__ == "__main__": + typer.run(main) diff --git a/evals/logs.py b/evals/logs.py index 3f7e366ec..094421529 100644 --- a/evals/logs.py +++ b/evals/logs.py @@ -114,6 +114,14 @@ C_RETRIEVAL_COVERED = "cleanly-parsed C source files graded: {count}" C_RETRIEVAL_CGR = "Building cgr C CALLS edges for {target} (project={project})" C_RETRIEVAL_CGR_DONE = "cgr C call edges (first-party): {count}" +CPP_RETRIEVAL_ORACLE_MISSING = ( + "libclang (clang.cindex) not importable; cannot run the C++ oracle" +) +CPP_RETRIEVAL_ORACLE = "Running libclang C++ call oracle over {target}" +CPP_RETRIEVAL_ORACLE_DONE = "libclang first-party C++ call edges: {count}" +CPP_RETRIEVAL_COVERED = "cleanly-parsed C++ source files graded: {count}" +CPP_RETRIEVAL_CGR = "Building cgr C++ CALLS edges for {target} (project={project})" +CPP_RETRIEVAL_CGR_DONE = "cgr C++ call edges (first-party): {count}" SEMANTIC_MISSING = "semantic dependencies not installed; cannot run semantic eval" SEMANTIC_TARGET = "Semantic-search eval over {target} (project={project})" SEMANTIC_DONE = "recall@{k}: {hits}/{total} queries retrieved the expected function" diff --git a/evals/oracles/__init__.py b/evals/oracles/__init__.py index 35fd3e2b2..63e71c2cc 100644 --- a/evals/oracles/__init__.py +++ b/evals/oracles/__init__.py @@ -1,4 +1,9 @@ -from .cpp_oracle import cpp_available, run_c_call_oracle, run_cpp_oracle +from .cpp_oracle import ( + cpp_available, + run_c_call_oracle, + run_cpp_call_oracle, + run_cpp_oracle, +) from .go_oracle import go_available, run_go_call_oracle, run_go_oracle from .java_oracle import java_available, run_java_call_oracle, run_java_oracle from .lua_oracle import lua_oracle_available, run_lua_call_oracle, run_lua_oracle @@ -14,6 +19,7 @@ __all__ = [ "cpp_available", "run_c_call_oracle", + "run_cpp_call_oracle", "run_cpp_oracle", "go_available", "run_go_call_oracle", diff --git a/evals/oracles/cpp_oracle.py b/evals/oracles/cpp_oracle.py index 3b4ca26c6..2700d68de 100644 --- a/evals/oracles/cpp_oracle.py +++ b/evals/oracles/cpp_oracle.py @@ -55,7 +55,43 @@ } +_libclang_pinned = False + + +def _ensure_libclang() -> None: + # (H) Pin the libclang shared library BEFORE the first Index.create (libclang is + # (H) a global one-shot). Prefer a system libclang whose clang version matches the + # (H) active SDK's libc++ — required to parse C++ standard headers, which the + # (H) bundled pip wheel's older clang cannot. C parsing is unaffected by the + # (H) choice, so both the C and C++ oracles share one consistent toolchain. + global _libclang_pinned + if _libclang_pinned: + return + _libclang_pinned = True + # (H) clang is an optional dependency: if the bindings are absent this import + # (H) raises ModuleNotFoundError, so swallow it here and let cpp_available's own + # (H) try/except report the oracle as unavailable (returning False), rather than + # (H) letting the exception escape and break test collection / the CLI path. + try: + from clang.cindex import Config + except Exception: + return + + for candidate in ec.LIBCLANG_CANDIDATES: + if Path(candidate).exists(): + try: + Config.set_library_file(candidate) + return + except Exception: + # (H) libclang loading raises a wide, unpredictable range of errors + # (H) (arch mismatch, format errors, an already-loaded library); on + # (H) any, fall through to the next candidate, else the bundled + # (H) default the bindings load on their own. + continue + + def cpp_available() -> bool: + _ensure_libclang() try: import clang.cindex as ci @@ -73,6 +109,7 @@ def _rel(path: str, root: Path) -> str | None: def run_cpp_oracle(target: Path) -> GraphData: + _ensure_libclang() import clang.cindex as ci root = target.resolve() @@ -169,7 +206,14 @@ def _emit( _FUNCTION_DECL = "FUNCTION_DECL" +_FUNCTION_TEMPLATE = "FUNCTION_TEMPLATE" +_CXX_METHOD = "CXX_METHOD" _CALL_EXPR = "CALL_EXPR" +# (H) C: only free functions are first-party callees. C++: free functions (incl. +# (H) templates) plus member functions; constructors/destructors are excluded +# (H) because cgr models object creation as INSTANTIATES, not CALLS. +_C_DECL_KINDS = frozenset({_FUNCTION_DECL}) +_CPP_DECL_KINDS = frozenset({_FUNCTION_DECL, _FUNCTION_TEMPLATE, _CXX_METHOD}) def _capture_path(command: tuple[str, ...]) -> str | None: @@ -211,11 +255,27 @@ def _c_include_args(root: Path) -> list[str]: return args -def _collect_c_decls_and_calls( +def _callee_is_first_party(call: Cursor, root: Path) -> bool: + # (H) libclang resolves a call to its callee declaration; grade the call only + # (H) when that declaration is itself first-party. Without this, a call whose + # (H) simple name collides with a first-party symbol (e.g. `std::string::size` + # (H) vs a project `size()`) would be counted as a first-party edge, understating + # (H) cgr recall against calls it correctly resolves as external/builtin. C++'s + # (H) large STL surface (size/data/empty/clear/...) makes this collision common. + ref = call.referenced + if ref is None or ref.location.file is None: + return False + cref = _rel(ref.location.file.name, root) + return cref is not None and not is_ignored(cref) + + +def _collect_decls_and_calls( cursor: Cursor, root: Path, declared: set[str], raw_calls: list[tuple[str, str]] | None, + decl_kinds: frozenset[str], + strict_callee: bool = False, ) -> None: # (H) raw_calls is None for an unclean translation unit: its AST may be # (H) truncated by a missing header, so its call sites are not authoritative @@ -227,11 +287,18 @@ def _collect_c_decls_and_calls( # (H) never graded and walking them is the dominant cost. if rel is None or is_ignored(rel): continue - if child.kind.name == _FUNCTION_DECL and child.is_definition(): + if child.kind.name in decl_kinds and child.is_definition(): declared.add(child.spelling) - elif raw_calls is not None and child.kind.name == _CALL_EXPR and child.spelling: + elif ( + raw_calls is not None + and child.kind.name == _CALL_EXPR + and child.spelling + and (not strict_callee or _callee_is_first_party(child, root)) + ): raw_calls.append((rel, child.spelling)) - _collect_c_decls_and_calls(child, root, declared, raw_calls) + _collect_decls_and_calls( + child, root, declared, raw_calls, decl_kinds, strict_callee + ) def run_c_call_oracle( @@ -245,6 +312,7 @@ def run_c_call_oracle( # (H) simple name is unambiguous. A file whose TU emits an error diagnostic # (H) (a missing build-generated header) is not authoritative, so it is left # (H) out of the covered set and the cgr side is held to the same files. + _ensure_libclang() import clang.cindex as ci root = target.resolve() @@ -264,8 +332,8 @@ def run_c_call_oracle( clean = not any( diag.severity >= ec.CLANG_SEVERITY_ERROR for diag in tu.diagnostics ) - _collect_c_decls_and_calls( - tu.cursor, root, declared, raw_calls if clean else None + _collect_decls_and_calls( + tu.cursor, root, declared, raw_calls if clean else None, _C_DECL_KINDS ) if clean: covered.add(rel) @@ -279,6 +347,96 @@ def run_c_call_oracle( return edges, declared_names, covered_files +def _cpp_system_args() -> list[str]: + # (H) Like _clang_system_args but for C++: the SDK's libc++ headers must precede + # (H) the clang builtin resource headers, else libc++'s resolves the C + # (H) first and the parse fails. isysroot supplies the platform C + # (H) library; the resource dir supplies clang builtins (stdarg.h, stddef.h). + args: list[str] = [] + if sdk := _capture_path(ec.XCRUN_SDK_PATH_CMD): + args.extend((ec.CLANG_ISYSROOT_FLAG, sdk)) + args.extend((ec.CLANG_ISYSTEM_FLAG, str(Path(sdk) / ec.CLANG_LIBCXX_SUBPATH))) + if resource := _capture_path(ec.CLANG_RESOURCE_DIR_CMD): + args.extend((ec.CLANG_ISYSTEM_FLAG, str(Path(resource) / ec.CLANG_INCLUDE_DIR))) + return args + + +def _cpp_include_args(root: Path) -> list[str]: + # (H) Root and a conventional include/ root plus every dir holding a C++ header + # (H) become -I paths so first-party #includes resolve without a compile database. + dirs = {root, root / ec.CLANG_INCLUDE_DIR} + for glob in ec.CPP_HEADER_GLOBS: + for header in root.rglob(glob): + rel = _rel(str(header), root) + if rel is not None and not is_ignored(rel): + dirs.add(header.parent) + args: list[str] = [] + for directory in sorted(dirs): + if directory.exists(): + args.extend((ec.CLANG_INCLUDE_FLAG, str(directory))) + return args + + +def run_cpp_call_oracle( + target: Path, + extra_defines: tuple[str, ...] = (), +) -> tuple[set[tuple[str, str]], frozenset[str], frozenset[str]]: + # (H) File-level C++ call sites restricted to first-party callees (free functions + # (H) and member functions), the declared name universe, and the cleanly-parsed + # (H) source files. libclang resolves the true translation-unit call graph + # (H) (independent of cgr's tree-sitter C++ frontend). Overloads collapse under + # (H) the (file, simple-name) metric, so they need no disambiguation. extra_defines + # (H) carries corpus-specific platform macros (e.g. LEVELDB_PLATFORM_POSIX) that a + # (H) build system would normally supply; a TU that still errors abstains. + _ensure_libclang() + import clang.cindex as ci + + root = target.resolve() + index = ci.Index.create() + defines = [ec.CLANG_DEFINE_FLAG + d for d in extra_defines] + base_args = [ + ec.CLANG_CPP_LANG_FLAG, + ec.CLANG_CPP_LANG, + ec.CLANG_CPP_STD, + *defines, + *_cpp_system_args(), + *_cpp_include_args(root), + ] + declared: set[str] = set() + raw_calls: list[tuple[str, str]] = [] + covered: set[str] = set() + for glob in ec.CPP_SOURCE_GLOBS: + for source in sorted(root.rglob(glob)): + rel = _rel(str(source), root) + if rel is None or is_ignored(rel): + continue + try: + tu = index.parse(str(source), args=base_args) + except ci.TranslationUnitLoadError: + continue + clean = not any( + diag.severity >= ec.CLANG_SEVERITY_ERROR for diag in tu.diagnostics + ) + _collect_decls_and_calls( + tu.cursor, + root, + declared, + raw_calls if clean else None, + _CPP_DECL_KINDS, + strict_callee=True, + ) + if clean: + covered.add(rel) + declared_names = frozenset(declared) + covered_files = frozenset(covered) + edges = { + (file, name) + for file, name in raw_calls + if name in declared_names and file in covered_files + } + return edges, declared_names, covered_files + + def _add_edge( edges: dict[_EdgeId, OracleEdge], rel: str,