diff --git a/src/sentry/integrations/github/multi_platform_detection.py b/src/sentry/integrations/github/multi_platform_detection.py index f728bd31e108..5564aebdd46c 100644 --- a/src/sentry/integrations/github/multi_platform_detection.py +++ b/src/sentry/integrations/github/multi_platform_detection.py @@ -3,6 +3,7 @@ import re import time from collections import defaultdict +from concurrent.futures import as_completed from typing import TYPE_CHECKING, Any, TypedDict import sentry_sdk @@ -43,6 +44,7 @@ from sentry.integrations.github.platform_registry import ( _PackageManifest as _PackageManifest, ) +from sentry.utils.concurrent import ContextPropagatingThreadPoolExecutor if TYPE_CHECKING: from sentry.integrations.github.client import GitHubBaseClient @@ -191,9 +193,9 @@ def _select_active_platforms( ) -def _path_is_ignored(path: str) -> bool: - """Return True if any segment of the path is in the ignore-list.""" - return any(segment in _IGNORED_TREE_SEGMENTS for segment in path.split("/")) +def _segments_are_ignored(segments: list[str]) -> bool: + """Return True if any path segment is in the ignore-list.""" + return any(segment in _IGNORED_TREE_SEGMENTS for segment in segments) def _get_tree( @@ -243,6 +245,9 @@ def _build_tree_index(entries: list[dict[str, Any]]) -> _TreeIndex: Any entry whose path passes through an ignored segment is skipped, so ``node_modules/some-lib/package.json`` never contributes a false signal. ``full_repo_size_bytes`` is the sum of ``size`` across all blobs. + + Each path is split once; the resulting segments are reused for both the + ignore check and the basename extraction (single-pass, no redundant splits). """ files_full_paths_by_basename: dict[str, set[str]] = defaultdict(set) dirs_full_paths_by_basename: dict[str, set[str]] = defaultdict(set) @@ -250,16 +255,21 @@ def _build_tree_index(entries: list[dict[str, Any]]) -> _TreeIndex: for entry in entries: path = entry.get("path", "") + entry_type = entry.get("type") size = entry.get("size") or 0 - if entry.get("type") == "blob": + if entry_type == "blob": full_repo_size_bytes += size - if not path or _path_is_ignored(path): + if not path: continue - entry_type = entry.get("type") - basename = path.rsplit("/", 1)[-1] + # Split once; reuse segments for ignore check and basename. + segments = path.split("/") + if _segments_are_ignored(segments): + continue + + basename = segments[-1] if entry_type == "blob": files_full_paths_by_basename[basename].add(path) @@ -480,11 +490,15 @@ def detect_platforms_multi( """ start_time = time.monotonic() - languages: dict[str, int] = client.get_languages(repo) - active_platforms = _select_active_platforms(languages) - + # Run get_languages and _get_tree concurrently — they are independent + # requests. active_platforms only needs languages, so it is computed on + # the main thread while the tree fetch is in flight. tree_start = time.monotonic() - entries, is_truncated = _get_tree(client, repo, ref) + with ContextPropagatingThreadPoolExecutor(max_workers=1) as ex: + tree_future = ex.submit(_get_tree, client, repo, ref) + languages: dict[str, int] = client.get_languages(repo) + active_platforms = _select_active_platforms(languages) + entries, is_truncated = tree_future.result() tree_duration_ms = (time.monotonic() - tree_start) * 1000 index = _build_tree_index(entries) @@ -526,12 +540,20 @@ def detect_platforms_multi( # are always within the cap before subdirectory files from monorepo workspaces. capped_paths = sorted(needed_paths, key=lambda p: (p.count("/"), p))[:MAX_CONTENT_READS] + # Fan out all content reads in parallel — each _get_repo_file_content call + # is an independent REST round-trip and already returns None on failure, so + # a single slow or missing file cannot block the others. content_reads_start = time.monotonic() content_by_path: dict[str, str] = {} - for path in capped_paths: - content = _get_repo_file_content(client, repo, path, ref) - if content is not None: - content_by_path[path] = content + if capped_paths: + with ContextPropagatingThreadPoolExecutor(max_workers=len(capped_paths)) as ex: + future_to_path = { + ex.submit(_get_repo_file_content, client, repo, p, ref): p for p in capped_paths + } + for future in as_completed(future_to_path): + content = future.result() + if content is not None: + content_by_path[future_to_path[future]] = content content_reads_duration_ms = (time.monotonic() - content_reads_start) * 1000 manifests_by_path: dict[str, _PackageManifest] = {} @@ -628,11 +650,15 @@ def detect_platforms_multi( f"{_MULTI_METRICS_PREFIX}.k_reads_realized", k_reads_realized, ) + # tree.duration: wall time of the concurrent (languages + tree) block — + # effectively the tree's wall time since it is the long pole. sentry_sdk.metrics.distribution( f"{_MULTI_METRICS_PREFIX}.tree.duration", tree_duration_ms, unit="millisecond", ) + # content_reads.duration: parallel wall time (max of individual reads), + # not the former serial sum — expect this metric to be significantly lower. sentry_sdk.metrics.distribution( f"{_MULTI_METRICS_PREFIX}.content_reads.duration", content_reads_duration_ms, diff --git a/tests/sentry/integrations/github/test_multi_platform_detection.py b/tests/sentry/integrations/github/test_multi_platform_detection.py index a90bbc416cde..0bfcc7978b2a 100644 --- a/tests/sentry/integrations/github/test_multi_platform_detection.py +++ b/tests/sentry/integrations/github/test_multi_platform_detection.py @@ -12,8 +12,8 @@ _collect_needed_paths, _framework_matches_scoped, _get_tree, - _path_is_ignored, _rule_parent_dirs, + _segments_are_ignored, _select_active_platforms, detect_platforms_multi, ) @@ -23,7 +23,7 @@ FrameworkDef, _PackageManifest, ) -from sentry.shared_integrations.exceptions import ApiError +from sentry.shared_integrations.exceptions import ApiConflictError, ApiError from sentry.utils import json @@ -623,26 +623,84 @@ def test_byte_count_descending_ordering(self) -> None: assert first_platform == "python" -class TestPathIsIgnored: +class TestSegmentsAreIgnored: def test_node_modules_segment_ignored(self) -> None: - assert _path_is_ignored("node_modules/react/index.js") is True + assert _segments_are_ignored(["node_modules", "react", "index.js"]) is True def test_nested_ignored_segment(self) -> None: - assert _path_is_ignored("a/b/vendor/c/util.py") is True + assert _segments_are_ignored(["a", "b", "vendor", "c", "util.py"]) is True def test_build_gradle_file_not_ignored(self) -> None: - # "build" is an ignored *directory* segment, but "build.gradle" is a filename, - # not the bare segment "build", so it must NOT be ignored. - assert _path_is_ignored("build.gradle") is False + # "build" is an ignored *directory* segment, but "build.gradle" as a + # single segment is not the bare string "build", so must NOT be ignored. + assert _segments_are_ignored(["build.gradle"]) is False def test_clean_path_not_ignored(self) -> None: - assert _path_is_ignored("src/app/main.py") is False + assert _segments_are_ignored(["src", "app", "main.py"]) is False def test_root_level_file_not_ignored(self) -> None: - assert _path_is_ignored("manage.py") is False + assert _segments_are_ignored(["manage.py"]) is False def test_dist_dir_ignored(self) -> None: - assert _path_is_ignored("dist/bundle.js") is True + assert _segments_are_ignored(["dist", "bundle.js"]) is True + + +class TestDetectPlatformsMultiConcurrency: + def test_isolated_content_read_failure_does_not_abort_detection(self) -> None: + """If one content read raises ApiError the other reads must still + complete and the detection must return a valid result.""" + from base64 import b64encode + + # tree: requirements.txt (django content) + a second file that will 404 + tree = [ + _make_tree_entry("requirements.txt"), + _make_tree_entry("broken_file.txt"), # will raise ApiError + ] + client = mock.MagicMock() + client.get_languages.return_value = {"Python": 80_000} + + def get_side_effect(path: str, params: dict | None = None) -> Any: + if "/git/trees/" in path: + return {"tree": tree, "truncated": False} + rel = path.split("/contents/", 1)[1] + if rel == "requirements.txt": + return {"content": b64encode(b"Django==4.2\n").decode()} + raise ApiError("Not Found", code=404) + + client.get.side_effect = get_side_effect + # Must not raise; ApiError on a single file is swallowed by + # _get_repo_file_content and should not abort the pool. + result = detect_platforms_multi(client, "owner/repo") + platform_ids = {p["platform"] for p in result["platforms"]} + assert "python-django" in platform_ids + + def test_api_conflict_error_propagates_from_tree_future(self) -> None: + client = mock.MagicMock() + client.get_languages.return_value = {} + client.get.side_effect = ApiConflictError("empty repo") + import pytest + + with pytest.raises(ApiConflictError): + detect_platforms_multi(client, "owner/repo") + + def test_languages_computed_before_tree_result_is_joined(self) -> None: + """active_platforms is derived from languages independently of the + tree. Verify detection still produces correct output even when the + tree responds very slowly (simulated via call ordering introspection).""" + # Simple fixture: Go repo, go.mod at root (existence rule → high match). + tree = [_make_tree_entry("go.mod")] + client = _make_client( + languages={"Go": 60_000}, + tree=tree, + contents={}, + ) + result = detect_platforms_multi(client, "owner/repo") + platform_ids = {p["platform"] for p in result["platforms"]} + assert "go" in platform_ids + # get_languages must have been called exactly once, tree call once. + assert client.get_languages.call_count == 1 + tree_calls = [c for c in client.get.call_args_list if "/git/trees/" in c.args[0]] + assert len(tree_calls) == 1 class TestGetTree: