Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 41 additions & 15 deletions src/sentry/integrations/github/multi_platform_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re
import time
from collections import defaultdict
from concurrent.futures import as_completed
from typing import TYPE_CHECKING, Any, TypedDict

import sentry_sdk
Expand Down Expand Up @@ -43,6 +44,7 @@
from sentry.integrations.github.platform_registry import (
_PackageManifest as _PackageManifest,
)
from sentry.utils.concurrent import ContextPropagatingThreadPoolExecutor

if TYPE_CHECKING:
from sentry.integrations.github.client import GitHubBaseClient
Expand Down Expand Up @@ -191,9 +193,9 @@ def _select_active_platforms(
)


def _path_is_ignored(path: str) -> bool:
"""Return True if any segment of the path is in the ignore-list."""
return any(segment in _IGNORED_TREE_SEGMENTS for segment in path.split("/"))
def _segments_are_ignored(segments: list[str]) -> bool:
"""Return True if any path segment is in the ignore-list."""
return any(segment in _IGNORED_TREE_SEGMENTS for segment in segments)


def _get_tree(
Expand Down Expand Up @@ -243,23 +245,31 @@ def _build_tree_index(entries: list[dict[str, Any]]) -> _TreeIndex:
Any entry whose path passes through an ignored segment is skipped, so
``node_modules/some-lib/package.json`` never contributes a false signal.
``full_repo_size_bytes`` is the sum of ``size`` across all blobs.

Each path is split once; the resulting segments are reused for both the
ignore check and the basename extraction (single-pass, no redundant splits).
"""
files_full_paths_by_basename: dict[str, set[str]] = defaultdict(set)
dirs_full_paths_by_basename: dict[str, set[str]] = defaultdict(set)
full_repo_size_bytes = 0

for entry in entries:
path = entry.get("path", "")
entry_type = entry.get("type")
size = entry.get("size") or 0

if entry.get("type") == "blob":
if entry_type == "blob":
full_repo_size_bytes += size

if not path or _path_is_ignored(path):
if not path:
continue

entry_type = entry.get("type")
basename = path.rsplit("/", 1)[-1]
# Split once; reuse segments for ignore check and basename.
segments = path.split("/")
if _segments_are_ignored(segments):
continue

basename = segments[-1]

if entry_type == "blob":
files_full_paths_by_basename[basename].add(path)
Expand Down Expand Up @@ -480,11 +490,15 @@ def detect_platforms_multi(
"""
start_time = time.monotonic()

languages: dict[str, int] = client.get_languages(repo)
active_platforms = _select_active_platforms(languages)

# Run get_languages and _get_tree concurrently — they are independent
# requests. active_platforms only needs languages, so it is computed on
# the main thread while the tree fetch is in flight.
tree_start = time.monotonic()
entries, is_truncated = _get_tree(client, repo, ref)
with ContextPropagatingThreadPoolExecutor(max_workers=1) as ex:
tree_future = ex.submit(_get_tree, client, repo, ref)
languages: dict[str, int] = client.get_languages(repo)
active_platforms = _select_active_platforms(languages)
entries, is_truncated = tree_future.result()
tree_duration_ms = (time.monotonic() - tree_start) * 1000
index = _build_tree_index(entries)

Expand Down Expand Up @@ -526,12 +540,20 @@ def detect_platforms_multi(
# are always within the cap before subdirectory files from monorepo workspaces.
capped_paths = sorted(needed_paths, key=lambda p: (p.count("/"), p))[:MAX_CONTENT_READS]

# Fan out all content reads in parallel — each _get_repo_file_content call
# is an independent REST round-trip and already returns None on failure, so
# a single slow or missing file cannot block the others.
content_reads_start = time.monotonic()
content_by_path: dict[str, str] = {}
for path in capped_paths:
content = _get_repo_file_content(client, repo, path, ref)
if content is not None:
content_by_path[path] = content
if capped_paths:
with ContextPropagatingThreadPoolExecutor(max_workers=len(capped_paths)) as ex:
future_to_path = {
ex.submit(_get_repo_file_content, client, repo, p, ref): p for p in capped_paths
}
for future in as_completed(future_to_path):
content = future.result()
if content is not None:
content_by_path[future_to_path[future]] = content
content_reads_duration_ms = (time.monotonic() - content_reads_start) * 1000

manifests_by_path: dict[str, _PackageManifest] = {}
Expand Down Expand Up @@ -628,11 +650,15 @@ def detect_platforms_multi(
f"{_MULTI_METRICS_PREFIX}.k_reads_realized",
k_reads_realized,
)
# tree.duration: wall time of the concurrent (languages + tree) block —
# effectively the tree's wall time since it is the long pole.
sentry_sdk.metrics.distribution(
f"{_MULTI_METRICS_PREFIX}.tree.duration",
tree_duration_ms,
unit="millisecond",
)
# content_reads.duration: parallel wall time (max of individual reads),
# not the former serial sum — expect this metric to be significantly lower.
sentry_sdk.metrics.distribution(
f"{_MULTI_METRICS_PREFIX}.content_reads.duration",
content_reads_duration_ms,
Expand Down
80 changes: 69 additions & 11 deletions tests/sentry/integrations/github/test_multi_platform_detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
_collect_needed_paths,
_framework_matches_scoped,
_get_tree,
_path_is_ignored,
_rule_parent_dirs,
_segments_are_ignored,
_select_active_platforms,
detect_platforms_multi,
)
Expand All @@ -23,7 +23,7 @@
FrameworkDef,
_PackageManifest,
)
from sentry.shared_integrations.exceptions import ApiError
from sentry.shared_integrations.exceptions import ApiConflictError, ApiError
from sentry.utils import json


Expand Down Expand Up @@ -623,26 +623,84 @@ def test_byte_count_descending_ordering(self) -> None:
assert first_platform == "python"


class TestPathIsIgnored:
class TestSegmentsAreIgnored:
def test_node_modules_segment_ignored(self) -> None:
assert _path_is_ignored("node_modules/react/index.js") is True
assert _segments_are_ignored(["node_modules", "react", "index.js"]) is True

def test_nested_ignored_segment(self) -> None:
assert _path_is_ignored("a/b/vendor/c/util.py") is True
assert _segments_are_ignored(["a", "b", "vendor", "c", "util.py"]) is True

def test_build_gradle_file_not_ignored(self) -> None:
# "build" is an ignored *directory* segment, but "build.gradle" is a filename,
# not the bare segment "build", so it must NOT be ignored.
assert _path_is_ignored("build.gradle") is False
# "build" is an ignored *directory* segment, but "build.gradle" as a
# single segment is not the bare string "build", so must NOT be ignored.
assert _segments_are_ignored(["build.gradle"]) is False

def test_clean_path_not_ignored(self) -> None:
assert _path_is_ignored("src/app/main.py") is False
assert _segments_are_ignored(["src", "app", "main.py"]) is False

def test_root_level_file_not_ignored(self) -> None:
assert _path_is_ignored("manage.py") is False
assert _segments_are_ignored(["manage.py"]) is False

def test_dist_dir_ignored(self) -> None:
assert _path_is_ignored("dist/bundle.js") is True
assert _segments_are_ignored(["dist", "bundle.js"]) is True


class TestDetectPlatformsMultiConcurrency:
def test_isolated_content_read_failure_does_not_abort_detection(self) -> None:
"""If one content read raises ApiError the other reads must still
complete and the detection must return a valid result."""
from base64 import b64encode

# tree: requirements.txt (django content) + a second file that will 404
tree = [
_make_tree_entry("requirements.txt"),
_make_tree_entry("broken_file.txt"), # will raise ApiError
]
client = mock.MagicMock()
client.get_languages.return_value = {"Python": 80_000}

def get_side_effect(path: str, params: dict | None = None) -> Any:
if "/git/trees/" in path:
return {"tree": tree, "truncated": False}
rel = path.split("/contents/", 1)[1]
if rel == "requirements.txt":
return {"content": b64encode(b"Django==4.2\n").decode()}
raise ApiError("Not Found", code=404)

client.get.side_effect = get_side_effect
# Must not raise; ApiError on a single file is swallowed by
# _get_repo_file_content and should not abort the pool.
result = detect_platforms_multi(client, "owner/repo")
platform_ids = {p["platform"] for p in result["platforms"]}
assert "python-django" in platform_ids

def test_api_conflict_error_propagates_from_tree_future(self) -> None:
client = mock.MagicMock()
client.get_languages.return_value = {}
client.get.side_effect = ApiConflictError("empty repo")
import pytest

with pytest.raises(ApiConflictError):
detect_platforms_multi(client, "owner/repo")

def test_languages_computed_before_tree_result_is_joined(self) -> None:
"""active_platforms is derived from languages independently of the
tree. Verify detection still produces correct output even when the
tree responds very slowly (simulated via call ordering introspection)."""
# Simple fixture: Go repo, go.mod at root (existence rule → high match).
tree = [_make_tree_entry("go.mod")]
client = _make_client(
languages={"Go": 60_000},
tree=tree,
contents={},
)
result = detect_platforms_multi(client, "owner/repo")
platform_ids = {p["platform"] for p in result["platforms"]}
assert "go" in platform_ids
# get_languages must have been called exactly once, tree call once.
assert client.get_languages.call_count == 1
tree_calls = [c for c in client.get.call_args_list if "/git/trees/" in c.args[0]]
assert len(tree_calls) == 1


class TestGetTree:
Expand Down
Loading