Skip to content

Commit b5991d7

Browse files
gavinmakLUCI
authored andcommitted
sync: Add heuristic warning for bloated shallow repositories
For clone-depth="1" repositories that are dirty or have local commits, add a check at the end of sync to detect excessive git object accumulation. This prevents silent performance degradation and disk exhaustion in large prebuilts repos where automatic GC is typically disabled from https://gerrit.googlesource.com/git-repo/+/7f87c54043ce9a35a5bb60a09ee846f9d7070352 Bug: 379111283 Change-Id: I376f38e1555cc6e906d852f6e63dc1c8f6331b4f Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/534701 Commit-Queue: Gavin Mak <gavinmak@google.com> Reviewed-by: Mike Frysinger <vapier@google.com> Tested-by: Gavin Mak <gavinmak@google.com>
1 parent 7f87c54 commit b5991d7

File tree

1 file changed

+104
-0
lines changed

1 file changed

+104
-0
lines changed

subcmds/sync.py

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,10 @@ def _rlimit_nofile():
8787

8888
_REPO_ALLOW_SHALLOW = os.environ.get("REPO_ALLOW_SHALLOW")
8989

90+
_BLOAT_PACK_COUNT_THRESHOLD = 10
91+
_BLOAT_SIZE_PACK_THRESHOLD_KB = 10 * 1024 * 1024 # 10 GiB in KiB
92+
_BLOAT_SIZE_GARBAGE_THRESHOLD_KB = 1 * 1024 * 1024 # 1 GiB in KiB
93+
9094
logger = RepoLogger(__file__)
9195

9296

@@ -1371,6 +1375,104 @@ def tidy_up(run_gc, bare_git):
13711375
t.join()
13721376
pm.end()
13731377

1378+
@classmethod
1379+
def _CheckOneBloatedProject(cls, project_index: int) -> Optional[str]:
1380+
"""Checks if a single project is bloated.
1381+
1382+
Args:
1383+
project_index: The index of the project in the parallel context.
1384+
1385+
Returns:
1386+
The name of the project if it is bloated, else None.
1387+
"""
1388+
project = cls.get_parallel_context()["projects"][project_index]
1389+
1390+
if not project.Exists or not project.worktree:
1391+
return None
1392+
1393+
# Only check dirty or locally modified projects. These can't be
1394+
# freshly cloned and will accumulate garbage.
1395+
try:
1396+
is_dirty = project.IsDirty(consider_untracked=True)
1397+
1398+
manifest_rev = project.GetRevisionId(project.bare_ref.all)
1399+
head_rev = project.work_git.rev_parse(HEAD)
1400+
has_local_commits = manifest_rev != head_rev
1401+
1402+
if not (is_dirty or has_local_commits):
1403+
return None
1404+
1405+
output = project.bare_git.count_objects("-v")
1406+
except Exception:
1407+
return None
1408+
1409+
stats = {}
1410+
for line in output.splitlines():
1411+
try:
1412+
key, value = line.split(": ", 1)
1413+
stats[key.strip()] = int(value.strip())
1414+
except ValueError:
1415+
pass
1416+
1417+
pack_count = stats.get("packs", 0)
1418+
size_pack_kb = stats.get("size-pack", 0)
1419+
size_garbage_kb = stats.get("size-garbage", 0)
1420+
1421+
is_fragmented = (
1422+
pack_count > _BLOAT_PACK_COUNT_THRESHOLD
1423+
and size_pack_kb > _BLOAT_SIZE_PACK_THRESHOLD_KB
1424+
)
1425+
has_excessive_garbage = (
1426+
size_garbage_kb > _BLOAT_SIZE_GARBAGE_THRESHOLD_KB
1427+
)
1428+
1429+
if is_fragmented or has_excessive_garbage:
1430+
return project.name
1431+
return None
1432+
1433+
def _CheckForBloatedProjects(self, projects, opt):
1434+
"""Check for shallow projects that are accumulating unoptimized data.
1435+
1436+
For projects with clone-depth="1" that are dirty (have local changes),
1437+
run 'git count-objects -v' and warn if the repository is accumulating
1438+
excessive pack files or garbage.
1439+
"""
1440+
projects = [p for p in projects if p.clone_depth]
1441+
if not projects:
1442+
return
1443+
1444+
bloated_projects = []
1445+
pm = Progress(
1446+
"Checking for bloat", len(projects), delay=False, quiet=opt.quiet
1447+
)
1448+
1449+
def _ProcessResults(pool, pm, results):
1450+
for result in results:
1451+
if result:
1452+
bloated_projects.append(result)
1453+
pm.update(msg="")
1454+
1455+
with self.ParallelContext():
1456+
self.get_parallel_context()["projects"] = projects
1457+
self.ExecuteInParallel(
1458+
opt.jobs,
1459+
self._CheckOneBloatedProject,
1460+
range(len(projects)),
1461+
callback=_ProcessResults,
1462+
output=pm,
1463+
chunksize=1,
1464+
)
1465+
pm.end()
1466+
1467+
for project_name in bloated_projects:
1468+
warn_msg = (
1469+
f'warning: Project "{project_name}" is accumulating '
1470+
'unoptimized data. Please run "repo sync --auto-gc" or '
1471+
'"repo gc --repack" to clean up.'
1472+
)
1473+
self.git_event_log.ErrorEvent(warn_msg)
1474+
logger.warning(warn_msg)
1475+
13741476
def _UpdateRepoProject(self, opt, manifest, errors):
13751477
"""Fetch the repo project and check for updates."""
13761478
if opt.local_only:
@@ -2002,6 +2104,8 @@ def _ExecuteHelper(self, opt, args, errors):
20022104
"experience, sync the entire tree."
20032105
)
20042106

2107+
self._CheckForBloatedProjects(all_projects, opt)
2108+
20052109
if not opt.quiet:
20062110
print("repo sync has finished successfully.")
20072111

0 commit comments

Comments
 (0)