sync: Add heuristic warning for bloated shallow repositories

For clone-depth="1" repositories that are dirty or have local commits,
add a check at the end of sync to detect excessive git object
accumulation.

This prevents silent performance degradation and disk exhaustion in
large prebuilts repos where automatic GC is typically disabled from
https://gerrit.googlesource.com/git-repo/+/7f87c54043ce9a35a5bb60a09ee846f9d7070352

Bug: 379111283
Change-Id: I376f38e1555cc6e906d852f6e63dc1c8f6331b4f
Reviewed-on: https://gerrit-review.googlesource.com/c/git-repo/+/534701
Commit-Queue: Gavin Mak <gavinmak@google.com>
Reviewed-by: Mike Frysinger <vapier@google.com>
Tested-by: Gavin Mak <gavinmak@google.com>
This commit is contained in:
Gavin Mak
2025-12-09 22:29:43 +00:00
committed by LUCI
parent 7f87c54043
commit b5991d7128

View File

@@ -87,6 +87,10 @@ _ONE_DAY_S = 24 * 60 * 60
_REPO_ALLOW_SHALLOW = os.environ.get("REPO_ALLOW_SHALLOW")
_BLOAT_PACK_COUNT_THRESHOLD = 10
_BLOAT_SIZE_PACK_THRESHOLD_KB = 10 * 1024 * 1024 # 10 GiB in KiB
_BLOAT_SIZE_GARBAGE_THRESHOLD_KB = 1 * 1024 * 1024 # 1 GiB in KiB
logger = RepoLogger(__file__)
@@ -1371,6 +1375,104 @@ later is required to fix a server side protocol bug.
t.join()
pm.end()
@classmethod
def _CheckOneBloatedProject(cls, project_index: int) -> Optional[str]:
"""Checks if a single project is bloated.
Args:
project_index: The index of the project in the parallel context.
Returns:
The name of the project if it is bloated, else None.
"""
project = cls.get_parallel_context()["projects"][project_index]
if not project.Exists or not project.worktree:
return None
# Only check dirty or locally modified projects. These can't be
# freshly cloned and will accumulate garbage.
try:
is_dirty = project.IsDirty(consider_untracked=True)
manifest_rev = project.GetRevisionId(project.bare_ref.all)
head_rev = project.work_git.rev_parse(HEAD)
has_local_commits = manifest_rev != head_rev
if not (is_dirty or has_local_commits):
return None
output = project.bare_git.count_objects("-v")
except Exception:
return None
stats = {}
for line in output.splitlines():
try:
key, value = line.split(": ", 1)
stats[key.strip()] = int(value.strip())
except ValueError:
pass
pack_count = stats.get("packs", 0)
size_pack_kb = stats.get("size-pack", 0)
size_garbage_kb = stats.get("size-garbage", 0)
is_fragmented = (
pack_count > _BLOAT_PACK_COUNT_THRESHOLD
and size_pack_kb > _BLOAT_SIZE_PACK_THRESHOLD_KB
)
has_excessive_garbage = (
size_garbage_kb > _BLOAT_SIZE_GARBAGE_THRESHOLD_KB
)
if is_fragmented or has_excessive_garbage:
return project.name
return None
def _CheckForBloatedProjects(self, projects, opt):
"""Check for shallow projects that are accumulating unoptimized data.
For projects with clone-depth="1" that are dirty (have local changes),
run 'git count-objects -v' and warn if the repository is accumulating
excessive pack files or garbage.
"""
projects = [p for p in projects if p.clone_depth]
if not projects:
return
bloated_projects = []
pm = Progress(
"Checking for bloat", len(projects), delay=False, quiet=opt.quiet
)
def _ProcessResults(pool, pm, results):
for result in results:
if result:
bloated_projects.append(result)
pm.update(msg="")
with self.ParallelContext():
self.get_parallel_context()["projects"] = projects
self.ExecuteInParallel(
opt.jobs,
self._CheckOneBloatedProject,
range(len(projects)),
callback=_ProcessResults,
output=pm,
chunksize=1,
)
pm.end()
for project_name in bloated_projects:
warn_msg = (
f'warning: Project "{project_name}" is accumulating '
'unoptimized data. Please run "repo sync --auto-gc" or '
'"repo gc --repack" to clean up.'
)
self.git_event_log.ErrorEvent(warn_msg)
logger.warning(warn_msg)
def _UpdateRepoProject(self, opt, manifest, errors):
"""Fetch the repo project and check for updates."""
if opt.local_only:
@@ -2002,6 +2104,8 @@ later is required to fix a server side protocol bug.
"experience, sync the entire tree."
)
self._CheckForBloatedProjects(all_projects, opt)
if not opt.quiet:
print("repo sync has finished successfully.")