From f0523e1e061d6adb4532b7b9e1b9fd61e382c972 Mon Sep 17 00:00:00 2001 From: David Holtz Date: Fri, 27 Feb 2026 13:47:04 -0500 Subject: [PATCH 1/3] feat: prefer upload_large_folder when many files --- kernels/src/kernels/cli/upload.py | 36 ++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/kernels/src/kernels/cli/upload.py b/kernels/src/kernels/cli/upload.py index 52f37ff..47a99f9 100644 --- a/kernels/src/kernels/cli/upload.py +++ b/kernels/src/kernels/cli/upload.py @@ -70,13 +70,33 @@ def upload_kernels_dir( allow_patterns=["benchmark*.py"], ) - api.upload_folder( - repo_id=repo_id, - folder_path=build_dir, - revision=branch, - path_in_repo="build", - delete_patterns=list(delete_patterns), - commit_message="Build uploaded using `kernels`.", - allow_patterns=["torch*"], + file_count = sum( + 1 + for p in build_dir.rglob("*") + if p.is_file() and p.relative_to(build_dir).as_posix().startswith("torch") ) + + if file_count > 200: + print( + f"⚠️ Found {file_count} files to upload, which exceeds the 200 file limit for a single commit. Deleting old build files and re-uploading the whole build folder to avoid hitting file limits." + ) + kernel_root_dir = build_dir.parent + api.upload_large_folder( + repo_id=repo_id, + folder_path=kernel_root_dir, + revision=branch, + repo_type="model", + allow_patterns=["build/torch*"], + ) + else: + api.upload_folder( + repo_id=repo_id, + folder_path=build_dir, + revision=branch, + path_in_repo="build", + delete_patterns=list(delete_patterns), + commit_message="Build uploaded using `kernels`.", + allow_patterns=["torch*"], + ) + print(f"✅ Kernel upload successful. Find the kernel in: https://hf.co/{repo_id}") From 2d0b9d6d309e2890a3a917e44cfd53c6217d1aa1 Mon Sep 17 00:00:00 2001 From: drbh Date: Mon, 2 Mar 2026 12:34:51 -0500 Subject: [PATCH 2/3] fix: add test and improve logic --- kernels/src/kernels/cli/upload.py | 17 ++++++++++------- kernels/tests/test_kernel_upload.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/kernels/src/kernels/cli/upload.py b/kernels/src/kernels/cli/upload.py index 47a99f9..781bd4e 100644 --- a/kernels/src/kernels/cli/upload.py +++ b/kernels/src/kernels/cli/upload.py @@ -5,6 +5,13 @@ from kernels.variants import BUILD_VARIANT_REGEX +def get_file_count_in_build(build_dir: Path) -> int: + return sum( + 1 + for p in build_dir.rglob("*") + if p.is_file() and p.relative_to(build_dir).as_posix().startswith("torch") + ) + def upload_kernels_dir( kernel_dir: Path, *, @@ -70,15 +77,11 @@ def upload_kernels_dir( allow_patterns=["benchmark*.py"], ) - file_count = sum( - 1 - for p in build_dir.rglob("*") - if p.is_file() and p.relative_to(build_dir).as_posix().startswith("torch") - ) + file_count = get_file_count_in_build(build_dir) - if file_count > 200: + if file_count > 1_000: print( - f"⚠️ Found {file_count} files to upload, which exceeds the 200 file limit for a single commit. Deleting old build files and re-uploading the whole build folder to avoid hitting file limits." + f"⚠️ Found {file_count} files to upload, which exceeds the 1,000 file limit for a single commit." ) kernel_root_dir = build_dir.parent api.upload_large_folder( diff --git a/kernels/tests/test_kernel_upload.py b/kernels/tests/test_kernel_upload.py index 11928bf..6d6daea 100644 --- a/kernels/tests/test_kernel_upload.py +++ b/kernels/tests/test_kernel_upload.py @@ -4,6 +4,8 @@ import tempfile from dataclasses import dataclass from pathlib import Path +from types import SimpleNamespace +from unittest.mock import Mock import pytest @@ -120,3 +122,27 @@ def test_kernel_upload_deletes_as_expected(): str(filename_to_change) in k for k in repo_filenames ), f"{repo_filenames=}" _get_hf_api().delete_repo(repo_id=REPO_ID) + + +def test_large_kernel_upload_uses_kernel_root_path(monkeypatch, tmp_path): + kernel_root = tmp_path / "kernel" + build_variant = kernel_root / "build" / "torch-cpu" + build_variant.mkdir(parents=True, exist_ok=True) + (build_variant / "metadata.json").write_text("{}") + for i in range(1001): + (build_variant / f"file_{i}.py").touch() + + api = Mock() + api.create_repo.return_value = SimpleNamespace(repo_id=REPO_ID) + monkeypatch.setattr("kernels.cli.upload._get_hf_api", lambda: api) + + upload_kernels(UploadArgs(kernel_root, REPO_ID, False, "main")) + + api.upload_large_folder.assert_called_once() + kwargs = api.upload_large_folder.call_args.kwargs + assert kwargs["repo_id"] == REPO_ID + assert kwargs["folder_path"] == kernel_root.resolve() + assert kwargs["revision"] == "main" + assert kwargs["repo_type"] == "model" + assert kwargs["allow_patterns"] == ["build/torch*"] + api.upload_folder.assert_not_called() From c7e74eed7617f6ba0facc3b2131e32bee803d36c Mon Sep 17 00:00:00 2001 From: drbh Date: Mon, 9 Mar 2026 13:30:04 -0400 Subject: [PATCH 3/3] feat: prefer internal api --- kernels/src/kernels/cli/upload.py | 99 +++++++++++++++++++---------- kernels/tests/test_kernel_upload.py | 57 ++++++++++++++--- 2 files changed, 112 insertions(+), 44 deletions(-) diff --git a/kernels/src/kernels/cli/upload.py b/kernels/src/kernels/cli/upload.py index 781bd4e..c1aacf5 100644 --- a/kernels/src/kernels/cli/upload.py +++ b/kernels/src/kernels/cli/upload.py @@ -1,17 +1,67 @@ from pathlib import Path +from huggingface_hub import CommitOperationAdd, CommitOperationDelete +from huggingface_hub.utils import chunk_iterable + from kernels.metadata import Metadata from kernels.utils import _get_hf_api from kernels.variants import BUILD_VARIANT_REGEX +BUILD_COMMIT_BATCH_SIZE = 1_000 + + +def _upload_build_dir( + api, + *, + repo_id: str, + revision: str | None, + build_dir: Path, + variants: list[Path], +): + repo_paths = {} + for variant in variants: + for path in sorted(variant.rglob("*")): + if path.is_file(): + repo_paths[f"build/{path.relative_to(build_dir).as_posix()}"] = path -def get_file_count_in_build(build_dir: Path) -> int: - return sum( - 1 - for p in build_dir.rglob("*") - if p.is_file() and p.relative_to(build_dir).as_posix().startswith("torch") + variant_prefixes = tuple( + f"build/{variant.relative_to(build_dir).as_posix()}/" for variant in variants + ) + operations = [ + CommitOperationDelete(path_in_repo=repo_file) + for repo_file in sorted( + api.list_repo_files(repo_id=repo_id, revision=revision, repo_type="model") + ) + if repo_file.startswith(variant_prefixes) and repo_file not in repo_paths + ] + operations.extend( + CommitOperationAdd(path_in_repo=repo_path, path_or_fileobj=str(local_path)) + for repo_path, local_path in sorted(repo_paths.items()) ) + batch_count = (len(operations) + BUILD_COMMIT_BATCH_SIZE - 1) // BUILD_COMMIT_BATCH_SIZE + if batch_count > 1: + print( + f"⚠️ Found {len(operations)} build operations, uploading in {batch_count} commits." + ) + + for batch_index, chunk in enumerate( + chunk_iterable(operations, chunk_size=BUILD_COMMIT_BATCH_SIZE), start=1 + ): + commit_message = "Build uploaded using `kernels`." + if batch_count > 1: + commit_message = ( + f"Build uploaded using `kernels` (batch {batch_index}/{batch_count})." + ) + api.create_commit( + repo_id=repo_id, + operations=list(chunk), + revision=revision, + repo_type="model", + commit_message=commit_message, + ) + + def upload_kernels_dir( kernel_dir: Path, *, @@ -60,12 +110,7 @@ def upload_kernels_dir( if branch is not None: api.create_branch(repo_id=repo_id, branch=branch, exist_ok=True) - delete_patterns: set[str] = set() - for build_variant in build_dir.iterdir(): - if build_variant.is_dir(): - delete_patterns.add(f"{build_variant.name}/**") - - # in the case we have variants, upload to the same as the kernel_dir + # In the case we have benchmarks, upload to the same repo as the kernel_dir. if (kernel_dir / "benchmarks").is_dir(): api.upload_folder( repo_id=repo_id, @@ -77,29 +122,13 @@ def upload_kernels_dir( allow_patterns=["benchmark*.py"], ) - file_count = get_file_count_in_build(build_dir) - - if file_count > 1_000: - print( - f"⚠️ Found {file_count} files to upload, which exceeds the 1,000 file limit for a single commit." - ) - kernel_root_dir = build_dir.parent - api.upload_large_folder( - repo_id=repo_id, - folder_path=kernel_root_dir, - revision=branch, - repo_type="model", - allow_patterns=["build/torch*"], - ) - else: - api.upload_folder( - repo_id=repo_id, - folder_path=build_dir, - revision=branch, - path_in_repo="build", - delete_patterns=list(delete_patterns), - commit_message="Build uploaded using `kernels`.", - allow_patterns=["torch*"], - ) + assert variants is not None + _upload_build_dir( + api, + repo_id=repo_id, + revision=branch, + build_dir=build_dir, + variants=variants, + ) print(f"✅ Kernel upload successful. Find the kernel in: https://hf.co/{repo_id}") diff --git a/kernels/tests/test_kernel_upload.py b/kernels/tests/test_kernel_upload.py index 6d6daea..001b819 100644 --- a/kernels/tests/test_kernel_upload.py +++ b/kernels/tests/test_kernel_upload.py @@ -8,8 +8,10 @@ from unittest.mock import Mock import pytest +from huggingface_hub import CommitOperationAdd, CommitOperationDelete from kernels.cli import upload_kernels +from kernels.cli.upload import BUILD_COMMIT_BATCH_SIZE from kernels.utils import _get_hf_api REPO_ID = "valid_org/kernels-upload-test" @@ -124,25 +126,62 @@ def test_kernel_upload_deletes_as_expected(): _get_hf_api().delete_repo(repo_id=REPO_ID) -def test_large_kernel_upload_uses_kernel_root_path(monkeypatch, tmp_path): +def test_large_kernel_upload_uses_create_commit_batches(monkeypatch, tmp_path): kernel_root = tmp_path / "kernel" build_variant = kernel_root / "build" / "torch-cpu" build_variant.mkdir(parents=True, exist_ok=True) (build_variant / "metadata.json").write_text("{}") - for i in range(1001): + file_count = BUILD_COMMIT_BATCH_SIZE * 2 + for i in range(file_count): (build_variant / f"file_{i}.py").touch() api = Mock() api.create_repo.return_value = SimpleNamespace(repo_id=REPO_ID) + api.list_repo_files.return_value = [ + "README.md", + "build/torch-cpu/file_0.py", + "build/torch-cpu/stale.py", + "build/torch-cuda/keep.py", + ] monkeypatch.setattr("kernels.cli.upload._get_hf_api", lambda: api) upload_kernels(UploadArgs(kernel_root, REPO_ID, False, "main")) - api.upload_large_folder.assert_called_once() - kwargs = api.upload_large_folder.call_args.kwargs - assert kwargs["repo_id"] == REPO_ID - assert kwargs["folder_path"] == kernel_root.resolve() - assert kwargs["revision"] == "main" - assert kwargs["repo_type"] == "model" - assert kwargs["allow_patterns"] == ["build/torch*"] + # 2 full batches of adds, plus metadata and 1 stale-file delete. + assert api.create_commit.call_count == 3 + batch_sizes = [ + len(call.kwargs["operations"]) for call in api.create_commit.call_args_list + ] + assert batch_sizes == [ + BUILD_COMMIT_BATCH_SIZE, + BUILD_COMMIT_BATCH_SIZE, + 2, + ] + commit_messages = [ + call.kwargs["commit_message"] for call in api.create_commit.call_args_list + ] + assert commit_messages == [ + "Build uploaded using `kernels` (batch 1/3).", + "Build uploaded using `kernels` (batch 2/3).", + "Build uploaded using `kernels` (batch 3/3).", + ] + + # Stale repo files should be deleted. + operations = [ + operation + for call in api.create_commit.call_args_list + for operation in call.kwargs["operations"] + ] + delete_paths = { + op.path_in_repo for op in operations if isinstance(op, CommitOperationDelete) + } + assert delete_paths == {"build/torch-cpu/stale.py"} + + add_paths = { + op.path_in_repo for op in operations if isinstance(op, CommitOperationAdd) + } + assert len(add_paths) == file_count + 1 + assert "build/torch-cpu/metadata.json" in add_paths + assert "build/torch-cpu/file_0.py" in add_paths + assert "build/torch-cpu/file_399.py" in add_paths api.upload_folder.assert_not_called()