From 1259663aebf6946da9b7dc388a2148b3f3c4519d Mon Sep 17 00:00:00 2001 From: Anson Qian Date: Wed, 27 Aug 2025 11:49:15 -0400 Subject: [PATCH 01/87] use v6 VM SKU --- .pipelines/.vsts-vhd-builder-release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pipelines/.vsts-vhd-builder-release.yaml b/.pipelines/.vsts-vhd-builder-release.yaml index bb6490d7dbd..7d022a5c1e1 100644 --- a/.pipelines/.vsts-vhd-builder-release.yaml +++ b/.pipelines/.vsts-vhd-builder-release.yaml @@ -975,7 +975,7 @@ stages: echo '##vso[task.setvariable variable=IMG_SKU]server-arm64' echo '##vso[task.setvariable variable=IMG_VERSION]latest' echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2' - echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D16pds_v5' + echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D32pds_v6' echo '##vso[task.setvariable variable=FEATURE_FLAGS]GB200' echo '##vso[task.setvariable variable=ARCHITECTURE]ARM64' echo '##vso[task.setvariable variable=ENABLE_FIPS]False' From 863166158363deabb45fdeeff6857d9e6d7672ec Mon Sep 17 00:00:00 2001 From: Anson Qian Date: Wed, 27 Aug 2025 11:50:41 -0400 Subject: [PATCH 02/87] increase os disk --- e2e/vmss.go | 2 +- vhdbuilder/packer/vhd-image-builder-arm64-gen2.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 4abf14f8931..d85c825c4b4 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -1153,7 +1153,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual StorageProfile: &armcompute.VirtualMachineScaleSetStorageProfile{ OSDisk: &armcompute.VirtualMachineScaleSetOSDisk{ CreateOption: to.Ptr(armcompute.DiskCreateOptionTypesFromImage), - DiskSizeGB: to.Ptr(int32(50)), + DiskSizeGB: to.Ptr(int32(60)), OSType: to.Ptr(armcompute.OperatingSystemTypesLinux), Caching: to.Ptr(armcompute.CachingTypesReadOnly), DiffDiskSettings: &armcompute.DiffDiskSettings{ diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index 90b9ccb0a0a..9934c1a393d 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -39,7 +39,7 @@ "virtual_network_subnet_name": "{{user `subnet_name`}}", "ssh_read_write_timeout": "5m", "os_type": "Linux", - "os_disk_size_gb": 30, + "os_disk_size_gb": 60, "image_publisher": "{{user `img_publisher`}}", "image_offer": "{{user `img_offer`}}", "image_sku": "{{user `img_sku`}}", From e3c002643c266d984ccf9c2f960cec4bb3d85f10 Mon Sep 17 00:00:00 2001 From: Anson Qian Date: Tue, 26 Aug 2025 12:01:05 -0400 Subject: [PATCH 03/87] Add Doca/Mofed driver to GB200 image --- .../artifacts/ubuntu/mellanox_mlnx_ofed.list | 9 +++ .../artifacts/ubuntu/mellanox_mlnx_ofed.pub | 81 +++++++++++++++++++ vhdbuilder/packer/install-dependencies.sh | 19 +++++ vhdbuilder/packer/packer_source.sh | 13 +++ .../packer/vhd-image-builder-arm64-gen2.json | 10 +++ 5 files changed, 132 insertions(+) create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.list create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.pub diff --git a/parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.list b/parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.list new file mode 100644 index 00000000000..44b427cab18 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.list @@ -0,0 +1,9 @@ +# +# Mellanox Technologies Ltd. public repository configuration file. +# For more information, refer to http://linux.mellanox.com +# +# For future reference: +# https://network.nvidia.com/support/mlnx-ofed-public-repository/ +# https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox +# [mlnx_ofed_24.10-1.1.4.0_base] +deb [signed-by=/etc/apt/keyrings/mellanox_mlnx_ofed.pub] http://linux.mellanox.com/public/repo/mlnx_ofed/24.10-1.1.4.0/ubuntu24.04/arm64 ./ \ No newline at end of file diff --git a/parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.pub b/parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.pub new file mode 100644 index 00000000000..45107ba4315 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.pub @@ -0,0 +1,81 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2.0.14 (GNU/Linux) + +mQGiBFMEmE0RBACsz1qcFsYOs0LHy/pBR2ip0gnHYbZgLy00R2i7cELxmqGcESzp +6IfzIdwOX9oVsPI6NT/yvftp+BxALuD8UC52MLjdMJZ+1sXBZM4J5xnDmQMhIp0G +wCse8usM8Zad1WTKq+P0ip8Gd17WEpfwMQPKXg3npcF69zaz/ceeDavqjwCgofU0 +rb8ui7cZs+c+7U+5mrXxmcMD/R/tV8tEykQFW7PKuZ9NvvRX2XFuQD9LZRW7v+Rg +ebC0GAM1ZSqgI7uNUL3ZLAMgxaURLZViqKPgiw8373uoayfrnccttoZ2prHdtB5O +ZPo9vp8wJYUd+Wug2c1nuzXQtTrs/wfeJDn/PfvlEIGlXYPphsBXGQd7MbMLtW7g +u6h/A/9lmSP1fFQflTRlO5j3jXrlFkW05lMlWVZD3H75obQxHlM7eGCgnUPABBMt +aoZDZDf5P9I3xinu9qhDi7Vbz7QOkWOGr2dHLUOMqIgoKz7zRcFtbAl65AcOuEKu +KpLE/R3mRjZ7vrCPud6euEKGpvMbdevDF7GeMG3fcvVlK1ivy7RVTWVsbGFub3gg +VGVjaG5vbG9naWVzIChNZWxsYW5veCBUZWNobm9sb2dpZXMgLSBTaWduaW5nIEtl +eSB2MikgPHN1cHBvcnRAbWVsbGFub3guY29tPohiBBMRAgAiBQJTBJhNAhsDBgsJ +CAcDAgYVCAIJCgsEFgIDAQIeAQIXgAAKCRDF7YPiYiTAUFcAAJ49FBA3hy0P0gsZ +q/ZkAMrgXZaG9wCcDjMtZZETG5NEaIVg3GYqJcvI4AW5AQ0EUwSYTRAEANmBQ0WP +O3VsOrDH0VX+fa1nuKpTqyPFmrROtiI0Ux1dEsU/hpFJnFHtv+CW8ppUlMmjhw6U +olS3dqvO+fWxe1FMLVpp1BQLI6udM5j/P1IEDH7TmZD5trYFp4PxXagKO2nBeqjj +NydQckgREntGCOGPqheBRdopmlJSPlTptQavAAMFA/9BVSpmStx3BsS0z5NPSI/V +wJFeQiXFq8zDKbEVHFMjYWGqbhGWDPaLJWxxNLF1hdpbZSQCAeaESNLYG0iqXwb6 +6O79BHpGeN0AWyy2J6FJpt0zwlCDfx7fgpFKMGzIxXWiTDNmKon241ojgM1iYC2o +arjropoA0dtG6noS2KJBYIhJBBgRAgAJBQJTBJhNAhsMAAoJEMXtg+JiJMBQzxUA +oJ+aJ2l6vt1S1tIKCLVtDMH8liOBAJ45EQ867jkf6f2Anihx9XJ0LLKZvw== +=QMd9 +-----END PGP PUBLIC KEY BLOCK----- +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2.0.14 (GNU/Linux) + +mQGiBFIHkboRBADGcZ0FQvQl8frNzEZIep6D+KSZY/ps70+k3ZJ+wj2mvtGZSV9t +zeEUbte7ft5HzrIniB87j1Swp+mSJIomLTkOcQunoqCCHQkuPOEMi1urUmdjpyc7 +nJjsQ63GLvH0DfmknGga4rCj3Kepn9mhJ9mqfS+/aXrz1ZP4Dk+alpi/RwCgplxo +94IruAMKoQCdJ3SmfqvszYcEAMUJ3qmCpYax4s/0XyX36emLiMioHZehq/QXdFmj +VmqqxL5QFmq9Yof8SwGBwpS8FS0VX8BTs7xAs5W7ZC7iGGo9uxuXZzeZ8vcwh6VX +OVmbtqLgXyPKqzHIDwJ8Q5Df0JQpRnCmQQaHbEcoOstSTP/3NHLFBIllPq7gqIpZ +9HQoBACvlwzvtabC9q1OAikXY5YKKbAtkmZYBa5I2qvfHV1bIRYPPHWW2shilX0N +Kz2pTR1ZlwEcz+CUhPtJgoWhkMu/Vl7NMeB0YzGmjQorHRj2mAvSbv/wvjeIMgbw +qRXIksGYiUSpTLtQYTfpJlNe0ZKzn6kHbqGUYZ92Jx2ki3gQqbQsTWVsbGFub3gg +VGVjaG5vbG9naWVzIDxzdXBwb3J0QG1lbGxhbm94LmNvbT6IYgQTEQIAIgUCUgeR +ugIbAwYLCQgHAwIGFQgCCQoLBBYCAwECHgECF4AACgkQAQSPp6nktkPsXwCfW2Rn +pgmC4zLTMBRo/hKsIvag2ToAnAtlzxpMAZGUQHBODfpGqx7MyHmUuQENBFIHkboQ +BADd2OqEdSDCB6KkgZ2BjURxpiDbZxEAEsTJOUBFMPSqdJN0GcqUon5Hc3yADDOF +ztdWf5XCKSp/loYvjTYM21Qq20g5EB2SU9FU6Eoq5vyU/HS3/c1wjiYv2rjMll62 +kc4oqRkM/fp9crrjArssfqMQcQRVYBS3dYdmoVdpHEH68wADBQP/XPW9r3wwGvUr +7hlFskYrSC/8s3r7vB4/mcF6UMkM4xEaP3jq8HH0SLkLbcPTa1+C/5evhmLbT12f +dub/V0/JVT9YsxS3anmvefT6EXjUntYXDLPhhRJqUCnxYjf95FX5zxudB5gMEwLh +9pmRMgqMCDsIANVv7V77DagfaWNkhqSISQQYEQIACQUCUgeRugIbDAAKCRABBI+n +qeS2Q71kAJ45i6YdS9bZGR8tDI0NfneMiU32CwCfdje+fgX5gUtag5SshjxyMrgt +DgY= +=z9pR +-----END PGP PUBLIC KEY BLOCK----- +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v1 + +mQENBFpbc0cBCADDST+ekKD1YJje77oDX94gRolmUlh0df4n6/xvE700M1vPAiTT +kU3WJcvwnuTZpyMGSsAQCXXQRJuQObnkPEvjVAPgh8fvghCXgVElcr6dqXu3EVze +iCkdYm08t/+FF3kg/P6VYPjgEM/GIFnKTz37LrQlUM4ArG0ENIYM9xjurnKWuV9r +JuckJcUsmZUS/D9QMM2fuurYOEWHrE8t+n2EcO4aoY2x0ogYce0vON539rJiskjz +OPhIB9G7ZFQabQnyxzEKiUUDyJsbe38XDT4eyjUR2mlHGgTY/WzGdDEtIKRBWsd3 +TV3wXt42nF9YA3oieeaTbIluyywNnOj1vyT1ABEBAAG0VU1lbGxhbm94IFRlY2hu +b2xvZ2llcyAoTWVsbGFub3ggVGVjaG5vbG9naWVzIC0gU2lnbmluZyBLZXkgdjMp +IDxzdXBwb3J0QG1lbGxhbm94LmNvbT6JATcEEwEIACEFAlpbc0cCGwMFCwkIBwMF +FQoJCAsFFgIDAQACHgECF4AACgkQoCT28ObWooFXYwgAunwBFELGlwKonnmnbi4/ +avUa8e0wRpww//DJjI0HQWjMk7oPLDbS50CVps1Mu0SxBAPYGtsFeSH6UMC6A0K4 +yoxXICVl409vYkycNu/vq6eLTbM2Y0PFvBDzRAf3rJXL0ApLuUb57ARZvc7Np7LA +v8K53PdOJUEFns8Ipp+2puEVx5dfezm7LwRca6ohoLUEdI/PobmGUeNvO5dvfiix +LvSVw2A2awihB7dcs5cpo57VxBWPs7+sYBZ0+EUJbtQEiHAyPvKs29nMeaCIwPTd +88A5RrhsEJx+QWXuG6NA4rfehy5e9j1PW3XnC2fMl6w7gNLY5I8Vq6c2MJ73NZ6y +wLkBDQRaW3NHAQgAynkQ+mf4f5cdM4/bJuRWlPxxuN3CUxN9Q6B5B1/13p6tkydP +C7S4ro8H8sSlO5FbbxihfZLPTbFNrBkd///OQYMJW/slbtT6D9dYmCIeuHObMEMb +V+Bn1bWQId2vZgr0+m0Xe3K+KqhsylsrmC1ebShMnny/V+MlOQQt+L089BNiyCB4 +70mhgM1NiJFv9EOQlXWWaMqWTxZGYkdOuFW0q8NnSGOqI5xjrAUxaHZ/1U3yPy0k +eAjX1AKJngaj86SvIzEefxq4oA2gZ8UFVO/qFH5OhfoovrEwudJEuIgGb76XOb9m +AoZlAqQLJniC97ld515ivBdSi4SZkaFbypnX4QARAQABiQEfBBgBCAAJBQJaW3NH +AhsMAAoJEKAk9vDm1qKBHhMIAJuGbb6S3nb2xAD3GjB8F2xNcZxWQ+Qz70DY5vV/ +WhrJl7cknXMxsbWvQupuYk6LujZraG9YoD4csZ5o+k3s3BGKVUXdZdhjaHpcAa5F +X12ADLHca5mlmdCaaORYXQ+xHYRlOKas4I6LPpZ79BauVomEnPcv/bL0kGFzDvLr +K3RdQ1n/pbcWcxxSY3InphAnslLUg0PTAME6Yay5F7WrJsnZnXApUjOlZvlPIl2c +iplivN8o85eBKQXvYRg/c5iyc0koTmkM6OXNvUy0hV9z8WhhK9O+ApXwMUMf43DS +KOIg9RxhZFQoPXptaQZDLz89sWmZaiXsyBPJyjlmaTjwHGM= +=Iy5R +-----END PGP PUBLIC KEY BLOCK----- \ No newline at end of file diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 5bbce11fd4a..019ebc47e29 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -720,6 +720,25 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit EOF fi +if grep -q "GB200" <<< "$FEATURE_FLAGS"; then + # The GB200 feature flag should only be set for arm64 and Ubuntu 24.04, but validate + if [ ${UBUNTU_RELEASE} = "24.04" ] && [ ${CPU_ARCH} = "arm64" ]; then + # The open series driver is required for the GB200 platform. Dmesg output + # will appear directing the reader away from the proprietary driver. The GPUs + # are also not visible in nvidia-smi output with the proprietary drivers + apt install -y \ + mlnx-ofed-kernel-dkms \ + mlnx-ofed-kernel-utils \ + mlnx-ofed-basic \ + rdma-core \ + ibverbs-utils \ + ibverbs-providers + + systemctl restart openibd + ofed_info -s + fi +fi + if [ -d "/opt/gpu" ] && [ "$(ls -A /opt/gpu)" ]; then ls -ltr /opt/gpu/* >> ${VHD_LOGS_FILEPATH} fi diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 40e922672f4..54684b54acc 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -503,6 +503,19 @@ copyPackerFiles() { cpAndMode $NOTICE_SRC $NOTICE_DEST 444 fi + if grep -q "GB200" <<< "$FEATURE_FLAGS"; then + # Only applicable to Ubuntu 24.04 and ARM64 + if [ ${UBUNTU_RELEASE} = "24.04" ] && [ ${CPU_ARCH} = "arm64" ]; then + MELLANOX_LIST_SRC=/home/packer/mellanox_mlnx_ofed.list + MELLANOX_LIST_DEST=/etc/apt/sources.list.d/mellanox_mlnx_ofed.list + cpAndMode $MELLANOX_LIST_SRC $MELLANOX_LIST_DEST 644 + + MELLANOX_ASC_SRC=/home/packer/mellanox_mlnx_ofed.pub + MELLANOX_ASC_DEST=/etc/apt/keyrings/mellanox_mlnx_ofed.pub + cpAndMode $MELLANOX_ASC_SRC $MELLANOX_ASC_DEST 644 + fi + fi + # Always copy the VHD cleanup script responsible for prepping the instance for first boot # to disk so we can run it again if needed in subsequent builds/releases (prefetch during SIG release) cpAndMode $VHD_CLEANUP_SCRIPT_SRC $VHD_CLEANUP_SCRIPT_DEST 644 diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index 9934c1a393d..a1101f5024c 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -752,6 +752,16 @@ "source": "parts/linux/cloud-init/artifacts/51-azure-nvidia.cfg", "destination": "/home/packer/51-azure-nvidia.cfg" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.list", + "destination": "/home/packer/mellanox_mlnx_ofed.list" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.pub", + "destination": "/home/packer/mellanox_mlnx_ofed.pub" + }, { "type": "shell", "inline": [ From 16f319d5185168e2387b0999e322a8dec008ad65 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 13 Aug 2025 13:59:23 -0500 Subject: [PATCH 04/87] Add option to install DCGM, dcgm-exporter and the NVIDIA drivers for the GB200 platform --- .../artifacts/ubuntu/nvidia-2404.list | 2 ++ .../cloud-init/artifacts/ubuntu/nvidia.pub | 29 +++++++++++++++++++ vhdbuilder/packer/install-dependencies.sh | 7 +++++ vhdbuilder/packer/packer_source.sh | 8 +++++ vhdbuilder/packer/pre-install-dependencies.sh | 2 +- 5 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/nvidia.pub diff --git a/parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list b/parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list new file mode 100644 index 00000000000..774f88654cc --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list @@ -0,0 +1,2 @@ +deb [arch=amd64 signed-by=/usr/share/keyrings/nvidia.asc] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64 +deb [arch=arm64 signed-by=/usr/share/keyrings/nvidia.asc] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa \ No newline at end of file diff --git a/parts/linux/cloud-init/artifacts/ubuntu/nvidia.pub b/parts/linux/cloud-init/artifacts/ubuntu/nvidia.pub new file mode 100644 index 00000000000..8aabe4805e5 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/nvidia.pub @@ -0,0 +1,29 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2.0.22 (GNU/Linux) + +mQINBGJYmlEBEAC6nJmeqByeReM+MSy4palACCnfOg4pOxffrrkldxz4jrDOZNK4 +q8KG+ZbXrkdP0e9qTFRvZzN+A6Jw3ySfoiKXRBw5l2Zp81AYkghV641OpWNjZOyL +syKEtST9LR1ttHv1ZI71pj8NVG/EnpimZPOblEJ1OpibJJCXLrbn+qcJ8JNuGTSK +6v2aLBmhR8VR/aSJpmkg7fFjcGklweTI8+Ibj72HuY9JRD/+dtUoSh7z037mWo56 +ee02lPFRD0pHOEAlLSXxFO/SDqRVMhcgHk0a8roCF+9h5Ni7ZUyxlGK/uHkqN7ED +/U/ATpGKgvk4t23eTpdRC8FXAlBZQyf/xnhQXsyF/z7+RV5CL0o1zk1LKgo+5K32 +5ka5uZb6JSIrEPUaCPEMXu6EEY8zSFnCrRS/Vjkfvc9ViYZWzJ387WTjAhMdS7wd +PmdDWw2ASGUP4FrfCireSZiFX+ZAOspKpZdh0P5iR5XSx14XDt3jNK2EQQboaJAD +uqksItatOEYNu4JsCbc24roJvJtGhpjTnq1/dyoy6K433afU0DS2ZPLthLpGqeyK +MKNY7a2WjxhRmCSu5Zok/fGKcO62XF8a3eSj4NzCRv8LM6mG1Oekz6Zz+tdxHg19 +ufHO0et7AKE5q+5VjE438Xpl4UWbM/Voj6VPJ9uzywDcnZXpeOqeTQh2pQARAQAB +tCBjdWRhdG9vbHMgPGN1ZGF0b29sc0BudmlkaWEuY29tPokCOQQTAQIAIwUCYlia +UQIbAwcLCQgHAwIBBhUIAgkKCwQWAgMBAh4BAheAAAoJEKS0aZY7+GPM1y4QALKh +BqSozrYbe341Qu7SyxHQgjRCGi4YhI3bHCMj5F6vEOHnwiFH6YmFkxCYtqcGjca6 +iw7cCYMow/hgKLAPwkwSJ84EYpGLWx62+20rMM4OuZwauSUcY/kE2WgnQ74zbh3+ +MHs56zntJFfJ9G+NYidvwDWeZn5HIzR4CtxaxRgpiykg0s3ps6X0U+vuVcLnutBF +7r81astvlVQERFbce/6KqHK+yj843Qrhb3JEolUoOETK06nD25bVtnAxe0QEyA90 +9MpRNLfR6BdjPpxqhphDcMOhJfyubAroQUxG/7S+Yw+mtEqHrL/dz9iEYqodYiSo +zfi0b+HFI59sRkTfOBDBwb3kcARExwnvLJmqijiVqWkoJ3H67oA0XJN2nelucw+A +Hb+Jt9BWjyzKWlLFDnVHdGicyRJ0I8yqi32w8hGeXmu3tU58VWJrkXEXadBftmci +pemb6oZ/r5SCkW6kxr2PsNWcJoebUdynyOQGbVwpMtJAnjOYp0ObKOANbcIg+tsi +kyCIO5TiY3ADbBDPCeZK8xdcugXoW5WFwACGC0z+Cn0mtw8z3VGIPAMSCYmLusgW +t2+EpikwrP2inNp5Pc+YdczRAsa4s30Jpyv/UHEG5P9GKnvofaxJgnU56lJIRPzF +iCUGy6cVI0Fq777X/ME1K6A/bzZ4vRYNx8rUmVE5 +=DO7z +-----END PGP PUBLIC KEY BLOCK----- diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 019ebc47e29..0739f1260ee 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -736,6 +736,13 @@ if grep -q "GB200" <<< "$FEATURE_FLAGS"; then systemctl restart openibd ofed_info -s + + # Install the NVIDIA driver + apt install -y nvidia-drivers_570.172 + # Install DCGM exporter + apt install -y datacenter-gpu-manager-exporter-4.1.3 datacenter-gpu-manager-core-4.3.1 datacenter-gpu-manager-proprietary-4-3.1 + systemctl enable nvidia-dcgm + systemctl enable fi fi diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 54684b54acc..5dbb69440d6 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -513,6 +513,14 @@ copyPackerFiles() { MELLANOX_ASC_SRC=/home/packer/mellanox_mlnx_ofed.pub MELLANOX_ASC_DEST=/etc/apt/keyrings/mellanox_mlnx_ofed.pub cpAndMode $MELLANOX_ASC_SRC $MELLANOX_ASC_DEST 644 + + NVIDIA_LIST_SRC=/home/packer/nvidia-2404.list + NVIDIA_LIST_DEST=/etc/apt/sources.list.d/nvidia.list + cpAndMode $NVIDIA_LIST_SRC $NVIDIA_LIST_DEST 644 + + NVIDIA_ASC_SRC=/home/packer/nvidia.pub + NVIDIA_ASC_DEST=/etc/apt/keyrings/nvidia.pub + cpAndMode $NVIDIA_ASC_SRC $NVIDIA_ASC_DEST 644 fi fi diff --git a/vhdbuilder/packer/pre-install-dependencies.sh b/vhdbuilder/packer/pre-install-dependencies.sh index f5a5fd816fe..5facc01f52e 100644 --- a/vhdbuilder/packer/pre-install-dependencies.sh +++ b/vhdbuilder/packer/pre-install-dependencies.sh @@ -198,7 +198,7 @@ if [[ ${UBUNTU_RELEASE//./} -ge 2204 && "${ENABLE_FIPS,,}" != "true" ]]; then fi NVIDIA_KERNEL_PACKAGE="linux-azure-nvidia" if [[ "${CPU_ARCH}" == "arm64" && "${UBUNTU_RELEASE}" = "24.04" ]]; then - # This is the ubuntu 2404arm64gen2containerd image. + # This is the ubuntu 2404arm64gen2containerd image or the build2404arm64gb200gen2containerd image # Uncomment if we have trouble finding the kernel package. # sudo add-apt-repository ppa:canonical-kernel-team/ppa sudo apt update From 8adb303fb7dc40a0ace603248d1d1b9ba0e11bc8 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 13 Aug 2025 14:39:22 -0500 Subject: [PATCH 05/87] Fix nvidia list file --- parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list b/parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list index 774f88654cc..0c90e150d6d 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list +++ b/parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list @@ -1,2 +1,2 @@ -deb [arch=amd64 signed-by=/usr/share/keyrings/nvidia.asc] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64 -deb [arch=arm64 signed-by=/usr/share/keyrings/nvidia.asc] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa \ No newline at end of file +deb [arch=amd64 signed-by=/etc/apt/keyrings/nvidia.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64 / +deb [arch=arm64 signed-by=/etc/apt/keyrings/nvidia.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa / \ No newline at end of file From e775baf1ce1975396410b51a15e3f663286e08ac Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 13 Aug 2025 15:50:57 -0500 Subject: [PATCH 06/87] Fix nvidia-driver package name and version --- vhdbuilder/packer/install-dependencies.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 0739f1260ee..480e52fb0c3 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -738,7 +738,7 @@ if grep -q "GB200" <<< "$FEATURE_FLAGS"; then ofed_info -s # Install the NVIDIA driver - apt install -y nvidia-drivers_570.172 + apt install -y nvidia-driver-575 # Install DCGM exporter apt install -y datacenter-gpu-manager-exporter-4.1.3 datacenter-gpu-manager-core-4.3.1 datacenter-gpu-manager-proprietary-4-3.1 systemctl enable nvidia-dcgm From cd5a832e4df771cf42ca88d814478fa2bf59a8f3 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 13 Aug 2025 15:54:30 -0500 Subject: [PATCH 07/87] Fix incomplete systemd command --- vhdbuilder/packer/install-dependencies.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 480e52fb0c3..ccb44731987 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -742,7 +742,7 @@ if grep -q "GB200" <<< "$FEATURE_FLAGS"; then # Install DCGM exporter apt install -y datacenter-gpu-manager-exporter-4.1.3 datacenter-gpu-manager-core-4.3.1 datacenter-gpu-manager-proprietary-4-3.1 systemctl enable nvidia-dcgm - systemctl enable + systemctl enable nvidia-dcgm-exporter fi fi From f9f4acee820b1aeb5d074224e4f57b268d9402f7 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 13 Aug 2025 19:04:52 -0500 Subject: [PATCH 08/87] Pin to specific versions of NVIDIA packages correctly --- vhdbuilder/packer/install-dependencies.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index ccb44731987..2b97b583242 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -738,9 +738,9 @@ if grep -q "GB200" <<< "$FEATURE_FLAGS"; then ofed_info -s # Install the NVIDIA driver - apt install -y nvidia-driver-575 + apt install -y nvidia-driver-575=575.57.08-0ubuntu1 # Install DCGM exporter - apt install -y datacenter-gpu-manager-exporter-4.1.3 datacenter-gpu-manager-core-4.3.1 datacenter-gpu-manager-proprietary-4-3.1 + apt install -y datacenter-gpu-manager-exporter=4.1.3-1 datacenter-gpu-manager-core=1:4.3.1-1 datacenter-gpu-manager-proprietary=1:4.3.1-1 libcap2-bin systemctl enable nvidia-dcgm systemctl enable nvidia-dcgm-exporter fi From 8eb34e7a2301184eb8106591e06fcce4d9f03fdf Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 13 Aug 2025 22:44:36 -0500 Subject: [PATCH 09/87] Remove specific versions due to package installation failures --- vhdbuilder/packer/install-dependencies.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 2b97b583242..5d4485b2815 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -738,9 +738,9 @@ if grep -q "GB200" <<< "$FEATURE_FLAGS"; then ofed_info -s # Install the NVIDIA driver - apt install -y nvidia-driver-575=575.57.08-0ubuntu1 + apt install -y nvidia-driver-575 # Install DCGM exporter - apt install -y datacenter-gpu-manager-exporter=4.1.3-1 datacenter-gpu-manager-core=1:4.3.1-1 datacenter-gpu-manager-proprietary=1:4.3.1-1 libcap2-bin + apt install -y datacenter-gpu-manager-exporter datacenter-gpu-manager-core datacenter-gpu-manager-proprietary libcap2-bin systemctl enable nvidia-dcgm systemctl enable nvidia-dcgm-exporter fi From 3013a9509919f4e2c389082319efa3386ec704c4 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 Aug 2025 07:09:47 -0500 Subject: [PATCH 10/87] Fix NVIDIA package names --- vhdbuilder/packer/install-dependencies.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 5d4485b2815..756b5f403f6 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -740,7 +740,7 @@ if grep -q "GB200" <<< "$FEATURE_FLAGS"; then # Install the NVIDIA driver apt install -y nvidia-driver-575 # Install DCGM exporter - apt install -y datacenter-gpu-manager-exporter datacenter-gpu-manager-core datacenter-gpu-manager-proprietary libcap2-bin + apt install -y datacenter-gpu-manager-exporter datacenter-gpu-manager-4-core datacenter-gpu-manager-4-proprietary libcap2-bin systemctl enable nvidia-dcgm systemctl enable nvidia-dcgm-exporter fi From e5480dc3576180707ad85b1f93cbd2d21167879d Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Mon, 18 Aug 2025 12:24:34 -0500 Subject: [PATCH 11/87] Increase systemd log level to debug to troubleshoot boot errors --- parts/linux/cloud-init/artifacts/10_azure_nvidia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/10_azure_nvidia b/parts/linux/cloud-init/artifacts/10_azure_nvidia index 47d5a2fab15..739df76f097 100755 --- a/parts/linux/cloud-init/artifacts/10_azure_nvidia +++ b/parts/linux/cloud-init/artifacts/10_azure_nvidia @@ -33,7 +33,7 @@ smbios --type 4 --get-string 7 --set cpu_manufacturer if [ x\$cpu_manufacturer = xNVIDIA ]; then set default="gnulinux-${NVIDIA}-advanced-${boot_device_id}" - set nvidia_args="iommu.passthrough=1 irqchip.gicv3_nolpi=y arm_smmu_v3.disable_msipolling=1" + set nvidia_args="iommu.passthrough=1 irqchip.gicv3_nolpi=y arm_smmu_v3.disable_msipolling=1 systemd.log_level=debug" else set default="gnulinux-${OTHER}-advanced-${boot_device_id}" fi From b3f64ada60d69b1e600ab004ce853644fd9b9f36 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Tue, 19 Aug 2025 09:48:53 -0500 Subject: [PATCH 12/87] Revert systemd debugging kernel parameter --- parts/linux/cloud-init/artifacts/10_azure_nvidia | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/10_azure_nvidia b/parts/linux/cloud-init/artifacts/10_azure_nvidia index 739df76f097..47d5a2fab15 100755 --- a/parts/linux/cloud-init/artifacts/10_azure_nvidia +++ b/parts/linux/cloud-init/artifacts/10_azure_nvidia @@ -33,7 +33,7 @@ smbios --type 4 --get-string 7 --set cpu_manufacturer if [ x\$cpu_manufacturer = xNVIDIA ]; then set default="gnulinux-${NVIDIA}-advanced-${boot_device_id}" - set nvidia_args="iommu.passthrough=1 irqchip.gicv3_nolpi=y arm_smmu_v3.disable_msipolling=1 systemd.log_level=debug" + set nvidia_args="iommu.passthrough=1 irqchip.gicv3_nolpi=y arm_smmu_v3.disable_msipolling=1" else set default="gnulinux-${OTHER}-advanced-${boot_device_id}" fi From c3c4c3f6efda400f8e0775a7ad473f7a3ae46874 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 20 Aug 2025 11:47:55 -0500 Subject: [PATCH 13/87] Add label limit check to prevent kubelet from crashing --- parts/linux/cloud-init/artifacts/cse_helpers.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/parts/linux/cloud-init/artifacts/cse_helpers.sh b/parts/linux/cloud-init/artifacts/cse_helpers.sh index 9a78cddf2fc..9ea4936ec06 100755 --- a/parts/linux/cloud-init/artifacts/cse_helpers.sh +++ b/parts/linux/cloud-init/artifacts/cse_helpers.sh @@ -1103,6 +1103,20 @@ addKubeletNodeLabel() { return 0 fi + # Check if the label value exceeds 63 characters (Kubernetes label value limit) + # Extract the value part after the '=' sign + # Without this limit check, kubelet will crash vaguely when first joining a cluster + local label_key="${LABEL_STRING%%=*}" + local label_value="${LABEL_STRING#*=}" + + if [ "${#label_value}" -gt 63 ]; then + echo "Warning: Label value for '${label_key}' exceeds 63 characters (${#label_value} chars). Truncating to last 63 characters." + # Take only the last 63 characters of the value + label_value="${label_value: -63}" + LABEL_STRING="${label_key}=${label_value}" + echo "Truncated label: ${LABEL_STRING}" + fi + echo "adding label $LABEL_STRING to kubelet node labels..." if [ -n "$KUBELET_NODE_LABELS" ]; then KUBELET_NODE_LABELS="${KUBELET_NODE_LABELS},${LABEL_STRING}" From 076e3c1047aca47629dddbaafbfa58f96b778859 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 20 Aug 2025 12:44:36 -0500 Subject: [PATCH 14/87] Add additional NVIDIA dependencies to the GB200 image --- vhdbuilder/packer/install-dependencies.sh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 756b5f403f6..288e1090c56 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -736,11 +736,15 @@ if grep -q "GB200" <<< "$FEATURE_FLAGS"; then systemctl restart openibd ofed_info -s + apt install -y \ + nvidia-driver-580-open \ + cuda-12-toolkit \ + nvidia-container-toolkit \ + datacenter-gpu-manager-exporter \ + datacenter-gpu-manager-4-core \ + datacenter-gpu-manager-4-proprietary \ + libcap2-bin - # Install the NVIDIA driver - apt install -y nvidia-driver-575 - # Install DCGM exporter - apt install -y datacenter-gpu-manager-exporter datacenter-gpu-manager-4-core datacenter-gpu-manager-4-proprietary libcap2-bin systemctl enable nvidia-dcgm systemctl enable nvidia-dcgm-exporter fi From 8013bf18d6e9d271848593941c1bb79694f4d233 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 20 Aug 2025 13:19:32 -0500 Subject: [PATCH 15/87] Add nvidia-container-toolkit configuration to containerd --- .../artifacts/ubuntu/containerd-nvidia.toml | 39 +++++++++++++++++++ vhdbuilder/packer/packer_source.sh | 4 ++ .../packer/vhd-image-builder-arm64-gen2.json | 15 +++++++ 3 files changed, 58 insertions(+) create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml new file mode 100644 index 00000000000..2366562134d --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml @@ -0,0 +1,39 @@ +oom_score = -999 +version = 2 + +[metrics] + address = "0.0.0.0:10257" + +[plugins] + +[plugins."io.containerd.grpc.v1.cri"] + sandbox_image = "mcr.microsoft.com/oss/kubernetes/pause:3.6" + +[plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + runtime_type = "io.containerd.runc.v2" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/bin/nvidia-container-runtime" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + BinaryName = "/usr/bin/runc" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted] + runtime_type = "io.containerd.runc.v2" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted.options] + BinaryName = "/usr/bin/runc" + +[plugins."io.containerd.grpc.v1.cri".registry] + config_path = "/etc/containerd/certs.d" + +[plugins."io.containerd.grpc.v1.cri".registry.headers] + X-Meta-Source-Client = ["azure/aks"] \ No newline at end of file diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 5dbb69440d6..86e86411013 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -521,6 +521,10 @@ copyPackerFiles() { NVIDIA_ASC_SRC=/home/packer/nvidia.pub NVIDIA_ASC_DEST=/etc/apt/keyrings/nvidia.pub cpAndMode $NVIDIA_ASC_SRC $NVIDIA_ASC_DEST 644 + + CONTAINERD_NVIDIA_TOML_SRC=/home/packer/containerd-nvidia.toml + CONTAINERD_NVIDIA_TOML_DEST=/etc/containerd/config.toml + cpAndMode $CONTAINERD_NVIDIA_TOML_SRC $CONTAINERD_NVIDIA_TOML_DEST 644 fi fi diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index a1101f5024c..f04acee4b44 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -762,6 +762,21 @@ "source": "parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.pub", "destination": "/home/packer/mellanox_mlnx_ofed.pub" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list", + "destination": "/home/packer/nvidia-2404.list" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/nvidia.pub", + "destination": "/home/packer/nvidia.pub" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml", + "destination": "/home/packer/containerd-nvidia.toml" + }, { "type": "shell", "inline": [ From 140767905307302c3ff3e620995f5f935bdc1a05 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 20 Aug 2025 16:04:32 -0500 Subject: [PATCH 16/87] Fix CUDA package name --- vhdbuilder/packer/install-dependencies.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 288e1090c56..64eb77f5ca0 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -738,7 +738,7 @@ if grep -q "GB200" <<< "$FEATURE_FLAGS"; then ofed_info -s apt install -y \ nvidia-driver-580-open \ - cuda-12-toolkit \ + cuda-toolkit-12 \ nvidia-container-toolkit \ datacenter-gpu-manager-exporter \ datacenter-gpu-manager-4-core \ From d5761cc8d9903e081d0c4e894254215255bf71de Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 21 Aug 2025 16:52:50 -0500 Subject: [PATCH 17/87] Move label value truncation logic to types.go --- .../linux/cloud-init/artifacts/cse_helpers.sh | 20 +++++++++---------- pkg/agent/datamodel/types.go | 16 ++++++++++++++- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/cse_helpers.sh b/parts/linux/cloud-init/artifacts/cse_helpers.sh index 9ea4936ec06..057e162e478 100755 --- a/parts/linux/cloud-init/artifacts/cse_helpers.sh +++ b/parts/linux/cloud-init/artifacts/cse_helpers.sh @@ -1106,16 +1106,16 @@ addKubeletNodeLabel() { # Check if the label value exceeds 63 characters (Kubernetes label value limit) # Extract the value part after the '=' sign # Without this limit check, kubelet will crash vaguely when first joining a cluster - local label_key="${LABEL_STRING%%=*}" - local label_value="${LABEL_STRING#*=}" - - if [ "${#label_value}" -gt 63 ]; then - echo "Warning: Label value for '${label_key}' exceeds 63 characters (${#label_value} chars). Truncating to last 63 characters." - # Take only the last 63 characters of the value - label_value="${label_value: -63}" - LABEL_STRING="${label_key}=${label_value}" - echo "Truncated label: ${LABEL_STRING}" - fi + #local label_key="${LABEL_STRING%%=*}" + #local label_value="${LABEL_STRING#*=}" + + #if [ "${#label_value}" -gt 63 ]; then + # echo "Warning: Label value for '${label_key}' exceeds 63 characters (${#label_value} chars). Truncating to last 63 characters." + # # Take only the last 63 characters of the value + # label_value="${label_value: -63}" + # LABEL_STRING="${label_key}=${label_value}" + # echo "Truncated label: ${LABEL_STRING}" + #fi echo "adding label $LABEL_STRING to kubelet node labels..." if [ -n "$KUBELET_NODE_LABELS" ]; then diff --git a/pkg/agent/datamodel/types.go b/pkg/agent/datamodel/types.go index 61ef460608b..4dea4b3ef29 100644 --- a/pkg/agent/datamodel/types.go +++ b/pkg/agent/datamodel/types.go @@ -290,6 +290,8 @@ const ( Componentkubelet CustomConfigurationComponent = "kubelet" ) +const MAXIMUM_VALUE_LENGTH = 63 + func (d Distro) IsVHDDistro() bool { for _, distro := range AKSDistrosAvailableOnVHD { if d == distro { @@ -1262,7 +1264,19 @@ func (a *AgentPoolProfile) GetKubernetesLabels() string { } sort.Strings(keys) for _, key := range keys { - buf.WriteString(fmt.Sprintf(",%s=%s", key, a.CustomNodeLabels[key])) + value := a.CustomNodeLabels[key] + /* + The maximum length of a value is 63 characters. If Kubelet is started with a key + that has a value longer than 63 characters, it will crash on startup. Truncate + the front of the string since the end of the string is more likely to contain + relevant information like version numbers. + */ + if len(value) > MAXIMUM_VALUE_LENGTH { + truncatedValue := value[len(value)-MAXIMUM_VALUE_LENGTH:] + buf.WriteString(fmt.Sprintf(",%s=%s", key, truncatedValue)) + } else { + buf.WriteString(fmt.Sprintf(",%s=%s", key, a.CustomNodeLabels[key])) + } } return buf.String() } From d881f97fdff3358e1b53f52099b64a17954e24e8 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 21 Aug 2025 20:30:28 -0500 Subject: [PATCH 18/87] Revert validation logic in Go code, add validation before KUBELET_NODE_LABELS are written to a /etc/default/kubelet --- .../linux/cloud-init/artifacts/cse_config.sh | 38 +++++++++++++++++++ .../linux/cloud-init/artifacts/cse_helpers.sh | 14 ------- pkg/agent/datamodel/types.go | 16 +------- 3 files changed, 39 insertions(+), 29 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 1bccaf924c7..7b742772668 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -634,6 +634,43 @@ ensurePodInfraContainerImage() { rm -f ${POD_INFRA_CONTAINER_IMAGE_TAR} } +validateKubeletNodeLabels() { + local labels="$1" + local validated_labels="" + local delimiter="" + + # Return empty if no labels provided + if [ -z "$labels" ]; then + echo "No labels found in KUBELET_NODE_LABELS" + return 0 + fi + + # Split labels by comma and process each + IFS=',' read -ra LABEL_ARRAY <<< "$labels" + for label in "${LABEL_ARRAY[@]}"; do + # Split each label into key and value + if [[ "$label" == *"="* ]]; then + key="${label%%=*}" + value="${label#*=}" + + # Check if key length exceeds 63 characters + if [ ${#key} -gt 63 ]; then + echo "Warning: Label key '$key' exceeds 63 characters, truncating to 63 characters" >&2 + key="${key:0:63}" + fi + + # Rebuild the label with potentially truncated key + validated_labels="${validated_labels}${delimiter}${key}=${value}" + fi + + # Set delimiter for subsequent labels + delimiter="," + done + + # Update the global variable with validated labels + KUBELET_NODE_LABELS="$validated_labels" +} + ensureKubelet() { KUBELET_DEFAULT_FILE=/etc/default/kubelet mkdir -p /etc/default @@ -662,6 +699,7 @@ EOF echo "KUBELET_REGISTER_SCHEDULABLE=true" >> "${KUBELET_DEFAULT_FILE}" echo "NETWORK_POLICY=${NETWORK_POLICY}" >> "${KUBELET_DEFAULT_FILE}" echo "KUBELET_IMAGE=${KUBELET_IMAGE}" >> "${KUBELET_DEFAULT_FILE}" + validateKubeletNodeLabels "${KUBELET_NODE_LABELS}" echo "KUBELET_NODE_LABELS=${KUBELET_NODE_LABELS}" >> "${KUBELET_DEFAULT_FILE}" if [ -n "${AZURE_ENVIRONMENT_FILEPATH}" ]; then echo "AZURE_ENVIRONMENT_FILEPATH=${AZURE_ENVIRONMENT_FILEPATH}" >> "${KUBELET_DEFAULT_FILE}" diff --git a/parts/linux/cloud-init/artifacts/cse_helpers.sh b/parts/linux/cloud-init/artifacts/cse_helpers.sh index 057e162e478..9a78cddf2fc 100755 --- a/parts/linux/cloud-init/artifacts/cse_helpers.sh +++ b/parts/linux/cloud-init/artifacts/cse_helpers.sh @@ -1103,20 +1103,6 @@ addKubeletNodeLabel() { return 0 fi - # Check if the label value exceeds 63 characters (Kubernetes label value limit) - # Extract the value part after the '=' sign - # Without this limit check, kubelet will crash vaguely when first joining a cluster - #local label_key="${LABEL_STRING%%=*}" - #local label_value="${LABEL_STRING#*=}" - - #if [ "${#label_value}" -gt 63 ]; then - # echo "Warning: Label value for '${label_key}' exceeds 63 characters (${#label_value} chars). Truncating to last 63 characters." - # # Take only the last 63 characters of the value - # label_value="${label_value: -63}" - # LABEL_STRING="${label_key}=${label_value}" - # echo "Truncated label: ${LABEL_STRING}" - #fi - echo "adding label $LABEL_STRING to kubelet node labels..." if [ -n "$KUBELET_NODE_LABELS" ]; then KUBELET_NODE_LABELS="${KUBELET_NODE_LABELS},${LABEL_STRING}" diff --git a/pkg/agent/datamodel/types.go b/pkg/agent/datamodel/types.go index 4dea4b3ef29..61ef460608b 100644 --- a/pkg/agent/datamodel/types.go +++ b/pkg/agent/datamodel/types.go @@ -290,8 +290,6 @@ const ( Componentkubelet CustomConfigurationComponent = "kubelet" ) -const MAXIMUM_VALUE_LENGTH = 63 - func (d Distro) IsVHDDistro() bool { for _, distro := range AKSDistrosAvailableOnVHD { if d == distro { @@ -1264,19 +1262,7 @@ func (a *AgentPoolProfile) GetKubernetesLabels() string { } sort.Strings(keys) for _, key := range keys { - value := a.CustomNodeLabels[key] - /* - The maximum length of a value is 63 characters. If Kubelet is started with a key - that has a value longer than 63 characters, it will crash on startup. Truncate - the front of the string since the end of the string is more likely to contain - relevant information like version numbers. - */ - if len(value) > MAXIMUM_VALUE_LENGTH { - truncatedValue := value[len(value)-MAXIMUM_VALUE_LENGTH:] - buf.WriteString(fmt.Sprintf(",%s=%s", key, truncatedValue)) - } else { - buf.WriteString(fmt.Sprintf(",%s=%s", key, a.CustomNodeLabels[key])) - } + buf.WriteString(fmt.Sprintf(",%s=%s", key, a.CustomNodeLabels[key])) } return buf.String() } From 4d70ddcd1b2ead85b043676139e0b9c89df8281f Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Fri, 22 Aug 2025 08:24:00 -0500 Subject: [PATCH 19/87] Revert validation, shorten artifact name --- .pipelines/.vsts-vhd-builder-release.yaml | 10 ++++++++-- parts/linux/cloud-init/artifacts/cse_config.sh | 1 - vhdbuilder/packer/pre-install-dependencies.sh | 2 +- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.pipelines/.vsts-vhd-builder-release.yaml b/.pipelines/.vsts-vhd-builder-release.yaml index 7d022a5c1e1..23558f173de 100644 --- a/.pipelines/.vsts-vhd-builder-release.yaml +++ b/.pipelines/.vsts-vhd-builder-release.yaml @@ -81,7 +81,7 @@ parameters: displayName: Build 2404 ARM64 Gen2 containerd type: boolean default: true - - name: build2404arm64gb200gen2containerd + - name: build2404arm64gb200 displayName: Build 2404 ARM64 GB200 Gen2 Containerd type: boolean default: false @@ -966,6 +966,12 @@ stages: condition: eq('${{ parameters.build2404arm64gb200gen2containerd }}', true) dependsOn: [ ] timeoutInMinutes: 360 + - stage: build_vhd_2404_arm64_gb200 + condition: eq('${{ parameters.build2404arm64gb200 }}', true) + dependsOn: [] + jobs: + - job: build2404arm64gb200 + timeoutInMinutes: 180 steps: - bash: | echo '##vso[task.setvariable variable=OS_SKU]Ubuntu' @@ -985,7 +991,7 @@ stages: parameters: useOverrides: ${{ parameters.useOverrides }} overrideBranch: ${{ parameters.overrideBranch }} - artifactName: 2404-arm64-gb200-gen2-containerd + artifactName: 2404-arm64-gb200 - stage: e2e condition: and(succeeded(), ne(variables.SKIP_E2E_TESTS, 'true')) variables: diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 7b742772668..49552f8bfc4 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -699,7 +699,6 @@ EOF echo "KUBELET_REGISTER_SCHEDULABLE=true" >> "${KUBELET_DEFAULT_FILE}" echo "NETWORK_POLICY=${NETWORK_POLICY}" >> "${KUBELET_DEFAULT_FILE}" echo "KUBELET_IMAGE=${KUBELET_IMAGE}" >> "${KUBELET_DEFAULT_FILE}" - validateKubeletNodeLabels "${KUBELET_NODE_LABELS}" echo "KUBELET_NODE_LABELS=${KUBELET_NODE_LABELS}" >> "${KUBELET_DEFAULT_FILE}" if [ -n "${AZURE_ENVIRONMENT_FILEPATH}" ]; then echo "AZURE_ENVIRONMENT_FILEPATH=${AZURE_ENVIRONMENT_FILEPATH}" >> "${KUBELET_DEFAULT_FILE}" diff --git a/vhdbuilder/packer/pre-install-dependencies.sh b/vhdbuilder/packer/pre-install-dependencies.sh index 5facc01f52e..c6ec20492f9 100644 --- a/vhdbuilder/packer/pre-install-dependencies.sh +++ b/vhdbuilder/packer/pre-install-dependencies.sh @@ -198,7 +198,7 @@ if [[ ${UBUNTU_RELEASE//./} -ge 2204 && "${ENABLE_FIPS,,}" != "true" ]]; then fi NVIDIA_KERNEL_PACKAGE="linux-azure-nvidia" if [[ "${CPU_ARCH}" == "arm64" && "${UBUNTU_RELEASE}" = "24.04" ]]; then - # This is the ubuntu 2404arm64gen2containerd image or the build2404arm64gb200gen2containerd image + # This is the ubuntu 2404arm64gen2containerd image or the 2404arm64gb200 image # Uncomment if we have trouble finding the kernel package. # sudo add-apt-repository ppa:canonical-kernel-team/ppa sudo apt update From b50159aea84457a36f3f5be426a98921f2194182 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Fri, 22 Aug 2025 09:13:06 -0500 Subject: [PATCH 20/87] Revert changes, add truncation logic to cse_config.sh before node labels are written to the /etc/default/kubelet file --- .pipelines/.vsts-vhd-builder-release.yaml | 8 ++-- .../linux/cloud-init/artifacts/cse_config.sh | 44 ++++++++++++++++++- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/.pipelines/.vsts-vhd-builder-release.yaml b/.pipelines/.vsts-vhd-builder-release.yaml index 23558f173de..a3ed0525086 100644 --- a/.pipelines/.vsts-vhd-builder-release.yaml +++ b/.pipelines/.vsts-vhd-builder-release.yaml @@ -81,7 +81,7 @@ parameters: displayName: Build 2404 ARM64 Gen2 containerd type: boolean default: true - - name: build2404arm64gb200 + - name: build2404arm64gb200gen2containerd displayName: Build 2404 ARM64 GB200 Gen2 Containerd type: boolean default: false @@ -968,9 +968,11 @@ stages: timeoutInMinutes: 360 - stage: build_vhd_2404_arm64_gb200 condition: eq('${{ parameters.build2404arm64gb200 }}', true) + - stage: build_vhd_2404_arm64_gb200_gen2_containerd + condition: eq('${{ parameters.build2404arm64gb200gen2containerd }}', true) dependsOn: [] jobs: - - job: build2404arm64gb200 + - job: build2404arm64gb200gen2containerd timeoutInMinutes: 180 steps: - bash: | @@ -991,7 +993,7 @@ stages: parameters: useOverrides: ${{ parameters.useOverrides }} overrideBranch: ${{ parameters.overrideBranch }} - artifactName: 2404-arm64-gb200 + artifactName: 2404-arm64-gb200-gen2-containerd - stage: e2e condition: and(succeeded(), ne(variables.SKIP_E2E_TESTS, 'true')) variables: diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 49552f8bfc4..c307c832d3d 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -675,6 +675,46 @@ ensureKubelet() { KUBELET_DEFAULT_FILE=/etc/default/kubelet mkdir -p /etc/default + # Function to validate and truncate kubelet node label values to 63 characters + validateKubeletNodeLabels() { + local labels="$1" + local validated_labels="" + local delimiter="" + + # Return empty if no labels provided + if [ -z "$labels" ]; then + echo "" + return 0 + fi + + # Split labels by comma and process each + IFS=',' read -ra LABEL_ARRAY <<< "$labels" + for label in "${LABEL_ARRAY[@]}"; do + # Split each label into key and value + if [[ "$label" == *"="* ]]; then + key="${label%%=*}" + value="${label#*=}" + + # Check if value length exceeds 63 characters + if [ ${#value} -gt 63 ]; then + echo "Warning: Label value for key '$key' exceeds 63 characters, truncating from ${#value} to 63 characters" >&2 + value="${value:0:63}" + fi + + # Rebuild the label with potentially truncated value + validated_labels="${validated_labels}${delimiter}${key}=${value}" + else + # Handle labels without values (though this shouldn't happen in practice) + validated_labels="${validated_labels}${delimiter}${label}" + fi + + # Set delimiter for subsequent labels + delimiter="," + done + + echo "$validated_labels" + } + # In k8s >= 1.29 kubelet no longer sets node internalIP when using external cloud provider # https://github.com/kubernetes/kubernetes/pull/121028 # This regresses node startup performance in Azure CNI Overlay and Podsubnet clusters, which require the node to be @@ -699,7 +739,9 @@ EOF echo "KUBELET_REGISTER_SCHEDULABLE=true" >> "${KUBELET_DEFAULT_FILE}" echo "NETWORK_POLICY=${NETWORK_POLICY}" >> "${KUBELET_DEFAULT_FILE}" echo "KUBELET_IMAGE=${KUBELET_IMAGE}" >> "${KUBELET_DEFAULT_FILE}" - echo "KUBELET_NODE_LABELS=${KUBELET_NODE_LABELS}" >> "${KUBELET_DEFAULT_FILE}" + # Validate and truncate label values to 63 characters before writing to kubelet config + VALIDATED_KUBELET_NODE_LABELS=$(validateKubeletNodeLabels "${KUBELET_NODE_LABELS}") + echo "KUBELET_NODE_LABELS=${VALIDATED_KUBELET_NODE_LABELS}" >> "${KUBELET_DEFAULT_FILE}" if [ -n "${AZURE_ENVIRONMENT_FILEPATH}" ]; then echo "AZURE_ENVIRONMENT_FILEPATH=${AZURE_ENVIRONMENT_FILEPATH}" >> "${KUBELET_DEFAULT_FILE}" fi From f5cba9aac6ef9c4933050d527ce39cde45be7217 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Fri, 22 Aug 2025 11:55:06 -0500 Subject: [PATCH 21/87] Add ExecStartPre script to trim KUBELET_NODE_LABELS, revert previous changes --- .../linux/cloud-init/artifacts/cse_config.sh | 44 +-------- .../artifacts/update-kubelet-node-labels.sh | 93 +++++++++++++++++++ vhdbuilder/packer/packer_source.sh | 3 + .../packer/vhd-image-builder-arm64-gen2.json | 5 + 4 files changed, 102 insertions(+), 43 deletions(-) create mode 100644 parts/linux/cloud-init/artifacts/update-kubelet-node-labels.sh diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index c307c832d3d..49552f8bfc4 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -675,46 +675,6 @@ ensureKubelet() { KUBELET_DEFAULT_FILE=/etc/default/kubelet mkdir -p /etc/default - # Function to validate and truncate kubelet node label values to 63 characters - validateKubeletNodeLabels() { - local labels="$1" - local validated_labels="" - local delimiter="" - - # Return empty if no labels provided - if [ -z "$labels" ]; then - echo "" - return 0 - fi - - # Split labels by comma and process each - IFS=',' read -ra LABEL_ARRAY <<< "$labels" - for label in "${LABEL_ARRAY[@]}"; do - # Split each label into key and value - if [[ "$label" == *"="* ]]; then - key="${label%%=*}" - value="${label#*=}" - - # Check if value length exceeds 63 characters - if [ ${#value} -gt 63 ]; then - echo "Warning: Label value for key '$key' exceeds 63 characters, truncating from ${#value} to 63 characters" >&2 - value="${value:0:63}" - fi - - # Rebuild the label with potentially truncated value - validated_labels="${validated_labels}${delimiter}${key}=${value}" - else - # Handle labels without values (though this shouldn't happen in practice) - validated_labels="${validated_labels}${delimiter}${label}" - fi - - # Set delimiter for subsequent labels - delimiter="," - done - - echo "$validated_labels" - } - # In k8s >= 1.29 kubelet no longer sets node internalIP when using external cloud provider # https://github.com/kubernetes/kubernetes/pull/121028 # This regresses node startup performance in Azure CNI Overlay and Podsubnet clusters, which require the node to be @@ -739,9 +699,7 @@ EOF echo "KUBELET_REGISTER_SCHEDULABLE=true" >> "${KUBELET_DEFAULT_FILE}" echo "NETWORK_POLICY=${NETWORK_POLICY}" >> "${KUBELET_DEFAULT_FILE}" echo "KUBELET_IMAGE=${KUBELET_IMAGE}" >> "${KUBELET_DEFAULT_FILE}" - # Validate and truncate label values to 63 characters before writing to kubelet config - VALIDATED_KUBELET_NODE_LABELS=$(validateKubeletNodeLabels "${KUBELET_NODE_LABELS}") - echo "KUBELET_NODE_LABELS=${VALIDATED_KUBELET_NODE_LABELS}" >> "${KUBELET_DEFAULT_FILE}" + echo "KUBELET_NODE_LABELS=${KUBELET_NODE_LABELS}" >> "${KUBELET_DEFAULT_FILE}" if [ -n "${AZURE_ENVIRONMENT_FILEPATH}" ]; then echo "AZURE_ENVIRONMENT_FILEPATH=${AZURE_ENVIRONMENT_FILEPATH}" >> "${KUBELET_DEFAULT_FILE}" fi diff --git a/parts/linux/cloud-init/artifacts/update-kubelet-node-labels.sh b/parts/linux/cloud-init/artifacts/update-kubelet-node-labels.sh new file mode 100644 index 00000000000..335407de476 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/update-kubelet-node-labels.sh @@ -0,0 +1,93 @@ +#!/bin/bash + +# Script to update KUBELET_NODE_LABELS in /etc/default/kubelet +# This script modifies the kubelet configuration file in-place to correct node labels +# that may have been overwritten by custom data processes. + +set -euo pipefail + +KUBELET_DEFAULT_FILE="/etc/default/kubelet" +BACKUP_FILE="/etc/default/kubelet.backup" + +# Function to log messages +log() { + echo "$(date '+%Y-%m-%d %H:%M:%S') [update-kubelet-node-labels] $*" >&2 +} + +# Check if the kubelet default file exists +if [ ! -f "$KUBELET_DEFAULT_FILE" ]; then + log "ERROR: $KUBELET_DEFAULT_FILE does not exist" + exit 1 +fi + +# Create a backup of the original file if it doesn't exist +if [ ! -f "$BACKUP_FILE" ]; then + cp "$KUBELET_DEFAULT_FILE" "$BACKUP_FILE" + log "Created backup at $BACKUP_FILE" +fi + +# Extract KUBELET_NODE_LABELS from the file +KUBELET_NODE_LABELS=$(grep "^KUBELET_NODE_LABELS=" "$KUBELET_DEFAULT_FILE" | cut -d'"' -f2) + +log "Current KUBELET_NODE_LABELS: $KUBELET_NODE_LABELS" + +# Process the labels if they exist +if [ -n "$KUBELET_NODE_LABELS" ]; then + NEW_LABELS="" + + # Split by comma and process each label + OLD_IFS="$IFS" + IFS=',' + for label in $KUBELET_NODE_LABELS; do + IFS="$OLD_IFS" + + # Skip empty labels + if [ -n "$label" ]; then + # Split by equal sign to get key and value + key="${label%%=*}" + value="${label#*=}" + + # Truncate value if longer than 63 characters + if [ ${#value} -gt 63 ]; then + value="${value:0:63}" + log "WARNING: Truncated value for key '$key' to 63 characters" + fi + + # Reassemble the label + processed_label="${key}=${value}" + + # Add to new labels list + if [ -n "$NEW_LABELS" ]; then + NEW_LABELS="${NEW_LABELS},${processed_label}" + else + NEW_LABELS="$processed_label" + fi + fi + IFS=',' + done + IFS="$OLD_IFS" + + KUBELET_NODE_LABELS="$NEW_LABELS" +fi + +log "Final KUBELET_NODE_LABELS: $KUBELET_NODE_LABELS" + +TEMP_FILE=$(mktemp) +trap 'rm -f "$TEMP_FILE"' EXIT + +while IFS= read -r line; do + case "$line" in + KUBELET_NODE_LABELS=*) + echo "KUBELET_NODE_LABELS=\"$KUBELET_NODE_LABELS\"" + ;; + *) + echo "$line" + ;; + esac +done < "$KUBELET_DEFAULT_FILE" > "$TEMP_FILE" + +mv "$TEMP_FILE" "$KUBELET_DEFAULT_FILE" + +chmod 644 "$KUBELET_DEFAULT_FILE" + +log "Successfully updated $KUBELET_DEFAULT_FILE" diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 86e86411013..003622de4e4 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -125,6 +125,8 @@ copyPackerFiles() { MEASURE_TLS_BOOTSTRAPPING_LATENCY_SERVICE_DEST=/etc/systemd/system/measure-tls-bootstrapping-latency.service VALIDATE_KUBELET_CREDENTIALS_SCRIPT_SRC=/home/packer/validate-kubelet-credentials.sh VALIDATE_KUBELET_CREDENTIALS_SCRIPT_DEST=/opt/azure/containers/validate-kubelet-credentials.sh + UPDATE_KUBELET_NODE_LABELS_SRC=/home/packer/update-kubelet-node-labels.sh + UPDATE_KUBELET_NODE_LABELS_DEST=/opt/azure/containers/update-kubelet-node-labels.sh RECONCILE_PRIVATE_HOSTS_SRC=/home/packer/reconcile-private-hosts.sh RECONCILE_PRIVATE_HOSTS_DEST=/opt/azure/containers/reconcilePrivateHosts.sh KUBELET_SERVICE_SRC=/home/packer/kubelet.service @@ -416,6 +418,7 @@ copyPackerFiles() { cpAndMode $MEASURE_TLS_BOOTSTRAPPING_LATENCY_SCRIPT_SRC $MEASURE_TLS_BOOTSTRAPPING_LATENCY_SCRIPT_DEST 755 cpAndMode $MEASURE_TLS_BOOTSTRAPPING_LATENCY_SERVICE_SRC $MEASURE_TLS_BOOTSTRAPPING_LATENCY_SERVICE_DEST 644 cpAndMode $VALIDATE_KUBELET_CREDENTIALS_SCRIPT_SRC $VALIDATE_KUBELET_CREDENTIALS_SCRIPT_DEST 755 + cpAndMode $UPDATE_KUBELET_NODE_LABELS_SRC $UPDATE_KUBELET_NODE_LABELS_DEST 755 cpAndMode $RECONCILE_PRIVATE_HOSTS_SRC $RECONCILE_PRIVATE_HOSTS_DEST 744 cpAndMode $SYSCTL_CONFIG_SRC $SYSCTL_CONFIG_DEST 644 cpAndMode $RSYSLOG_CONFIG_SRC $RSYSLOG_CONFIG_DEST 644 diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index f04acee4b44..6ad379eae25 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -222,6 +222,11 @@ "source": "parts/linux/cloud-init/artifacts/validate-kubelet-credentials.sh", "destination": "/home/packer/validate-kubelet-credentials.sh" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/update-kubelet-node-labels.sh", + "destination": "/home/packer/update-kubelet-node-labels.sh" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/cse_redact_cloud_config.py", From c27135d9e4125d62f53af077810fd1e9c7764daf Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Fri, 22 Aug 2025 13:52:14 -0500 Subject: [PATCH 22/87] Remove changes related to truncating node label, specifying version number instead --- .../artifacts/update-kubelet-node-labels.sh | 93 ------------------- 1 file changed, 93 deletions(-) delete mode 100644 parts/linux/cloud-init/artifacts/update-kubelet-node-labels.sh diff --git a/parts/linux/cloud-init/artifacts/update-kubelet-node-labels.sh b/parts/linux/cloud-init/artifacts/update-kubelet-node-labels.sh deleted file mode 100644 index 335407de476..00000000000 --- a/parts/linux/cloud-init/artifacts/update-kubelet-node-labels.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash - -# Script to update KUBELET_NODE_LABELS in /etc/default/kubelet -# This script modifies the kubelet configuration file in-place to correct node labels -# that may have been overwritten by custom data processes. - -set -euo pipefail - -KUBELET_DEFAULT_FILE="/etc/default/kubelet" -BACKUP_FILE="/etc/default/kubelet.backup" - -# Function to log messages -log() { - echo "$(date '+%Y-%m-%d %H:%M:%S') [update-kubelet-node-labels] $*" >&2 -} - -# Check if the kubelet default file exists -if [ ! -f "$KUBELET_DEFAULT_FILE" ]; then - log "ERROR: $KUBELET_DEFAULT_FILE does not exist" - exit 1 -fi - -# Create a backup of the original file if it doesn't exist -if [ ! -f "$BACKUP_FILE" ]; then - cp "$KUBELET_DEFAULT_FILE" "$BACKUP_FILE" - log "Created backup at $BACKUP_FILE" -fi - -# Extract KUBELET_NODE_LABELS from the file -KUBELET_NODE_LABELS=$(grep "^KUBELET_NODE_LABELS=" "$KUBELET_DEFAULT_FILE" | cut -d'"' -f2) - -log "Current KUBELET_NODE_LABELS: $KUBELET_NODE_LABELS" - -# Process the labels if they exist -if [ -n "$KUBELET_NODE_LABELS" ]; then - NEW_LABELS="" - - # Split by comma and process each label - OLD_IFS="$IFS" - IFS=',' - for label in $KUBELET_NODE_LABELS; do - IFS="$OLD_IFS" - - # Skip empty labels - if [ -n "$label" ]; then - # Split by equal sign to get key and value - key="${label%%=*}" - value="${label#*=}" - - # Truncate value if longer than 63 characters - if [ ${#value} -gt 63 ]; then - value="${value:0:63}" - log "WARNING: Truncated value for key '$key' to 63 characters" - fi - - # Reassemble the label - processed_label="${key}=${value}" - - # Add to new labels list - if [ -n "$NEW_LABELS" ]; then - NEW_LABELS="${NEW_LABELS},${processed_label}" - else - NEW_LABELS="$processed_label" - fi - fi - IFS=',' - done - IFS="$OLD_IFS" - - KUBELET_NODE_LABELS="$NEW_LABELS" -fi - -log "Final KUBELET_NODE_LABELS: $KUBELET_NODE_LABELS" - -TEMP_FILE=$(mktemp) -trap 'rm -f "$TEMP_FILE"' EXIT - -while IFS= read -r line; do - case "$line" in - KUBELET_NODE_LABELS=*) - echo "KUBELET_NODE_LABELS=\"$KUBELET_NODE_LABELS\"" - ;; - *) - echo "$line" - ;; - esac -done < "$KUBELET_DEFAULT_FILE" > "$TEMP_FILE" - -mv "$TEMP_FILE" "$KUBELET_DEFAULT_FILE" - -chmod 644 "$KUBELET_DEFAULT_FILE" - -log "Successfully updated $KUBELET_DEFAULT_FILE" From 61f8534284035cd3df32b734fcba630c4980f25b Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Fri, 22 Aug 2025 13:54:53 -0500 Subject: [PATCH 23/87] Revert changes made for truncating node label --- vhdbuilder/packer/packer_source.sh | 3 --- vhdbuilder/packer/vhd-image-builder-arm64-gen2.json | 5 ----- 2 files changed, 8 deletions(-) diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 003622de4e4..86e86411013 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -125,8 +125,6 @@ copyPackerFiles() { MEASURE_TLS_BOOTSTRAPPING_LATENCY_SERVICE_DEST=/etc/systemd/system/measure-tls-bootstrapping-latency.service VALIDATE_KUBELET_CREDENTIALS_SCRIPT_SRC=/home/packer/validate-kubelet-credentials.sh VALIDATE_KUBELET_CREDENTIALS_SCRIPT_DEST=/opt/azure/containers/validate-kubelet-credentials.sh - UPDATE_KUBELET_NODE_LABELS_SRC=/home/packer/update-kubelet-node-labels.sh - UPDATE_KUBELET_NODE_LABELS_DEST=/opt/azure/containers/update-kubelet-node-labels.sh RECONCILE_PRIVATE_HOSTS_SRC=/home/packer/reconcile-private-hosts.sh RECONCILE_PRIVATE_HOSTS_DEST=/opt/azure/containers/reconcilePrivateHosts.sh KUBELET_SERVICE_SRC=/home/packer/kubelet.service @@ -418,7 +416,6 @@ copyPackerFiles() { cpAndMode $MEASURE_TLS_BOOTSTRAPPING_LATENCY_SCRIPT_SRC $MEASURE_TLS_BOOTSTRAPPING_LATENCY_SCRIPT_DEST 755 cpAndMode $MEASURE_TLS_BOOTSTRAPPING_LATENCY_SERVICE_SRC $MEASURE_TLS_BOOTSTRAPPING_LATENCY_SERVICE_DEST 644 cpAndMode $VALIDATE_KUBELET_CREDENTIALS_SCRIPT_SRC $VALIDATE_KUBELET_CREDENTIALS_SCRIPT_DEST 755 - cpAndMode $UPDATE_KUBELET_NODE_LABELS_SRC $UPDATE_KUBELET_NODE_LABELS_DEST 755 cpAndMode $RECONCILE_PRIVATE_HOSTS_SRC $RECONCILE_PRIVATE_HOSTS_DEST 744 cpAndMode $SYSCTL_CONFIG_SRC $SYSCTL_CONFIG_DEST 644 cpAndMode $RSYSLOG_CONFIG_SRC $RSYSLOG_CONFIG_DEST 644 diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index 6ad379eae25..f04acee4b44 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -222,11 +222,6 @@ "source": "parts/linux/cloud-init/artifacts/validate-kubelet-credentials.sh", "destination": "/home/packer/validate-kubelet-credentials.sh" }, - { - "type": "file", - "source": "parts/linux/cloud-init/artifacts/update-kubelet-node-labels.sh", - "destination": "/home/packer/update-kubelet-node-labels.sh" - }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/cse_redact_cloud_config.py", From 1fc3f68d94ad0a18aad60fec2bb1fef6262586f5 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 27 Aug 2025 13:08:33 -0500 Subject: [PATCH 24/87] Add k8s-device-plugin installation and enable with systemd --- vhdbuilder/packer/install-dependencies.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 64eb77f5ca0..7b4ae1172bc 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -743,10 +743,12 @@ if grep -q "GB200" <<< "$FEATURE_FLAGS"; then datacenter-gpu-manager-exporter \ datacenter-gpu-manager-4-core \ datacenter-gpu-manager-4-proprietary \ - libcap2-bin + libcap2-bin \ + k8s-device-plugin systemctl enable nvidia-dcgm systemctl enable nvidia-dcgm-exporter + systemctl enable nvidia-device-plugin fi fi From 20ff30bb3ee43ce46fbe1ed1c1ab081f9cb663b9 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 28 Aug 2025 11:15:46 -0500 Subject: [PATCH 25/87] Add oneshot systemd service to write nvidia-specific containerd file --- .../ubuntu/containerd-nvidia-config.service | 12 ++++++++++++ .../artifacts/ubuntu/gb200-containerd-config.sh | 14 ++++++++++++++ vhdbuilder/packer/install-dependencies.sh | 1 + vhdbuilder/packer/packer_source.sh | 17 ++++++++++++++++- .../packer/vhd-image-builder-arm64-gen2.json | 10 ++++++++++ 5 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service new file mode 100644 index 00000000000..07723817e7b --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service @@ -0,0 +1,12 @@ +[Unit] +Description=A oneshot service that overwrites the containerd configuration on GB200 SKUs +After=cloud-final.service +Before=containerd.service + +[Service] +Type=oneshot +ExecStart=/opt/azure/gb200-containerd-config.sh +RemainAfterExit=yes + +[Install] +WantedBy=multi-user.target \ No newline at end of file diff --git a/parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh b/parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh new file mode 100644 index 00000000000..2bdc6c77b37 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh @@ -0,0 +1,14 @@ +#!/bin/bash + +echo "Waiting for cloud-init to finish..." +cloud-init status --wait + +echo "Confirmed cloud-init finished, writing nvidia-specific containerd configuration..." +cp /opt/azure/containerd-nvidia.toml /etc/containerd/config.toml + +if [ $? -ne 0 ]; then + echo "Failed to write /etc/containerd/config.toml" + exit 1 +else + echo "Wrote /etc/containerd/config.toml" +fi \ No newline at end of file diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 7b4ae1172bc..e1d1b17bada 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -749,6 +749,7 @@ if grep -q "GB200" <<< "$FEATURE_FLAGS"; then systemctl enable nvidia-dcgm systemctl enable nvidia-dcgm-exporter systemctl enable nvidia-device-plugin + systemctl enable containerd-nvidia-config fi fi diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 86e86411013..3e6a51bba17 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -522,9 +522,24 @@ copyPackerFiles() { NVIDIA_ASC_DEST=/etc/apt/keyrings/nvidia.pub cpAndMode $NVIDIA_ASC_SRC $NVIDIA_ASC_DEST 644 + # The following three files are required because RP does not currently + # recognize the GB200 SKU as a GPU VM. The containerd configuration requires + # modification when running GPU workloads. Since RP doesn't recognize the GB200 + # SKU correctly, and RP will overwrite /etc/containerd/config.toml, we need to + # create a oneshot systemd service that runs after cloud-init that will + # put the proper configuration into place. CONTAINERD_NVIDIA_TOML_SRC=/home/packer/containerd-nvidia.toml - CONTAINERD_NVIDIA_TOML_DEST=/etc/containerd/config.toml + CONTAINERD_NVIDIA_TOML_DEST=/opt/azure/containerd-nvidia.toml cpAndMode $CONTAINERD_NVIDIA_TOML_SRC $CONTAINERD_NVIDIA_TOML_DEST 644 + + CONTAINERD_GB200_CONFIG_SCRIPT_SRC=/home/packer/gb200-containerd-config.sh + CONTAINERD_GB200_CONFIG_SCRIPT_DEST=/opt/azure/gb200-containerd-config.sh + cpAndMode $CONTAINERD_GB200_CONFIG_SCRIPT_SRC $CONTAINERD_GB200_CONFIG_SCRIPT_DEST 755 + + SYSTEMD_CONTAINERD_CONFIG_SERVICE=/home/packer/containerd-nvidia-config.service + SYSTEMD_CONTAINERD_CONFIG_DEST=/etc/systemd/system/containerd-nvidia-config.service + cpAndMode $SYSTEMD_CONTAINERD_CONFIG_SERVICE $SYSTEMD_CONTAINERD_CONFIG_DEST 644 + fi fi diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index f04acee4b44..e0639fbd8a4 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -777,6 +777,16 @@ "source": "parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml", "destination": "/home/packer/containerd-nvidia.toml" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service", + "destination": "/home/packer/containerd-nvidia-config.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh", + "destination": "/home/packer/gb200-containerd-config.sh" + }, { "type": "shell", "inline": [ From f4d9f950dace8b2becfc350229510331a037b165 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 28 Aug 2025 15:07:16 -0500 Subject: [PATCH 26/87] Modify script and service file to fix dependency problem --- .../artifacts/ubuntu/containerd-nvidia-config.service | 4 ++-- .../cloud-init/artifacts/ubuntu/gb200-containerd-config.sh | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service index 07723817e7b..2db740fd666 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service @@ -1,7 +1,7 @@ [Unit] Description=A oneshot service that overwrites the containerd configuration on GB200 SKUs After=cloud-final.service -Before=containerd.service +DefaultDependencies=no [Service] Type=oneshot @@ -9,4 +9,4 @@ ExecStart=/opt/azure/gb200-containerd-config.sh RemainAfterExit=yes [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=default.target \ No newline at end of file diff --git a/parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh b/parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh index 2bdc6c77b37..f01fe50a106 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh +++ b/parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh @@ -11,4 +11,9 @@ if [ $? -ne 0 ]; then exit 1 else echo "Wrote /etc/containerd/config.toml" -fi \ No newline at end of file +fi + +echo "Restarting containerd to apply new configuration" +systemctl restart containerd +systemctl is-active --quiet containerd +echo "Restarted containerd" \ No newline at end of file From b71d82b48e602fcf317b2acc492da4c9f3d007ca Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 28 Aug 2025 16:49:49 -0500 Subject: [PATCH 27/87] Require containerd service to run before replacing file --- .../cloud-init/artifacts/ubuntu/containerd-nvidia-config.service | 1 + 1 file changed, 1 insertion(+) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service index 2db740fd666..f09d6a7a5d3 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service @@ -1,6 +1,7 @@ [Unit] Description=A oneshot service that overwrites the containerd configuration on GB200 SKUs After=cloud-final.service +After=containerd.service DefaultDependencies=no [Service] From af4fed4710a47e211c4cc54e0797b70a464d53cb Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Fri, 29 Aug 2025 10:47:31 -0500 Subject: [PATCH 28/87] Change dependency for containerd-nvidia-config service --- .../artifacts/ubuntu/containerd-nvidia-config.service | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service index f09d6a7a5d3..9d8fa765e03 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service @@ -1,7 +1,9 @@ [Unit] Description=A oneshot service that overwrites the containerd configuration on GB200 SKUs -After=cloud-final.service -After=containerd.service +# The aks-log-collector service writes the containerd configuration before containerd starts +# The /etc/containerd directory doesn't appear to exist before service runs +After=aks-log-collector.service +Requires=aks-log-collector.service DefaultDependencies=no [Service] From 1cdc94136d5aba99450ceae08b6a4c5dd5dc6808 Mon Sep 17 00:00:00 2001 From: Alex Benn <62816975+abenn135@users.noreply.github.com> Date: Fri, 29 Aug 2025 13:24:35 -0400 Subject: [PATCH 29/87] feat: add blacklist nouveau drivers, udev char rules, some comments (#6895) --- vhdbuilder/packer/install-dependencies.sh | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index e1d1b17bada..5cb6e714bf1 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -723,6 +723,22 @@ fi if grep -q "GB200" <<< "$FEATURE_FLAGS"; then # The GB200 feature flag should only be set for arm64 and Ubuntu 24.04, but validate if [ ${UBUNTU_RELEASE} = "24.04" ] && [ ${CPU_ARCH} = "arm64" ]; then + # Need to replicate all functionality from github.com/azure/aks-gpu/install.sh. + # aks-gpu is designed to run at node boot/join time, whereas the GB200 VHD is set up + # to have all drivers installed at VHD build time. + # + # TODO(abenn135): move all GPU installation logic back into the AgentBaker repo, and + # invoke it where we need it, either at VHD build time or at node boot time (for example + # if we do not know at VHD build time whether we will want GPU drivers installed or not). + + # 1. Blacklist nouveau driver + cat << EOF >> /etc/modprobe.d/blacklist-nouveau.conf +blacklist nouveau +options nouveau modeset=0 +EOF + update-initramfs -u + + # 2. install GPU drivers # The open series driver is required for the GB200 platform. Dmesg output # will appear directing the reader away from the proprietary driver. The GPUs # are also not visible in nvidia-smi output with the proprietary drivers @@ -746,6 +762,13 @@ if grep -q "GB200" <<< "$FEATURE_FLAGS"; then libcap2-bin \ k8s-device-plugin + # 3. Add char device symlinks for NVIDIA devices + mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)" + cat << EOF >> /lib/udev/rules.d/71-nvidia-dev-char.rules +ACTION=="add", DEVPATH=="/bus/pci/drivers/nvidia", RUN+="/usr/bin/nvidia-ctk system create-dev-char-symlinks --create-all" +EOF + + # Now we are off-piste: enable DCGM, DCGM exporter, container device plugin, and the NVIDIA containerd config. systemctl enable nvidia-dcgm systemctl enable nvidia-dcgm-exporter systemctl enable nvidia-device-plugin From e97d0762d0a074b8cde97ce057f666b0b18f2b39 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Fri, 29 Aug 2025 14:44:04 -0500 Subject: [PATCH 30/87] Move oneshot script to later in the boot process --- .../artifacts/ubuntu/containerd-nvidia-config.service | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service index 9d8fa765e03..8d1b443c44a 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service @@ -1,9 +1,6 @@ [Unit] Description=A oneshot service that overwrites the containerd configuration on GB200 SKUs -# The aks-log-collector service writes the containerd configuration before containerd starts -# The /etc/containerd directory doesn't appear to exist before service runs -After=aks-log-collector.service -Requires=aks-log-collector.service +After=multi-user.target DefaultDependencies=no [Service] @@ -12,4 +9,4 @@ ExecStart=/opt/azure/gb200-containerd-config.sh RemainAfterExit=yes [Install] -WantedBy=default.target \ No newline at end of file +WantedBy=graphical.target \ No newline at end of file From 43f9c135852c60520510b7051b37b0e56b5d9c3a Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Tue, 2 Sep 2025 09:22:51 -0500 Subject: [PATCH 31/87] Remove DefaultDependencies=no, restore After=aks-log-collector.service --- .../artifacts/ubuntu/containerd-nvidia-config.service | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service index 8d1b443c44a..8dddabcb204 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service @@ -1,7 +1,6 @@ [Unit] Description=A oneshot service that overwrites the containerd configuration on GB200 SKUs -After=multi-user.target -DefaultDependencies=no +After=aks-log-collector.service [Service] Type=oneshot @@ -9,4 +8,4 @@ ExecStart=/opt/azure/gb200-containerd-config.sh RemainAfterExit=yes [Install] -WantedBy=graphical.target \ No newline at end of file +WantedBy=multi-user.target \ No newline at end of file From 17ffe0dc0baeed969b54a57bed4112de5b8db4e9 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Tue, 2 Sep 2025 12:56:09 -0500 Subject: [PATCH 32/87] Make sure the nvidia config is written before the device plugin starts --- .../artifacts/ubuntu/containerd-nvidia-config.service | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service index 8dddabcb204..6de968eaae7 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service @@ -1,6 +1,8 @@ [Unit] Description=A oneshot service that overwrites the containerd configuration on GB200 SKUs +DefaultDependencies=no After=aks-log-collector.service +Before=nvidia-device-plugin.service [Service] Type=oneshot From 7c726938b376e59c3c5e88d82f16a1b17c6243c5 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 3 Sep 2025 14:05:35 -0500 Subject: [PATCH 33/87] Attempt to write containerd configuration directly after making modifications to RP --- .../ubuntu/containerd-nvidia-config.service | 1 - vhdbuilder/packer/install-dependencies.sh | 2 +- vhdbuilder/packer/packer_source.sh | 14 +++++++------- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service index 6de968eaae7..2d9d14543e0 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service @@ -1,7 +1,6 @@ [Unit] Description=A oneshot service that overwrites the containerd configuration on GB200 SKUs DefaultDependencies=no -After=aks-log-collector.service Before=nvidia-device-plugin.service [Service] diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 5cb6e714bf1..61989a8cfd2 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -772,7 +772,7 @@ EOF systemctl enable nvidia-dcgm systemctl enable nvidia-dcgm-exporter systemctl enable nvidia-device-plugin - systemctl enable containerd-nvidia-config + #systemctl enable containerd-nvidia-config fi fi diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 3e6a51bba17..c603ae53508 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -529,16 +529,16 @@ copyPackerFiles() { # create a oneshot systemd service that runs after cloud-init that will # put the proper configuration into place. CONTAINERD_NVIDIA_TOML_SRC=/home/packer/containerd-nvidia.toml - CONTAINERD_NVIDIA_TOML_DEST=/opt/azure/containerd-nvidia.toml + CONTAINERD_NVIDIA_TOML_DEST=/etc/containerd/config.toml cpAndMode $CONTAINERD_NVIDIA_TOML_SRC $CONTAINERD_NVIDIA_TOML_DEST 644 - CONTAINERD_GB200_CONFIG_SCRIPT_SRC=/home/packer/gb200-containerd-config.sh - CONTAINERD_GB200_CONFIG_SCRIPT_DEST=/opt/azure/gb200-containerd-config.sh - cpAndMode $CONTAINERD_GB200_CONFIG_SCRIPT_SRC $CONTAINERD_GB200_CONFIG_SCRIPT_DEST 755 + #CONTAINERD_GB200_CONFIG_SCRIPT_SRC=/home/packer/gb200-containerd-config.sh + #CONTAINERD_GB200_CONFIG_SCRIPT_DEST=/opt/azure/gb200-containerd-config.sh + #cpAndMode $CONTAINERD_GB200_CONFIG_SCRIPT_SRC $CONTAINERD_GB200_CONFIG_SCRIPT_DEST 755 - SYSTEMD_CONTAINERD_CONFIG_SERVICE=/home/packer/containerd-nvidia-config.service - SYSTEMD_CONTAINERD_CONFIG_DEST=/etc/systemd/system/containerd-nvidia-config.service - cpAndMode $SYSTEMD_CONTAINERD_CONFIG_SERVICE $SYSTEMD_CONTAINERD_CONFIG_DEST 644 + #SYSTEMD_CONTAINERD_CONFIG_SERVICE=/home/packer/containerd-nvidia-config.service + #SYSTEMD_CONTAINERD_CONFIG_DEST=/etc/systemd/system/containerd-nvidia-config.service + #cpAndMode $SYSTEMD_CONTAINERD_CONFIG_SERVICE $SYSTEMD_CONTAINERD_CONFIG_DEST 644 fi fi From 47c10010f2d4e9188946b2442c0a4fa1eef11892 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 3 Sep 2025 17:25:01 -0500 Subject: [PATCH 34/87] Modify nvidia-device-plugin service to run after kubelet --- .../ubuntu/containerd-nvidia-config.service | 12 ------------ .../ubuntu/gb200-containerd-config.sh | 19 ------------------- vhdbuilder/packer/install-dependencies.sh | 8 +++++++- vhdbuilder/packer/packer_source.sh | 19 ++++--------------- 4 files changed, 11 insertions(+), 47 deletions(-) delete mode 100644 parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service delete mode 100644 parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service deleted file mode 100644 index 2d9d14543e0..00000000000 --- a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service +++ /dev/null @@ -1,12 +0,0 @@ -[Unit] -Description=A oneshot service that overwrites the containerd configuration on GB200 SKUs -DefaultDependencies=no -Before=nvidia-device-plugin.service - -[Service] -Type=oneshot -ExecStart=/opt/azure/gb200-containerd-config.sh -RemainAfterExit=yes - -[Install] -WantedBy=multi-user.target \ No newline at end of file diff --git a/parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh b/parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh deleted file mode 100644 index f01fe50a106..00000000000 --- a/parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -echo "Waiting for cloud-init to finish..." -cloud-init status --wait - -echo "Confirmed cloud-init finished, writing nvidia-specific containerd configuration..." -cp /opt/azure/containerd-nvidia.toml /etc/containerd/config.toml - -if [ $? -ne 0 ]; then - echo "Failed to write /etc/containerd/config.toml" - exit 1 -else - echo "Wrote /etc/containerd/config.toml" -fi - -echo "Restarting containerd to apply new configuration" -systemctl restart containerd -systemctl is-active --quiet containerd -echo "Restarted containerd" \ No newline at end of file diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 61989a8cfd2..65b3f3f8874 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -766,13 +766,19 @@ EOF mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)" cat << EOF >> /lib/udev/rules.d/71-nvidia-dev-char.rules ACTION=="add", DEVPATH=="/bus/pci/drivers/nvidia", RUN+="/usr/bin/nvidia-ctk system create-dev-char-symlinks --create-all" +EOF + + # Create systemd drop-in to override nvidia-device-plugin dependencies + mkdir -p /etc/systemd/system/nvidia-device-plugin.service.d + cat << EOF > /etc/systemd/system/nvidia-device-plugin.service.d/override.conf +[Unit] +After=kubelet.service EOF # Now we are off-piste: enable DCGM, DCGM exporter, container device plugin, and the NVIDIA containerd config. systemctl enable nvidia-dcgm systemctl enable nvidia-dcgm-exporter systemctl enable nvidia-device-plugin - #systemctl enable containerd-nvidia-config fi fi diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index c603ae53508..bdfd234cf26 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -522,24 +522,13 @@ copyPackerFiles() { NVIDIA_ASC_DEST=/etc/apt/keyrings/nvidia.pub cpAndMode $NVIDIA_ASC_SRC $NVIDIA_ASC_DEST 644 - # The following three files are required because RP does not currently - # recognize the GB200 SKU as a GPU VM. The containerd configuration requires - # modification when running GPU workloads. Since RP doesn't recognize the GB200 - # SKU correctly, and RP will overwrite /etc/containerd/config.toml, we need to - # create a oneshot systemd service that runs after cloud-init that will - # put the proper configuration into place. + # This will only currently work if changes are applied to the subscription + # the node runs in. Otherwise, until the GB200 is recognized as a GPU SKU, + # it'll be overwritten by a containerd configuration that doesn't support + # running GPU workloads. CONTAINERD_NVIDIA_TOML_SRC=/home/packer/containerd-nvidia.toml CONTAINERD_NVIDIA_TOML_DEST=/etc/containerd/config.toml cpAndMode $CONTAINERD_NVIDIA_TOML_SRC $CONTAINERD_NVIDIA_TOML_DEST 644 - - #CONTAINERD_GB200_CONFIG_SCRIPT_SRC=/home/packer/gb200-containerd-config.sh - #CONTAINERD_GB200_CONFIG_SCRIPT_DEST=/opt/azure/gb200-containerd-config.sh - #cpAndMode $CONTAINERD_GB200_CONFIG_SCRIPT_SRC $CONTAINERD_GB200_CONFIG_SCRIPT_DEST 755 - - #SYSTEMD_CONTAINERD_CONFIG_SERVICE=/home/packer/containerd-nvidia-config.service - #SYSTEMD_CONTAINERD_CONFIG_DEST=/etc/systemd/system/containerd-nvidia-config.service - #cpAndMode $SYSTEMD_CONTAINERD_CONFIG_SERVICE $SYSTEMD_CONTAINERD_CONFIG_DEST 644 - fi fi From 25321070951f149e4d895b92ea96ca83814efd0c Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 3 Sep 2025 17:36:23 -0500 Subject: [PATCH 35/87] Remove reference to deleted files --- vhdbuilder/packer/vhd-image-builder-arm64-gen2.json | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index e0639fbd8a4..f04acee4b44 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -777,16 +777,6 @@ "source": "parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml", "destination": "/home/packer/containerd-nvidia.toml" }, - { - "type": "file", - "source": "parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia-config.service", - "destination": "/home/packer/containerd-nvidia-config.service" - }, - { - "type": "file", - "source": "parts/linux/cloud-init/artifacts/ubuntu/gb200-containerd-config.sh", - "destination": "/home/packer/gb200-containerd-config.sh" - }, { "type": "shell", "inline": [ From 8bc843fd38f4eef511fc6e22143e01acba9daf61 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 3 Sep 2025 19:00:54 -0500 Subject: [PATCH 36/87] Add ExecStartPre to make sure the device-plugins directory exists --- vhdbuilder/packer/install-dependencies.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 65b3f3f8874..8eb63ab6b46 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -773,6 +773,9 @@ EOF cat << EOF > /etc/systemd/system/nvidia-device-plugin.service.d/override.conf [Unit] After=kubelet.service + +[Service] +ExecStartPre=-/usr/bin/mkdir -p /var/lib/kubelet/device-plugins EOF # Now we are off-piste: enable DCGM, DCGM exporter, container device plugin, and the NVIDIA containerd config. From 79f1f54bd8eef29657c11464b5f49d364493fac3 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 4 Sep 2025 11:17:52 -0500 Subject: [PATCH 37/87] Remove OFED driver installation to troubleshoot VHD build failure around the NVIDIA driver --- vhdbuilder/packer/install-dependencies.sh | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 8eb63ab6b46..cde230d636d 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -743,16 +743,12 @@ EOF # will appear directing the reader away from the proprietary driver. The GPUs # are also not visible in nvidia-smi output with the proprietary drivers apt install -y \ - mlnx-ofed-kernel-dkms \ - mlnx-ofed-kernel-utils \ - mlnx-ofed-basic \ - rdma-core \ - ibverbs-utils \ - ibverbs-providers - - systemctl restart openibd - ofed_info -s - apt install -y \ + #mlnx-ofed-kernel-dkms \ + #mlnx-ofed-kernel-utils \ + #mlnx-ofed-basic \ + #rdma-core \ + #ibverbs-utils \ + #ibverbs-providers \ nvidia-driver-580-open \ cuda-toolkit-12 \ nvidia-container-toolkit \ @@ -782,6 +778,8 @@ EOF systemctl enable nvidia-dcgm systemctl enable nvidia-dcgm-exporter systemctl enable nvidia-device-plugin + #systemctl enable openibd + #ofed_info -s fi fi From d4f6af8e8e34e895bb1f6a63adfbe75346fdd502 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 4 Sep 2025 12:05:47 -0500 Subject: [PATCH 38/87] Fix formatting for package installation --- vhdbuilder/packer/install-dependencies.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index cde230d636d..62e9153c67a 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -743,12 +743,6 @@ EOF # will appear directing the reader away from the proprietary driver. The GPUs # are also not visible in nvidia-smi output with the proprietary drivers apt install -y \ - #mlnx-ofed-kernel-dkms \ - #mlnx-ofed-kernel-utils \ - #mlnx-ofed-basic \ - #rdma-core \ - #ibverbs-utils \ - #ibverbs-providers \ nvidia-driver-580-open \ cuda-toolkit-12 \ nvidia-container-toolkit \ @@ -758,6 +752,12 @@ EOF libcap2-bin \ k8s-device-plugin + #mlnx-ofed-kernel-dkms \ + #mlnx-ofed-kernel-utils \ + #mlnx-ofed-basic \ + #rdma-core \ + #ibverbs-utils \ + #ibverbs-providers \ # 3. Add char device symlinks for NVIDIA devices mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)" cat << EOF >> /lib/udev/rules.d/71-nvidia-dev-char.rules From f7086946b0b2a2e24ee6a187935552d8bd12f32a Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 4 Sep 2025 12:31:18 -0500 Subject: [PATCH 39/87] Break up apt commands to install kernel-related packages separately --- vhdbuilder/packer/install-dependencies.sh | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 62e9153c67a..53b2c4115a0 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -743,7 +743,9 @@ EOF # will appear directing the reader away from the proprietary driver. The GPUs # are also not visible in nvidia-smi output with the proprietary drivers apt install -y \ - nvidia-driver-580-open \ + nvidia-driver-580-open + + apt install -y \ cuda-toolkit-12 \ nvidia-container-toolkit \ datacenter-gpu-manager-exporter \ @@ -752,12 +754,16 @@ EOF libcap2-bin \ k8s-device-plugin - #mlnx-ofed-kernel-dkms \ - #mlnx-ofed-kernel-utils \ - #mlnx-ofed-basic \ - #rdma-core \ - #ibverbs-utils \ - #ibverbs-providers \ + apt install -y \ + mlnx-ofed-kernel-dkms + + apt install -y \ + mlnx-ofed-kernel-utils \ + mlnx-ofed-basic \ + rdma-core \ + ibverbs-utils \ + ibverbs-providers + # 3. Add char device symlinks for NVIDIA devices mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)" cat << EOF >> /lib/udev/rules.d/71-nvidia-dev-char.rules From e8b0048196460f2fada2d4e91fec0219b4c4ad37 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 4 Sep 2025 12:39:24 -0500 Subject: [PATCH 40/87] Enable openidb service --- vhdbuilder/packer/install-dependencies.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 53b2c4115a0..a361fbbae53 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -784,8 +784,8 @@ EOF systemctl enable nvidia-dcgm systemctl enable nvidia-dcgm-exporter systemctl enable nvidia-device-plugin - #systemctl enable openibd - #ofed_info -s + systemctl enable openibd + ofed_info -s fi fi From 952d0d8a042e354cb87e9a6e79967da6557ec8a4 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 4 Sep 2025 15:20:13 -0500 Subject: [PATCH 41/87] Replace MLNX_OFED with DOCA-OFED --- parts/linux/cloud-init/artifacts/ubuntu/doca.list | 1 + .../ubuntu/{mellanox_mlnx_ofed.pub => doca.pub} | 0 .../artifacts/ubuntu/mellanox_mlnx_ofed.list | 9 --------- vhdbuilder/packer/install-dependencies.sh | 10 +--------- vhdbuilder/packer/packer_source.sh | 8 ++++++++ vhdbuilder/packer/vhd-image-builder-arm64-gen2.json | 8 ++++---- 6 files changed, 14 insertions(+), 22 deletions(-) create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/doca.list rename parts/linux/cloud-init/artifacts/ubuntu/{mellanox_mlnx_ofed.pub => doca.pub} (100%) delete mode 100644 parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.list diff --git a/parts/linux/cloud-init/artifacts/ubuntu/doca.list b/parts/linux/cloud-init/artifacts/ubuntu/doca.list new file mode 100644 index 00000000000..6c491e76df4 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/doca.list @@ -0,0 +1 @@ +deb [arch=arm64 signed-by=/etc/apt/keyrings/doca.pub] https://linux.mellanox.com/public/repo/doca/3.1.0/ubuntu24.04/arm64-sbsa/ ./ \ No newline at end of file diff --git a/parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.pub b/parts/linux/cloud-init/artifacts/ubuntu/doca.pub similarity index 100% rename from parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.pub rename to parts/linux/cloud-init/artifacts/ubuntu/doca.pub diff --git a/parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.list b/parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.list deleted file mode 100644 index 44b427cab18..00000000000 --- a/parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.list +++ /dev/null @@ -1,9 +0,0 @@ -# -# Mellanox Technologies Ltd. public repository configuration file. -# For more information, refer to http://linux.mellanox.com -# -# For future reference: -# https://network.nvidia.com/support/mlnx-ofed-public-repository/ -# https://www.mellanox.com/downloads/ofed/RPM-GPG-KEY-Mellanox -# [mlnx_ofed_24.10-1.1.4.0_base] -deb [signed-by=/etc/apt/keyrings/mellanox_mlnx_ofed.pub] http://linux.mellanox.com/public/repo/mlnx_ofed/24.10-1.1.4.0/ubuntu24.04/arm64 ./ \ No newline at end of file diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index a361fbbae53..27b1f9aba8b 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -755,14 +755,7 @@ EOF k8s-device-plugin apt install -y \ - mlnx-ofed-kernel-dkms - - apt install -y \ - mlnx-ofed-kernel-utils \ - mlnx-ofed-basic \ - rdma-core \ - ibverbs-utils \ - ibverbs-providers + doca-ofed # 3. Add char device symlinks for NVIDIA devices mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)" @@ -785,7 +778,6 @@ EOF systemctl enable nvidia-dcgm-exporter systemctl enable nvidia-device-plugin systemctl enable openibd - ofed_info -s fi fi diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index bdfd234cf26..9579c096a7d 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -529,6 +529,14 @@ copyPackerFiles() { CONTAINERD_NVIDIA_TOML_SRC=/home/packer/containerd-nvidia.toml CONTAINERD_NVIDIA_TOML_DEST=/etc/containerd/config.toml cpAndMode $CONTAINERD_NVIDIA_TOML_SRC $CONTAINERD_NVIDIA_TOML_DEST 644 + + DOCA_LIST_SRC=/home/packer/doca.list + DOCA_LIST_DEST=/etc/apt/sources.list.d/doca.list + cpAndMode $DOCA_LIST_SRC $DOCA_LIST_DEST 644 + + DOCA_PUB_SRC=/home/packer/doca.pub + DOCA_PUB_DEST=/etc/apt/keyrings/doca.pub + cpAndMode $DOCA_PUB_SRC $DOCA_PUB_DEST 644 fi fi diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index f04acee4b44..ff9de8a99c9 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -754,13 +754,13 @@ }, { "type": "file", - "source": "parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.list", - "destination": "/home/packer/mellanox_mlnx_ofed.list" + "source": "parts/linux/cloud-init/artifacts/ubuntu/doca.list", + "destination": "/home/packer/doca.list" }, { "type": "file", - "source": "parts/linux/cloud-init/artifacts/ubuntu/mellanox_mlnx_ofed.pub", - "destination": "/home/packer/mellanox_mlnx_ofed.pub" + "source": "parts/linux/cloud-init/artifacts/ubuntu/doca.pub", + "destination": "/home/packer/doca.pub" }, { "type": "file", From 084256b833956eefc7b6ab79d30556aa48ab613f Mon Sep 17 00:00:00 2001 From: Keith <153014933+keith-ms@users.noreply.github.com> Date: Mon, 22 Sep 2025 11:51:32 -0500 Subject: [PATCH 42/87] Keith ms/add nvidia module parameters (#7071) Co-authored-by: Cameron Meissner --- .../linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml | 2 +- .../artifacts/ubuntu/modprobe-nvidia-parameters.conf | 4 ++++ vhdbuilder/packer/install-dependencies.sh | 5 ++++- vhdbuilder/packer/packer_source.sh | 5 +++++ vhdbuilder/packer/vhd-image-builder-arm64-gen2.json | 5 +++++ 5 files changed, 19 insertions(+), 2 deletions(-) create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml index 2366562134d..4c1cf207cbe 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml @@ -10,7 +10,7 @@ version = 2 sandbox_image = "mcr.microsoft.com/oss/kubernetes/pause:3.6" [plugins."io.containerd.grpc.v1.cri".containerd] - default_runtime_name = "nvidia" + default_runtime_name = "runc" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] diff --git a/parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf b/parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf new file mode 100644 index 00000000000..0b513ace710 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf @@ -0,0 +1,4 @@ +options nvidia NVreg_RestrictProfilingToAdminUsers=0 +options nvidia NVreg_CreateImexChannel0=1 +options nvidia NVreg_CoherentGPUMemoryMode=driver +options nvidia NVreg_RegistryDwords="RMBug5172204War=4" \ No newline at end of file diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 27b1f9aba8b..2590fdcff69 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -743,7 +743,7 @@ EOF # will appear directing the reader away from the proprietary driver. The GPUs # are also not visible in nvidia-smi output with the proprietary drivers apt install -y \ - nvidia-driver-580-open + nvidia-driver-580-open=580.82.07-0ubuntu1 apt install -y \ cuda-toolkit-12 \ @@ -754,6 +754,9 @@ EOF libcap2-bin \ k8s-device-plugin + apt install -y \ + nvidia-imex=580.82.07-1 + apt install -y \ doca-ofed diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 9579c096a7d..ec6375c9497 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -537,6 +537,11 @@ copyPackerFiles() { DOCA_PUB_SRC=/home/packer/doca.pub DOCA_PUB_DEST=/etc/apt/keyrings/doca.pub cpAndMode $DOCA_PUB_SRC $DOCA_PUB_DEST 644 + + NVIDIA_MODPROBE_PARAMETERS_SRC=/home/packer/modprobe-nvidia-parameters.conf + NVIDIA_MODPROBE_PARAMETERS_DEST=/etc/modprobe.d/nvidia.conf + cpAndMode $NVIDIA_MODPROBE_PARAMETERS_SRC $NVIDIA_MODPROBE_PARAMETERS_DEST 644 + fi fi diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index ff9de8a99c9..62558f3891f 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -777,6 +777,11 @@ "source": "parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml", "destination": "/home/packer/containerd-nvidia.toml" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf", + "destination": "/home/packer/modprobe-nvidia-parameters.conf" + }, { "type": "shell", "inline": [ From ec33d3d4557cdb20f42ccdf18a72c82763f3680a Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Tue, 7 Oct 2025 13:30:55 -0500 Subject: [PATCH 43/87] Remove reference to ARM64 check in GB200 image, only set 60GB volume size when the GB200 feature flag is present --- vhdbuilder/packer/install-dependencies.sh | 2 +- vhdbuilder/packer/packer_source.sh | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 2590fdcff69..c3de381d7a3 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -722,7 +722,7 @@ fi if grep -q "GB200" <<< "$FEATURE_FLAGS"; then # The GB200 feature flag should only be set for arm64 and Ubuntu 24.04, but validate - if [ ${UBUNTU_RELEASE} = "24.04" ] && [ ${CPU_ARCH} = "arm64" ]; then + if [ ${UBUNTU_RELEASE} = "24.04" ]; then # Need to replicate all functionality from github.com/azure/aks-gpu/install.sh. # aks-gpu is designed to run at node boot/join time, whereas the GB200 VHD is set up # to have all drivers installed at VHD build time. diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index ec6375c9497..2e3597a3a4f 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -504,8 +504,7 @@ copyPackerFiles() { fi if grep -q "GB200" <<< "$FEATURE_FLAGS"; then - # Only applicable to Ubuntu 24.04 and ARM64 - if [ ${UBUNTU_RELEASE} = "24.04" ] && [ ${CPU_ARCH} = "arm64" ]; then + if [ ${UBUNTU_RELEASE} = "24.04" ]; then MELLANOX_LIST_SRC=/home/packer/mellanox_mlnx_ofed.list MELLANOX_LIST_DEST=/etc/apt/sources.list.d/mellanox_mlnx_ofed.list cpAndMode $MELLANOX_LIST_SRC $MELLANOX_LIST_DEST 644 From 3da96414e84c5ea37f320b8b0806be52eb98ee29 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Tue, 7 Oct 2025 16:27:03 -0500 Subject: [PATCH 44/87] Split out GB200 build into new packer file --- e2e/vmss.go | 2 +- packer.mk | 5 + .../packer/vhd-image-builder-arm64-gb200.json | 755 ++++++++++++++++++ .../packer/vhd-image-builder-arm64-gen2.json | 32 +- 4 files changed, 762 insertions(+), 32 deletions(-) create mode 100644 vhdbuilder/packer/vhd-image-builder-arm64-gb200.json diff --git a/e2e/vmss.go b/e2e/vmss.go index d85c825c4b4..4abf14f8931 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -1153,7 +1153,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual StorageProfile: &armcompute.VirtualMachineScaleSetStorageProfile{ OSDisk: &armcompute.VirtualMachineScaleSetOSDisk{ CreateOption: to.Ptr(armcompute.DiskCreateOptionTypesFromImage), - DiskSizeGB: to.Ptr(int32(60)), + DiskSizeGB: to.Ptr(int32(50)), OSType: to.Ptr(armcompute.OperatingSystemTypesLinux), Caching: to.Ptr(armcompute.CachingTypesReadOnly), DiffDiskSettings: &armcompute.DiffDiskSettings{ diff --git a/packer.mk b/packer.mk index 8b61ade6561..a783a20a1f5 100755 --- a/packer.mk +++ b/packer.mk @@ -12,8 +12,13 @@ build-packer: setup-golang generate-prefetch-scripts build-image-fetcher build-a ifeq (${ARCHITECTURE},ARM64) @echo "${MODE}: Building with Hyper-v generation 2 ARM64 VM" ifeq (${OS_SKU},Ubuntu) +ifeq ($(findstring GB200,$(FEATURE_FLAGS)),GB200) + @echo "Using packer template file vhd-image-builder-arm64-gb200.json" + @packer build -timestamp-ui -var-file=vhdbuilder/packer/settings.json vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +else @echo "Using packer template file vhd-image-builder-arm64-gen2.json" @packer build -timestamp-ui -var-file=vhdbuilder/packer/settings.json vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +endif else ifeq (${OS_SKU},CBLMariner) @echo "Using packer template file vhd-image-builder-mariner-arm64.json" @packer build -timestamp-ui -var-file=vhdbuilder/packer/settings.json vhdbuilder/packer/vhd-image-builder-mariner-arm64.json diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json new file mode 100644 index 00000000000..94e8cbce0de --- /dev/null +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -0,0 +1,755 @@ +{ + "variables": { + "subscription_id": "{{env `AZURE_SUBSCRIPTION_ID`}}", + "gallery_subscription_id": "{{user `gallery_subscription_id`}}", + "location": "{{env `PACKER_BUILD_LOCATION`}}", + "vm_size": "{{env `AZURE_VM_SIZE`}}", + "build_definition_name": "{{env `BUILD_DEFINITION_NAME`}}", + "build_number": "{{env `BUILD_NUMBER`}}", + "build_id": "{{env `BUILD_ID`}}", + "commit": "{{env `GIT_VERSION`}}", + "feature_flags": "{{env `FEATURE_FLAGS`}}", + "image_version": "{{env `IMAGE_VERSION`}}", + "os_version": "{{env `OS_VERSION`}}", + "sku_name": "{{env `SKU_NAME`}}", + "hyperv_generation": "{{env `HYPERV_GENERATION`}}", + "sig_gallery_name": "{{env `SIG_GALLERY_NAME`}}", + "sig_image_name": "{{env `SIG_IMAGE_NAME`}}", + "sig_image_version": "{{env `SIG_IMAGE_VERSION`}}", + "container_runtime": "{{env `CONTAINER_RUNTIME`}}", + "teleportd_plugin_download_url": "{{env `TELEPORTD_PLUGIN_DOWNLOAD_URL`}}", + "captured_sig_version": "{{env `${CAPTURED_SIG_VERSION`}}", + "enable_fips": "{{env `ENABLE_FIPS`}}", + "img_publisher": "{{env `IMG_PUBLISHER`}}", + "img_offer": "{{env `IMG_OFFER`}}", + "img_sku": "{{env `IMG_SKU`}}", + "img_version": "{{env `IMG_VERSION`}}", + "vnet_resource_group_name": "{{env `VNET_RESOURCE_GROUP_NAME`}}", + "vnet_name": "{{env `VNET_NAME`}}", + "subnet_name": "{{env `SUBNET_NAME`}}", + "private_packages_url": "{{env `PRIVATE_PACKAGES_URL`}}", + "branch": "{{env `BRANCH`}}", + "vhd_build_timestamp": "{{user `VHD_BUILD_TIMESTAMP`}}" + }, + "builders": [ + { + "type": "azure-arm", + "subscription_id": "{{user `subscription_id`}}", + "virtual_network_resource_group_name": "{{user `vnet_resource_group_name`}}", + "virtual_network_name": "{{user `vnet_name`}}", + "virtual_network_subnet_name": "{{user `subnet_name`}}", + "ssh_read_write_timeout": "5m", + "os_type": "Linux", + "os_disk_size_gb": 60, + "image_publisher": "{{user `img_publisher`}}", + "image_offer": "{{user `img_offer`}}", + "image_sku": "{{user `img_sku`}}", + "image_version": "{{user `img_version`}}", + "azure_tags": { + "buildDefinitionName": "{{user `build_definition_name`}}", + "buildNumber": "{{user `build_number`}}", + "buildId": "{{user `build_id`}}", + "SkipLinuxAzSecPack": "true", + "os": "Linux", + "now": "{{user `create_time`}}", + "createdBy": "aks-vhd-pipeline", + "image_sku": "{{user `img_sku`}}", + "branch": "{{user `branch`}}" + }, + "location": "{{user `location`}}", + "vm_size": "{{user `vm_size`}}", + "polling_duration_timeout": "1h", + "managed_image_storage_account_type": "Premium_LRS", + "shared_image_gallery_destination": { + "subscription": "{{user `gallery_subscription_id`}}", + "resource_group": "{{user `resource_group_name`}}", + "gallery_name": "{{user `sig_gallery_name`}}", + "image_name": "{{user `sig_image_name`}}", + "image_version": "{{user `captured_sig_version`}}", + "replication_regions": [ + "{{user `location`}}" + ] + }, + "user_assigned_managed_identities": "{{user `msi_resource_strings`}}" + } + ], + "provisioners": [ + { + "type": "shell", + "inline": [ + "sudo mkdir -p /opt/azure/containers", + "sudo mkdir -p /opt/scripts", + "sudo mkdir -p /opt/certs" + ] + }, + { + "type": "file", + "source": "vhdbuilder/lister/bin/lister", + "destination": "/home/packer/lister" + }, + { + "type": "file", + "source": "aks-node-controller/bin/aks-node-controller-linux-arm64", + "destination": "/home/packer/aks-node-controller" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-node-controller.service", + "destination": "/home/packer/aks-node-controller.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cloud-init-status-check.sh", + "destination": "/home/packer/cloud-init-status-check.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/prefetch.sh", + "destination": "/home/packer/prefetch.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/cleanup-vhd.sh", + "destination": "/home/packer/cleanup-vhd.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/packer_source.sh", + "destination": "/home/packer/packer_source.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_install.sh", + "destination": "/home/packer/provision_installs.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh", + "destination": "/home/packer/provision_installs_distro.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_helpers.sh", + "destination": "/home/packer/provision_source.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_benchmark_functions.sh", + "destination": "/home/packer/provision_source_benchmarks.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/cse_helpers_ubuntu.sh", + "destination": "/home/packer/provision_source_distro.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_config.sh", + "destination": "/home/packer/provision_configs.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_main.sh", + "destination": "/home/packer/provision.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_start.sh", + "destination": "/home/packer/provision_start.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/containerd_exec_start.conf", + "destination": "/home/packer/containerd_exec_start.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/kubelet.service", + "destination": "/home/packer/kubelet.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service", + "destination": "/home/packer/secure-tls-bootstrap.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/reconcile-private-hosts.sh", + "destination": "/home/packer/reconcile-private-hosts.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/block_wireserver.sh", + "destination": "/home/packer/block_wireserver.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ensure_imds_restriction.sh", + "destination": "/home/packer/ensure_imds_restriction.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/measure-tls-bootstrapping-latency.sh", + "destination": "/home/packer/measure-tls-bootstrapping-latency.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/measure-tls-bootstrapping-latency.service", + "destination": "/home/packer/measure-tls-bootstrapping-latency.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/validate-kubelet-credentials.sh", + "destination": "/home/packer/validate-kubelet-credentials.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_redact_cloud_config.py", + "destination": "/home/packer/cse_redact_cloud_config.py" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_send_logs.py", + "destination": "/home/packer/cse_send_logs.py" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh", + "destination": "/home/packer/init-aks-custom-cloud.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/reconcile-private-hosts.service", + "destination": "/home/packer/reconcile-private-hosts.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/mig-partition.service", + "destination": "/home/packer/mig-partition.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/bind-mount.sh", + "destination": "/home/packer/bind-mount.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/bind-mount.service", + "destination": "/home/packer/bind-mount.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/enable-dhcpv6.sh", + "destination": "/home/packer/enable-dhcpv6.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/dhcpv6.service", + "destination": "/home/packer/dhcpv6.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/sync-container-logs.sh", + "destination": "/home/packer/sync-container-logs.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/sync-container-logs.service", + "destination": "/home/packer/sync-container-logs.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/crictl.yaml", + "destination": "/home/packer/crictl.yaml" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ensure-no-dup.sh", + "destination": "/home/packer/ensure-no-dup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ensure-no-dup.service", + "destination": "/home/packer/ensure-no-dup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/teleportd.service", + "destination": "/home/packer/teleportd.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/setup-custom-search-domains.sh", + "destination": "/home/packer/setup-custom-search-domains.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/ubuntu-snapshot-update.sh", + "destination": "/home/packer/ubuntu-snapshot-update.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/snapshot-update.service", + "destination": "/home/packer/snapshot-update.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/snapshot-update.timer", + "destination": "/home/packer/snapshot-update.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cis.sh", + "destination": "/home/packer/cis.sh" + }, + { + "type": "file", + "source": "vhdbuilder/scripts/linux/tool_installs.sh", + "destination": "/home/packer/tool_installs.sh" + }, + { + "type": "file", + "source": "vhdbuilder/scripts/linux/ubuntu/tool_installs_ubuntu.sh", + "destination": "/home/packer/tool_installs_distro.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/pre-install-dependencies.sh", + "destination": "/home/packer/pre-install-dependencies.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/install-dependencies.sh", + "destination": "/home/packer/install-dependencies.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/post-install-dependencies.sh", + "destination": "/home/packer/post-install-dependencies.sh" + }, + { + "type": "file", + "source": "parts/common/components.json", + "destination": "/home/packer/components.json" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/manifest.json", + "destination": "/home/packer/manifest.json" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/sysctl-d-60-CIS.conf", + "destination": "/home/packer/sysctl-d-60-CIS.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/sshd_config", + "destination": "/home/packer/sshd_config" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/sshd_config_1604", + "destination": "/home/packer/sshd_config_1604" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/rsyslog-d-60-CIS.conf", + "destination": "/home/packer/rsyslog-d-60-CIS.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/logrotate-d-rsyslog-CIS.conf", + "destination": "/home/packer/logrotate-d-rsyslog-CIS.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/etc-issue", + "destination": "/home/packer/etc-issue" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/etc-issue.net", + "destination": "/home/packer/etc-issue.net" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/modprobe-CIS.conf", + "destination": "/home/packer/modprobe-CIS.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pwquality-CIS.conf", + "destination": "/home/packer/pwquality-CIS.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-su", + "destination": "/home/packer/pam-d-su" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-auth", + "destination": "/home/packer/pam-d-common-auth" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-auth-2204", + "destination": "/home/packer/pam-d-common-auth-2204" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-password", + "destination": "/home/packer/pam-d-common-password" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/profile-d-cis.sh", + "destination": "/home/packer/profile-d-cis.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/disk_queue.service", + "destination": "/home/packer/disk_queue.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-memory-telemetry.sh", + "destination": "/home/packer/cgroup-memory-telemetry.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-memory-telemetry.service", + "destination": "/home/packer/cgroup-memory-telemetry.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-memory-telemetry.timer", + "destination": "/home/packer/cgroup-memory-telemetry.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-pressure-telemetry.sh", + "destination": "/home/packer/cgroup-pressure-telemetry.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-pressure-telemetry.service", + "destination": "/home/packer/cgroup-pressure-telemetry.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-pressure-telemetry.timer", + "destination": "/home/packer/cgroup-pressure-telemetry.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/update_certs.service", + "destination": "/home/packer/update_certs.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/update_certs.path", + "destination": "/home/packer/update_certs.path" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/update_certs.sh", + "destination": "/home/packer/update_certs.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ci-syslog-watcher.path", + "destination": "/home/packer/ci-syslog-watcher.path" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ci-syslog-watcher.service", + "destination": "/home/packer/ci-syslog-watcher.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ci-syslog-watcher.sh", + "destination": "/home/packer/ci-syslog-watcher.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-diagnostic.py", + "destination": "/home/packer/aks-diagnostic.py" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-log-collector.sh", + "destination": "/home/packer/aks-log-collector.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-log-collector-send.py", + "destination": "/home/packer/aks-log-collector-send.py" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-log-collector.service", + "destination": "/home/packer/aks-log-collector.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-log-collector.slice", + "destination": "/home/packer/aks-log-collector.slice" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-log-collector.timer", + "destination": "/home/packer/aks-log-collector.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-check-network.sh", + "destination": "/home/packer/aks-check-network.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-check-network.service", + "destination": "/home/packer/aks-check-network.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-logrotate.sh", + "destination": "/home/packer/logrotate.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-logrotate.service", + "destination": "/home/packer/logrotate.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-logrotate.timer", + "destination": "/home/packer/logrotate.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-logrotate-override.conf", + "destination": "/home/packer/override.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-rsyslog", + "destination": "/home/packer/rsyslog" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ipv6_nftables", + "destination": "/home/packer/ipv6_nftables" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ipv6_nftables.service", + "destination": "/home/packer/ipv6_nftables.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ipv6_nftables.sh", + "destination": "/home/packer/ipv6_nftables.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/apt-preferences", + "destination": "/home/packer/apt-preferences" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/kms.service", + "destination": "/home/packer/kms.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/mig-partition.sh", + "destination": "/home/packer/mig-partition.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/docker_clear_mount_propagation_flags.conf", + "destination": "/home/packer/docker_clear_mount_propagation_flags.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/nvidia-modprobe.service", + "destination": "/home/packer/nvidia-modprobe.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/nvidia-docker-daemon.json", + "destination": "/home/packer/nvidia-docker-daemon.json" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-auth", + "destination": "/home/packer/pam-d-common-auth" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-password", + "destination": "/home/packer/pam-d-common-password" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-su", + "destination": "/home/packer/pam-d-su" + }, + { + "type": "file", + "source": "vhdbuilder/notice.txt", + "destination": "/home/packer/NOTICE.txt" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns.sh", + "destination": "/home/packer/localdns.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns.service", + "destination": "/home/packer/localdns.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", + "destination": "/home/packer/localdns-delegate.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/10_azure_nvidia", + "destination": "/home/packer/10_azure_nvidia" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/51-azure-nvidia.cfg", + "destination": "/home/packer/51-azure-nvidia.cfg" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/doca.list", + "destination": "/home/packer/doca.list" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/doca.pub", + "destination": "/home/packer/doca.pub" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list", + "destination": "/home/packer/nvidia-2404.list" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/nvidia.pub", + "destination": "/home/packer/nvidia.pub" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml", + "destination": "/home/packer/containerd-nvidia.toml" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf", + "destination": "/home/packer/modprobe-nvidia-parameters.conf" + }, + { + "type": "shell", + "inline": [ + "sudo FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} VHD_BUILD_TIMESTAMP={{user `vhd_build_timestamp`}} /bin/bash -ux /home/packer/pre-install-dependencies.sh" + ] + }, + { + "type": "shell", + "inline": "sudo reboot", + "expect_disconnect": true, + "skip_clean": true, + "pause_after": "60s" + }, + { + "type": "shell", + "inline": [ + "sudo FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} PRIVATE_PACKAGES_URL={{user `private_packages_url`}} VHD_BUILD_TIMESTAMP={{user `vhd_build_timestamp`}} /bin/bash -ux /home/packer/install-dependencies.sh" + ] + }, + { + "type": "file", + "direction": "download", + "source": "/var/log/bcc_installation.log", + "destination": "bcc-tools-installation.log" + }, + { + "type": "shell", + "inline": [ + "sudo rm /var/log/bcc_installation.log" + ] + }, + { + "type": "shell", + "inline": "sudo reboot", + "expect_disconnect": true, + "skip_clean": true, + "pause_after": "60s" + }, + { + "type": "shell", + "inline": [ + "sudo FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} /bin/bash -ux /home/packer/post-install-dependencies.sh" + ] + }, + { + "type": "file", + "source": "vhdbuilder/packer/list-images.sh", + "destination": "/home/packer/list-images.sh" + }, + { + "type": "shell", + "inline": [ + "sudo SKU_NAME={{user `sku_name`}} IMAGE_VERSION={{user `image_version`}} CONTAINER_RUNTIME={{user `container_runtime`}} /bin/bash -ux /home/packer/list-images.sh" + ] + }, + { + "type": "file", + "direction": "download", + "source": "/opt/azure/containers/image-bom.json", + "destination": "image-bom.json" + }, + { + "type": "file", + "direction": "download", + "source": "/opt/azure/vhd-install.complete", + "destination": "release-notes.txt" + }, + { + "type": "file", + "direction": "download", + "source": "/opt/azure/vhd-build-performance-data.json", + "destination": "vhd-build-performance-data.json" + }, + { + "type": "file", + "direction": "download", + "source": "/opt/azure/vhd-grid-compatibility-data.json", + "destination": "vhd-grid-compatibility-data.json" + }, + { + "type": "shell", + "inline": [ + "sudo rm /opt/azure/vhd-build-performance-data.json", + "sudo rm /opt/azure/vhd-grid-compatibility-data.json" + ] + }, + { + "type": "shell", + "inline": [ + "sudo /bin/bash -eux /home/packer/cis.sh", + "sudo /bin/bash -eux /opt/azure/containers/cleanup-vhd.sh", + "sudo /usr/sbin/waagent -force -deprovision+user && export HISTSIZE=0 && sync || exit 125" + ] + } + ] +} \ No newline at end of file diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json index 62558f3891f..90b9ccb0a0a 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gen2.json @@ -39,7 +39,7 @@ "virtual_network_subnet_name": "{{user `subnet_name`}}", "ssh_read_write_timeout": "5m", "os_type": "Linux", - "os_disk_size_gb": 60, + "os_disk_size_gb": 30, "image_publisher": "{{user `img_publisher`}}", "image_offer": "{{user `img_offer`}}", "image_sku": "{{user `img_sku`}}", @@ -752,36 +752,6 @@ "source": "parts/linux/cloud-init/artifacts/51-azure-nvidia.cfg", "destination": "/home/packer/51-azure-nvidia.cfg" }, - { - "type": "file", - "source": "parts/linux/cloud-init/artifacts/ubuntu/doca.list", - "destination": "/home/packer/doca.list" - }, - { - "type": "file", - "source": "parts/linux/cloud-init/artifacts/ubuntu/doca.pub", - "destination": "/home/packer/doca.pub" - }, - { - "type": "file", - "source": "parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list", - "destination": "/home/packer/nvidia-2404.list" - }, - { - "type": "file", - "source": "parts/linux/cloud-init/artifacts/ubuntu/nvidia.pub", - "destination": "/home/packer/nvidia.pub" - }, - { - "type": "file", - "source": "parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml", - "destination": "/home/packer/containerd-nvidia.toml" - }, - { - "type": "file", - "source": "parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf", - "destination": "/home/packer/modprobe-nvidia-parameters.conf" - }, { "type": "shell", "inline": [ From 781dc5ac943638a57cc287c1fe2ae37e0030b001 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Wed, 8 Oct 2025 09:54:21 -0500 Subject: [PATCH 45/87] Remove specific package version specification for the driver, bump to CUDA toolkit 13 --- vhdbuilder/packer/install-dependencies.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index c3de381d7a3..035ff046da3 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -743,10 +743,10 @@ EOF # will appear directing the reader away from the proprietary driver. The GPUs # are also not visible in nvidia-smi output with the proprietary drivers apt install -y \ - nvidia-driver-580-open=580.82.07-0ubuntu1 + nvidia-driver-580-open apt install -y \ - cuda-toolkit-12 \ + cuda-toolkit-13 \ nvidia-container-toolkit \ datacenter-gpu-manager-exporter \ datacenter-gpu-manager-4-core \ @@ -755,7 +755,7 @@ EOF k8s-device-plugin apt install -y \ - nvidia-imex=580.82.07-1 + nvidia-imex apt install -y \ doca-ofed From 1a910a555791ff9aa3389f780c871d12f8c86917 Mon Sep 17 00:00:00 2001 From: Keith <153014933+keith-ms@users.noreply.github.com> Date: Thu, 16 Oct 2025 11:28:02 -0500 Subject: [PATCH 46/87] Add the ability to specify a custom local repository to download (#7186) Co-authored-by: {your_GitHub_username} <{your_GitHub_email}> --- .../templates/.builder-release-template.yaml | 2 ++ .../artifacts/ubuntu/containerd-nvidia.toml | 2 +- .../cloud-init/artifacts/ubuntu/doca.list | 2 +- vhdbuilder/packer/install-dependencies.sh | 36 +++++++++++++++++++ vhdbuilder/packer/packer_source.sh | 5 +-- .../packer/vhd-image-builder-arm64-gb200.json | 8 +++-- 6 files changed, 48 insertions(+), 7 deletions(-) diff --git a/.pipelines/templates/.builder-release-template.yaml b/.pipelines/templates/.builder-release-template.yaml index f46cd7e2000..a269e629c4c 100644 --- a/.pipelines/templates/.builder-release-template.yaml +++ b/.pipelines/templates/.builder-release-template.yaml @@ -106,6 +106,8 @@ steps: BUILD_ID: $(Build.BuildId) BUILD_DEFINITION_NAME: $(Build.DefinitionName) UA_TOKEN: $(ua-token) + LOCAL_DOCA_REPO_URL: $(LOCAL_DOCA_REPO_URL) + CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR: $(CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR) - task: AzureCLI@2 inputs: diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml index 4c1cf207cbe..2366562134d 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml @@ -10,7 +10,7 @@ version = 2 sandbox_image = "mcr.microsoft.com/oss/kubernetes/pause:3.6" [plugins."io.containerd.grpc.v1.cri".containerd] - default_runtime_name = "runc" + default_runtime_name = "nvidia" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] diff --git a/parts/linux/cloud-init/artifacts/ubuntu/doca.list b/parts/linux/cloud-init/artifacts/ubuntu/doca.list index 6c491e76df4..f4afa06d3bb 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/doca.list +++ b/parts/linux/cloud-init/artifacts/ubuntu/doca.list @@ -1 +1 @@ -deb [arch=arm64 signed-by=/etc/apt/keyrings/doca.pub] https://linux.mellanox.com/public/repo/doca/3.1.0/ubuntu24.04/arm64-sbsa/ ./ \ No newline at end of file +deb [arch=arm64 signed-by=/etc/apt/keyrings/doca-net.pub] https://linux.mellanox.com/public/repo/doca/3.1.0/ubuntu24.04/arm64-sbsa/ ./ diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 035ff046da3..e32c1308b9c 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -742,6 +742,42 @@ EOF # The open series driver is required for the GB200 platform. Dmesg output # will appear directing the reader away from the proprietary driver. The GPUs # are also not visible in nvidia-smi output with the proprietary drivers + + # Install a local repository if a LOCAL_DOCA_REPO_URL is provided + if [ -n "${LOCAL_DOCA_REPO_URL}" ]; then + # Extract filename from URL path, removing query parameters + LOCAL_REPO_FILENAME=$(basename "${LOCAL_DOCA_REPO_URL%%\?*}") + + # Store files downloaded before curl command + BEFORE_FILES=$(ls /tmp/*.deb 2>/dev/null || echo "") + + curl --output-dir /tmp -O "${LOCAL_DOCA_REPO_URL}" + if [ $? -ne 0 ]; then + if [ "${CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR}" = "True" ]; then + echo "WARNING: Continuing despite error downloading package from ${LOCAL_DOCA_REPO_URL}." + else + echo "ERROR: Failed to download package from ${LOCAL_DOCA_REPO_URL}." + exit 1 + fi + else + # Find the newly downloaded file + AFTER_FILES=$(ls /tmp/*.deb 2>/dev/null || echo "") + DOWNLOADED_FILE=$(comm -13 <(echo "$BEFORE_FILES" | sort) <(echo "$AFTER_FILES" | sort) | head -1) + + # Use the detected file or fall back to the extracted filename + if [ -n "${DOWNLOADED_FILE}" ]; then + dpkg -i "${DOWNLOADED_FILE}" + else + dpkg -i "/tmp/${LOCAL_REPO_FILENAME}" + fi + + # Disable the online repository + mv /etc/apt/sources.list.d/doca-net.list /etc/apt/sources.list.d/doca-net.list.disabled + + apt update + fi + fi + apt install -y \ nvidia-driver-580-open diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 2e3597a3a4f..3f22ef66650 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -529,12 +529,13 @@ copyPackerFiles() { CONTAINERD_NVIDIA_TOML_DEST=/etc/containerd/config.toml cpAndMode $CONTAINERD_NVIDIA_TOML_SRC $CONTAINERD_NVIDIA_TOML_DEST 644 + DOCA_LIST_SRC=/home/packer/doca.list - DOCA_LIST_DEST=/etc/apt/sources.list.d/doca.list + DOCA_LIST_DEST=/etc/apt/sources.list.d/doca-net.list cpAndMode $DOCA_LIST_SRC $DOCA_LIST_DEST 644 DOCA_PUB_SRC=/home/packer/doca.pub - DOCA_PUB_DEST=/etc/apt/keyrings/doca.pub + DOCA_PUB_DEST=/etc/apt/keyrings/doca-net.pub cpAndMode $DOCA_PUB_SRC $DOCA_PUB_DEST 644 NVIDIA_MODPROBE_PARAMETERS_SRC=/home/packer/modprobe-nvidia-parameters.conf diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json index 94e8cbce0de..759f7f73aee 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -29,7 +29,9 @@ "subnet_name": "{{env `SUBNET_NAME`}}", "private_packages_url": "{{env `PRIVATE_PACKAGES_URL`}}", "branch": "{{env `BRANCH`}}", - "vhd_build_timestamp": "{{user `VHD_BUILD_TIMESTAMP`}}" + "vhd_build_timestamp": "{{user `VHD_BUILD_TIMESTAMP`}}", + "local_doca_repo_url": "{{env `LOCAL_DOCA_REPO_URL`}}", + "continue_on_local_repo_download_error": "{{env `CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR`}}" }, "builders": [ { @@ -673,7 +675,7 @@ { "type": "shell", "inline": [ - "sudo FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} PRIVATE_PACKAGES_URL={{user `private_packages_url`}} VHD_BUILD_TIMESTAMP={{user `vhd_build_timestamp`}} /bin/bash -ux /home/packer/install-dependencies.sh" + "sudo CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR={{user `continue_on_local_repo_download_error`}} LOCAL_DOCA_REPO_URL=\"{{user `local_doca_repo_url`}}\" FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} PRIVATE_PACKAGES_URL={{user `private_packages_url`}} VHD_BUILD_TIMESTAMP={{user `vhd_build_timestamp`}} /bin/bash -ux /home/packer/install-dependencies.sh" ] }, { @@ -752,4 +754,4 @@ ] } ] -} \ No newline at end of file +} From 0e4851f1587dcfe31406a3c5ecf573a13602db78 Mon Sep 17 00:00:00 2001 From: Alex Benn Date: Fri, 24 Oct 2025 10:49:20 -0400 Subject: [PATCH 47/87] tidy up messy merge -- remove bad mellanox deb repo reference --- vhdbuilder/packer/packer_source.sh | 8 -------- 1 file changed, 8 deletions(-) diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 3f22ef66650..1c921197b21 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -505,14 +505,6 @@ copyPackerFiles() { if grep -q "GB200" <<< "$FEATURE_FLAGS"; then if [ ${UBUNTU_RELEASE} = "24.04" ]; then - MELLANOX_LIST_SRC=/home/packer/mellanox_mlnx_ofed.list - MELLANOX_LIST_DEST=/etc/apt/sources.list.d/mellanox_mlnx_ofed.list - cpAndMode $MELLANOX_LIST_SRC $MELLANOX_LIST_DEST 644 - - MELLANOX_ASC_SRC=/home/packer/mellanox_mlnx_ofed.pub - MELLANOX_ASC_DEST=/etc/apt/keyrings/mellanox_mlnx_ofed.pub - cpAndMode $MELLANOX_ASC_SRC $MELLANOX_ASC_DEST 644 - NVIDIA_LIST_SRC=/home/packer/nvidia-2404.list NVIDIA_LIST_DEST=/etc/apt/sources.list.d/nvidia.list cpAndMode $NVIDIA_LIST_SRC $NVIDIA_LIST_DEST 644 From 44faf9b2fd6cc67ad7f457b9ef0e9ddfb353e3e0 Mon Sep 17 00:00:00 2001 From: Alex Benn Date: Fri, 24 Oct 2025 11:07:26 -0400 Subject: [PATCH 48/87] fix shell test failure. Glob matching is tidier in bash --- parts/linux/cloud-init/artifacts/cse_config.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 49552f8bfc4..c3d1a5cf939 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -649,6 +649,7 @@ validateKubeletNodeLabels() { IFS=',' read -ra LABEL_ARRAY <<< "$labels" for label in "${LABEL_ARRAY[@]}"; do # Split each label into key and value + # shellcheck disable=SC3010 if [[ "$label" == *"="* ]]; then key="${label%%=*}" value="${label#*=}" From a6fb228c2e304e574e657800c4dee760fdb787c6 Mon Sep 17 00:00:00 2001 From: Alex Benn Date: Mon, 27 Oct 2025 15:40:59 -0400 Subject: [PATCH 49/87] add recent additions to vhd/packer/vhd-image-builder-arm64-gen2.json --- .../packer/vhd-image-builder-arm64-gb200.json | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json index 759f7f73aee..5af82e75014 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -68,9 +68,7 @@ "gallery_name": "{{user `sig_gallery_name`}}", "image_name": "{{user `sig_image_name`}}", "image_version": "{{user `captured_sig_version`}}", - "replication_regions": [ - "{{user `location`}}" - ] + "replication_regions": ["{{user `location`}}"] }, "user_assigned_managed_identities": "{{user `msi_resource_strings`}}" } @@ -379,6 +377,11 @@ "source": "parts/linux/cloud-init/artifacts/modprobe-CIS.conf", "destination": "/home/packer/modprobe-CIS.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/faillock-CIS.conf", + "destination": "/home/packer/faillock-CIS.conf" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/pwquality-CIS.conf", @@ -389,6 +392,11 @@ "source": "parts/linux/cloud-init/artifacts/pam-d-su", "destination": "/home/packer/pam-d-su" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-account", + "destination": "/home/packer/pam-d-common-account" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/pam-d-common-auth", @@ -686,9 +694,7 @@ }, { "type": "shell", - "inline": [ - "sudo rm /var/log/bcc_installation.log" - ] + "inline": ["sudo rm /var/log/bcc_installation.log"] }, { "type": "shell", From d9cf9e47def651d83deec9b8bd7ec29f29c3fcb1 Mon Sep 17 00:00:00 2001 From: Alex Benn Date: Tue, 28 Oct 2025 12:06:50 -0400 Subject: [PATCH 50/87] Address some nits: re-enable CIS reports, follow doca latest, add newline to end of containerd-nvidia.toml --- .../templates/.builder-release-template.yaml | 2 +- .../artifacts/ubuntu/containerd-nvidia.toml | 14 +++++++------- parts/linux/cloud-init/artifacts/ubuntu/doca.list | 3 ++- vhdbuilder/packer/packer_source.sh | 1 - 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.pipelines/templates/.builder-release-template.yaml b/.pipelines/templates/.builder-release-template.yaml index a269e629c4c..5fb7511ab14 100644 --- a/.pipelines/templates/.builder-release-template.yaml +++ b/.pipelines/templates/.builder-release-template.yaml @@ -352,7 +352,7 @@ steps: TargetFolder: '$(Build.ArtifactStagingDirectory)' - task: CopyFiles@2 - condition: and(eq(variables.OS_SKU, 'Ubuntu'), in(variables.OS_VERSION, '22.04', '24.04'), in(variables.FEATURE_FLAGS, 'None', 'cvm')) + condition: and(eq(variables.OS_SKU, 'Ubuntu'), in(variables.OS_VERSION, '22.04', '24.04'), in(variables.FEATURE_FLAGS, 'None', 'cvm', 'GB200')) displayName: Copy CIS Reports inputs: SourceFolder: '$(System.DefaultWorkingDirectory)' diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml index 2366562134d..88aa0fa0222 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml @@ -13,22 +13,22 @@ version = 2 default_runtime_name = "nvidia" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] - + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] runtime_type = "io.containerd.runc.v2" - + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] BinaryName = "/usr/bin/nvidia-container-runtime" - + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] runtime_type = "io.containerd.runc.v2" - + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] BinaryName = "/usr/bin/runc" - + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted] runtime_type = "io.containerd.runc.v2" - + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted.options] BinaryName = "/usr/bin/runc" @@ -36,4 +36,4 @@ version = 2 config_path = "/etc/containerd/certs.d" [plugins."io.containerd.grpc.v1.cri".registry.headers] - X-Meta-Source-Client = ["azure/aks"] \ No newline at end of file + X-Meta-Source-Client = ["azure/aks"] diff --git a/parts/linux/cloud-init/artifacts/ubuntu/doca.list b/parts/linux/cloud-init/artifacts/ubuntu/doca.list index f4afa06d3bb..20300428613 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/doca.list +++ b/parts/linux/cloud-init/artifacts/ubuntu/doca.list @@ -1 +1,2 @@ -deb [arch=arm64 signed-by=/etc/apt/keyrings/doca-net.pub] https://linux.mellanox.com/public/repo/doca/3.1.0/ubuntu24.04/arm64-sbsa/ ./ +deb [arch=arm64 signed-by=/etc/apt/keyrings/doca-net.pub] https://linux.mellanox.com/public/repo/doca/latest/ubuntu24.04/arm64-sbsa/ ./ +deb [arch=amd64 signed-by=/etc/apt/keyrings/doca-net.pub] https://linux.mellanox.com/public/repo/doca/latest/ubuntu24.04/x86_64/ ./ diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 1c921197b21..771f5d760d7 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -521,7 +521,6 @@ copyPackerFiles() { CONTAINERD_NVIDIA_TOML_DEST=/etc/containerd/config.toml cpAndMode $CONTAINERD_NVIDIA_TOML_SRC $CONTAINERD_NVIDIA_TOML_DEST 644 - DOCA_LIST_SRC=/home/packer/doca.list DOCA_LIST_DEST=/etc/apt/sources.list.d/doca-net.list cpAndMode $DOCA_LIST_SRC $DOCA_LIST_DEST 644 From 720979acae1a9f3f8c917c82d8a77b96ac9675f5 Mon Sep 17 00:00:00 2001 From: Alex Benn <62816975+abenn135@users.noreply.github.com> Date: Fri, 31 Oct 2025 14:18:49 -0400 Subject: [PATCH 51/87] bug: add DCGM CUDA13-compatible packages to GB200 image. Also add multinode DCGM package. (#7277) --- .../cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh | 6 +++--- vhdbuilder/packer/install-dependencies.sh | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh index 8c58d398191..dd3ab601394 100755 --- a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh +++ b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh @@ -42,7 +42,7 @@ blobfuseFallbackPackages() { installDeps() { wait_for_apt_locks - retrycmd_silent 120 5 25 curl -fsSL https://packages.microsoft.com/config/ubuntu/${UBUNTU_RELEASE}/packages-microsoft-prod.deb > /tmp/packages-microsoft-prod.deb || exit $ERR_MS_PROD_DEB_DOWNLOAD_TIMEOUT + retrycmd_silent 120 5 90 curl -fsSL https://packages.microsoft.com/config/ubuntu/${UBUNTU_RELEASE}/packages-microsoft-prod.deb > /tmp/packages-microsoft-prod.deb || exit $ERR_MS_PROD_DEB_DOWNLOAD_TIMEOUT retrycmd_if_failure 60 5 10 dpkg -i /tmp/packages-microsoft-prod.deb || exit $ERR_MS_PROD_DEB_PKG_ADD_FAIL holdWALinuxAgent hold @@ -93,12 +93,12 @@ installDeps() { } updateAptWithMicrosoftPkg() { - retrycmd_silent 120 5 25 curl https://packages.microsoft.com/config/ubuntu/${UBUNTU_RELEASE}/prod.list > /tmp/microsoft-prod.list || exit $ERR_MOBY_APT_LIST_TIMEOUT + retrycmd_silent 120 5 90 curl https://packages.microsoft.com/config/ubuntu/${UBUNTU_RELEASE}/prod.list > /tmp/microsoft-prod.list || exit $ERR_MOBY_APT_LIST_TIMEOUT retrycmd_if_failure 10 5 10 cp /tmp/microsoft-prod.list /etc/apt/sources.list.d/ || exit $ERR_MOBY_APT_LIST_TIMEOUT echo "deb [arch=amd64,arm64,armhf] https://packages.microsoft.com/ubuntu/${UBUNTU_RELEASE}/prod testing main" > /etc/apt/sources.list.d/microsoft-prod-testing.list - retrycmd_silent 120 5 25 curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > /tmp/microsoft.gpg || exit $ERR_MS_GPG_KEY_DOWNLOAD_TIMEOUT + retrycmd_silent 120 5 90 curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > /tmp/microsoft.gpg || exit $ERR_MS_GPG_KEY_DOWNLOAD_TIMEOUT retrycmd_if_failure 10 5 10 cp /tmp/microsoft.gpg /etc/apt/trusted.gpg.d/ || exit $ERR_MS_GPG_KEY_DOWNLOAD_TIMEOUT apt_get_update || exit $ERR_APT_UPDATE_TIMEOUT } diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index e32c1308b9c..a87aba28043 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -68,6 +68,10 @@ APT::Periodic::Update-Package-Lists "0"; APT::Periodic::Download-Upgradeable-Packages "0"; APT::Periodic::AutocleanInterval "0"; APT::Periodic::Unattended-Upgrade "0"; +EOF + # Make apt more patient connecting to repositories: set a timeout of 5 min. + tee /etc/apt/apt.conf.d/99patience > /dev/null < Date: Fri, 7 Nov 2025 15:51:31 -0800 Subject: [PATCH 52/87] chore: revert bad aks-node-controller changes (#7336) Co-authored-by: Lily Pan --- aks-node-controller/parser/helper.go | 12 +++++++-- .../artifacts/secure-tls-bootstrap.service | 1 + .../cloud-init/artifacts/cse_install_spec.sh | 26 +++++++++---------- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index 99c69c7aa4f..1d5450810d9 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -203,11 +203,19 @@ func getCustomCACertsStatus(customCACerts []string) bool { return len(customCACerts) > 0 } -func getEnableSecureTLSBootstrapping(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) bool { - // TODO: Change logic to default to true once Secure TLS Bootstrapping is complete +func getEnableSecureTLSBootstrap(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) bool { + // TODO: Change logic to default to false once Secure TLS Bootstrapping is complete return bootstrapConfig.GetBootstrappingAuthMethod() == aksnodeconfigv1.BootstrappingAuthMethod_BOOTSTRAPPING_AUTH_METHOD_SECURE_TLS_BOOTSTRAPPING } +func getTLSBootstrapToken(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) string { + return bootstrapConfig.GetTlsBootstrappingToken() +} + +func getCustomSecureTLSBootstrapAADServerAppID(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) string { + return bootstrapConfig.GetCustomAadResource() +} + func getEnsureNoDupePromiscuousBridge(nc *aksnodeconfigv1.NetworkConfig) bool { return nc.GetNetworkPlugin() == aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_KUBENET && nc.GetNetworkPolicy() != aksnodeconfigv1.NetworkPolicy_NETWORK_POLICY_CALICO } diff --git a/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service b/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service index 7f2c880629d..dabf26dfa15 100644 --- a/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service +++ b/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service @@ -16,6 +16,7 @@ ExecStart=/opt/bin/aks-secure-tls-bootstrap-client \ --cert-dir=/var/lib/kubelet/pki \ --cluster-ca-file=/etc/kubernetes/certs/ca.crt \ --log-file=/var/log/azure/aks/secure-tls-bootstrap.log \ + --deadline=120s \ $BOOTSTRAP_FLAGS [Install] diff --git a/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh index 9fa8b4b94bc..868a30a128f 100644 --- a/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh @@ -21,7 +21,7 @@ Describe 'cse_install.sh' It 'returns expected output for successful installation of fake containerd in UBUNTU 20.04' UBUNTU_RELEASE="20.04" containerdPackage=$(readPackage "containerd") - When call installContainerRuntime + When call installContainerRuntime The variable containerdMajorMinorPatchVersion should equal "1.2.3" The variable containerdHotFixVersion should equal "" The output line 3 should equal "mock logs to events calling with AKS.CSE.installContainerRuntime.installStandaloneContainerd" @@ -31,7 +31,7 @@ Describe 'cse_install.sh' UBUNTU_RELEASE="" # mocking Mariner doesn't have command `lsb_release -cs` OS="MARINER" containerdPackage=$(readPackage "containerd") - When call installContainerRuntime + When call installContainerRuntime The variable containerdMajorMinorPatchVersion should equal "1.2.3" The variable containerdHotFixVersion should equal "5.fake" The output line 3 should equal "mock logs to events calling with AKS.CSE.installContainerRuntime.installStandaloneContainerd" @@ -44,8 +44,8 @@ Describe 'cse_install.sh' containerdPackage=$(readPackage "containerd") IS_KATA="true" When call installContainerRuntime - The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." - End + The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." + End # TODO(mheberling): In a ~month this will probably be removed when we use the standard containerd. It 'skips the containerd installation for AzureLinux with Kata' UBUNTU_RELEASE="" # mocking Mariner doesn't have command `lsb_release -cs` @@ -53,8 +53,8 @@ Describe 'cse_install.sh' containerdPackage=$(readPackage "containerd") IS_KATA="true" When call installContainerRuntime - The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." - End + The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." + End It 'returns expected output for successful installation of containerd in AzureLinux' UBUNTU_RELEASE="" # mocking AzureLinux doesn't have command `lsb_release -cs` OS="AZURELINUX" @@ -70,7 +70,7 @@ Describe 'cse_install.sh' installContainerdWithManifestJson() { echo "mock installContainerdWithManifestJson calling" } - When call installContainerRuntime + When call installContainerRuntime The output line 2 should equal "Package \"containerd\" does not exist in $COMPONENTS_FILEPATH." The output line 3 should equal "mock installContainerdWithManifestJson calling" End @@ -131,7 +131,7 @@ Describe 'cse_install.sh' End Describe 'extractKubeBinaries' - k8s_version="1.31.5" + k8s_version="1.31.5" is_private_url="false" k8s_downloads_dir="/opt/kubernetes/downloads" ORAS_REGISTRY_CONFIG_FILE=/etc/oras/config.yaml @@ -175,7 +175,7 @@ Describe 'cse_install.sh' export -f logs_to_events AfterEach 'cleanup' - It 'should use retrycmd_get_tarball_from_registry_with_oras to download kube binaries' + It 'should use retrycmd_get_tarball_from_registry_with_oras to download kube binaries' kube_binary_url="mcr.microsoft.com/oss/binaries/kubernetes/kubernetes-node:FakeTag" When call extractKubeBinaries $k8s_version $kube_binary_url $is_private_url $k8s_downloads_dir The status should be success @@ -203,7 +203,7 @@ Describe 'cse_install.sh' Describe 'installSecureTLSBootstrapClient' SECURE_TLS_BOOTSTRAP_CLIENT_BIN_DIR="bin" SECURE_TLS_BOOTSTRAP_CLIENT_DOWNLOAD_DIR="downloads" - CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL="https://packages/custom-client-binary-url.tar.gz" + CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL="https://packages/custom-client-binary-url.tar.gz" sudo() { echo "sudo $@" @@ -225,11 +225,11 @@ Describe 'cse_install.sh' The status should be success End - It 'should return with a no-op if CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL is not set' + It 'should return with a no-op if CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL is not set' ENABLE_SECURE_TLS_BOOTSTRAPPING="true" - CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL="" + CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL="" When call installSecureTLSBootstrapClient - The output line 1 should equal "secure TLS bootstrapping is enabled but no custom client download URL was provided, nothing to download" + The output line 1 should equal "secure TLS bootstrapping is enabled but no custom client URL was provided, nothing to download" The status should be success End From dce944d0f856da4036775aee03b37c24a75c1b0d Mon Sep 17 00:00:00 2001 From: Cameron Meissner Date: Fri, 7 Nov 2025 16:12:59 -0800 Subject: [PATCH 53/87] Revert "chore: revert bad aks-node-controller changes (#7336)" (#7337) --- aks-node-controller/parser/helper.go | 12 ++------- .../artifacts/secure-tls-bootstrap.service | 1 - .../cloud-init/artifacts/cse_install_spec.sh | 26 +++++++++---------- 3 files changed, 15 insertions(+), 24 deletions(-) diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index 1d5450810d9..99c69c7aa4f 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -203,19 +203,11 @@ func getCustomCACertsStatus(customCACerts []string) bool { return len(customCACerts) > 0 } -func getEnableSecureTLSBootstrap(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) bool { - // TODO: Change logic to default to false once Secure TLS Bootstrapping is complete +func getEnableSecureTLSBootstrapping(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) bool { + // TODO: Change logic to default to true once Secure TLS Bootstrapping is complete return bootstrapConfig.GetBootstrappingAuthMethod() == aksnodeconfigv1.BootstrappingAuthMethod_BOOTSTRAPPING_AUTH_METHOD_SECURE_TLS_BOOTSTRAPPING } -func getTLSBootstrapToken(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) string { - return bootstrapConfig.GetTlsBootstrappingToken() -} - -func getCustomSecureTLSBootstrapAADServerAppID(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) string { - return bootstrapConfig.GetCustomAadResource() -} - func getEnsureNoDupePromiscuousBridge(nc *aksnodeconfigv1.NetworkConfig) bool { return nc.GetNetworkPlugin() == aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_KUBENET && nc.GetNetworkPolicy() != aksnodeconfigv1.NetworkPolicy_NETWORK_POLICY_CALICO } diff --git a/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service b/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service index dabf26dfa15..7f2c880629d 100644 --- a/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service +++ b/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service @@ -16,7 +16,6 @@ ExecStart=/opt/bin/aks-secure-tls-bootstrap-client \ --cert-dir=/var/lib/kubelet/pki \ --cluster-ca-file=/etc/kubernetes/certs/ca.crt \ --log-file=/var/log/azure/aks/secure-tls-bootstrap.log \ - --deadline=120s \ $BOOTSTRAP_FLAGS [Install] diff --git a/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh index 868a30a128f..9fa8b4b94bc 100644 --- a/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh @@ -21,7 +21,7 @@ Describe 'cse_install.sh' It 'returns expected output for successful installation of fake containerd in UBUNTU 20.04' UBUNTU_RELEASE="20.04" containerdPackage=$(readPackage "containerd") - When call installContainerRuntime + When call installContainerRuntime The variable containerdMajorMinorPatchVersion should equal "1.2.3" The variable containerdHotFixVersion should equal "" The output line 3 should equal "mock logs to events calling with AKS.CSE.installContainerRuntime.installStandaloneContainerd" @@ -31,7 +31,7 @@ Describe 'cse_install.sh' UBUNTU_RELEASE="" # mocking Mariner doesn't have command `lsb_release -cs` OS="MARINER" containerdPackage=$(readPackage "containerd") - When call installContainerRuntime + When call installContainerRuntime The variable containerdMajorMinorPatchVersion should equal "1.2.3" The variable containerdHotFixVersion should equal "5.fake" The output line 3 should equal "mock logs to events calling with AKS.CSE.installContainerRuntime.installStandaloneContainerd" @@ -44,8 +44,8 @@ Describe 'cse_install.sh' containerdPackage=$(readPackage "containerd") IS_KATA="true" When call installContainerRuntime - The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." - End + The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." + End # TODO(mheberling): In a ~month this will probably be removed when we use the standard containerd. It 'skips the containerd installation for AzureLinux with Kata' UBUNTU_RELEASE="" # mocking Mariner doesn't have command `lsb_release -cs` @@ -53,8 +53,8 @@ Describe 'cse_install.sh' containerdPackage=$(readPackage "containerd") IS_KATA="true" When call installContainerRuntime - The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." - End + The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." + End It 'returns expected output for successful installation of containerd in AzureLinux' UBUNTU_RELEASE="" # mocking AzureLinux doesn't have command `lsb_release -cs` OS="AZURELINUX" @@ -70,7 +70,7 @@ Describe 'cse_install.sh' installContainerdWithManifestJson() { echo "mock installContainerdWithManifestJson calling" } - When call installContainerRuntime + When call installContainerRuntime The output line 2 should equal "Package \"containerd\" does not exist in $COMPONENTS_FILEPATH." The output line 3 should equal "mock installContainerdWithManifestJson calling" End @@ -131,7 +131,7 @@ Describe 'cse_install.sh' End Describe 'extractKubeBinaries' - k8s_version="1.31.5" + k8s_version="1.31.5" is_private_url="false" k8s_downloads_dir="/opt/kubernetes/downloads" ORAS_REGISTRY_CONFIG_FILE=/etc/oras/config.yaml @@ -175,7 +175,7 @@ Describe 'cse_install.sh' export -f logs_to_events AfterEach 'cleanup' - It 'should use retrycmd_get_tarball_from_registry_with_oras to download kube binaries' + It 'should use retrycmd_get_tarball_from_registry_with_oras to download kube binaries' kube_binary_url="mcr.microsoft.com/oss/binaries/kubernetes/kubernetes-node:FakeTag" When call extractKubeBinaries $k8s_version $kube_binary_url $is_private_url $k8s_downloads_dir The status should be success @@ -203,7 +203,7 @@ Describe 'cse_install.sh' Describe 'installSecureTLSBootstrapClient' SECURE_TLS_BOOTSTRAP_CLIENT_BIN_DIR="bin" SECURE_TLS_BOOTSTRAP_CLIENT_DOWNLOAD_DIR="downloads" - CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL="https://packages/custom-client-binary-url.tar.gz" + CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL="https://packages/custom-client-binary-url.tar.gz" sudo() { echo "sudo $@" @@ -225,11 +225,11 @@ Describe 'cse_install.sh' The status should be success End - It 'should return with a no-op if CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL is not set' + It 'should return with a no-op if CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL is not set' ENABLE_SECURE_TLS_BOOTSTRAPPING="true" - CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL="" + CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL="" When call installSecureTLSBootstrapClient - The output line 1 should equal "secure TLS bootstrapping is enabled but no custom client URL was provided, nothing to download" + The output line 1 should equal "secure TLS bootstrapping is enabled but no custom client download URL was provided, nothing to download" The status should be success End From c39f4f81009c3a42f657f1a8a6447c9cf7a6bb92 Mon Sep 17 00:00:00 2001 From: Cameron Meissner Date: Fri, 7 Nov 2025 16:20:31 -0800 Subject: [PATCH 54/87] chore: final fix for gb200 release branch (#7338) --- aks-node-controller/parser/helper.go | 12 +++++++-- .../artifacts/secure-tls-bootstrap.service | 1 + .../cloud-init/artifacts/cse_install_spec.sh | 26 +++++++++---------- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index 99c69c7aa4f..1d5450810d9 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -203,11 +203,19 @@ func getCustomCACertsStatus(customCACerts []string) bool { return len(customCACerts) > 0 } -func getEnableSecureTLSBootstrapping(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) bool { - // TODO: Change logic to default to true once Secure TLS Bootstrapping is complete +func getEnableSecureTLSBootstrap(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) bool { + // TODO: Change logic to default to false once Secure TLS Bootstrapping is complete return bootstrapConfig.GetBootstrappingAuthMethod() == aksnodeconfigv1.BootstrappingAuthMethod_BOOTSTRAPPING_AUTH_METHOD_SECURE_TLS_BOOTSTRAPPING } +func getTLSBootstrapToken(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) string { + return bootstrapConfig.GetTlsBootstrappingToken() +} + +func getCustomSecureTLSBootstrapAADServerAppID(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) string { + return bootstrapConfig.GetCustomAadResource() +} + func getEnsureNoDupePromiscuousBridge(nc *aksnodeconfigv1.NetworkConfig) bool { return nc.GetNetworkPlugin() == aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_KUBENET && nc.GetNetworkPolicy() != aksnodeconfigv1.NetworkPolicy_NETWORK_POLICY_CALICO } diff --git a/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service b/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service index 7f2c880629d..dabf26dfa15 100644 --- a/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service +++ b/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service @@ -16,6 +16,7 @@ ExecStart=/opt/bin/aks-secure-tls-bootstrap-client \ --cert-dir=/var/lib/kubelet/pki \ --cluster-ca-file=/etc/kubernetes/certs/ca.crt \ --log-file=/var/log/azure/aks/secure-tls-bootstrap.log \ + --deadline=120s \ $BOOTSTRAP_FLAGS [Install] diff --git a/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh index 9fa8b4b94bc..868a30a128f 100644 --- a/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh @@ -21,7 +21,7 @@ Describe 'cse_install.sh' It 'returns expected output for successful installation of fake containerd in UBUNTU 20.04' UBUNTU_RELEASE="20.04" containerdPackage=$(readPackage "containerd") - When call installContainerRuntime + When call installContainerRuntime The variable containerdMajorMinorPatchVersion should equal "1.2.3" The variable containerdHotFixVersion should equal "" The output line 3 should equal "mock logs to events calling with AKS.CSE.installContainerRuntime.installStandaloneContainerd" @@ -31,7 +31,7 @@ Describe 'cse_install.sh' UBUNTU_RELEASE="" # mocking Mariner doesn't have command `lsb_release -cs` OS="MARINER" containerdPackage=$(readPackage "containerd") - When call installContainerRuntime + When call installContainerRuntime The variable containerdMajorMinorPatchVersion should equal "1.2.3" The variable containerdHotFixVersion should equal "5.fake" The output line 3 should equal "mock logs to events calling with AKS.CSE.installContainerRuntime.installStandaloneContainerd" @@ -44,8 +44,8 @@ Describe 'cse_install.sh' containerdPackage=$(readPackage "containerd") IS_KATA="true" When call installContainerRuntime - The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." - End + The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." + End # TODO(mheberling): In a ~month this will probably be removed when we use the standard containerd. It 'skips the containerd installation for AzureLinux with Kata' UBUNTU_RELEASE="" # mocking Mariner doesn't have command `lsb_release -cs` @@ -53,8 +53,8 @@ Describe 'cse_install.sh' containerdPackage=$(readPackage "containerd") IS_KATA="true" When call installContainerRuntime - The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." - End + The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." + End It 'returns expected output for successful installation of containerd in AzureLinux' UBUNTU_RELEASE="" # mocking AzureLinux doesn't have command `lsb_release -cs` OS="AZURELINUX" @@ -70,7 +70,7 @@ Describe 'cse_install.sh' installContainerdWithManifestJson() { echo "mock installContainerdWithManifestJson calling" } - When call installContainerRuntime + When call installContainerRuntime The output line 2 should equal "Package \"containerd\" does not exist in $COMPONENTS_FILEPATH." The output line 3 should equal "mock installContainerdWithManifestJson calling" End @@ -131,7 +131,7 @@ Describe 'cse_install.sh' End Describe 'extractKubeBinaries' - k8s_version="1.31.5" + k8s_version="1.31.5" is_private_url="false" k8s_downloads_dir="/opt/kubernetes/downloads" ORAS_REGISTRY_CONFIG_FILE=/etc/oras/config.yaml @@ -175,7 +175,7 @@ Describe 'cse_install.sh' export -f logs_to_events AfterEach 'cleanup' - It 'should use retrycmd_get_tarball_from_registry_with_oras to download kube binaries' + It 'should use retrycmd_get_tarball_from_registry_with_oras to download kube binaries' kube_binary_url="mcr.microsoft.com/oss/binaries/kubernetes/kubernetes-node:FakeTag" When call extractKubeBinaries $k8s_version $kube_binary_url $is_private_url $k8s_downloads_dir The status should be success @@ -203,7 +203,7 @@ Describe 'cse_install.sh' Describe 'installSecureTLSBootstrapClient' SECURE_TLS_BOOTSTRAP_CLIENT_BIN_DIR="bin" SECURE_TLS_BOOTSTRAP_CLIENT_DOWNLOAD_DIR="downloads" - CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL="https://packages/custom-client-binary-url.tar.gz" + CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL="https://packages/custom-client-binary-url.tar.gz" sudo() { echo "sudo $@" @@ -225,11 +225,11 @@ Describe 'cse_install.sh' The status should be success End - It 'should return with a no-op if CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL is not set' + It 'should return with a no-op if CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL is not set' ENABLE_SECURE_TLS_BOOTSTRAPPING="true" - CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL="" + CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL="" When call installSecureTLSBootstrapClient - The output line 1 should equal "secure TLS bootstrapping is enabled but no custom client download URL was provided, nothing to download" + The output line 1 should equal "secure TLS bootstrapping is enabled but no custom client URL was provided, nothing to download" The status should be success End From 38f7c172ce2141e9dfc15ddc77664101695df779 Mon Sep 17 00:00:00 2001 From: Alex Benn Date: Mon, 17 Nov 2025 14:37:26 -0500 Subject: [PATCH 55/87] Revert "chore: final fix for gb200 release branch (#7338)" This reverts commit 87f25335050b2e93503e494bd7547ee55cd8d7c7. --- aks-node-controller/parser/helper.go | 12 ++------- .../artifacts/secure-tls-bootstrap.service | 1 - .../cloud-init/artifacts/cse_install_spec.sh | 26 +++++++++---------- 3 files changed, 15 insertions(+), 24 deletions(-) diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index 1d5450810d9..99c69c7aa4f 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -203,19 +203,11 @@ func getCustomCACertsStatus(customCACerts []string) bool { return len(customCACerts) > 0 } -func getEnableSecureTLSBootstrap(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) bool { - // TODO: Change logic to default to false once Secure TLS Bootstrapping is complete +func getEnableSecureTLSBootstrapping(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) bool { + // TODO: Change logic to default to true once Secure TLS Bootstrapping is complete return bootstrapConfig.GetBootstrappingAuthMethod() == aksnodeconfigv1.BootstrappingAuthMethod_BOOTSTRAPPING_AUTH_METHOD_SECURE_TLS_BOOTSTRAPPING } -func getTLSBootstrapToken(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) string { - return bootstrapConfig.GetTlsBootstrappingToken() -} - -func getCustomSecureTLSBootstrapAADServerAppID(bootstrapConfig *aksnodeconfigv1.BootstrappingConfig) string { - return bootstrapConfig.GetCustomAadResource() -} - func getEnsureNoDupePromiscuousBridge(nc *aksnodeconfigv1.NetworkConfig) bool { return nc.GetNetworkPlugin() == aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_KUBENET && nc.GetNetworkPolicy() != aksnodeconfigv1.NetworkPolicy_NETWORK_POLICY_CALICO } diff --git a/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service b/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service index dabf26dfa15..7f2c880629d 100644 --- a/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service +++ b/parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service @@ -16,7 +16,6 @@ ExecStart=/opt/bin/aks-secure-tls-bootstrap-client \ --cert-dir=/var/lib/kubelet/pki \ --cluster-ca-file=/etc/kubernetes/certs/ca.crt \ --log-file=/var/log/azure/aks/secure-tls-bootstrap.log \ - --deadline=120s \ $BOOTSTRAP_FLAGS [Install] diff --git a/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh index 868a30a128f..9fa8b4b94bc 100644 --- a/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_install_spec.sh @@ -21,7 +21,7 @@ Describe 'cse_install.sh' It 'returns expected output for successful installation of fake containerd in UBUNTU 20.04' UBUNTU_RELEASE="20.04" containerdPackage=$(readPackage "containerd") - When call installContainerRuntime + When call installContainerRuntime The variable containerdMajorMinorPatchVersion should equal "1.2.3" The variable containerdHotFixVersion should equal "" The output line 3 should equal "mock logs to events calling with AKS.CSE.installContainerRuntime.installStandaloneContainerd" @@ -31,7 +31,7 @@ Describe 'cse_install.sh' UBUNTU_RELEASE="" # mocking Mariner doesn't have command `lsb_release -cs` OS="MARINER" containerdPackage=$(readPackage "containerd") - When call installContainerRuntime + When call installContainerRuntime The variable containerdMajorMinorPatchVersion should equal "1.2.3" The variable containerdHotFixVersion should equal "5.fake" The output line 3 should equal "mock logs to events calling with AKS.CSE.installContainerRuntime.installStandaloneContainerd" @@ -44,8 +44,8 @@ Describe 'cse_install.sh' containerdPackage=$(readPackage "containerd") IS_KATA="true" When call installContainerRuntime - The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." - End + The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." + End # TODO(mheberling): In a ~month this will probably be removed when we use the standard containerd. It 'skips the containerd installation for AzureLinux with Kata' UBUNTU_RELEASE="" # mocking Mariner doesn't have command `lsb_release -cs` @@ -53,8 +53,8 @@ Describe 'cse_install.sh' containerdPackage=$(readPackage "containerd") IS_KATA="true" When call installContainerRuntime - The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." - End + The output line 3 should equal "INFO: containerd package versions array is either empty or the first element is . Skipping containerd installation." + End It 'returns expected output for successful installation of containerd in AzureLinux' UBUNTU_RELEASE="" # mocking AzureLinux doesn't have command `lsb_release -cs` OS="AZURELINUX" @@ -70,7 +70,7 @@ Describe 'cse_install.sh' installContainerdWithManifestJson() { echo "mock installContainerdWithManifestJson calling" } - When call installContainerRuntime + When call installContainerRuntime The output line 2 should equal "Package \"containerd\" does not exist in $COMPONENTS_FILEPATH." The output line 3 should equal "mock installContainerdWithManifestJson calling" End @@ -131,7 +131,7 @@ Describe 'cse_install.sh' End Describe 'extractKubeBinaries' - k8s_version="1.31.5" + k8s_version="1.31.5" is_private_url="false" k8s_downloads_dir="/opt/kubernetes/downloads" ORAS_REGISTRY_CONFIG_FILE=/etc/oras/config.yaml @@ -175,7 +175,7 @@ Describe 'cse_install.sh' export -f logs_to_events AfterEach 'cleanup' - It 'should use retrycmd_get_tarball_from_registry_with_oras to download kube binaries' + It 'should use retrycmd_get_tarball_from_registry_with_oras to download kube binaries' kube_binary_url="mcr.microsoft.com/oss/binaries/kubernetes/kubernetes-node:FakeTag" When call extractKubeBinaries $k8s_version $kube_binary_url $is_private_url $k8s_downloads_dir The status should be success @@ -203,7 +203,7 @@ Describe 'cse_install.sh' Describe 'installSecureTLSBootstrapClient' SECURE_TLS_BOOTSTRAP_CLIENT_BIN_DIR="bin" SECURE_TLS_BOOTSTRAP_CLIENT_DOWNLOAD_DIR="downloads" - CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL="https://packages/custom-client-binary-url.tar.gz" + CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL="https://packages/custom-client-binary-url.tar.gz" sudo() { echo "sudo $@" @@ -225,11 +225,11 @@ Describe 'cse_install.sh' The status should be success End - It 'should return with a no-op if CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL is not set' + It 'should return with a no-op if CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL is not set' ENABLE_SECURE_TLS_BOOTSTRAPPING="true" - CUSTOM_SECURE_TLS_BOOTSTRAP_CLIENT_URL="" + CUSTOM_SECURE_TLS_BOOTSTRAPPING_CLIENT_DOWNLOAD_URL="" When call installSecureTLSBootstrapClient - The output line 1 should equal "secure TLS bootstrapping is enabled but no custom client URL was provided, nothing to download" + The output line 1 should equal "secure TLS bootstrapping is enabled but no custom client download URL was provided, nothing to download" The status should be success End From 6cf8d8474f0a4d13642c18afd1a91f2634d13506 Mon Sep 17 00:00:00 2001 From: Alex Benn Date: Mon, 17 Nov 2025 16:23:21 -0500 Subject: [PATCH 56/87] Remove leftover Ubuntu 1604 logic carried over into gb200 packer template --- vhdbuilder/packer/vhd-image-builder-arm64-gb200.json | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json index 5af82e75014..232446594a5 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -347,11 +347,6 @@ "source": "parts/linux/cloud-init/artifacts/sshd_config", "destination": "/home/packer/sshd_config" }, - { - "type": "file", - "source": "parts/linux/cloud-init/artifacts/sshd_config_1604", - "destination": "/home/packer/sshd_config_1604" - }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/rsyslog-d-60-CIS.conf", From 34f7697d282dbed95a1334c763ac8ef0763ba4a2 Mon Sep 17 00:00:00 2001 From: Alex Benn <62816975+abenn135@users.noreply.github.com> Date: Wed, 19 Nov 2025 11:25:18 -0500 Subject: [PATCH 57/87] feat: exact BOM versions (#7393) --- vhdbuilder/packer/gb200-mai-bom.json | 39 ++++++++++ vhdbuilder/packer/install-dependencies.sh | 71 ++++--------------- vhdbuilder/packer/packer_source.sh | 3 + .../packer/post-install-dependencies.sh | 7 +- .../packer/vhd-image-builder-arm64-gb200.json | 5 ++ 5 files changed, 66 insertions(+), 59 deletions(-) create mode 100644 vhdbuilder/packer/gb200-mai-bom.json diff --git a/vhdbuilder/packer/gb200-mai-bom.json b/vhdbuilder/packer/gb200-mai-bom.json new file mode 100644 index 00000000000..52f4aa37a7a --- /dev/null +++ b/vhdbuilder/packer/gb200-mai-bom.json @@ -0,0 +1,39 @@ +{ + "versions-wave1": { + "libxnvctrl0": "580.95.05-0ubuntu1", + "libnvidia-common-580": "580.95.05-0ubuntu1", + "libnvidia-cfg1-580": "580.95.05-0ubuntu1", + "libnvidia-gpucomp-580": "580.95.05-0ubuntu1", + "libnvidia-gl-580": "580.95.05-0ubuntu1", + "nvidia-firmware-580": "580.95.05-0ubuntu1", + "nvidia-dkms-580-open": "580.95.05-0ubuntu1", + "nvidia-kernel-common-580": "580.95.05-0ubuntu1", + "nvidia-kernel-source-580-open": "580.95.05-0ubuntu1", + "libnvidia-compute-580": "580.95.05-0ubuntu1", + "libnvidia-extra-580": "580.95.05-0ubuntu1", + "libnvidia-decode-580": "580.95.05-0ubuntu1", + "libnvidia-encode-580": "580.95.05-0ubuntu1", + "xserver-xorg-video-nvidia-580": "580.95.05-0ubuntu1", + "libnvidia-fbc1-580": "580.95.05-0ubuntu1", + "nvidia-driver-580-open": "580.95.05-0ubuntu1" + }, + "versions-wave2": { + "cuda-toolkit-13": "13.0.2-1", + "nvidia-container-toolkit": "1.18.0-1", + "datacenter-gpu-manager-exporter": "4.6.0-1", + "datacenter-gpu-manager-4-core": "1:4.4.1-1", + "datacenter-gpu-manager-4-proprietary": "1:4.4.1-1", + "datacenter-gpu-manager-4-cuda13": "1:4.4.1-1", + "datacenter-gpu-manager-4-proprietary-cuda13": "1:4.4.1-1", + "datacenter-gpu-manager-4-multinode": "1:4.4.1-1", + "datacenter-gpu-manager-4-multinode-cuda13": "1:4.4.1-1", + "libcap2-bin": "1:2.66-5ubuntu2.2", + "k8s-device-plugin": "0.17.3-ubuntu24.04u5", + "nvidia-imex": "580.95.05-1", + "doca-ofed": "3.1.0-091513" + }, + "doca-custom-repo": "https://linux.mellanox.com/public/repo/doca/3.1.0-091513/ubuntu24.04/arm64-sbsa/", + "kernel-versions": { + "linux-azure-nvidia": "6.14.0-1007.7" + } +} diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index a87aba28043..46357189ad3 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -742,66 +742,21 @@ options nouveau modeset=0 EOF update-initramfs -u - # 2. install GPU drivers - # The open series driver is required for the GB200 platform. Dmesg output - # will appear directing the reader away from the proprietary driver. The GPUs - # are also not visible in nvidia-smi output with the proprietary drivers - - # Install a local repository if a LOCAL_DOCA_REPO_URL is provided - if [ -n "${LOCAL_DOCA_REPO_URL}" ]; then - # Extract filename from URL path, removing query parameters - LOCAL_REPO_FILENAME=$(basename "${LOCAL_DOCA_REPO_URL%%\?*}") - - # Store files downloaded before curl command - BEFORE_FILES=$(ls /tmp/*.deb 2>/dev/null || echo "") - - curl --output-dir /tmp -O "${LOCAL_DOCA_REPO_URL}" - if [ $? -ne 0 ]; then - if [ "${CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR}" = "True" ]; then - echo "WARNING: Continuing despite error downloading package from ${LOCAL_DOCA_REPO_URL}." - else - echo "ERROR: Failed to download package from ${LOCAL_DOCA_REPO_URL}." - exit 1 - fi - else - # Find the newly downloaded file - AFTER_FILES=$(ls /tmp/*.deb 2>/dev/null || echo "") - DOWNLOADED_FILE=$(comm -13 <(echo "$BEFORE_FILES" | sort) <(echo "$AFTER_FILES" | sort) | head -1) - - # Use the detected file or fall back to the extracted filename - if [ -n "${DOWNLOADED_FILE}" ]; then - dpkg -i "${DOWNLOADED_FILE}" - else - dpkg -i "/tmp/${LOCAL_REPO_FILENAME}" - fi - - # Disable the online repository - mv /etc/apt/sources.list.d/doca-net.list /etc/apt/sources.list.d/doca-net.list.disabled - - apt update - fi + # 2. install drivers + BOM_PATH="gb200-mai-bom.json" + + # Install a custom repository if a doca-custom-repo is specified + DOCA_CUSTOM_REPO=$(jq -r '.["doca-custom-repo"]' $BOM_PATH) + if [ -n "$DOCA_CUSTOM_REPO" ]; then + mv /etc/apt/sources.list.d/doca-net.list /etc/apt/sources.list.d/doca-net.list.backup + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/doca-net.pub] $DOCA_CUSTOM_REPO ./" > /etc/apt/sources.list.d/doca-net.list + apt-get update fi - apt install -y \ - nvidia-driver-580-open - - apt install -y \ - cuda-toolkit-13 \ - nvidia-container-toolkit \ - datacenter-gpu-manager-exporter \ - datacenter-gpu-manager-4-core \ - datacenter-gpu-manager-4-proprietary \ - datacenter-gpu-manager-4-cuda13 \ - datacenter-gpu-manager-4-proprietary-cuda13 \ - datacenter-gpu-manager-4-multinode-cuda13 \ - libcap2-bin \ - k8s-device-plugin - - apt install -y \ - nvidia-imex - - apt install -y \ - doca-ofed + # Farcically, nvidia-dkms-580-open cannot be installed together with the CUDA toolkit. Something about that package changes the build environment in an incompatible way. I've seen people mention CUDA including an old version of gcc that somehow makes its way onto the PATH... + # Therefore we install the GPU driver and its dependencies first, then install all downstream reverse-dependencies (CUDA, DCGM, and so forth) second. + sudo apt-get install -y $(jq -r '.["versions-wave1"] | to_entries[] | "\(.key)=\(.value)"' $BOM_PATH) + sudo apt-get install -y $(jq -r '.["versions-wave2"] | to_entries[] | "\(.key)=\(.value)"' $BOM_PATH) # 3. Add char device symlinks for NVIDIA devices mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)" diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 771f5d760d7..4b0af8e5679 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -533,6 +533,9 @@ copyPackerFiles() { NVIDIA_MODPROBE_PARAMETERS_DEST=/etc/modprobe.d/nvidia.conf cpAndMode $NVIDIA_MODPROBE_PARAMETERS_SRC $NVIDIA_MODPROBE_PARAMETERS_DEST 644 + BOM_SRC=/home/packer/gb200-mai-bom.json + BOM_DEST=/opt/azure/containers/gb200-mai-bom.json + cpAndMode $BOM_SRC $BOM_DEST 644 fi fi diff --git a/vhdbuilder/packer/post-install-dependencies.sh b/vhdbuilder/packer/post-install-dependencies.sh index 67cccc26da3..3113d49569a 100644 --- a/vhdbuilder/packer/post-install-dependencies.sh +++ b/vhdbuilder/packer/post-install-dependencies.sh @@ -29,7 +29,7 @@ fi capture_benchmark "${SCRIPT_NAME}_source_packer_files_and_declare_variables" if [ $OS = $UBUNTU_OS_NAME ]; then - # We do not purge extra kernels from the Ubuntu 24.04 ARM image, since that image must dual-boot for GB200. + # We do not purge extra kernels from the Ubuntu 24.04 ARM images, since those images must dual-boot for GB200. if [ $CPU_ARCH != "arm64" ] || [ $UBUNTU_RELEASE != "24.04" ]; then # shellcheck disable=SC2021 current_kernel="$(uname -r | cut -d- -f-2)" @@ -39,6 +39,11 @@ if [ $OS = $UBUNTU_OS_NAME ]; then else dpkg --get-selections | grep -e "linux-\(headers\|modules\|image\)" | grep -v "linux-\(headers\|modules\|image\)-azure" | grep -v "$current_kernel" | tr -s '[[:space:]]' | tr '\t' ' ' | cut -d' ' -f1 | xargs -I{} apt-get --purge remove -yq {} fi + else + # However, for the 24.04 ARM images, we MUST have both -azure and -azure-nvidia kernels, so that we can run on either vanilla ARM64 hardware or GB200. + if [ $(dpkg --get-selections | grep -c "linux-image") -lt 2 ]; then + echo "ERROR: Ubuntu 24.04 ARM image is missing either the -azure or -azure-nvidia kernel, cannot continue!" && exit 1 + fi fi # remove apport diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json index 232446594a5..167d18b9c87 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -662,6 +662,11 @@ "source": "parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf", "destination": "/home/packer/modprobe-nvidia-parameters.conf" }, + { + "type": "file", + "source": "vhdbuilder/packer/gb200-mai-bom.json", + "destination": "/home/packer/gb200-mai-bom.json" + }, { "type": "shell", "inline": [ From 6b78d98fac5f5c57826906e23a655e576ad41be6 Mon Sep 17 00:00:00 2001 From: Alex Benn Date: Wed, 3 Dec 2025 14:16:45 -0500 Subject: [PATCH 58/87] Add more explicit deps to the GB200 BOM. --- vhdbuilder/packer/gb200-mai-bom.json | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vhdbuilder/packer/gb200-mai-bom.json b/vhdbuilder/packer/gb200-mai-bom.json index 52f4aa37a7a..bc5c60381e6 100644 --- a/vhdbuilder/packer/gb200-mai-bom.json +++ b/vhdbuilder/packer/gb200-mai-bom.json @@ -19,6 +19,8 @@ }, "versions-wave2": { "cuda-toolkit-13": "13.0.2-1", + "nvidia-container-toolkit-base": "1.18.0-1", + "libnvidia-container-tools": "1.18.0-1", "nvidia-container-toolkit": "1.18.0-1", "datacenter-gpu-manager-exporter": "4.6.0-1", "datacenter-gpu-manager-4-core": "1:4.4.1-1", @@ -30,6 +32,10 @@ "libcap2-bin": "1:2.66-5ubuntu2.2", "k8s-device-plugin": "0.17.3-ubuntu24.04u5", "nvidia-imex": "580.95.05-1", + "librdmacm-dev": "2507mlnx58-1.2507097.0214", + "libibverbs-dev": "2507mlnx58-1.2507097.0214", + "libibverbs1": "2507mlnx58-1.2507097.0214", + "ibverbs-providers": "2507mlnx58-1.2507097.0214", "doca-ofed": "3.1.0-091513" }, "doca-custom-repo": "https://linux.mellanox.com/public/repo/doca/3.1.0-091513/ubuntu24.04/arm64-sbsa/", From c1a88981e92c9d7c41bb8abaf689e4d54e289487 Mon Sep 17 00:00:00 2001 From: Alex Benn Date: Wed, 3 Dec 2025 15:11:46 -0500 Subject: [PATCH 59/87] fix: add libnvidia-container1 to bom --- vhdbuilder/packer/gb200-mai-bom.json | 1 + 1 file changed, 1 insertion(+) diff --git a/vhdbuilder/packer/gb200-mai-bom.json b/vhdbuilder/packer/gb200-mai-bom.json index bc5c60381e6..b4361983273 100644 --- a/vhdbuilder/packer/gb200-mai-bom.json +++ b/vhdbuilder/packer/gb200-mai-bom.json @@ -20,6 +20,7 @@ "versions-wave2": { "cuda-toolkit-13": "13.0.2-1", "nvidia-container-toolkit-base": "1.18.0-1", + "libnvidia-container1": "1.18.0-1", "libnvidia-container-tools": "1.18.0-1", "nvidia-container-toolkit": "1.18.0-1", "datacenter-gpu-manager-exporter": "4.6.0-1", From 416b2f4295f8a822b1cd4f8851da180e61405d99 Mon Sep 17 00:00:00 2001 From: Alex Benn Date: Wed, 3 Dec 2025 16:31:23 -0500 Subject: [PATCH 60/87] fix: allow downgrades. I don't know why we're installing ib packages earlier in the process but we are, and they are too new. --- vhdbuilder/packer/install-dependencies.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 46357189ad3..764d0e2c8bd 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -755,8 +755,8 @@ EOF # Farcically, nvidia-dkms-580-open cannot be installed together with the CUDA toolkit. Something about that package changes the build environment in an incompatible way. I've seen people mention CUDA including an old version of gcc that somehow makes its way onto the PATH... # Therefore we install the GPU driver and its dependencies first, then install all downstream reverse-dependencies (CUDA, DCGM, and so forth) second. - sudo apt-get install -y $(jq -r '.["versions-wave1"] | to_entries[] | "\(.key)=\(.value)"' $BOM_PATH) - sudo apt-get install -y $(jq -r '.["versions-wave2"] | to_entries[] | "\(.key)=\(.value)"' $BOM_PATH) + sudo apt-get install -y --allow-downgrades $(jq -r '.["versions-wave1"] | to_entries[] | "\(.key)=\(.value)"' $BOM_PATH) + sudo apt-get install -y --allow-downgrades $(jq -r '.["versions-wave2"] | to_entries[] | "\(.key)=\(.value)"' $BOM_PATH) # 3. Add char device symlinks for NVIDIA devices mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)" From 06c7c53adfca10a6436f3b761703d0366fcd5efe Mon Sep 17 00:00:00 2001 From: Keith <153014933+keith-ms@users.noreply.github.com> Date: Wed, 10 Dec 2025 12:31:14 -0600 Subject: [PATCH 61/87] Merge changes from `main` into `release-gb200` (#7491) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ervin Rácz Signed-off-by: dependabot[bot] Co-authored-by: Tim Wright Co-authored-by: Cameron Meissner Co-authored-by: Devinwong Co-authored-by: Ervin Racz <39372002+ervcz@users.noreply.github.com> Co-authored-by: James Le Cuirot Co-authored-by: aks-node-assistant[bot] <190555641+aks-node-assistant[bot]@users.noreply.github.com> Co-authored-by: Zachary <123345317+zachary-bailey@users.noreply.github.com> Co-authored-by: Alex Benn <62816975+abenn135@users.noreply.github.com> Co-authored-by: Sri Harsha Co-authored-by: Jeremi Piotrowski Co-authored-by: Ganeshkumar Ashokavardhanan <35557827+ganeshkumarashok@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: lilypan26 Co-authored-by: Nishchay Co-authored-by: Xinhe Li Co-authored-by: r2k1 Co-authored-by: Qi Ni Co-authored-by: Andrew Beltrano <2082148+abeltrano@users.noreply.github.com> --- .pipelines/.vsts-vhd-builder-release.yaml | 1 + e2e/vmss.go | 3 +++ vhdbuilder/packer/vhd-image-builder-arm64-gb200.json | 9 +++++++-- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.pipelines/.vsts-vhd-builder-release.yaml b/.pipelines/.vsts-vhd-builder-release.yaml index a3ed0525086..549866ab71e 100644 --- a/.pipelines/.vsts-vhd-builder-release.yaml +++ b/.pipelines/.vsts-vhd-builder-release.yaml @@ -973,6 +973,7 @@ stages: dependsOn: [] jobs: - job: build2404arm64gb200gen2containerd + condition: eq('${{ parameters.build2404arm64gb200gen2containerd }}', true) timeoutInMinutes: 180 steps: - bash: | diff --git a/e2e/vmss.go b/e2e/vmss.go index 4abf14f8931..3b758c4294e 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -789,6 +789,9 @@ func extractLogsFromVMLinux(ctx context.Context, s *Scenario, vm *ScenarioVM) er if s.SecureTLSBootstrappingEnabled() { commandList["secure-tls-bootstrap.log"] = "sudo cat /var/log/azure/aks/secure-tls-bootstrap.log" } + if s.SecureTLSBootstrappingEnabled() { + commandList["secure-tls-bootstrap.log"] = "sudo cat /var/log/azure/aks/secure-tls-bootstrap.log" + } isAzureCNI, err := s.Runtime.Cluster.IsAzureCNI() if err == nil && isAzureCNI { diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json index 167d18b9c87..2910e64ff73 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -60,6 +60,7 @@ }, "location": "{{user `location`}}", "vm_size": "{{user `vm_size`}}", + "use_azure_cli_auth": "true", "polling_duration_timeout": "1h", "managed_image_storage_account_type": "Premium_LRS", "shared_image_gallery_destination": { @@ -68,7 +69,9 @@ "gallery_name": "{{user `sig_gallery_name`}}", "image_name": "{{user `sig_image_name`}}", "image_version": "{{user `captured_sig_version`}}", - "replication_regions": ["{{user `location`}}"] + "replication_regions": [ + "{{user `location`}}" + ] }, "user_assigned_managed_identities": "{{user `msi_resource_strings`}}" } @@ -694,7 +697,9 @@ }, { "type": "shell", - "inline": ["sudo rm /var/log/bcc_installation.log"] + "inline": [ + "sudo rm /var/log/bcc_installation.log" + ] }, { "type": "shell", From 7c1dc5284c648af77e568f274768dcc901c06a44 Mon Sep 17 00:00:00 2001 From: Alex Benn <62816975+abenn135@users.noreply.github.com> Date: Wed, 28 Jan 2026 12:47:58 -0500 Subject: [PATCH 62/87] Merge main changes into release-gb200 branch rolling up through Jan 27 (#7746) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ervin Rácz Signed-off-by: dependabot[bot] Signed-off-by: Liunardy <54991798+Liunardy@users.noreply.github.com> Signed-off-by: Billy Zha Signed-off-by: James Le Cuirot Signed-off-by: Jeremi Piotrowski Co-authored-by: Tim Wright Co-authored-by: Cameron Meissner Co-authored-by: Devinwong Co-authored-by: Ervin Racz <39372002+ervcz@users.noreply.github.com> Co-authored-by: James Le Cuirot Co-authored-by: aks-node-assistant[bot] <190555641+aks-node-assistant[bot]@users.noreply.github.com> Co-authored-by: Zachary <123345317+zachary-bailey@users.noreply.github.com> Co-authored-by: Sri Harsha Co-authored-by: Jeremi Piotrowski Co-authored-by: Ganeshkumar Ashokavardhanan <35557827+ganeshkumarashok@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: lilypan26 Co-authored-by: Nishchay Co-authored-by: Xinhe Li Co-authored-by: r2k1 Co-authored-by: Qi Ni Co-authored-by: Andrew Beltrano <2082148+abeltrano@users.noreply.github.com> Co-authored-by: Mitch Zhu Co-authored-by: smiezah-msft Co-authored-by: Sylvain Boily <4981802+djsly@users.noreply.github.com> Co-authored-by: Martin Heberling Co-authored-by: Andy Zhang Co-authored-by: Henry Li <69694695+henryli001@users.noreply.github.com> Co-authored-by: Liunardy <54991798+Liunardy@users.noreply.github.com> Co-authored-by: Mark Ibrahim Co-authored-by: Calvin Shum Co-authored-by: C.YAO Co-authored-by: OBULAPURAM VENKATA ASHOK KUMAR <87068742+ashokobulapuram7@users.noreply.github.com> Co-authored-by: Billy Zha Co-authored-by: Ben Brady <86134744+benjamin-brady@users.noreply.github.com> Co-authored-by: jiashun0011 Co-authored-by: Henry Beberman Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Ganeshkumar Ashokavardhanan Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: r2k1 <2599261+r2k1@users.noreply.github.com> --- .github/copilot-instructions.md | 298 ------------------ .../.builder-release-template-windows.yaml | 5 + e2e/scenario_win_test.go | 58 ++++ .../artifacts/ubuntu/cse_helpers_ubuntu.sh | 2 + 4 files changed, 65 insertions(+), 298 deletions(-) delete mode 100644 .github/copilot-instructions.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md deleted file mode 100644 index f3b28183f3c..00000000000 --- a/.github/copilot-instructions.md +++ /dev/null @@ -1,298 +0,0 @@ -# Overview - -AgentBaker repo has 2 main services discussed below: - -- VHD Builder -- AgentBaker Service - -## VHD Builder - -It builds VHDs using Packer for base OS: Windows, Azure Linux/Mariner and Ubuntu. For each OS there are multiple supported versions (windows 2019, 2022, ubuntu 2004, 2204 etc). The VHDs are base images for a node in an aks cluster. - -VHDs are built using [Packer](https://developer.hashicorp.com/packer/docs) in [vhdbuilder](./vhdbuilder/). - -Windows VHD is configured through [VHD](./vhdbuilder/packer/windows/windows-vhd-configuration.ps1) - -## AgentBaker Service - -[apiserver](./apiserver/) is `go` based webserver. It receives request from external client and generates CSE and CustomData to be used on the VHD when a new node is created / provisioned. - -windows generates its CSE package using [script](./parts/windows/kuberneteswindowssetup.ps1). - -The webserver is also used to determine the latest version of Linux VHDs available for provisioning within AKS clusters. - -## Code Structure - -[parts](./parts/) serves both AgentBaker Service and VHD build. AgentBaker service and VHDs are coupled because of this shared component. When building VHD, packer maps and renames scripts from [parts](./parts/) depending on the OS / versions. The mappings can be found at [packer](./vhdbuilder/packer/). - -Windows uses a different folder [cse](./staging/cse/windows/) for almost the same purpose. There are subtle differences as windows CSEs can be downloaded as a zip file during provisioning time due to restrictions on the file size on Windows system, while for linux based systems the cse/custom data are dropped in during provisioning time. - -## Deployment and Release - -The VHD build is triggered by Azure Devops [pipelines](.pipelines/). For release, the pipelines following the same templates for different OS versions: - -- [linux/ubuntu](./.pipelines/templates/.builder-release-template.yaml) -- [windows](./.pipelines/templates/.builder-release-template-windows.yaml) - -you can reason the steps by following the steps defined in the pipeline. - -Tags of AgentBaker and corresponding Linux VHDs are released every week. Linux VHDs are built with a particular image version in the YYYYMM.DD.PATCH format. All Linux VHD versions correspond to a particular tag of the AgentBaker go module. AgentBaker go module tags follow the format v0.YYYYMMDD.PATCH. The mapping between AgentBaker tag and Linux VHD version is defined within [linux_sig_version.json](./pkg/agent/datamodel/linux_sig_version.json). - -Windows VHD are released separately, following windows patch tuesday schedule. - -## Guidelines - -### SRE Guidelines - -The operational goals of this project are: - -- achieve consistency across different OS as much as possible -- avoid functional regression when introducing new features (component updates, new drivers, new binaries), ensure that all supported OS / versions are tested -- avoid VHD build performance regressions when making any changes -- avoid node provisioning performance regression when making any changes - -When making changes, reason whether the file is used in VHD building stage, or provision stage, or both. Make sure the changes are valid in its life stage. as an example, [windows-vhd-configuration.ps1](./vhdbuilder/packer/windows/windows-vhd-configuration.ps1) defines container images to be cached in VHD, while [configure-windows-vhd.ps1](./vhdbuilder/packer/windows/configure-windows-vhd.ps1) executes commands at provision time. - -One way to debug / explore / just for fun is to run [e2e](./e2e/) tests. To run locally, follow the readme file under that folder. - -The SRE guidelines ground other coding guidelines and practices. - -### Golang Guidelines - -- Follow Go best practice -- Use vanilla go test framework - -### PowerShell Guidelines - -- follow PowerShell best practices - -### ShellScripts Guidelines - -- use shellcheck for sanity checking -- use ShellSpec for testing -- the shell scripts are used on both azure linux/mariner and ubuntu and cross platform portability is critical. -- when using functions defined in other files, ensure it is sourced properly. -- use local variables rather than constants when their scoping allows for it. -- avoid using variables declared inside another function, even they are visible. It is hard to reason and might introduce subtle bugs. - -## Pull Request Review Guidelines - -When reviewing pull requests, perform breaking change analysis to prevent regressions. VHDs remain in production for 6 months, so backward compatibility is critical. - -**Review Approach**: Focus on high-level architecture, security vulnerabilities, and logic bugs. Apply deep reasoning similar to advanced models (e.g., Claude Opus) - don't just pattern match, but truly understand the code's intent, dependencies, and potential failure modes. - -### Breaking Change Detection - -Analyze PRs for these compatibility scenarios: - -**1. Linux Provisioning Script Changes** -- **Context**: Scripts in `parts/linux/cloud-init/artifacts/` run during critical VM bootstrap and are used in both: - - VHD build (uploaded via packer configs in `vhdbuilder/packer/*.json`) - - VM provisioning (CSE - embedded in Go service via `pkg/agent/const.go`) - - Versions synchronized via `pkg/agent/datamodel/linux_sig_version.json` -- **What to check**: Changes that could break VM provisioning in production -- **Breaking signals**: - - **Script logic errors**: Syntax errors, wrong commands, incorrect flags, broken pipes - - **Dependency issues**: - - Calling functions before they're sourced - - Using variables declared in other functions - - Removing `source` statements that break dependency chains - - **Cross-distro compatibility**: - - Commands that don't work on both Ubuntu and Azure Linux/Mariner (check distro-specific variants: `ubuntu/`, `mariner/`) - - Package manager assumptions (apt vs dnf/tdnf) - - Missing OS-specific conditional logic - - **External dependency violations**: - - NEW: Downloading from internet URLs not in `parts/common/components.json` or allowed sources (packages.aks.azure.com) - - All external dependencies MUST be referenced in `parts/common/components.json` for Renovate updates - - Only allowed runtime downloads: packages.aks.azure.com or other explicitly allowed sources in CSE - - **Function signature changes**: Parameters, return values, exit codes that break callers - - **Missing test coverage**: Changes to provisioning logic without corresponding e2e tests - - **Forward and backward compatibility**: Keep compatibility across the 6-month VHD support window in both directions. - - **Backward**: Newer VHDs must still work with older CSE scripts delivered via CRP custom data. - - Example: PR #7866 restored `cni-plugins` dependency + install logic after a removal caused provisioning failures (exit 206) when old scripts ran on newer VHDs. - - **Forward**: Newer CSE script changes must not require components/features that exist only on newer VHDs, unless the logic to detect and handle missing features is implemented. - - -**2. Windows Bidirectional Compatibility** -- **Context**: Windows VHD and CSE scripts release on different cadences with no guaranteed order -- **What to check**: Changes to `staging/cse/windows/` (CSE scripts) or `vhdbuilder/packer/windows/` (VHD scripts). -- **Breaking signals**: - - New CSE scripts using features in the VHD Scripts that aren't present before this PR. - - New VHD scripts expecting features in the CSE scripts that aren't present before this PR. - - Changes to shared state (registry keys, files, environment variables) that break coordination - - Removing PowerShell functions or cmdlets that the other component might call - - Incompatibilities between newer versions of the CSE scripts and older versions of the VHD scripts are critical to detect as they can cause production outages. - -**3. aks-node-controller Migration (Dual-Mode Support)** -- **Context**: Transitioning from uploading scripts during both VHD build and CSE to only uploading aks-node-controller during VHD build -- **What to check**: Any changes must work in BOTH deployment modes -- **Breaking signals**: - - Assumptions that scripts are always uploaded during CSE (new mode won't do this) - - Assumptions that aks-node-controller is always present (old VHDs won't have it) - - Missing feature detection to determine which mode is running - - Hardcoded paths that differ between deployment modes - -**4. Cross-OS Compatibility** -- **What to check**: Changes work on Ubuntu, Azure Linux/Mariner, and Windows -- **Breaking signals**: - - Linux commands that don't work on both Ubuntu and Azure Linux/Mariner - - Missing conditional logic for OS-specific behaviors - - Package manager assumptions (apt vs dnf/tdnf) - - Systemd differences between distributions - -**5. Package/Dependency Update PRs (Renovate)** -- **Context**: Renovate bot automatically creates PRs to update component versions in `parts/common/components.json`. These components are cached on VHDs during build and directly affect node stability, GPU workloads, networking, and security. Updated packages are downloaded from `packages.aks.azure.com` or upstream registries during VHD build. -- **What to check**: Every version bump—even patch versions—can introduce regressions that affect production nodes. -- **`renovate.json` syntax guardrails**: - 1. Keep the file valid JSON (double quotes only, no comments, no trailing commas). - 2. When editing arrays like `assignees`, `reviewers`, `matchPackageNames`, and `matchUpdateTypes`, preserve comma placement and avoid duplicate entries. - 3. Keep schema-compatible key casing and value types (for example `enabled` as boolean, `prHourlyLimit` as number, `labels` as string array). - 4. In `packageRules`, preserve specific-to-generic ordering so narrow matchers are not shadowed by broader rules. - 5. For regex fields (`versioning`, `extractVersion`, `matchCurrentVersion`, custom manager `matchStrings`), escape backslashes correctly for JSON strings. - 6. For custom manager templates, keep Renovate template tokens intact (`{{{newValue}}}`, `{{#if ...}}`, `{{/if}}`) and avoid converting them to normal JSON interpolation. - 7. If modifying identity lists in `assignees` or `reviewers`, update all related grouped rules consistently to avoid ownership drift. - 8. On GitHub, both `assignees` and `reviewers` may include team handles using the `team:` format; if used, ensure the team exists in the AKS org and has at least read permission to the AgentBaker repo. - 9. Keep `minor` updates disabled by default; only allow minor updates through explicit, narrow `packageRules` (avoid broad datasource/wildcard exceptions). Example context: https://github.com/Azure/AgentBaker/pull/7898 (broad matching led to unintended cross-stream minor jumps). - 10. Never combine `matchUpdateTypes` and `allowedVersions` in the same `packageRules` entry — Renovate rejects this. Put `allowedVersions` in a separate rule; Renovate merges all matching rules. Context: #8420. -- **Analysis steps for every package update PR**: - 1. **Identify the component and version change**: Parse the diff in `parts/common/components.json` to extract exact old → new versions for each OS/release entry. - 2. **Determine the update type**: Classify as major, minor, or patch using semver. Major and minor updates carry higher risk than patch updates. - 3. **Research upstream changelog**: Look up the project's release notes, changelog, or GitHub releases to understand what changed between the old and new versions. Summarize: - - New features introduced - - Bug fixes included - - Breaking changes or deprecations - - Security fixes (CVEs patched) - 4. **Assess OS coverage**: Check if the update covers all OS variants where the component is used (Ubuntu 22.04, 24.04, Azure Linux 3.0, etc.). Flag if some OS entries are updated but others are not — partial updates can cause inconsistency across node pools. - 5. **Evaluate VHD size impact**: For components downloaded as binaries or packages, consider whether the new version significantly increases VHD size. Large size increases can affect VHD build time and storage costs. - 6. **Check for configuration or API changes**: If the component exposes configuration files, CLI flags, systemd units, or APIs consumed by CSE scripts, verify that the update doesn't change defaults or remove options that provisioning scripts depend on. - 7. **Verify download URL validity**: Confirm that the `downloadLocation` and `downloadURIs` structure in components.json remains valid for the new version. New versions sometimes change the artifact naming convention or repository layout. - -- **Risk assessment for package updates**: - - 🔴 **High Risk**: Major version bumps, components critical to node boot (kubelet, containerd, runc), GPU drivers (nvidia-driver, dcgm-exporter), or networking (azure-cni, cilium). Also high risk if upstream changelog mentions breaking changes or behavioral changes. - - 🟡 **Medium Risk**: Minor version bumps of non-critical components, updates that only affect specific OS variants, or updates where upstream changelog shows feature additions that could subtly change behavior. - - 🟢 **Low Risk**: Patch version bumps with only bug fixes or security patches, no breaking changes in upstream changelog, and full OS coverage. - -- **Review output for package update PRs must include a detailed version diff analysis**: - - **Header:** - ``` - ## Package Update Analysis: - **Version change**: X.Y.Z → A.B.C ( update) - **OS variants affected**: Ubuntu 22.04, Ubuntu 24.04, Azure Linux 3.0 (list all) - **OS variants NOT updated**: - ``` - - **Detailed changelog between versions:** - Use web search, GitHub releases, or upstream project documentation to find the exact differences between the old and new version. Present each change as a line item with its own risk tag: - - ``` - ### Changes between X.Y.Z and A.B.C - - | Change | Description | Risk | - |--------|-------------|------| - | Feature | | 🟢 Low / 🟡 Medium / 🔴 High | - | Bug fix | | 🟢 Low / 🟡 Medium / 🔴 High | - | Breaking | | 🔴 High | - | Security | CVE-YYYY-XXXXX: | 🟢 Low / 🟡 Medium / 🔴 High | - | Deprecation | | 🟡 Medium / 🔴 High | - | Config change | | 🟡 Medium / 🔴 High | - | Performance | | 🟢 Low / 🟡 Medium | - ``` - - For each individual change, assess risk by considering: - - Does it alter runtime behavior on AKS nodes? - - Does it change CLI flags, config file formats, or systemd unit behavior that CSE scripts depend on? - - Does it affect GPU workloads, networking, container runtime, or kubelet interaction? - - Could it increase binary size significantly (VHD bloat)? - - Does it introduce new system dependencies or kernel requirements? - - **If upstream changelog is unavailable**, explicitly state: _"Upstream changelog not found for this version range. Manual testing recommended before merge."_ - - **Overall risk assessment:** - ``` - ### Overall Risk: 🟢 Low / 🟡 Medium / 🔴 High - **Justification**: <1-2 sentence summary of why this risk level was chosen> - **Recommendation**: Approve / Request more info / Flag for manual testing - ``` - - **Example** (for a PR like dcgm-exporter 4.7.1 → 4.8.0): - ``` - ## Package Update Analysis: dcgm-exporter - **Version change**: 4.7.1 → 4.8.0 (minor update) - **OS variants affected**: Ubuntu 22.04, Ubuntu 24.04 - **OS variants NOT updated**: Azure Linux 3.0 (still on 4.7.1-1.azl3) — flag for follow-up - - ### Changes between 4.7.1 and 4.8.0 - | Change | Description | Risk | - |--------|-------------|------| - | Feature | Added support for new DCGM field IDs for Blackwell GPUs | 🟢 Low | - | Feature | New metrics endpoint configuration options | 🟡 Medium | - | Bug fix | Fixed memory leak in long-running metric collection | 🟢 Low | - | Deprecation | Removed legacy CSV export format | 🟡 Medium | - - ### Overall Risk: 🟡 Medium - **Justification**: Minor version bump of GPU monitoring component. No breaking changes to core metrics pipeline, but Azure Linux 3.0 is not updated which creates version skew across OS variants. - **Recommendation**: Approve, but file follow-up issue for Azure Linux 3.0 alignment. - ``` - -### Analysis Approach - -**Dynamic Dependency Tracing**: -1. For each changed file, identify what depends on it -2. Follow `source` statements in bash scripts to trace dependency chains -3. Check for function calls, variable references across files -4. Look for hardcoded paths in VHD build scripts (`vhdbuilder/packer/`) that reference changed files -5. Trace through as many levels as needed within the codebase -6. **Check external dependencies**: - - Search for new URLs being downloaded (curl, wget, etc.) - - Verify all external dependencies are in `parts/common/components.json` for Renovate updates - - Flag downloads from unauthorized sources (only packages.aks.azure.com and sources in components.json allowed) - -**Historical Context**: -- Look for related changes that previously caused issues -- Identify patterns of fragile areas that break frequently - -**Test Coverage Assessment**: -- Note if changed code has e2e test coverage -- Flag changes to untested areas as higher risk -- Mention if new behavior lacks corresponding test additions - -### Review Output Format - -Provide targeted inline comments on specific lines where you detect issues: - -**For each breaking change or risk:** -- Comment directly on the problematic line or code block -- Explain why this is risky (e.g., "This removes function X which may be called by VHDs built in the last 6 months") -- Suggest specific mitigations or alternatives -- Include actionable next steps (e.g., "Verify this function is not used by checking references in `vhdbuilder/packer/`") - -**Risk indicators to include:** - -- **Severity** (pick one): - - 🔴 **High Risk** — Could break production VM provisioning, cause node failures, or introduce security vulnerabilities - - 🟡 **Medium Risk** — Could cause issues in specific configurations, edge cases, or degrade performance - - 🟢 **Low Risk** — Unlikely to cause issues but worth noting for awareness - -- **Category** (pick one): - - 🔧 **Script Logic** — Syntax errors, incorrect commands, broken control flow, wrong exit codes - - 🖥️ **Cross-OS** — Incompatibility between Ubuntu, Azure Linux/Mariner, or Windows - - 🌐 **External Dependency** — Unauthorized downloads, missing components.json entries, broken URLs - - 🧪 **Test Coverage** — Missing or insufficient test coverage for changed behavior - - 📦 **Package Update** — Component version changes, upstream regressions, VHD size impact - - 🔄 **Backward Compatibility** — Breaking changes affecting VHDs in production (6-month window) - - 🔒 **Security** — Credential exposure, privilege escalation, insecure defaults - - ⚡ **Performance** — VHD build time regression, node provisioning latency increase - - 🏗️ **Architecture** — Structural changes affecting multiple components or deployment modes - -**Only comment when you have substantive findings** - avoid noise on trivial or obviously safe changes. - -### Review Philosophy - -Think like an experienced reviewer who "eyeballs" PRs for subtle risks. Look beyond pattern matching: -- Understand the architecture and how components interact -- Consider timing of releases and deployment sequences -- Reason about implicit dependencies and assumptions -- Flag changes that "feel risky" even without obvious red flags -- Balance thoroughness with actionable feedback -- Focus on high-impact issues that could break production VM provisioning diff --git a/.pipelines/templates/.builder-release-template-windows.yaml b/.pipelines/templates/.builder-release-template-windows.yaml index 5da7c54e949..4b351dbddd6 100644 --- a/.pipelines/templates/.builder-release-template-windows.yaml +++ b/.pipelines/templates/.builder-release-template-windows.yaml @@ -65,6 +65,11 @@ steps: echo "##vso[task.setvariable variable=MODE]windowsVhdMode" displayName: Set DRY_RUN and MODE variables + - bash: | + echo "##vso[task.setvariable variable=DRY_RUN]${{ parameters.dryrun }}" + echo "##vso[task.setvariable variable=MODE]windowsVhdMode" + displayName: Set DRY_RUN and MODE variables + - template: ./.template-override-components-json.yaml parameters: overrideBranch: ${{ parameters.overrideBranch }} diff --git a/e2e/scenario_win_test.go b/e2e/scenario_win_test.go index 2ece49b1c7b..2646405ee0d 100644 --- a/e2e/scenario_win_test.go +++ b/e2e/scenario_win_test.go @@ -651,3 +651,61 @@ func Test_NetworkIsolatedCluster_Windows_OrasDownload(t *testing.T) { }, }) } + +func Test_Windows2022_McrChinaCloud_Windows(t *testing.T) { + RunScenario(t, &Scenario{ + Tags: Tags{ + MockAzureChinaCloud: true, + }, + Description: "Windows Server 2022 Azure Network Containerd - v1 to test Azure China Cloud MCR host", + Config: Config{ + Cluster: ClusterAzureNetwork, + VHD: config.VHDWindows2022Containerd, + VMConfigMutator: EmptyVMConfigMutator, + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateFileExists(ctx, s, `C:\ProgramData\containerd\certs.d\docker.io\hosts.toml`) + ValidateFileExists(ctx, s, `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`) + ValidateFileHasContent(ctx, s, + `C:\ProgramData\containerd\certs.d\docker.io\hosts.toml`, + `https://docker.io`) + ValidateFileHasContent(ctx, s, + `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`, + `https://mcr.azk8s.cn`) + }, + }, + }) +} + +func Test_Windows2025Gen2_McrChinaCloud_Windows(t *testing.T) { + RunScenario(t, &Scenario{ + Tags: Tags{ + MockAzureChinaCloud: true, + }, + Description: "Windows Server 2025 with Containerd - hyperv gen 2 to test Azure China Cloud MCR host", + Config: Config{ + Cluster: ClusterAzureNetwork, + VHD: config.VHDWindows2025Gen2, + VMConfigMutator: EmptyVMConfigMutator, + BootstrapConfigMutator: func(configuration *datamodel.NodeBootstrappingConfiguration) { + Windows2025BootstrapConfigMutator(t, configuration) + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateWindowsVersionFromWindowsSettings(ctx, s, "2025-gen2") + ValidateWindowsProductName(ctx, s, "Windows Server 2025 Datacenter") + ValidateWindowsDisplayVersion(ctx, s, "24H2") + ValidateFileHasContent(ctx, s, "/k/kubeletstart.ps1", "--container-runtime=remote") + ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"}) + ValidateCiliumIsNotRunningWindows(ctx, s) + ValidateFileExists(ctx, s, `C:\ProgramData\containerd\certs.d\docker.io\hosts.toml`) + ValidateFileExists(ctx, s, `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`) + ValidateFileHasContent(ctx, s, + `C:\ProgramData\containerd\certs.d\docker.io\hosts.toml`, + `https://docker.io`) + ValidateFileHasContent(ctx, s, + `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`, + `https://mcr.azk8s.cn`) + }, + }, + }) +} diff --git a/parts/linux/cloud-init/artifacts/ubuntu/cse_helpers_ubuntu.sh b/parts/linux/cloud-init/artifacts/ubuntu/cse_helpers_ubuntu.sh index d65b5874bcd..72bb85c2caa 100755 --- a/parts/linux/cloud-init/artifacts/ubuntu/cse_helpers_ubuntu.sh +++ b/parts/linux/cloud-init/artifacts/ubuntu/cse_helpers_ubuntu.sh @@ -110,6 +110,8 @@ _apt_get_install() { wait_for_apt_locks DEBIAN_FRONTEND=noninteractive apt-get clean wait_for_apt_locks + DEBIAN_FRONTEND=noninteractive apt-get clean + wait_for_apt_locks return 0 fi From 61731d04cc63ddeabdcbdd77da7de18c337ffc35 Mon Sep 17 00:00:00 2001 From: Alex Benn <62816975+abenn135@users.noreply.github.com> Date: Wed, 28 Jan 2026 14:20:42 -0500 Subject: [PATCH 63/87] Abenn135/gb300 bom (#7748) --- vhdbuilder/packer/gb200-mai-bom.json | 36 +++++++++---------- vhdbuilder/packer/pre-install-dependencies.sh | 12 +++++-- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/vhdbuilder/packer/gb200-mai-bom.json b/vhdbuilder/packer/gb200-mai-bom.json index b4361983273..9ec176f4799 100644 --- a/vhdbuilder/packer/gb200-mai-bom.json +++ b/vhdbuilder/packer/gb200-mai-bom.json @@ -1,21 +1,21 @@ { "versions-wave1": { - "libxnvctrl0": "580.95.05-0ubuntu1", - "libnvidia-common-580": "580.95.05-0ubuntu1", - "libnvidia-cfg1-580": "580.95.05-0ubuntu1", - "libnvidia-gpucomp-580": "580.95.05-0ubuntu1", - "libnvidia-gl-580": "580.95.05-0ubuntu1", - "nvidia-firmware-580": "580.95.05-0ubuntu1", - "nvidia-dkms-580-open": "580.95.05-0ubuntu1", - "nvidia-kernel-common-580": "580.95.05-0ubuntu1", - "nvidia-kernel-source-580-open": "580.95.05-0ubuntu1", - "libnvidia-compute-580": "580.95.05-0ubuntu1", - "libnvidia-extra-580": "580.95.05-0ubuntu1", - "libnvidia-decode-580": "580.95.05-0ubuntu1", - "libnvidia-encode-580": "580.95.05-0ubuntu1", - "xserver-xorg-video-nvidia-580": "580.95.05-0ubuntu1", - "libnvidia-fbc1-580": "580.95.05-0ubuntu1", - "nvidia-driver-580-open": "580.95.05-0ubuntu1" + "libxnvctrl0": "580.105.08-0ubuntu1", + "libnvidia-common-580": "580.105.08-0ubuntu1", + "libnvidia-cfg1-580": "580.105.08-0ubuntu1", + "libnvidia-gpucomp-580": "580.105.08-0ubuntu1", + "libnvidia-gl-580": "580.105.08-0ubuntu1", + "nvidia-firmware-580": "580.105.08-0ubuntu1", + "nvidia-dkms-580-open": "580.105.08-0ubuntu1", + "nvidia-kernel-common-580": "580.105.08-0ubuntu1", + "nvidia-kernel-source-580-open": "580.105.08-0ubuntu1", + "libnvidia-compute-580": "580.105.08-0ubuntu1", + "libnvidia-extra-580": "580.105.08-0ubuntu1", + "libnvidia-decode-580": "580.105.08-0ubuntu1", + "libnvidia-encode-580": "580.105.08-0ubuntu1", + "xserver-xorg-video-nvidia-580": "580.105.08-0ubuntu1", + "libnvidia-fbc1-580": "580.105.08-0ubuntu1", + "nvidia-driver-580-open": "580.105.08-0ubuntu1" }, "versions-wave2": { "cuda-toolkit-13": "13.0.2-1", @@ -32,7 +32,7 @@ "datacenter-gpu-manager-4-multinode-cuda13": "1:4.4.1-1", "libcap2-bin": "1:2.66-5ubuntu2.2", "k8s-device-plugin": "0.17.3-ubuntu24.04u5", - "nvidia-imex": "580.95.05-1", + "nvidia-imex": "580.105.08-1", "librdmacm-dev": "2507mlnx58-1.2507097.0214", "libibverbs-dev": "2507mlnx58-1.2507097.0214", "libibverbs1": "2507mlnx58-1.2507097.0214", @@ -41,6 +41,6 @@ }, "doca-custom-repo": "https://linux.mellanox.com/public/repo/doca/3.1.0-091513/ubuntu24.04/arm64-sbsa/", "kernel-versions": { - "linux-azure-nvidia": "6.14.0-1007.7" + "linux-azure-nvidia": "6.14.0-1009.9" } } diff --git a/vhdbuilder/packer/pre-install-dependencies.sh b/vhdbuilder/packer/pre-install-dependencies.sh index c6ec20492f9..9fadd1c457c 100644 --- a/vhdbuilder/packer/pre-install-dependencies.sh +++ b/vhdbuilder/packer/pre-install-dependencies.sh @@ -200,8 +200,15 @@ if [[ ${UBUNTU_RELEASE//./} -ge 2204 && "${ENABLE_FIPS,,}" != "true" ]]; then if [[ "${CPU_ARCH}" == "arm64" && "${UBUNTU_RELEASE}" = "24.04" ]]; then # This is the ubuntu 2404arm64gen2containerd image or the 2404arm64gb200 image # Uncomment if we have trouble finding the kernel package. - # sudo add-apt-repository ppa:canonical-kernel-team/ppa - sudo apt update + # The Ubuntu PPA has early access to new kernels, such as the one in the GB300 CRD. + if grep -q "GB200" <<< "$FEATURE_FLAGS"; then + add-apt-repository ppa:canonical-kernel-team/ppa + BOM_PATH="gb200-mai-bom.json" + if [ -n "$(jq -r '.["kernel-versions"] | keys[]' $BOM_PATH)" ]; then + NVIDIA_KERNEL_PACKAGE=$(jq -r '.["kernel-versions"] | to_entries[] | "\(.key)=\(.value)"' $BOM_PATH) + fi + fi + apt-get update if apt-cache show "${NVIDIA_KERNEL_PACKAGE}" &> /dev/null; then echo "ARM64 image. Installing NVIDIA kernel and its packages alongside LTS kernel" wait_for_apt_locks @@ -211,6 +218,7 @@ if [[ ${UBUNTU_RELEASE//./} -ge 2204 && "${ENABLE_FIPS,,}" != "true" ]]; then else echo "ARM64 image. NVIDIA kernel not available, skipping installation." fi + add-apt-repository --remove ppa:canonical-kernel-team/ppa fi wait_for_apt_locks if grep -q "cvm" <<< "$FEATURE_FLAGS"; then From 321c28e331492f2bf6a6bf276cc518770e0b5982 Mon Sep 17 00:00:00 2001 From: Alex Benn Date: Wed, 28 Jan 2026 15:23:06 -0500 Subject: [PATCH 64/87] fix: copy over image builder files and configuration from main arm64 file, vhd-image-builder-arm64-gen2.json --- .../packer/vhd-image-builder-arm64-gb200.json | 44 ++++++++++++++++--- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json index 2910e64ff73..1594b04d9b2 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -69,9 +69,7 @@ "gallery_name": "{{user `sig_gallery_name`}}", "image_name": "{{user `sig_image_name`}}", "image_version": "{{user `captured_sig_version`}}", - "replication_regions": [ - "{{user `location`}}" - ] + "replication_regions": ["{{user `location`}}"] }, "user_assigned_managed_identities": "{{user `msi_resource_strings`}}" } @@ -100,6 +98,11 @@ "source": "parts/linux/cloud-init/artifacts/aks-node-controller.service", "destination": "/home/packer/aks-node-controller.service" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-node-controller-wrapper.sh", + "destination": "/home/packer/aks-node-controller-wrapper.sh" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/cloud-init-status-check.sh", @@ -325,6 +328,11 @@ "source": "vhdbuilder/packer/install-dependencies.sh", "destination": "/home/packer/install-dependencies.sh" }, + { + "type": "file", + "source": "vhdbuilder/packer/generate-disk-usage.sh", + "destination": "/home/packer/generate-disk-usage.sh" + }, { "type": "file", "source": "vhdbuilder/packer/post-install-dependencies.sh", @@ -415,6 +423,16 @@ "source": "parts/linux/cloud-init/artifacts/profile-d-cis.sh", "destination": "/home/packer/profile-d-cis.sh" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/profile-d-path.sh", + "destination": "/home/packer/profile-d-path.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/disk_queue.sh", + "destination": "/home/packer/disk_queue.sh" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/disk_queue.service", @@ -697,9 +715,17 @@ }, { "type": "shell", - "inline": [ - "sudo rm /var/log/bcc_installation.log" - ] + "inline": ["sudo rm /var/log/bcc_installation.log"] + }, + { + "type": "shell", + "inline": ["sudo /bin/bash /home/packer/generate-disk-usage.sh"] + }, + { + "type": "file", + "direction": "download", + "source": "/opt/azure/disk-usage.txt", + "destination": "disk-usage.txt" }, { "type": "shell", @@ -764,5 +790,9 @@ "sudo /usr/sbin/waagent -force -deprovision+user && export HISTSIZE=0 && sync || exit 125" ] } - ] + ], + "error-cleanup-provisioner": { + "type": "shell", + "inline": ["sudo /bin/bash /home/packer/generate-disk-usage.sh"] + } } From 4679e08bc4c6df2aedf1ff155b2597aa631a439a Mon Sep 17 00:00:00 2001 From: Alex Benn <62816975+abenn135@users.noreply.github.com> Date: Wed, 4 Feb 2026 10:07:48 -0500 Subject: [PATCH 65/87] Downgrade GBx00 kernel to 6.14.0-1003. (#7769) --- vhdbuilder/packer/gb200-mai-bom.json | 2 +- vhdbuilder/packer/pre-install-dependencies.sh | 26 +++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/vhdbuilder/packer/gb200-mai-bom.json b/vhdbuilder/packer/gb200-mai-bom.json index 9ec176f4799..621bc8e9fba 100644 --- a/vhdbuilder/packer/gb200-mai-bom.json +++ b/vhdbuilder/packer/gb200-mai-bom.json @@ -41,6 +41,6 @@ }, "doca-custom-repo": "https://linux.mellanox.com/public/repo/doca/3.1.0-091513/ubuntu24.04/arm64-sbsa/", "kernel-versions": { - "linux-azure-nvidia": "6.14.0-1009.9" + "linux-azure-nvidia": "6.14.0-1003.3" } } diff --git a/vhdbuilder/packer/pre-install-dependencies.sh b/vhdbuilder/packer/pre-install-dependencies.sh index 9fadd1c457c..6df13bdbbf1 100644 --- a/vhdbuilder/packer/pre-install-dependencies.sh +++ b/vhdbuilder/packer/pre-install-dependencies.sh @@ -209,14 +209,36 @@ if [[ ${UBUNTU_RELEASE//./} -ge 2204 && "${ENABLE_FIPS,,}" != "true" ]]; then fi fi apt-get update - if apt-cache show "${NVIDIA_KERNEL_PACKAGE}" &> /dev/null; then + if apt-get install -s "${NVIDIA_KERNEL_PACKAGE}" &> /dev/null; then echo "ARM64 image. Installing NVIDIA kernel and its packages alongside LTS kernel" wait_for_apt_locks sudo apt install --no-install-recommends -y "${NVIDIA_KERNEL_PACKAGE}" echo "after installation:" dpkg -l | grep "linux-.*-azure-nvidia" || true else - echo "ARM64 image. NVIDIA kernel not available, skipping installation." + echo "ARM64 image. NVIDIA kernel not available from repo, fetching and installing dpkgs by hand" + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb > /tmp/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb > /tmp/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb > /tmp/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb > /tmp/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + + dpkg -i /tmp/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + + rm /tmp/*.deb fi add-apt-repository --remove ppa:canonical-kernel-team/ppa fi From 5322115ad22a4dd9b2d58dbb6ac1280d28df544c Mon Sep 17 00:00:00 2001 From: Alex Benn <62816975+abenn135@users.noreply.github.com> Date: Tue, 10 Feb 2026 16:35:55 -0500 Subject: [PATCH 66/87] feat: disable NPD for MAI VHD. (#7843) --- vhdbuilder/packer/install-dependencies.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 764d0e2c8bd..6ef68c5d110 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -779,6 +779,9 @@ EOF systemctl enable nvidia-dcgm-exporter systemctl enable nvidia-device-plugin systemctl enable openibd + + # One additional request from MAI: Disable the AKS node problem detector. When this file is present, the Azure AKS VM Extension assumes the NPD has been installed on the VHD and skips installing it at provision time. + touch /etc/node-problem-detector.d/skip_vhd_npd fi fi From 047b0e63d8785d2791d7920f9d1c665057976270 Mon Sep 17 00:00:00 2001 From: Alex Benn <62816975+abenn135@users.noreply.github.com> Date: Wed, 11 Feb 2026 12:51:26 -0500 Subject: [PATCH 67/87] fix: make node-problem-detector.d directory before trying to touch the status file skip_vhd_npd in it (#7849) Tested working. No NPD conditions advertised on running node. --- vhdbuilder/packer/install-dependencies.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 6ef68c5d110..4a559f3becc 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -781,6 +781,7 @@ EOF systemctl enable openibd # One additional request from MAI: Disable the AKS node problem detector. When this file is present, the Azure AKS VM Extension assumes the NPD has been installed on the VHD and skips installing it at provision time. + mkdir -p /etc/node-problem-detector.d/ touch /etc/node-problem-detector.d/skip_vhd_npd fi fi From 80e86ff3eb47c1e308f5511add2a7f10e5c61494 Mon Sep 17 00:00:00 2001 From: Alex Benn <62816975+abenn135@users.noreply.github.com> Date: Wed, 18 Mar 2026 14:34:48 -0400 Subject: [PATCH 68/87] Bond local NVMe drives together via software RAID. Run kubelet root there. (#7929) Co-authored-by: Keith Pimm Co-authored-by: chmill-zz <17792380+chmill-zz@users.noreply.github.com> --- .../ubuntu/format-mount-kubelet.conf | 3 ++ .../ubuntu/format-mount-nvme-root.service | 13 +++++++ .../ubuntu/format-mount-nvme-root.sh | 35 +++++++++++++++++++ vhdbuilder/packer/packer_source.sh | 12 +++++++ .../packer/vhd-image-builder-arm64-gb200.json | 15 ++++++++ 5 files changed, 78 insertions(+) create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/format-mount-kubelet.conf create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.service create mode 100644 parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh diff --git a/parts/linux/cloud-init/artifacts/ubuntu/format-mount-kubelet.conf b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-kubelet.conf new file mode 100644 index 00000000000..5f7333e3a7b --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-kubelet.conf @@ -0,0 +1,3 @@ +[Unit] +Requires=format-mount-nvme-root.service +After=format-mount-nvme-root.service diff --git a/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.service b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.service new file mode 100644 index 00000000000..3748421f1cd --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.service @@ -0,0 +1,13 @@ +[Unit] +Description=Format NVMe local disk and mount Kubelet there +Requires=mnt.mount +After=mnt.mount + +[Service] +Restart=on-failure +RemainAfterExit=yes +Type=oneshot +ExecStart=/bin/bash /opt/azure/containers/format-mount-nvme-root.sh + +[Install] +WantedBy=multi-user.target diff --git a/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh new file mode 100644 index 00000000000..885e62ae90f --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail +set -x + +# Bind mount kubelet to local NVMe storage specifically on startup. +MOUNT_POINT="/mnt/aks" + + +KUBELET_MOUNT_POINT="${MOUNT_POINT}/kubelet" +KUBELET_DIR="/var/lib/kubelet" + +mkdir -p "${MOUNT_POINT}" + +SENTINEL_FILE="/opt/azure/containers/bind-sentinel" +if [ ! -e "${SENTINEL_FILE}" ]; then + # Bond (via software RAID) and format the NVMe disks if that's not already done. + if [ -e /dev/disk/azure/local/by-index/1 ] && [ ! -e /dev/md0 ]; then + mdadm --create --verbose /dev/md0 --level=0 --raid-devices=4 /dev/disk/azure/local/by-index/1 /dev/disk/azure/local/by-index/2 /dev/disk/azure/local/by-index/3 /dev/disk/azure/local/by-index/4 + mkfs.ext4 -F /dev/md0 + fi + mount /dev/md0 "${MOUNT_POINT}" + mv "${KUBELET_DIR}" "${KUBELET_MOUNT_POINT}" + touch "${SENTINEL_FILE}" +else + # On subsequent boots, the disk should already be partitioned and formatted, so just mount it. + mount /dev/md0 "${MOUNT_POINT}" +fi + +# on every boot, bind mount the kubelet directory back to the expected +# location before kubelet itself may start. +mkdir -p "${KUBELET_DIR}" +mount --bind "${KUBELET_MOUNT_POINT}" "${KUBELET_DIR}" +chmod a+w "${KUBELET_DIR}" diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 4b0af8e5679..9cdbbfc30c7 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -282,6 +282,18 @@ copyPackerFiles() { CLOUD_INIT_STATUS_CHECK_DEST=/opt/azure/containers/cloud-init-status-check.sh cpAndMode $CLOUD_INIT_STATUS_CHECK_SRC $CLOUD_INIT_STATUS_CHECK_DEST 0744 + if grep -q "GB200" <<< "$FEATURE_FLAGS"; then + FMT_SH_SRC=/home/packer/format-mount-nvme-root.sh + FMT_SH_DEST=/opt/azure/containers/format-mount-nvme-root.sh + cpAndMode $FMT_SH_SRC $FMT_SH_DEST 0544 + FMT_SVC_SRC=/home/packer/format-mount-nvme-root.service + FMT_SVC_DEST=/etc/systemd/system/format-mount-nvme-root.service + cpAndMode $FMT_SVC_SRC $FMT_SVC_DEST 600 + FMT_SVC_SRC=/home/packer/format-mount-kubelet.conf + FMT_SVC_DEST=/etc/systemd/system/kubelet.service.d/11-fmtmount.conf + cpAndMode $FMT_SVC_SRC $FMT_SVC_DEST 600 + fi + NOTICE_SRC=/home/packer/NOTICE.txt NOTICE_DEST=/NOTICE.txt diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json index 1594b04d9b2..59d3e576c15 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -108,6 +108,21 @@ "source": "parts/linux/cloud-init/artifacts/cloud-init-status-check.sh", "destination": "/home/packer/cloud-init-status-check.sh" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh", + "destination": "/home/packer/format-mount-nvme-root.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.service", + "destination": "/home/packer/format-mount-nvme-root.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/format-mount-kubelet.conf", + "destination": "/home/packer/format-mount-kubelet.conf" + }, { "type": "file", "source": "vhdbuilder/packer/prefetch.sh", From 27e79dc98be3449610104a5be3e417e8ce258865 Mon Sep 17 00:00:00 2001 From: Keith <153014933+keith-ms@users.noreply.github.com> Date: Fri, 20 Mar 2026 12:34:15 -0500 Subject: [PATCH 69/87] Reassemble /dev/md0 correctly (#8130) --- .../artifacts/ubuntu/format-mount-nvme-root.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh index 885e62ae90f..9d336206b4c 100644 --- a/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh +++ b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh @@ -19,12 +19,19 @@ if [ ! -e "${SENTINEL_FILE}" ]; then if [ -e /dev/disk/azure/local/by-index/1 ] && [ ! -e /dev/md0 ]; then mdadm --create --verbose /dev/md0 --level=0 --raid-devices=4 /dev/disk/azure/local/by-index/1 /dev/disk/azure/local/by-index/2 /dev/disk/azure/local/by-index/3 /dev/disk/azure/local/by-index/4 mkfs.ext4 -F /dev/md0 + # Save the RAID config so mdadm --assemble --scan works on subsequent boots. + mdadm --detail --scan >> /etc/mdadm/mdadm.conf fi mount /dev/md0 "${MOUNT_POINT}" mv "${KUBELET_DIR}" "${KUBELET_MOUNT_POINT}" touch "${SENTINEL_FILE}" else - # On subsequent boots, the disk should already be partitioned and formatted, so just mount it. + # On subsequent boots, reassemble the RAID array from superblocks. + # Cannot use /dev/disk/azure/local/by-index/ paths here as the waagent + # udev rules that create those symlinks may not have run yet. + if [ ! -e /dev/md0 ]; then + mdadm --assemble --scan + fi mount /dev/md0 "${MOUNT_POINT}" fi From 9b1256031b1c0a05f5d397e5aeddc24e5a296ebb Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 10:54:43 -0500 Subject: [PATCH 70/87] Restore incorrectly deleted file during cherry-pick --- .github/copilot-instructions.md | 298 ++++++++++++++++++++++++++++++++ 1 file changed, 298 insertions(+) create mode 100644 .github/copilot-instructions.md diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 00000000000..f3b28183f3c --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,298 @@ +# Overview + +AgentBaker repo has 2 main services discussed below: + +- VHD Builder +- AgentBaker Service + +## VHD Builder + +It builds VHDs using Packer for base OS: Windows, Azure Linux/Mariner and Ubuntu. For each OS there are multiple supported versions (windows 2019, 2022, ubuntu 2004, 2204 etc). The VHDs are base images for a node in an aks cluster. + +VHDs are built using [Packer](https://developer.hashicorp.com/packer/docs) in [vhdbuilder](./vhdbuilder/). + +Windows VHD is configured through [VHD](./vhdbuilder/packer/windows/windows-vhd-configuration.ps1) + +## AgentBaker Service + +[apiserver](./apiserver/) is `go` based webserver. It receives request from external client and generates CSE and CustomData to be used on the VHD when a new node is created / provisioned. + +windows generates its CSE package using [script](./parts/windows/kuberneteswindowssetup.ps1). + +The webserver is also used to determine the latest version of Linux VHDs available for provisioning within AKS clusters. + +## Code Structure + +[parts](./parts/) serves both AgentBaker Service and VHD build. AgentBaker service and VHDs are coupled because of this shared component. When building VHD, packer maps and renames scripts from [parts](./parts/) depending on the OS / versions. The mappings can be found at [packer](./vhdbuilder/packer/). + +Windows uses a different folder [cse](./staging/cse/windows/) for almost the same purpose. There are subtle differences as windows CSEs can be downloaded as a zip file during provisioning time due to restrictions on the file size on Windows system, while for linux based systems the cse/custom data are dropped in during provisioning time. + +## Deployment and Release + +The VHD build is triggered by Azure Devops [pipelines](.pipelines/). For release, the pipelines following the same templates for different OS versions: + +- [linux/ubuntu](./.pipelines/templates/.builder-release-template.yaml) +- [windows](./.pipelines/templates/.builder-release-template-windows.yaml) + +you can reason the steps by following the steps defined in the pipeline. + +Tags of AgentBaker and corresponding Linux VHDs are released every week. Linux VHDs are built with a particular image version in the YYYYMM.DD.PATCH format. All Linux VHD versions correspond to a particular tag of the AgentBaker go module. AgentBaker go module tags follow the format v0.YYYYMMDD.PATCH. The mapping between AgentBaker tag and Linux VHD version is defined within [linux_sig_version.json](./pkg/agent/datamodel/linux_sig_version.json). + +Windows VHD are released separately, following windows patch tuesday schedule. + +## Guidelines + +### SRE Guidelines + +The operational goals of this project are: + +- achieve consistency across different OS as much as possible +- avoid functional regression when introducing new features (component updates, new drivers, new binaries), ensure that all supported OS / versions are tested +- avoid VHD build performance regressions when making any changes +- avoid node provisioning performance regression when making any changes + +When making changes, reason whether the file is used in VHD building stage, or provision stage, or both. Make sure the changes are valid in its life stage. as an example, [windows-vhd-configuration.ps1](./vhdbuilder/packer/windows/windows-vhd-configuration.ps1) defines container images to be cached in VHD, while [configure-windows-vhd.ps1](./vhdbuilder/packer/windows/configure-windows-vhd.ps1) executes commands at provision time. + +One way to debug / explore / just for fun is to run [e2e](./e2e/) tests. To run locally, follow the readme file under that folder. + +The SRE guidelines ground other coding guidelines and practices. + +### Golang Guidelines + +- Follow Go best practice +- Use vanilla go test framework + +### PowerShell Guidelines + +- follow PowerShell best practices + +### ShellScripts Guidelines + +- use shellcheck for sanity checking +- use ShellSpec for testing +- the shell scripts are used on both azure linux/mariner and ubuntu and cross platform portability is critical. +- when using functions defined in other files, ensure it is sourced properly. +- use local variables rather than constants when their scoping allows for it. +- avoid using variables declared inside another function, even they are visible. It is hard to reason and might introduce subtle bugs. + +## Pull Request Review Guidelines + +When reviewing pull requests, perform breaking change analysis to prevent regressions. VHDs remain in production for 6 months, so backward compatibility is critical. + +**Review Approach**: Focus on high-level architecture, security vulnerabilities, and logic bugs. Apply deep reasoning similar to advanced models (e.g., Claude Opus) - don't just pattern match, but truly understand the code's intent, dependencies, and potential failure modes. + +### Breaking Change Detection + +Analyze PRs for these compatibility scenarios: + +**1. Linux Provisioning Script Changes** +- **Context**: Scripts in `parts/linux/cloud-init/artifacts/` run during critical VM bootstrap and are used in both: + - VHD build (uploaded via packer configs in `vhdbuilder/packer/*.json`) + - VM provisioning (CSE - embedded in Go service via `pkg/agent/const.go`) + - Versions synchronized via `pkg/agent/datamodel/linux_sig_version.json` +- **What to check**: Changes that could break VM provisioning in production +- **Breaking signals**: + - **Script logic errors**: Syntax errors, wrong commands, incorrect flags, broken pipes + - **Dependency issues**: + - Calling functions before they're sourced + - Using variables declared in other functions + - Removing `source` statements that break dependency chains + - **Cross-distro compatibility**: + - Commands that don't work on both Ubuntu and Azure Linux/Mariner (check distro-specific variants: `ubuntu/`, `mariner/`) + - Package manager assumptions (apt vs dnf/tdnf) + - Missing OS-specific conditional logic + - **External dependency violations**: + - NEW: Downloading from internet URLs not in `parts/common/components.json` or allowed sources (packages.aks.azure.com) + - All external dependencies MUST be referenced in `parts/common/components.json` for Renovate updates + - Only allowed runtime downloads: packages.aks.azure.com or other explicitly allowed sources in CSE + - **Function signature changes**: Parameters, return values, exit codes that break callers + - **Missing test coverage**: Changes to provisioning logic without corresponding e2e tests + - **Forward and backward compatibility**: Keep compatibility across the 6-month VHD support window in both directions. + - **Backward**: Newer VHDs must still work with older CSE scripts delivered via CRP custom data. + - Example: PR #7866 restored `cni-plugins` dependency + install logic after a removal caused provisioning failures (exit 206) when old scripts ran on newer VHDs. + - **Forward**: Newer CSE script changes must not require components/features that exist only on newer VHDs, unless the logic to detect and handle missing features is implemented. + + +**2. Windows Bidirectional Compatibility** +- **Context**: Windows VHD and CSE scripts release on different cadences with no guaranteed order +- **What to check**: Changes to `staging/cse/windows/` (CSE scripts) or `vhdbuilder/packer/windows/` (VHD scripts). +- **Breaking signals**: + - New CSE scripts using features in the VHD Scripts that aren't present before this PR. + - New VHD scripts expecting features in the CSE scripts that aren't present before this PR. + - Changes to shared state (registry keys, files, environment variables) that break coordination + - Removing PowerShell functions or cmdlets that the other component might call + - Incompatibilities between newer versions of the CSE scripts and older versions of the VHD scripts are critical to detect as they can cause production outages. + +**3. aks-node-controller Migration (Dual-Mode Support)** +- **Context**: Transitioning from uploading scripts during both VHD build and CSE to only uploading aks-node-controller during VHD build +- **What to check**: Any changes must work in BOTH deployment modes +- **Breaking signals**: + - Assumptions that scripts are always uploaded during CSE (new mode won't do this) + - Assumptions that aks-node-controller is always present (old VHDs won't have it) + - Missing feature detection to determine which mode is running + - Hardcoded paths that differ between deployment modes + +**4. Cross-OS Compatibility** +- **What to check**: Changes work on Ubuntu, Azure Linux/Mariner, and Windows +- **Breaking signals**: + - Linux commands that don't work on both Ubuntu and Azure Linux/Mariner + - Missing conditional logic for OS-specific behaviors + - Package manager assumptions (apt vs dnf/tdnf) + - Systemd differences between distributions + +**5. Package/Dependency Update PRs (Renovate)** +- **Context**: Renovate bot automatically creates PRs to update component versions in `parts/common/components.json`. These components are cached on VHDs during build and directly affect node stability, GPU workloads, networking, and security. Updated packages are downloaded from `packages.aks.azure.com` or upstream registries during VHD build. +- **What to check**: Every version bump—even patch versions—can introduce regressions that affect production nodes. +- **`renovate.json` syntax guardrails**: + 1. Keep the file valid JSON (double quotes only, no comments, no trailing commas). + 2. When editing arrays like `assignees`, `reviewers`, `matchPackageNames`, and `matchUpdateTypes`, preserve comma placement and avoid duplicate entries. + 3. Keep schema-compatible key casing and value types (for example `enabled` as boolean, `prHourlyLimit` as number, `labels` as string array). + 4. In `packageRules`, preserve specific-to-generic ordering so narrow matchers are not shadowed by broader rules. + 5. For regex fields (`versioning`, `extractVersion`, `matchCurrentVersion`, custom manager `matchStrings`), escape backslashes correctly for JSON strings. + 6. For custom manager templates, keep Renovate template tokens intact (`{{{newValue}}}`, `{{#if ...}}`, `{{/if}}`) and avoid converting them to normal JSON interpolation. + 7. If modifying identity lists in `assignees` or `reviewers`, update all related grouped rules consistently to avoid ownership drift. + 8. On GitHub, both `assignees` and `reviewers` may include team handles using the `team:` format; if used, ensure the team exists in the AKS org and has at least read permission to the AgentBaker repo. + 9. Keep `minor` updates disabled by default; only allow minor updates through explicit, narrow `packageRules` (avoid broad datasource/wildcard exceptions). Example context: https://github.com/Azure/AgentBaker/pull/7898 (broad matching led to unintended cross-stream minor jumps). + 10. Never combine `matchUpdateTypes` and `allowedVersions` in the same `packageRules` entry — Renovate rejects this. Put `allowedVersions` in a separate rule; Renovate merges all matching rules. Context: #8420. +- **Analysis steps for every package update PR**: + 1. **Identify the component and version change**: Parse the diff in `parts/common/components.json` to extract exact old → new versions for each OS/release entry. + 2. **Determine the update type**: Classify as major, minor, or patch using semver. Major and minor updates carry higher risk than patch updates. + 3. **Research upstream changelog**: Look up the project's release notes, changelog, or GitHub releases to understand what changed between the old and new versions. Summarize: + - New features introduced + - Bug fixes included + - Breaking changes or deprecations + - Security fixes (CVEs patched) + 4. **Assess OS coverage**: Check if the update covers all OS variants where the component is used (Ubuntu 22.04, 24.04, Azure Linux 3.0, etc.). Flag if some OS entries are updated but others are not — partial updates can cause inconsistency across node pools. + 5. **Evaluate VHD size impact**: For components downloaded as binaries or packages, consider whether the new version significantly increases VHD size. Large size increases can affect VHD build time and storage costs. + 6. **Check for configuration or API changes**: If the component exposes configuration files, CLI flags, systemd units, or APIs consumed by CSE scripts, verify that the update doesn't change defaults or remove options that provisioning scripts depend on. + 7. **Verify download URL validity**: Confirm that the `downloadLocation` and `downloadURIs` structure in components.json remains valid for the new version. New versions sometimes change the artifact naming convention or repository layout. + +- **Risk assessment for package updates**: + - 🔴 **High Risk**: Major version bumps, components critical to node boot (kubelet, containerd, runc), GPU drivers (nvidia-driver, dcgm-exporter), or networking (azure-cni, cilium). Also high risk if upstream changelog mentions breaking changes or behavioral changes. + - 🟡 **Medium Risk**: Minor version bumps of non-critical components, updates that only affect specific OS variants, or updates where upstream changelog shows feature additions that could subtly change behavior. + - 🟢 **Low Risk**: Patch version bumps with only bug fixes or security patches, no breaking changes in upstream changelog, and full OS coverage. + +- **Review output for package update PRs must include a detailed version diff analysis**: + + **Header:** + ``` + ## Package Update Analysis: + **Version change**: X.Y.Z → A.B.C ( update) + **OS variants affected**: Ubuntu 22.04, Ubuntu 24.04, Azure Linux 3.0 (list all) + **OS variants NOT updated**: + ``` + + **Detailed changelog between versions:** + Use web search, GitHub releases, or upstream project documentation to find the exact differences between the old and new version. Present each change as a line item with its own risk tag: + + ``` + ### Changes between X.Y.Z and A.B.C + + | Change | Description | Risk | + |--------|-------------|------| + | Feature | | 🟢 Low / 🟡 Medium / 🔴 High | + | Bug fix | | 🟢 Low / 🟡 Medium / 🔴 High | + | Breaking | | 🔴 High | + | Security | CVE-YYYY-XXXXX: | 🟢 Low / 🟡 Medium / 🔴 High | + | Deprecation | | 🟡 Medium / 🔴 High | + | Config change | | 🟡 Medium / 🔴 High | + | Performance | | 🟢 Low / 🟡 Medium | + ``` + + For each individual change, assess risk by considering: + - Does it alter runtime behavior on AKS nodes? + - Does it change CLI flags, config file formats, or systemd unit behavior that CSE scripts depend on? + - Does it affect GPU workloads, networking, container runtime, or kubelet interaction? + - Could it increase binary size significantly (VHD bloat)? + - Does it introduce new system dependencies or kernel requirements? + + **If upstream changelog is unavailable**, explicitly state: _"Upstream changelog not found for this version range. Manual testing recommended before merge."_ + + **Overall risk assessment:** + ``` + ### Overall Risk: 🟢 Low / 🟡 Medium / 🔴 High + **Justification**: <1-2 sentence summary of why this risk level was chosen> + **Recommendation**: Approve / Request more info / Flag for manual testing + ``` + + **Example** (for a PR like dcgm-exporter 4.7.1 → 4.8.0): + ``` + ## Package Update Analysis: dcgm-exporter + **Version change**: 4.7.1 → 4.8.0 (minor update) + **OS variants affected**: Ubuntu 22.04, Ubuntu 24.04 + **OS variants NOT updated**: Azure Linux 3.0 (still on 4.7.1-1.azl3) — flag for follow-up + + ### Changes between 4.7.1 and 4.8.0 + | Change | Description | Risk | + |--------|-------------|------| + | Feature | Added support for new DCGM field IDs for Blackwell GPUs | 🟢 Low | + | Feature | New metrics endpoint configuration options | 🟡 Medium | + | Bug fix | Fixed memory leak in long-running metric collection | 🟢 Low | + | Deprecation | Removed legacy CSV export format | 🟡 Medium | + + ### Overall Risk: 🟡 Medium + **Justification**: Minor version bump of GPU monitoring component. No breaking changes to core metrics pipeline, but Azure Linux 3.0 is not updated which creates version skew across OS variants. + **Recommendation**: Approve, but file follow-up issue for Azure Linux 3.0 alignment. + ``` + +### Analysis Approach + +**Dynamic Dependency Tracing**: +1. For each changed file, identify what depends on it +2. Follow `source` statements in bash scripts to trace dependency chains +3. Check for function calls, variable references across files +4. Look for hardcoded paths in VHD build scripts (`vhdbuilder/packer/`) that reference changed files +5. Trace through as many levels as needed within the codebase +6. **Check external dependencies**: + - Search for new URLs being downloaded (curl, wget, etc.) + - Verify all external dependencies are in `parts/common/components.json` for Renovate updates + - Flag downloads from unauthorized sources (only packages.aks.azure.com and sources in components.json allowed) + +**Historical Context**: +- Look for related changes that previously caused issues +- Identify patterns of fragile areas that break frequently + +**Test Coverage Assessment**: +- Note if changed code has e2e test coverage +- Flag changes to untested areas as higher risk +- Mention if new behavior lacks corresponding test additions + +### Review Output Format + +Provide targeted inline comments on specific lines where you detect issues: + +**For each breaking change or risk:** +- Comment directly on the problematic line or code block +- Explain why this is risky (e.g., "This removes function X which may be called by VHDs built in the last 6 months") +- Suggest specific mitigations or alternatives +- Include actionable next steps (e.g., "Verify this function is not used by checking references in `vhdbuilder/packer/`") + +**Risk indicators to include:** + +- **Severity** (pick one): + - 🔴 **High Risk** — Could break production VM provisioning, cause node failures, or introduce security vulnerabilities + - 🟡 **Medium Risk** — Could cause issues in specific configurations, edge cases, or degrade performance + - 🟢 **Low Risk** — Unlikely to cause issues but worth noting for awareness + +- **Category** (pick one): + - 🔧 **Script Logic** — Syntax errors, incorrect commands, broken control flow, wrong exit codes + - 🖥️ **Cross-OS** — Incompatibility between Ubuntu, Azure Linux/Mariner, or Windows + - 🌐 **External Dependency** — Unauthorized downloads, missing components.json entries, broken URLs + - 🧪 **Test Coverage** — Missing or insufficient test coverage for changed behavior + - 📦 **Package Update** — Component version changes, upstream regressions, VHD size impact + - 🔄 **Backward Compatibility** — Breaking changes affecting VHDs in production (6-month window) + - 🔒 **Security** — Credential exposure, privilege escalation, insecure defaults + - ⚡ **Performance** — VHD build time regression, node provisioning latency increase + - 🏗️ **Architecture** — Structural changes affecting multiple components or deployment modes + +**Only comment when you have substantive findings** - avoid noise on trivial or obviously safe changes. + +### Review Philosophy + +Think like an experienced reviewer who "eyeballs" PRs for subtle risks. Look beyond pattern matching: +- Understand the architecture and how components interact +- Consider timing of releases and deployment sequences +- Reason about implicit dependencies and assumptions +- Flag changes that "feel risky" even without obvious red flags +- Balance thoroughness with actionable feedback +- Focus on high-impact issues that could break production VM provisioning From 3936e6e729be493a8444d7348b17827c9c1fa156 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 11:07:06 -0500 Subject: [PATCH 71/87] Remove poorly merged stage and Windows configuration --- .pipelines/.vsts-vhd-builder-release.yaml | 2 -- .pipelines/templates/.builder-release-template-windows.yaml | 5 ----- 2 files changed, 7 deletions(-) diff --git a/.pipelines/.vsts-vhd-builder-release.yaml b/.pipelines/.vsts-vhd-builder-release.yaml index 549866ab71e..10970aeb967 100644 --- a/.pipelines/.vsts-vhd-builder-release.yaml +++ b/.pipelines/.vsts-vhd-builder-release.yaml @@ -966,8 +966,6 @@ stages: condition: eq('${{ parameters.build2404arm64gb200gen2containerd }}', true) dependsOn: [ ] timeoutInMinutes: 360 - - stage: build_vhd_2404_arm64_gb200 - condition: eq('${{ parameters.build2404arm64gb200 }}', true) - stage: build_vhd_2404_arm64_gb200_gen2_containerd condition: eq('${{ parameters.build2404arm64gb200gen2containerd }}', true) dependsOn: [] diff --git a/.pipelines/templates/.builder-release-template-windows.yaml b/.pipelines/templates/.builder-release-template-windows.yaml index 4b351dbddd6..5da7c54e949 100644 --- a/.pipelines/templates/.builder-release-template-windows.yaml +++ b/.pipelines/templates/.builder-release-template-windows.yaml @@ -65,11 +65,6 @@ steps: echo "##vso[task.setvariable variable=MODE]windowsVhdMode" displayName: Set DRY_RUN and MODE variables - - bash: | - echo "##vso[task.setvariable variable=DRY_RUN]${{ parameters.dryrun }}" - echo "##vso[task.setvariable variable=MODE]windowsVhdMode" - displayName: Set DRY_RUN and MODE variables - - template: ./.template-override-components-json.yaml parameters: overrideBranch: ${{ parameters.overrideBranch }} From 28cab478e2635a607fabaa9c3d95608e35bfe788 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 11:11:44 -0500 Subject: [PATCH 72/87] Remove malformed pipeline definition --- .pipelines/.vsts-vhd-builder-release.yaml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.pipelines/.vsts-vhd-builder-release.yaml b/.pipelines/.vsts-vhd-builder-release.yaml index 10970aeb967..0e49420696f 100644 --- a/.pipelines/.vsts-vhd-builder-release.yaml +++ b/.pipelines/.vsts-vhd-builder-release.yaml @@ -966,13 +966,6 @@ stages: condition: eq('${{ parameters.build2404arm64gb200gen2containerd }}', true) dependsOn: [ ] timeoutInMinutes: 360 - - stage: build_vhd_2404_arm64_gb200_gen2_containerd - condition: eq('${{ parameters.build2404arm64gb200gen2containerd }}', true) - dependsOn: [] - jobs: - - job: build2404arm64gb200gen2containerd - condition: eq('${{ parameters.build2404arm64gb200gen2containerd }}', true) - timeoutInMinutes: 180 steps: - bash: | echo '##vso[task.setvariable variable=OS_SKU]Ubuntu' @@ -982,7 +975,7 @@ stages: echo '##vso[task.setvariable variable=IMG_SKU]server-arm64' echo '##vso[task.setvariable variable=IMG_VERSION]latest' echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2' - echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D32pds_v6' + echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D32pds_v5' echo '##vso[task.setvariable variable=FEATURE_FLAGS]GB200' echo '##vso[task.setvariable variable=ARCHITECTURE]ARM64' echo '##vso[task.setvariable variable=ENABLE_FIPS]False' From 7cdea46812f84c3b0c3cd650c504e9abd7f8fcf1 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 11:15:15 -0500 Subject: [PATCH 73/87] Fix wrongly merged e2e file --- e2e/scenario_win_test.go | 58 ---------------------------------------- e2e/vmss.go | 3 --- 2 files changed, 61 deletions(-) diff --git a/e2e/scenario_win_test.go b/e2e/scenario_win_test.go index 2646405ee0d..2ece49b1c7b 100644 --- a/e2e/scenario_win_test.go +++ b/e2e/scenario_win_test.go @@ -651,61 +651,3 @@ func Test_NetworkIsolatedCluster_Windows_OrasDownload(t *testing.T) { }, }) } - -func Test_Windows2022_McrChinaCloud_Windows(t *testing.T) { - RunScenario(t, &Scenario{ - Tags: Tags{ - MockAzureChinaCloud: true, - }, - Description: "Windows Server 2022 Azure Network Containerd - v1 to test Azure China Cloud MCR host", - Config: Config{ - Cluster: ClusterAzureNetwork, - VHD: config.VHDWindows2022Containerd, - VMConfigMutator: EmptyVMConfigMutator, - BootstrapConfigMutator: EmptyBootstrapConfigMutator, - Validator: func(ctx context.Context, s *Scenario) { - ValidateFileExists(ctx, s, `C:\ProgramData\containerd\certs.d\docker.io\hosts.toml`) - ValidateFileExists(ctx, s, `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`) - ValidateFileHasContent(ctx, s, - `C:\ProgramData\containerd\certs.d\docker.io\hosts.toml`, - `https://docker.io`) - ValidateFileHasContent(ctx, s, - `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`, - `https://mcr.azk8s.cn`) - }, - }, - }) -} - -func Test_Windows2025Gen2_McrChinaCloud_Windows(t *testing.T) { - RunScenario(t, &Scenario{ - Tags: Tags{ - MockAzureChinaCloud: true, - }, - Description: "Windows Server 2025 with Containerd - hyperv gen 2 to test Azure China Cloud MCR host", - Config: Config{ - Cluster: ClusterAzureNetwork, - VHD: config.VHDWindows2025Gen2, - VMConfigMutator: EmptyVMConfigMutator, - BootstrapConfigMutator: func(configuration *datamodel.NodeBootstrappingConfiguration) { - Windows2025BootstrapConfigMutator(t, configuration) - }, - Validator: func(ctx context.Context, s *Scenario) { - ValidateWindowsVersionFromWindowsSettings(ctx, s, "2025-gen2") - ValidateWindowsProductName(ctx, s, "Windows Server 2025 Datacenter") - ValidateWindowsDisplayVersion(ctx, s, "24H2") - ValidateFileHasContent(ctx, s, "/k/kubeletstart.ps1", "--container-runtime=remote") - ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"}) - ValidateCiliumIsNotRunningWindows(ctx, s) - ValidateFileExists(ctx, s, `C:\ProgramData\containerd\certs.d\docker.io\hosts.toml`) - ValidateFileExists(ctx, s, `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`) - ValidateFileHasContent(ctx, s, - `C:\ProgramData\containerd\certs.d\docker.io\hosts.toml`, - `https://docker.io`) - ValidateFileHasContent(ctx, s, - `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`, - `https://mcr.azk8s.cn`) - }, - }, - }) -} diff --git a/e2e/vmss.go b/e2e/vmss.go index 3b758c4294e..4abf14f8931 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -789,9 +789,6 @@ func extractLogsFromVMLinux(ctx context.Context, s *Scenario, vm *ScenarioVM) er if s.SecureTLSBootstrappingEnabled() { commandList["secure-tls-bootstrap.log"] = "sudo cat /var/log/azure/aks/secure-tls-bootstrap.log" } - if s.SecureTLSBootstrappingEnabled() { - commandList["secure-tls-bootstrap.log"] = "sudo cat /var/log/azure/aks/secure-tls-bootstrap.log" - } isAzureCNI, err := s.Runtime.Cluster.IsAzureCNI() if err == nil && isAzureCNI { From 286f5322edf6d731a0ecea4fb29efaf932e4e375 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 11:16:59 -0500 Subject: [PATCH 74/87] Remove extra apt-clean --- parts/linux/cloud-init/artifacts/ubuntu/cse_helpers_ubuntu.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/cse_helpers_ubuntu.sh b/parts/linux/cloud-init/artifacts/ubuntu/cse_helpers_ubuntu.sh index 72bb85c2caa..d65b5874bcd 100755 --- a/parts/linux/cloud-init/artifacts/ubuntu/cse_helpers_ubuntu.sh +++ b/parts/linux/cloud-init/artifacts/ubuntu/cse_helpers_ubuntu.sh @@ -110,8 +110,6 @@ _apt_get_install() { wait_for_apt_locks DEBIAN_FRONTEND=noninteractive apt-get clean wait_for_apt_locks - DEBIAN_FRONTEND=noninteractive apt-get clean - wait_for_apt_locks return 0 fi From db3c66f67e6b597c4ea4608d82ee17746d1d36eb Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 11:51:18 -0500 Subject: [PATCH 75/87] Restore incorrectly merged file --- .../linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh index dd3ab601394..8c58d398191 100755 --- a/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh +++ b/parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh @@ -42,7 +42,7 @@ blobfuseFallbackPackages() { installDeps() { wait_for_apt_locks - retrycmd_silent 120 5 90 curl -fsSL https://packages.microsoft.com/config/ubuntu/${UBUNTU_RELEASE}/packages-microsoft-prod.deb > /tmp/packages-microsoft-prod.deb || exit $ERR_MS_PROD_DEB_DOWNLOAD_TIMEOUT + retrycmd_silent 120 5 25 curl -fsSL https://packages.microsoft.com/config/ubuntu/${UBUNTU_RELEASE}/packages-microsoft-prod.deb > /tmp/packages-microsoft-prod.deb || exit $ERR_MS_PROD_DEB_DOWNLOAD_TIMEOUT retrycmd_if_failure 60 5 10 dpkg -i /tmp/packages-microsoft-prod.deb || exit $ERR_MS_PROD_DEB_PKG_ADD_FAIL holdWALinuxAgent hold @@ -93,12 +93,12 @@ installDeps() { } updateAptWithMicrosoftPkg() { - retrycmd_silent 120 5 90 curl https://packages.microsoft.com/config/ubuntu/${UBUNTU_RELEASE}/prod.list > /tmp/microsoft-prod.list || exit $ERR_MOBY_APT_LIST_TIMEOUT + retrycmd_silent 120 5 25 curl https://packages.microsoft.com/config/ubuntu/${UBUNTU_RELEASE}/prod.list > /tmp/microsoft-prod.list || exit $ERR_MOBY_APT_LIST_TIMEOUT retrycmd_if_failure 10 5 10 cp /tmp/microsoft-prod.list /etc/apt/sources.list.d/ || exit $ERR_MOBY_APT_LIST_TIMEOUT echo "deb [arch=amd64,arm64,armhf] https://packages.microsoft.com/ubuntu/${UBUNTU_RELEASE}/prod testing main" > /etc/apt/sources.list.d/microsoft-prod-testing.list - retrycmd_silent 120 5 90 curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > /tmp/microsoft.gpg || exit $ERR_MS_GPG_KEY_DOWNLOAD_TIMEOUT + retrycmd_silent 120 5 25 curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > /tmp/microsoft.gpg || exit $ERR_MS_GPG_KEY_DOWNLOAD_TIMEOUT retrycmd_if_failure 10 5 10 cp /tmp/microsoft.gpg /etc/apt/trusted.gpg.d/ || exit $ERR_MS_GPG_KEY_DOWNLOAD_TIMEOUT apt_get_update || exit $ERR_APT_UPDATE_TIMEOUT } From b0de98d136309432d96a4091dc9f3c70fdeab962 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 12:12:11 -0500 Subject: [PATCH 76/87] Remove global APT setting --- vhdbuilder/packer/install-dependencies.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 4a559f3becc..0f3753c3c34 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -68,10 +68,6 @@ APT::Periodic::Update-Package-Lists "0"; APT::Periodic::Download-Upgradeable-Packages "0"; APT::Periodic::AutocleanInterval "0"; APT::Periodic::Unattended-Upgrade "0"; -EOF - # Make apt more patient connecting to repositories: set a timeout of 5 min. - tee /etc/apt/apt.conf.d/99patience > /dev/null < Date: Thu, 14 May 2026 12:15:03 -0500 Subject: [PATCH 77/87] Reorder file movement in packer_source file for GB200 feature flag --- vhdbuilder/packer/packer_source.sh | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 9cdbbfc30c7..6a3567a4f9e 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -282,18 +282,6 @@ copyPackerFiles() { CLOUD_INIT_STATUS_CHECK_DEST=/opt/azure/containers/cloud-init-status-check.sh cpAndMode $CLOUD_INIT_STATUS_CHECK_SRC $CLOUD_INIT_STATUS_CHECK_DEST 0744 - if grep -q "GB200" <<< "$FEATURE_FLAGS"; then - FMT_SH_SRC=/home/packer/format-mount-nvme-root.sh - FMT_SH_DEST=/opt/azure/containers/format-mount-nvme-root.sh - cpAndMode $FMT_SH_SRC $FMT_SH_DEST 0544 - FMT_SVC_SRC=/home/packer/format-mount-nvme-root.service - FMT_SVC_DEST=/etc/systemd/system/format-mount-nvme-root.service - cpAndMode $FMT_SVC_SRC $FMT_SVC_DEST 600 - FMT_SVC_SRC=/home/packer/format-mount-kubelet.conf - FMT_SVC_DEST=/etc/systemd/system/kubelet.service.d/11-fmtmount.conf - cpAndMode $FMT_SVC_SRC $FMT_SVC_DEST 600 - fi - NOTICE_SRC=/home/packer/NOTICE.txt NOTICE_DEST=/NOTICE.txt @@ -516,6 +504,16 @@ copyPackerFiles() { fi if grep -q "GB200" <<< "$FEATURE_FLAGS"; then + FMT_SH_SRC=/home/packer/format-mount-nvme-root.sh + FMT_SH_DEST=/opt/azure/containers/format-mount-nvme-root.sh + cpAndMode $FMT_SH_SRC $FMT_SH_DEST 0544 + FMT_SVC_SRC=/home/packer/format-mount-nvme-root.service + FMT_SVC_DEST=/etc/systemd/system/format-mount-nvme-root.service + cpAndMode $FMT_SVC_SRC $FMT_SVC_DEST 600 + FMT_SVC_SRC=/home/packer/format-mount-kubelet.conf + FMT_SVC_DEST=/etc/systemd/system/kubelet.service.d/11-fmtmount.conf + cpAndMode $FMT_SVC_SRC $FMT_SVC_DEST 600 + if [ ${UBUNTU_RELEASE} = "24.04" ]; then NVIDIA_LIST_SRC=/home/packer/nvidia-2404.list NVIDIA_LIST_DEST=/etc/apt/sources.list.d/nvidia.list From f28b9e42fc71e54141c6bfd315260b36b8cc696f Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 12:39:41 -0500 Subject: [PATCH 78/87] Fix kernel installation logic for GB* platform --- vhdbuilder/packer/pre-install-dependencies.sh | 78 +++++++++++-------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/vhdbuilder/packer/pre-install-dependencies.sh b/vhdbuilder/packer/pre-install-dependencies.sh index 6df13bdbbf1..d46b2312150 100644 --- a/vhdbuilder/packer/pre-install-dependencies.sh +++ b/vhdbuilder/packer/pre-install-dependencies.sh @@ -199,48 +199,60 @@ if [[ ${UBUNTU_RELEASE//./} -ge 2204 && "${ENABLE_FIPS,,}" != "true" ]]; then NVIDIA_KERNEL_PACKAGE="linux-azure-nvidia" if [[ "${CPU_ARCH}" == "arm64" && "${UBUNTU_RELEASE}" = "24.04" ]]; then # This is the ubuntu 2404arm64gen2containerd image or the 2404arm64gb200 image - # Uncomment if we have trouble finding the kernel package. # The Ubuntu PPA has early access to new kernels, such as the one in the GB300 CRD. + # Uncomment if we have trouble finding the kernel package. + # add-apt-repository ppa:canonical-kernel-team/ppa if grep -q "GB200" <<< "$FEATURE_FLAGS"; then add-apt-repository ppa:canonical-kernel-team/ppa + apt-get update BOM_PATH="gb200-mai-bom.json" if [ -n "$(jq -r '.["kernel-versions"] | keys[]' $BOM_PATH)" ]; then NVIDIA_KERNEL_PACKAGE=$(jq -r '.["kernel-versions"] | to_entries[] | "\(.key)=\(.value)"' $BOM_PATH) fi - fi - apt-get update - if apt-get install -s "${NVIDIA_KERNEL_PACKAGE}" &> /dev/null; then - echo "ARM64 image. Installing NVIDIA kernel and its packages alongside LTS kernel" - wait_for_apt_locks - sudo apt install --no-install-recommends -y "${NVIDIA_KERNEL_PACKAGE}" - echo "after installation:" - dpkg -l | grep "linux-.*-azure-nvidia" || true + if apt-get install -s "${NVIDIA_KERNEL_PACKAGE}" &> /dev/null; then + echo "ARM64 image. Installing NVIDIA kernel and its packages alongside LTS kernel" + wait_for_apt_locks + apt install --no-install-recommends -y "${NVIDIA_KERNEL_PACKAGE}" + echo "after installation:" + dpkg -l | grep "linux-.*-azure-nvidia" || true + else + echo "ARM64 image. NVIDIA kernel not available from repo, fetching and installing dpkgs by hand" + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb > /tmp/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb > /tmp/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb > /tmp/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb > /tmp/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + + dpkg -i /tmp/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + + rm /tmp/*.deb + fi + add-apt-repository --remove ppa:canonical-kernel-team/ppa else - echo "ARM64 image. NVIDIA kernel not available from repo, fetching and installing dpkgs by hand" - curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb - curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb > /tmp/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb - curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb > /tmp/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb - curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb > /tmp/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb - curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb > /tmp/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb - curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb - curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb - curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb - - curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb - - dpkg -i /tmp/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb - dpkg -i /tmp/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb - dpkg -i /tmp/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb - dpkg -i /tmp/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb - dpkg -i /tmp/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb - dpkg -i /tmp/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb - dpkg -i /tmp/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb - dpkg -i /tmp/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb - dpkg -i /tmp/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb - - rm /tmp/*.deb + apt-get update + if apt-cache show "${NVIDIA_KERNEL_PACKAGE}" &> /dev/null; then + echo "ARM64 image. Installing NVIDIA kernel and its packages alongside LTS kernel" + wait_for_apt_locks + sudo apt install --no-install-recommends -y "${NVIDIA_KERNEL_PACKAGE}" + echo "after installation:" + dpkg -l | grep "linux-.*-azure-nvidia" || true + else + echo "ARM64 image. NVIDIA kernel not available, skipping installation." + fi fi - add-apt-repository --remove ppa:canonical-kernel-team/ppa fi wait_for_apt_locks if grep -q "cvm" <<< "$FEATURE_FLAGS"; then From 021b8b0c6827f859d74824b95129856905edbab5 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 13:05:06 -0500 Subject: [PATCH 79/87] Remove reference to teleportd service file --- vhdbuilder/packer/vhd-image-builder-arm64-gb200.json | 6 ------ 1 file changed, 6 deletions(-) diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json index 59d3e576c15..4b8372b15b6 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -17,7 +17,6 @@ "sig_image_name": "{{env `SIG_IMAGE_NAME`}}", "sig_image_version": "{{env `SIG_IMAGE_VERSION`}}", "container_runtime": "{{env `CONTAINER_RUNTIME`}}", - "teleportd_plugin_download_url": "{{env `TELEPORTD_PLUGIN_DOWNLOAD_URL`}}", "captured_sig_version": "{{env `${CAPTURED_SIG_VERSION`}}", "enable_fips": "{{env `ENABLE_FIPS`}}", "img_publisher": "{{env `IMG_PUBLISHER`}}", @@ -293,11 +292,6 @@ "source": "parts/linux/cloud-init/artifacts/ensure-no-dup.service", "destination": "/home/packer/ensure-no-dup.service" }, - { - "type": "file", - "source": "parts/linux/cloud-init/artifacts/teleportd.service", - "destination": "/home/packer/teleportd.service" - }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/setup-custom-search-domains.sh", From a76e7ec2e71bc6541dd839e6f2111262d28b845f Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 14:33:56 -0500 Subject: [PATCH 80/87] Add missing files referenced in packer_source.sh --- .../packer/vhd-image-builder-arm64-gb200.json | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json index 4b8372b15b6..0380bffe9c5 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -87,6 +87,11 @@ "source": "vhdbuilder/lister/bin/lister", "destination": "/home/packer/lister" }, + { + "type": "file", + "source": "image-fetcher/bin/image-fetcher-linux-arm64", + "destination": "/home/packer/image-fetcher" + }, { "type": "file", "source": "aks-node-controller/bin/aks-node-controller-linux-arm64", @@ -132,6 +137,16 @@ "source": "vhdbuilder/packer/cleanup-vhd.sh", "destination": "/home/packer/cleanup-vhd.sh" }, + { + "type": "file", + "source": "vhdbuilder/packer/post-deprovision-walinuxagent.sh", + "destination": "/home/packer/post-deprovision-walinuxagent.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/install_walinuxagent.py", + "destination": "/home/packer/install_walinuxagent.py" + }, { "type": "file", "source": "vhdbuilder/packer/packer_source.sh", @@ -332,11 +347,61 @@ "source": "vhdbuilder/packer/pre-install-dependencies.sh", "destination": "/home/packer/pre-install-dependencies.sh" }, + { + "type": "file", + "source": "vhdbuilder/packer/install-ig.sh", + "destination": "/home/packer/install-ig.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/inspektor-gadget/baseline/usr/share/inspektor-gadget/import_gadgets.sh", + "destination": "/home/packer/ig-import-gadgets.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/inspektor-gadget/baseline/usr/share/inspektor-gadget/remove_gadgets.sh", + "destination": "/home/packer/ig-remove-gadgets.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/inspektor-gadget/baseline/usr/lib/systemd/system/ig-import-gadgets.service", + "destination": "/home/packer/ig-import-gadgets.service" + }, { "type": "file", "source": "vhdbuilder/packer/install-dependencies.sh", "destination": "/home/packer/install-dependencies.sh" }, + { + "type": "file", + "source": "vhdbuilder/packer/install-node-exporter.sh", + "destination": "/home/packer/install-node-exporter.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh", + "destination": "/home/packer/node-exporter-startup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/systemd/system/node-exporter.service", + "destination": "/home/packer/node-exporter.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/systemd/system/node-exporter-restart.service", + "destination": "/home/packer/node-exporter-restart.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/systemd/system/node-exporter-restart.path", + "destination": "/home/packer/node-exporter-restart.path" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml", + "destination": "/home/packer/node-exporter-web-config.yml" + }, { "type": "file", "source": "vhdbuilder/packer/generate-disk-usage.sh", @@ -652,6 +717,46 @@ "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", "destination": "/home/packer/localdns-delegate.conf" }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns_exporter.sh", + "destination": "/home/packer/localdns_exporter.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns-exporter.socket", + "destination": "/home/packer/localdns-exporter.socket" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns-exporter@.service", + "destination": "/home/packer/localdns-exporter@.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-localdns-hosts-setup.sh", + "destination": "/home/packer/aks-localdns-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-localdns-hosts-setup.service", + "destination": "/home/packer/aks-localdns-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-localdns-hosts-setup.timer", + "destination": "/home/packer/aks-localdns-hosts-setup.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", + "destination": "/home/packer/configure-azure-network.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/99-azure-network.rules", + "destination": "/home/packer/99-azure-network.rules" + }, { "type": "file", "source": "parts/linux/cloud-init/artifacts/10_azure_nvidia", From 352087afb051981de361ecae2bdc3b2623115335 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 15:33:03 -0500 Subject: [PATCH 81/87] Update package version for libcap2-bin --- vhdbuilder/packer/gb200-mai-bom.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vhdbuilder/packer/gb200-mai-bom.json b/vhdbuilder/packer/gb200-mai-bom.json index 621bc8e9fba..2dad95fea11 100644 --- a/vhdbuilder/packer/gb200-mai-bom.json +++ b/vhdbuilder/packer/gb200-mai-bom.json @@ -30,7 +30,7 @@ "datacenter-gpu-manager-4-proprietary-cuda13": "1:4.4.1-1", "datacenter-gpu-manager-4-multinode": "1:4.4.1-1", "datacenter-gpu-manager-4-multinode-cuda13": "1:4.4.1-1", - "libcap2-bin": "1:2.66-5ubuntu2.2", + "libcap2-bin": "1:2.66-5ubuntu2.4", "k8s-device-plugin": "0.17.3-ubuntu24.04u5", "nvidia-imex": "580.105.08-1", "librdmacm-dev": "2507mlnx58-1.2507097.0214", From dfc1db0a40ddd37e8f36c28db4b3c701864fd1af Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 20:49:09 -0500 Subject: [PATCH 82/87] Increase disk size for GB200 image --- vhdbuilder/packer/post-install-dependencies.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vhdbuilder/packer/post-install-dependencies.sh b/vhdbuilder/packer/post-install-dependencies.sh index 3113d49569a..c6481f3e34c 100644 --- a/vhdbuilder/packer/post-install-dependencies.sh +++ b/vhdbuilder/packer/post-install-dependencies.sh @@ -19,7 +19,7 @@ VHD_LOGS_FILEPATH=/opt/azure/vhd-install.complete PERFORMANCE_DATA_FILE=/opt/azure/vhd-build-performance-data.json # Hardcode the desired size of the OS disk so we don't accidently rely on extra disk space -if [ "$OS" = "$FLATCAR_OS_NAME" ] || isACL "$OS" "$OS_VARIANT"; then +if [ "$OS" = "$FLATCAR_OS_NAME" ] || isACL "$OS" "$OS_VARIANT" || grep -q "GB200" <<< "$FEATURE_FLAGS"; then MAX_BLOCK_COUNT=60397977 # 60 GB DISK_SIZE_GB=60 else From 3e0d69b0e0549e4365cf50d274bf4f6859800eb3 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Thu, 14 May 2026 23:18:44 -0500 Subject: [PATCH 83/87] Fix waagent script invocation --- vhdbuilder/packer/vhd-image-builder-arm64-gb200.json | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json index 0380bffe9c5..8cbc65dfd8f 100644 --- a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -805,7 +805,7 @@ { "type": "shell", "inline": [ - "sudo FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} VHD_BUILD_TIMESTAMP={{user `vhd_build_timestamp`}} /bin/bash -ux /home/packer/pre-install-dependencies.sh" + "sudo FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} VHD_BUILD_TIMESTAMP={{user `vhd_build_timestamp`}} /bin/bash -eux /home/packer/pre-install-dependencies.sh" ] }, { @@ -818,7 +818,7 @@ { "type": "shell", "inline": [ - "sudo CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR={{user `continue_on_local_repo_download_error`}} LOCAL_DOCA_REPO_URL=\"{{user `local_doca_repo_url`}}\" FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} PRIVATE_PACKAGES_URL={{user `private_packages_url`}} VHD_BUILD_TIMESTAMP={{user `vhd_build_timestamp`}} /bin/bash -ux /home/packer/install-dependencies.sh" + "sudo CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR={{user `continue_on_local_repo_download_error`}} LOCAL_DOCA_REPO_URL=\"{{user `local_doca_repo_url`}}\" FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} PRIVATE_PACKAGES_URL={{user `private_packages_url`}} VHD_BUILD_TIMESTAMP={{user `vhd_build_timestamp`}} /bin/bash -eux /home/packer/install-dependencies.sh" ] }, { @@ -851,7 +851,7 @@ { "type": "shell", "inline": [ - "sudo FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} /bin/bash -ux /home/packer/post-install-dependencies.sh" + "sudo FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} /bin/bash -eux /home/packer/post-install-dependencies.sh" ] }, { @@ -862,7 +862,7 @@ { "type": "shell", "inline": [ - "sudo SKU_NAME={{user `sku_name`}} IMAGE_VERSION={{user `image_version`}} CONTAINER_RUNTIME={{user `container_runtime`}} /bin/bash -ux /home/packer/list-images.sh" + "sudo SKU_NAME={{user `sku_name`}} IMAGE_VERSION={{user `image_version`}} CONTAINER_RUNTIME={{user `container_runtime`}} /bin/bash -eux /home/packer/list-images.sh" ] }, { @@ -901,7 +901,7 @@ "inline": [ "sudo /bin/bash -eux /home/packer/cis.sh", "sudo /bin/bash -eux /opt/azure/containers/cleanup-vhd.sh", - "sudo /usr/sbin/waagent -force -deprovision+user && export HISTSIZE=0 && sync || exit 125" + "sudo /bin/bash -c '/usr/sbin/waagent -force -deprovision+user && export HISTSIZE=0 && /opt/azure/containers/post-deprovision-walinuxagent.sh' || exit 125" ] } ], From 8e9352f19dc0b2e75c609790182fa1b1dadc2051 Mon Sep 17 00:00:00 2001 From: Keith Pimm Date: Fri, 15 May 2026 08:23:04 -0500 Subject: [PATCH 84/87] Fix containerd overwrite problem --- .../linux/cloud-init/artifacts/cse_config.sh | 31 ++++++----- .../cloud-init/artifacts/cse_config_spec.sh | 53 +++++++++++++++++++ 2 files changed, 71 insertions(+), 13 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index c3d1a5cf939..9cad4a628e8 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -345,22 +345,27 @@ LimitNOFILE=1048576 EOF mkdir -p /etc/containerd - # Remove in case this is an existing symlink - rm -f /etc/containerd/config.toml - if [ "${GPU_NODE}" = "true" ]; then - # Check VM tag directly to determine if GPU drivers should be skipped - export -f should_skip_nvidia_drivers - should_skip=$(should_skip_nvidia_drivers) - if [ "$?" -eq 0 ] && [ "${should_skip}" = "true" ]; then - echo "Generating non-GPU containerd config for GPU node due to VM tags" - echo "${CONTAINERD_CONFIG_NO_GPU_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT + + if grep -q 'BinaryName = "/usr/bin/nvidia-container-runtime"' /etc/containerd/config.toml 2>/dev/null; then + echo "NVIDIA containerd config already exists at /etc/containerd/config.toml, skipping generation" + else + # Remove in case this is an existing symlink or non-NVIDIA config + rm -f /etc/containerd/config.toml + if [ "${GPU_NODE}" = "true" ]; then + # Check VM tag directly to determine if GPU drivers should be skipped + export -f should_skip_nvidia_drivers + should_skip=$(should_skip_nvidia_drivers) + if [ "$?" -eq 0 ] && [ "${should_skip}" = "true" ]; then + echo "Generating non-GPU containerd config for GPU node due to VM tags" + echo "${CONTAINERD_CONFIG_NO_GPU_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT + else + echo "Generating GPU containerd config..." + echo "${CONTAINERD_CONFIG_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT + fi else - echo "Generating GPU containerd config..." + echo "Generating containerd config..." echo "${CONTAINERD_CONFIG_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT fi - else - echo "Generating containerd config..." - echo "${CONTAINERD_CONFIG_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT fi export -f should_e2e_mock_azure_china_cloud diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index eae2b8a27dc..5051528c554 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -330,6 +330,59 @@ Describe 'cse_config.sh' End End + Describe 'ensureContainerd' + It 'should not overwrite an existing NVIDIA containerd config' + grep() { + echo "grep $@" + return 0 + } + + mkdir() { + echo "mkdir $@" + } + + rm() { + echo "rm $@" + } + + tee() { + echo "tee $@" + cat >/dev/null + } + + retrycmd_if_failure() { + echo "retrycmd_if_failure $@" + return 0 + } + + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + return 0 + } + + should_e2e_mock_azure_china_cloud() { + echo "false" + } + + GPU_NODE="false" + TARGET_CLOUD="AzurePublicCloud" + BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER="" + ERR_SYSCTL_RELOAD=1 + ERR_SYSTEMCTL_START_FAIL=1 + + When call ensureContainerd + + The output should include 'grep -q BinaryName = "/usr/bin/nvidia-container-runtime" /etc/containerd/config.toml' + The output should include "NVIDIA containerd config already exists at /etc/containerd/config.toml, skipping generation" + The output should not include "rm -f /etc/containerd/config.toml" + The output should not include "Generating containerd config" + The output should not include "Generating GPU containerd config" + The output should not include "Generating non-GPU containerd config" + The output should include "systemctlEnableAndStartNoBlock containerd 30" + The status should be success + End + End + Describe 'configureContainerdRegistryHost' It 'should configure registry host correctly if MCR_REPOSITORY_BASE is unset' mkdir() { From 30dedb678d31cac72a22d27e23ec1f3f05847506 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 15 May 2026 18:03:04 +0000 Subject: [PATCH 85/87] fix: address GB200 review feedback in dependency install logic Agent-Logs-Url: https://github.com/Azure/AgentBaker/sessions/8b4cdf6e-94c7-4c74-b5e2-8983bd8d7c23 Co-authored-by: cameronmeissner <24923771+cameronmeissner@users.noreply.github.com> --- vhdbuilder/packer/install-dependencies.sh | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 0f3753c3c34..945f2a94689 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -721,15 +721,11 @@ EOF fi if grep -q "GB200" <<< "$FEATURE_FLAGS"; then - # The GB200 feature flag should only be set for arm64 and Ubuntu 24.04, but validate - if [ ${UBUNTU_RELEASE} = "24.04" ]; then - # Need to replicate all functionality from github.com/azure/aks-gpu/install.sh. + # GB200 setup is only supported on arm64 Ubuntu 24.04. + if [ "${CPU_ARCH}" = "arm64" ] && [ "${UBUNTU_RELEASE}" = "24.04" ]; then + # Replicate all functionality from github.com/azure/aks-gpu/install.sh. # aks-gpu is designed to run at node boot/join time, whereas the GB200 VHD is set up # to have all drivers installed at VHD build time. - # - # TODO(abenn135): move all GPU installation logic back into the AgentBaker repo, and - # invoke it where we need it, either at VHD build time or at node boot time (for example - # if we do not know at VHD build time whether we will want GPU drivers installed or not). # 1. Blacklist nouveau driver cat << EOF >> /etc/modprobe.d/blacklist-nouveau.conf @@ -775,10 +771,6 @@ EOF systemctl enable nvidia-dcgm-exporter systemctl enable nvidia-device-plugin systemctl enable openibd - - # One additional request from MAI: Disable the AKS node problem detector. When this file is present, the Azure AKS VM Extension assumes the NPD has been installed on the VHD and skips installing it at provision time. - mkdir -p /etc/node-problem-detector.d/ - touch /etc/node-problem-detector.d/skip_vhd_npd fi fi From 332acf0ee3f6fdc6755a79ab7b930a2dcd9f5b9e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 15 May 2026 18:48:49 +0000 Subject: [PATCH 86/87] fix: restore GB200 NPD skip marker behavior Agent-Logs-Url: https://github.com/Azure/AgentBaker/sessions/7402cb4e-9c9d-4f11-8a2f-e49e30d5a461 Co-authored-by: keith-ms <153014933+keith-ms@users.noreply.github.com> --- vhdbuilder/packer/install-dependencies.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 945f2a94689..dc54748ed98 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -771,6 +771,10 @@ EOF systemctl enable nvidia-dcgm-exporter systemctl enable nvidia-device-plugin systemctl enable openibd + + # One additional request from MAI: Disable the AKS node problem detector. When this file is present, the Azure AKS VM Extension assumes the NPD has been installed on the VHD and skips installing it at provision time. + mkdir -p /etc/node-problem-detector.d/ + touch /etc/node-problem-detector.d/skip_vhd_npd fi fi From eeb56b871020d02a2c4f19dba7a951a16acc5112 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 15 May 2026 18:49:18 +0000 Subject: [PATCH 87/87] chore: clarify NPD skip marker comment wording Agent-Logs-Url: https://github.com/Azure/AgentBaker/sessions/7402cb4e-9c9d-4f11-8a2f-e49e30d5a461 Co-authored-by: keith-ms <153014933+keith-ms@users.noreply.github.com> --- vhdbuilder/packer/install-dependencies.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index dc54748ed98..ddd056e97ef 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -772,7 +772,8 @@ EOF systemctl enable nvidia-device-plugin systemctl enable openibd - # One additional request from MAI: Disable the AKS node problem detector. When this file is present, the Azure AKS VM Extension assumes the NPD has been installed on the VHD and skips installing it at provision time. + # One additional request from MAI: signal that NPD is pre-installed on the VHD. + # When this file is present, the Azure AKS VM Extension skips installing NPD at provision time. mkdir -p /etc/node-problem-detector.d/ touch /etc/node-problem-detector.d/skip_vhd_npd fi