diff --git a/.pipelines/.vsts-vhd-builder-release.yaml b/.pipelines/.vsts-vhd-builder-release.yaml index bb6490d7dbd..0e49420696f 100644 --- a/.pipelines/.vsts-vhd-builder-release.yaml +++ b/.pipelines/.vsts-vhd-builder-release.yaml @@ -975,7 +975,7 @@ stages: echo '##vso[task.setvariable variable=IMG_SKU]server-arm64' echo '##vso[task.setvariable variable=IMG_VERSION]latest' echo '##vso[task.setvariable variable=HYPERV_GENERATION]V2' - echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D16pds_v5' + echo '##vso[task.setvariable variable=AZURE_VM_SIZE]Standard_D32pds_v5' echo '##vso[task.setvariable variable=FEATURE_FLAGS]GB200' echo '##vso[task.setvariable variable=ARCHITECTURE]ARM64' echo '##vso[task.setvariable variable=ENABLE_FIPS]False' diff --git a/.pipelines/templates/.builder-release-template.yaml b/.pipelines/templates/.builder-release-template.yaml index f46cd7e2000..5fb7511ab14 100644 --- a/.pipelines/templates/.builder-release-template.yaml +++ b/.pipelines/templates/.builder-release-template.yaml @@ -106,6 +106,8 @@ steps: BUILD_ID: $(Build.BuildId) BUILD_DEFINITION_NAME: $(Build.DefinitionName) UA_TOKEN: $(ua-token) + LOCAL_DOCA_REPO_URL: $(LOCAL_DOCA_REPO_URL) + CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR: $(CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR) - task: AzureCLI@2 inputs: @@ -350,7 +352,7 @@ steps: TargetFolder: '$(Build.ArtifactStagingDirectory)' - task: CopyFiles@2 - condition: and(eq(variables.OS_SKU, 'Ubuntu'), in(variables.OS_VERSION, '22.04', '24.04'), in(variables.FEATURE_FLAGS, 'None', 'cvm')) + condition: and(eq(variables.OS_SKU, 'Ubuntu'), in(variables.OS_VERSION, '22.04', '24.04'), in(variables.FEATURE_FLAGS, 'None', 'cvm', 'GB200')) displayName: Copy CIS Reports inputs: SourceFolder: '$(System.DefaultWorkingDirectory)' diff --git a/packer.mk b/packer.mk index 8b61ade6561..a783a20a1f5 100755 --- a/packer.mk +++ b/packer.mk @@ -12,8 +12,13 @@ build-packer: setup-golang generate-prefetch-scripts build-image-fetcher build-a ifeq (${ARCHITECTURE},ARM64) @echo "${MODE}: Building with Hyper-v generation 2 ARM64 VM" ifeq (${OS_SKU},Ubuntu) +ifeq ($(findstring GB200,$(FEATURE_FLAGS)),GB200) + @echo "Using packer template file vhd-image-builder-arm64-gb200.json" + @packer build -timestamp-ui -var-file=vhdbuilder/packer/settings.json vhdbuilder/packer/vhd-image-builder-arm64-gb200.json +else @echo "Using packer template file vhd-image-builder-arm64-gen2.json" @packer build -timestamp-ui -var-file=vhdbuilder/packer/settings.json vhdbuilder/packer/vhd-image-builder-arm64-gen2.json +endif else ifeq (${OS_SKU},CBLMariner) @echo "Using packer template file vhd-image-builder-mariner-arm64.json" @packer build -timestamp-ui -var-file=vhdbuilder/packer/settings.json vhdbuilder/packer/vhd-image-builder-mariner-arm64.json diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 1bccaf924c7..9cad4a628e8 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -345,22 +345,27 @@ LimitNOFILE=1048576 EOF mkdir -p /etc/containerd - # Remove in case this is an existing symlink - rm -f /etc/containerd/config.toml - if [ "${GPU_NODE}" = "true" ]; then - # Check VM tag directly to determine if GPU drivers should be skipped - export -f should_skip_nvidia_drivers - should_skip=$(should_skip_nvidia_drivers) - if [ "$?" -eq 0 ] && [ "${should_skip}" = "true" ]; then - echo "Generating non-GPU containerd config for GPU node due to VM tags" - echo "${CONTAINERD_CONFIG_NO_GPU_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT + + if grep -q 'BinaryName = "/usr/bin/nvidia-container-runtime"' /etc/containerd/config.toml 2>/dev/null; then + echo "NVIDIA containerd config already exists at /etc/containerd/config.toml, skipping generation" + else + # Remove in case this is an existing symlink or non-NVIDIA config + rm -f /etc/containerd/config.toml + if [ "${GPU_NODE}" = "true" ]; then + # Check VM tag directly to determine if GPU drivers should be skipped + export -f should_skip_nvidia_drivers + should_skip=$(should_skip_nvidia_drivers) + if [ "$?" -eq 0 ] && [ "${should_skip}" = "true" ]; then + echo "Generating non-GPU containerd config for GPU node due to VM tags" + echo "${CONTAINERD_CONFIG_NO_GPU_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT + else + echo "Generating GPU containerd config..." + echo "${CONTAINERD_CONFIG_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT + fi else - echo "Generating GPU containerd config..." + echo "Generating containerd config..." echo "${CONTAINERD_CONFIG_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT fi - else - echo "Generating containerd config..." - echo "${CONTAINERD_CONFIG_CONTENT}" | base64 -d > /etc/containerd/config.toml || exit $ERR_FILE_WATCH_TIMEOUT fi export -f should_e2e_mock_azure_china_cloud @@ -634,6 +639,44 @@ ensurePodInfraContainerImage() { rm -f ${POD_INFRA_CONTAINER_IMAGE_TAR} } +validateKubeletNodeLabels() { + local labels="$1" + local validated_labels="" + local delimiter="" + + # Return empty if no labels provided + if [ -z "$labels" ]; then + echo "No labels found in KUBELET_NODE_LABELS" + return 0 + fi + + # Split labels by comma and process each + IFS=',' read -ra LABEL_ARRAY <<< "$labels" + for label in "${LABEL_ARRAY[@]}"; do + # Split each label into key and value + # shellcheck disable=SC3010 + if [[ "$label" == *"="* ]]; then + key="${label%%=*}" + value="${label#*=}" + + # Check if key length exceeds 63 characters + if [ ${#key} -gt 63 ]; then + echo "Warning: Label key '$key' exceeds 63 characters, truncating to 63 characters" >&2 + key="${key:0:63}" + fi + + # Rebuild the label with potentially truncated key + validated_labels="${validated_labels}${delimiter}${key}=${value}" + fi + + # Set delimiter for subsequent labels + delimiter="," + done + + # Update the global variable with validated labels + KUBELET_NODE_LABELS="$validated_labels" +} + ensureKubelet() { KUBELET_DEFAULT_FILE=/etc/default/kubelet mkdir -p /etc/default diff --git a/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml new file mode 100644 index 00000000000..88aa0fa0222 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml @@ -0,0 +1,39 @@ +oom_score = -999 +version = 2 + +[metrics] + address = "0.0.0.0:10257" + +[plugins] + +[plugins."io.containerd.grpc.v1.cri"] + sandbox_image = "mcr.microsoft.com/oss/kubernetes/pause:3.6" + +[plugins."io.containerd.grpc.v1.cri".containerd] + default_runtime_name = "nvidia" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes] + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia] + runtime_type = "io.containerd.runc.v2" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.nvidia.options] + BinaryName = "/usr/bin/nvidia-container-runtime" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc] + runtime_type = "io.containerd.runc.v2" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.runc.options] + BinaryName = "/usr/bin/runc" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted] + runtime_type = "io.containerd.runc.v2" + + [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.untrusted.options] + BinaryName = "/usr/bin/runc" + +[plugins."io.containerd.grpc.v1.cri".registry] + config_path = "/etc/containerd/certs.d" + +[plugins."io.containerd.grpc.v1.cri".registry.headers] + X-Meta-Source-Client = ["azure/aks"] diff --git a/parts/linux/cloud-init/artifacts/ubuntu/doca.list b/parts/linux/cloud-init/artifacts/ubuntu/doca.list new file mode 100644 index 00000000000..20300428613 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/doca.list @@ -0,0 +1,2 @@ +deb [arch=arm64 signed-by=/etc/apt/keyrings/doca-net.pub] https://linux.mellanox.com/public/repo/doca/latest/ubuntu24.04/arm64-sbsa/ ./ +deb [arch=amd64 signed-by=/etc/apt/keyrings/doca-net.pub] https://linux.mellanox.com/public/repo/doca/latest/ubuntu24.04/x86_64/ ./ diff --git a/parts/linux/cloud-init/artifacts/ubuntu/doca.pub b/parts/linux/cloud-init/artifacts/ubuntu/doca.pub new file mode 100644 index 00000000000..45107ba4315 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/doca.pub @@ -0,0 +1,81 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2.0.14 (GNU/Linux) + +mQGiBFMEmE0RBACsz1qcFsYOs0LHy/pBR2ip0gnHYbZgLy00R2i7cELxmqGcESzp +6IfzIdwOX9oVsPI6NT/yvftp+BxALuD8UC52MLjdMJZ+1sXBZM4J5xnDmQMhIp0G +wCse8usM8Zad1WTKq+P0ip8Gd17WEpfwMQPKXg3npcF69zaz/ceeDavqjwCgofU0 +rb8ui7cZs+c+7U+5mrXxmcMD/R/tV8tEykQFW7PKuZ9NvvRX2XFuQD9LZRW7v+Rg +ebC0GAM1ZSqgI7uNUL3ZLAMgxaURLZViqKPgiw8373uoayfrnccttoZ2prHdtB5O +ZPo9vp8wJYUd+Wug2c1nuzXQtTrs/wfeJDn/PfvlEIGlXYPphsBXGQd7MbMLtW7g +u6h/A/9lmSP1fFQflTRlO5j3jXrlFkW05lMlWVZD3H75obQxHlM7eGCgnUPABBMt +aoZDZDf5P9I3xinu9qhDi7Vbz7QOkWOGr2dHLUOMqIgoKz7zRcFtbAl65AcOuEKu +KpLE/R3mRjZ7vrCPud6euEKGpvMbdevDF7GeMG3fcvVlK1ivy7RVTWVsbGFub3gg +VGVjaG5vbG9naWVzIChNZWxsYW5veCBUZWNobm9sb2dpZXMgLSBTaWduaW5nIEtl +eSB2MikgPHN1cHBvcnRAbWVsbGFub3guY29tPohiBBMRAgAiBQJTBJhNAhsDBgsJ +CAcDAgYVCAIJCgsEFgIDAQIeAQIXgAAKCRDF7YPiYiTAUFcAAJ49FBA3hy0P0gsZ +q/ZkAMrgXZaG9wCcDjMtZZETG5NEaIVg3GYqJcvI4AW5AQ0EUwSYTRAEANmBQ0WP +O3VsOrDH0VX+fa1nuKpTqyPFmrROtiI0Ux1dEsU/hpFJnFHtv+CW8ppUlMmjhw6U +olS3dqvO+fWxe1FMLVpp1BQLI6udM5j/P1IEDH7TmZD5trYFp4PxXagKO2nBeqjj +NydQckgREntGCOGPqheBRdopmlJSPlTptQavAAMFA/9BVSpmStx3BsS0z5NPSI/V +wJFeQiXFq8zDKbEVHFMjYWGqbhGWDPaLJWxxNLF1hdpbZSQCAeaESNLYG0iqXwb6 +6O79BHpGeN0AWyy2J6FJpt0zwlCDfx7fgpFKMGzIxXWiTDNmKon241ojgM1iYC2o +arjropoA0dtG6noS2KJBYIhJBBgRAgAJBQJTBJhNAhsMAAoJEMXtg+JiJMBQzxUA +oJ+aJ2l6vt1S1tIKCLVtDMH8liOBAJ45EQ867jkf6f2Anihx9XJ0LLKZvw== +=QMd9 +-----END PGP PUBLIC KEY BLOCK----- +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2.0.14 (GNU/Linux) + +mQGiBFIHkboRBADGcZ0FQvQl8frNzEZIep6D+KSZY/ps70+k3ZJ+wj2mvtGZSV9t +zeEUbte7ft5HzrIniB87j1Swp+mSJIomLTkOcQunoqCCHQkuPOEMi1urUmdjpyc7 +nJjsQ63GLvH0DfmknGga4rCj3Kepn9mhJ9mqfS+/aXrz1ZP4Dk+alpi/RwCgplxo +94IruAMKoQCdJ3SmfqvszYcEAMUJ3qmCpYax4s/0XyX36emLiMioHZehq/QXdFmj +VmqqxL5QFmq9Yof8SwGBwpS8FS0VX8BTs7xAs5W7ZC7iGGo9uxuXZzeZ8vcwh6VX +OVmbtqLgXyPKqzHIDwJ8Q5Df0JQpRnCmQQaHbEcoOstSTP/3NHLFBIllPq7gqIpZ +9HQoBACvlwzvtabC9q1OAikXY5YKKbAtkmZYBa5I2qvfHV1bIRYPPHWW2shilX0N +Kz2pTR1ZlwEcz+CUhPtJgoWhkMu/Vl7NMeB0YzGmjQorHRj2mAvSbv/wvjeIMgbw +qRXIksGYiUSpTLtQYTfpJlNe0ZKzn6kHbqGUYZ92Jx2ki3gQqbQsTWVsbGFub3gg +VGVjaG5vbG9naWVzIDxzdXBwb3J0QG1lbGxhbm94LmNvbT6IYgQTEQIAIgUCUgeR +ugIbAwYLCQgHAwIGFQgCCQoLBBYCAwECHgECF4AACgkQAQSPp6nktkPsXwCfW2Rn +pgmC4zLTMBRo/hKsIvag2ToAnAtlzxpMAZGUQHBODfpGqx7MyHmUuQENBFIHkboQ +BADd2OqEdSDCB6KkgZ2BjURxpiDbZxEAEsTJOUBFMPSqdJN0GcqUon5Hc3yADDOF +ztdWf5XCKSp/loYvjTYM21Qq20g5EB2SU9FU6Eoq5vyU/HS3/c1wjiYv2rjMll62 +kc4oqRkM/fp9crrjArssfqMQcQRVYBS3dYdmoVdpHEH68wADBQP/XPW9r3wwGvUr +7hlFskYrSC/8s3r7vB4/mcF6UMkM4xEaP3jq8HH0SLkLbcPTa1+C/5evhmLbT12f +dub/V0/JVT9YsxS3anmvefT6EXjUntYXDLPhhRJqUCnxYjf95FX5zxudB5gMEwLh +9pmRMgqMCDsIANVv7V77DagfaWNkhqSISQQYEQIACQUCUgeRugIbDAAKCRABBI+n +qeS2Q71kAJ45i6YdS9bZGR8tDI0NfneMiU32CwCfdje+fgX5gUtag5SshjxyMrgt +DgY= +=z9pR +-----END PGP PUBLIC KEY BLOCK----- +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v1 + +mQENBFpbc0cBCADDST+ekKD1YJje77oDX94gRolmUlh0df4n6/xvE700M1vPAiTT +kU3WJcvwnuTZpyMGSsAQCXXQRJuQObnkPEvjVAPgh8fvghCXgVElcr6dqXu3EVze +iCkdYm08t/+FF3kg/P6VYPjgEM/GIFnKTz37LrQlUM4ArG0ENIYM9xjurnKWuV9r +JuckJcUsmZUS/D9QMM2fuurYOEWHrE8t+n2EcO4aoY2x0ogYce0vON539rJiskjz +OPhIB9G7ZFQabQnyxzEKiUUDyJsbe38XDT4eyjUR2mlHGgTY/WzGdDEtIKRBWsd3 +TV3wXt42nF9YA3oieeaTbIluyywNnOj1vyT1ABEBAAG0VU1lbGxhbm94IFRlY2hu +b2xvZ2llcyAoTWVsbGFub3ggVGVjaG5vbG9naWVzIC0gU2lnbmluZyBLZXkgdjMp +IDxzdXBwb3J0QG1lbGxhbm94LmNvbT6JATcEEwEIACEFAlpbc0cCGwMFCwkIBwMF +FQoJCAsFFgIDAQACHgECF4AACgkQoCT28ObWooFXYwgAunwBFELGlwKonnmnbi4/ +avUa8e0wRpww//DJjI0HQWjMk7oPLDbS50CVps1Mu0SxBAPYGtsFeSH6UMC6A0K4 +yoxXICVl409vYkycNu/vq6eLTbM2Y0PFvBDzRAf3rJXL0ApLuUb57ARZvc7Np7LA +v8K53PdOJUEFns8Ipp+2puEVx5dfezm7LwRca6ohoLUEdI/PobmGUeNvO5dvfiix +LvSVw2A2awihB7dcs5cpo57VxBWPs7+sYBZ0+EUJbtQEiHAyPvKs29nMeaCIwPTd +88A5RrhsEJx+QWXuG6NA4rfehy5e9j1PW3XnC2fMl6w7gNLY5I8Vq6c2MJ73NZ6y +wLkBDQRaW3NHAQgAynkQ+mf4f5cdM4/bJuRWlPxxuN3CUxN9Q6B5B1/13p6tkydP +C7S4ro8H8sSlO5FbbxihfZLPTbFNrBkd///OQYMJW/slbtT6D9dYmCIeuHObMEMb +V+Bn1bWQId2vZgr0+m0Xe3K+KqhsylsrmC1ebShMnny/V+MlOQQt+L089BNiyCB4 +70mhgM1NiJFv9EOQlXWWaMqWTxZGYkdOuFW0q8NnSGOqI5xjrAUxaHZ/1U3yPy0k +eAjX1AKJngaj86SvIzEefxq4oA2gZ8UFVO/qFH5OhfoovrEwudJEuIgGb76XOb9m +AoZlAqQLJniC97ld515ivBdSi4SZkaFbypnX4QARAQABiQEfBBgBCAAJBQJaW3NH +AhsMAAoJEKAk9vDm1qKBHhMIAJuGbb6S3nb2xAD3GjB8F2xNcZxWQ+Qz70DY5vV/ +WhrJl7cknXMxsbWvQupuYk6LujZraG9YoD4csZ5o+k3s3BGKVUXdZdhjaHpcAa5F +X12ADLHca5mlmdCaaORYXQ+xHYRlOKas4I6LPpZ79BauVomEnPcv/bL0kGFzDvLr +K3RdQ1n/pbcWcxxSY3InphAnslLUg0PTAME6Yay5F7WrJsnZnXApUjOlZvlPIl2c +iplivN8o85eBKQXvYRg/c5iyc0koTmkM6OXNvUy0hV9z8WhhK9O+ApXwMUMf43DS +KOIg9RxhZFQoPXptaQZDLz89sWmZaiXsyBPJyjlmaTjwHGM= +=Iy5R +-----END PGP PUBLIC KEY BLOCK----- \ No newline at end of file diff --git a/parts/linux/cloud-init/artifacts/ubuntu/format-mount-kubelet.conf b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-kubelet.conf new file mode 100644 index 00000000000..5f7333e3a7b --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-kubelet.conf @@ -0,0 +1,3 @@ +[Unit] +Requires=format-mount-nvme-root.service +After=format-mount-nvme-root.service diff --git a/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.service b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.service new file mode 100644 index 00000000000..3748421f1cd --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.service @@ -0,0 +1,13 @@ +[Unit] +Description=Format NVMe local disk and mount Kubelet there +Requires=mnt.mount +After=mnt.mount + +[Service] +Restart=on-failure +RemainAfterExit=yes +Type=oneshot +ExecStart=/bin/bash /opt/azure/containers/format-mount-nvme-root.sh + +[Install] +WantedBy=multi-user.target diff --git a/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh new file mode 100644 index 00000000000..9d336206b4c --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash +set -o errexit +set -o nounset +set -o pipefail +set -x + +# Bind mount kubelet to local NVMe storage specifically on startup. +MOUNT_POINT="/mnt/aks" + + +KUBELET_MOUNT_POINT="${MOUNT_POINT}/kubelet" +KUBELET_DIR="/var/lib/kubelet" + +mkdir -p "${MOUNT_POINT}" + +SENTINEL_FILE="/opt/azure/containers/bind-sentinel" +if [ ! -e "${SENTINEL_FILE}" ]; then + # Bond (via software RAID) and format the NVMe disks if that's not already done. + if [ -e /dev/disk/azure/local/by-index/1 ] && [ ! -e /dev/md0 ]; then + mdadm --create --verbose /dev/md0 --level=0 --raid-devices=4 /dev/disk/azure/local/by-index/1 /dev/disk/azure/local/by-index/2 /dev/disk/azure/local/by-index/3 /dev/disk/azure/local/by-index/4 + mkfs.ext4 -F /dev/md0 + # Save the RAID config so mdadm --assemble --scan works on subsequent boots. + mdadm --detail --scan >> /etc/mdadm/mdadm.conf + fi + mount /dev/md0 "${MOUNT_POINT}" + mv "${KUBELET_DIR}" "${KUBELET_MOUNT_POINT}" + touch "${SENTINEL_FILE}" +else + # On subsequent boots, reassemble the RAID array from superblocks. + # Cannot use /dev/disk/azure/local/by-index/ paths here as the waagent + # udev rules that create those symlinks may not have run yet. + if [ ! -e /dev/md0 ]; then + mdadm --assemble --scan + fi + mount /dev/md0 "${MOUNT_POINT}" +fi + +# on every boot, bind mount the kubelet directory back to the expected +# location before kubelet itself may start. +mkdir -p "${KUBELET_DIR}" +mount --bind "${KUBELET_MOUNT_POINT}" "${KUBELET_DIR}" +chmod a+w "${KUBELET_DIR}" diff --git a/parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf b/parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf new file mode 100644 index 00000000000..0b513ace710 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf @@ -0,0 +1,4 @@ +options nvidia NVreg_RestrictProfilingToAdminUsers=0 +options nvidia NVreg_CreateImexChannel0=1 +options nvidia NVreg_CoherentGPUMemoryMode=driver +options nvidia NVreg_RegistryDwords="RMBug5172204War=4" \ No newline at end of file diff --git a/parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list b/parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list new file mode 100644 index 00000000000..0c90e150d6d --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list @@ -0,0 +1,2 @@ +deb [arch=amd64 signed-by=/etc/apt/keyrings/nvidia.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64 / +deb [arch=arm64 signed-by=/etc/apt/keyrings/nvidia.pub] https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa / \ No newline at end of file diff --git a/parts/linux/cloud-init/artifacts/ubuntu/nvidia.pub b/parts/linux/cloud-init/artifacts/ubuntu/nvidia.pub new file mode 100644 index 00000000000..8aabe4805e5 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/ubuntu/nvidia.pub @@ -0,0 +1,29 @@ +-----BEGIN PGP PUBLIC KEY BLOCK----- +Version: GnuPG v2.0.22 (GNU/Linux) + +mQINBGJYmlEBEAC6nJmeqByeReM+MSy4palACCnfOg4pOxffrrkldxz4jrDOZNK4 +q8KG+ZbXrkdP0e9qTFRvZzN+A6Jw3ySfoiKXRBw5l2Zp81AYkghV641OpWNjZOyL +syKEtST9LR1ttHv1ZI71pj8NVG/EnpimZPOblEJ1OpibJJCXLrbn+qcJ8JNuGTSK +6v2aLBmhR8VR/aSJpmkg7fFjcGklweTI8+Ibj72HuY9JRD/+dtUoSh7z037mWo56 +ee02lPFRD0pHOEAlLSXxFO/SDqRVMhcgHk0a8roCF+9h5Ni7ZUyxlGK/uHkqN7ED +/U/ATpGKgvk4t23eTpdRC8FXAlBZQyf/xnhQXsyF/z7+RV5CL0o1zk1LKgo+5K32 +5ka5uZb6JSIrEPUaCPEMXu6EEY8zSFnCrRS/Vjkfvc9ViYZWzJ387WTjAhMdS7wd +PmdDWw2ASGUP4FrfCireSZiFX+ZAOspKpZdh0P5iR5XSx14XDt3jNK2EQQboaJAD +uqksItatOEYNu4JsCbc24roJvJtGhpjTnq1/dyoy6K433afU0DS2ZPLthLpGqeyK +MKNY7a2WjxhRmCSu5Zok/fGKcO62XF8a3eSj4NzCRv8LM6mG1Oekz6Zz+tdxHg19 +ufHO0et7AKE5q+5VjE438Xpl4UWbM/Voj6VPJ9uzywDcnZXpeOqeTQh2pQARAQAB +tCBjdWRhdG9vbHMgPGN1ZGF0b29sc0BudmlkaWEuY29tPokCOQQTAQIAIwUCYlia +UQIbAwcLCQgHAwIBBhUIAgkKCwQWAgMBAh4BAheAAAoJEKS0aZY7+GPM1y4QALKh +BqSozrYbe341Qu7SyxHQgjRCGi4YhI3bHCMj5F6vEOHnwiFH6YmFkxCYtqcGjca6 +iw7cCYMow/hgKLAPwkwSJ84EYpGLWx62+20rMM4OuZwauSUcY/kE2WgnQ74zbh3+ +MHs56zntJFfJ9G+NYidvwDWeZn5HIzR4CtxaxRgpiykg0s3ps6X0U+vuVcLnutBF +7r81astvlVQERFbce/6KqHK+yj843Qrhb3JEolUoOETK06nD25bVtnAxe0QEyA90 +9MpRNLfR6BdjPpxqhphDcMOhJfyubAroQUxG/7S+Yw+mtEqHrL/dz9iEYqodYiSo +zfi0b+HFI59sRkTfOBDBwb3kcARExwnvLJmqijiVqWkoJ3H67oA0XJN2nelucw+A +Hb+Jt9BWjyzKWlLFDnVHdGicyRJ0I8yqi32w8hGeXmu3tU58VWJrkXEXadBftmci +pemb6oZ/r5SCkW6kxr2PsNWcJoebUdynyOQGbVwpMtJAnjOYp0ObKOANbcIg+tsi +kyCIO5TiY3ADbBDPCeZK8xdcugXoW5WFwACGC0z+Cn0mtw8z3VGIPAMSCYmLusgW +t2+EpikwrP2inNp5Pc+YdczRAsa4s30Jpyv/UHEG5P9GKnvofaxJgnU56lJIRPzF +iCUGy6cVI0Fq777X/ME1K6A/bzZ4vRYNx8rUmVE5 +=DO7z +-----END PGP PUBLIC KEY BLOCK----- diff --git a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh index eae2b8a27dc..5051528c554 100755 --- a/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/cse_config_spec.sh @@ -330,6 +330,59 @@ Describe 'cse_config.sh' End End + Describe 'ensureContainerd' + It 'should not overwrite an existing NVIDIA containerd config' + grep() { + echo "grep $@" + return 0 + } + + mkdir() { + echo "mkdir $@" + } + + rm() { + echo "rm $@" + } + + tee() { + echo "tee $@" + cat >/dev/null + } + + retrycmd_if_failure() { + echo "retrycmd_if_failure $@" + return 0 + } + + systemctlEnableAndStartNoBlock() { + echo "systemctlEnableAndStartNoBlock $@" + return 0 + } + + should_e2e_mock_azure_china_cloud() { + echo "false" + } + + GPU_NODE="false" + TARGET_CLOUD="AzurePublicCloud" + BOOTSTRAP_PROFILE_CONTAINER_REGISTRY_SERVER="" + ERR_SYSCTL_RELOAD=1 + ERR_SYSTEMCTL_START_FAIL=1 + + When call ensureContainerd + + The output should include 'grep -q BinaryName = "/usr/bin/nvidia-container-runtime" /etc/containerd/config.toml' + The output should include "NVIDIA containerd config already exists at /etc/containerd/config.toml, skipping generation" + The output should not include "rm -f /etc/containerd/config.toml" + The output should not include "Generating containerd config" + The output should not include "Generating GPU containerd config" + The output should not include "Generating non-GPU containerd config" + The output should include "systemctlEnableAndStartNoBlock containerd 30" + The status should be success + End + End + Describe 'configureContainerdRegistryHost' It 'should configure registry host correctly if MCR_REPOSITORY_BASE is unset' mkdir() { diff --git a/vhdbuilder/packer/gb200-mai-bom.json b/vhdbuilder/packer/gb200-mai-bom.json new file mode 100644 index 00000000000..2dad95fea11 --- /dev/null +++ b/vhdbuilder/packer/gb200-mai-bom.json @@ -0,0 +1,46 @@ +{ + "versions-wave1": { + "libxnvctrl0": "580.105.08-0ubuntu1", + "libnvidia-common-580": "580.105.08-0ubuntu1", + "libnvidia-cfg1-580": "580.105.08-0ubuntu1", + "libnvidia-gpucomp-580": "580.105.08-0ubuntu1", + "libnvidia-gl-580": "580.105.08-0ubuntu1", + "nvidia-firmware-580": "580.105.08-0ubuntu1", + "nvidia-dkms-580-open": "580.105.08-0ubuntu1", + "nvidia-kernel-common-580": "580.105.08-0ubuntu1", + "nvidia-kernel-source-580-open": "580.105.08-0ubuntu1", + "libnvidia-compute-580": "580.105.08-0ubuntu1", + "libnvidia-extra-580": "580.105.08-0ubuntu1", + "libnvidia-decode-580": "580.105.08-0ubuntu1", + "libnvidia-encode-580": "580.105.08-0ubuntu1", + "xserver-xorg-video-nvidia-580": "580.105.08-0ubuntu1", + "libnvidia-fbc1-580": "580.105.08-0ubuntu1", + "nvidia-driver-580-open": "580.105.08-0ubuntu1" + }, + "versions-wave2": { + "cuda-toolkit-13": "13.0.2-1", + "nvidia-container-toolkit-base": "1.18.0-1", + "libnvidia-container1": "1.18.0-1", + "libnvidia-container-tools": "1.18.0-1", + "nvidia-container-toolkit": "1.18.0-1", + "datacenter-gpu-manager-exporter": "4.6.0-1", + "datacenter-gpu-manager-4-core": "1:4.4.1-1", + "datacenter-gpu-manager-4-proprietary": "1:4.4.1-1", + "datacenter-gpu-manager-4-cuda13": "1:4.4.1-1", + "datacenter-gpu-manager-4-proprietary-cuda13": "1:4.4.1-1", + "datacenter-gpu-manager-4-multinode": "1:4.4.1-1", + "datacenter-gpu-manager-4-multinode-cuda13": "1:4.4.1-1", + "libcap2-bin": "1:2.66-5ubuntu2.4", + "k8s-device-plugin": "0.17.3-ubuntu24.04u5", + "nvidia-imex": "580.105.08-1", + "librdmacm-dev": "2507mlnx58-1.2507097.0214", + "libibverbs-dev": "2507mlnx58-1.2507097.0214", + "libibverbs1": "2507mlnx58-1.2507097.0214", + "ibverbs-providers": "2507mlnx58-1.2507097.0214", + "doca-ofed": "3.1.0-091513" + }, + "doca-custom-repo": "https://linux.mellanox.com/public/repo/doca/3.1.0-091513/ubuntu24.04/arm64-sbsa/", + "kernel-versions": { + "linux-azure-nvidia": "6.14.0-1003.3" + } +} diff --git a/vhdbuilder/packer/install-dependencies.sh b/vhdbuilder/packer/install-dependencies.sh index 5bbce11fd4a..ddd056e97ef 100644 --- a/vhdbuilder/packer/install-dependencies.sh +++ b/vhdbuilder/packer/install-dependencies.sh @@ -720,6 +720,65 @@ if [ $OS = $UBUNTU_OS_NAME ] && [ "$(isARM64)" -ne 1 ]; then # No ARM64 SKU wit EOF fi +if grep -q "GB200" <<< "$FEATURE_FLAGS"; then + # GB200 setup is only supported on arm64 Ubuntu 24.04. + if [ "${CPU_ARCH}" = "arm64" ] && [ "${UBUNTU_RELEASE}" = "24.04" ]; then + # Replicate all functionality from github.com/azure/aks-gpu/install.sh. + # aks-gpu is designed to run at node boot/join time, whereas the GB200 VHD is set up + # to have all drivers installed at VHD build time. + + # 1. Blacklist nouveau driver + cat << EOF >> /etc/modprobe.d/blacklist-nouveau.conf +blacklist nouveau +options nouveau modeset=0 +EOF + update-initramfs -u + + # 2. install drivers + BOM_PATH="gb200-mai-bom.json" + + # Install a custom repository if a doca-custom-repo is specified + DOCA_CUSTOM_REPO=$(jq -r '.["doca-custom-repo"]' $BOM_PATH) + if [ -n "$DOCA_CUSTOM_REPO" ]; then + mv /etc/apt/sources.list.d/doca-net.list /etc/apt/sources.list.d/doca-net.list.backup + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/doca-net.pub] $DOCA_CUSTOM_REPO ./" > /etc/apt/sources.list.d/doca-net.list + apt-get update + fi + + # Farcically, nvidia-dkms-580-open cannot be installed together with the CUDA toolkit. Something about that package changes the build environment in an incompatible way. I've seen people mention CUDA including an old version of gcc that somehow makes its way onto the PATH... + # Therefore we install the GPU driver and its dependencies first, then install all downstream reverse-dependencies (CUDA, DCGM, and so forth) second. + sudo apt-get install -y --allow-downgrades $(jq -r '.["versions-wave1"] | to_entries[] | "\(.key)=\(.value)"' $BOM_PATH) + sudo apt-get install -y --allow-downgrades $(jq -r '.["versions-wave2"] | to_entries[] | "\(.key)=\(.value)"' $BOM_PATH) + + # 3. Add char device symlinks for NVIDIA devices + mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)" + cat << EOF >> /lib/udev/rules.d/71-nvidia-dev-char.rules +ACTION=="add", DEVPATH=="/bus/pci/drivers/nvidia", RUN+="/usr/bin/nvidia-ctk system create-dev-char-symlinks --create-all" +EOF + + # Create systemd drop-in to override nvidia-device-plugin dependencies + mkdir -p /etc/systemd/system/nvidia-device-plugin.service.d + cat << EOF > /etc/systemd/system/nvidia-device-plugin.service.d/override.conf +[Unit] +After=kubelet.service + +[Service] +ExecStartPre=-/usr/bin/mkdir -p /var/lib/kubelet/device-plugins +EOF + + # Now we are off-piste: enable DCGM, DCGM exporter, container device plugin, and the NVIDIA containerd config. + systemctl enable nvidia-dcgm + systemctl enable nvidia-dcgm-exporter + systemctl enable nvidia-device-plugin + systemctl enable openibd + + # One additional request from MAI: signal that NPD is pre-installed on the VHD. + # When this file is present, the Azure AKS VM Extension skips installing NPD at provision time. + mkdir -p /etc/node-problem-detector.d/ + touch /etc/node-problem-detector.d/skip_vhd_npd + fi +fi + if [ -d "/opt/gpu" ] && [ "$(ls -A /opt/gpu)" ]; then ls -ltr /opt/gpu/* >> ${VHD_LOGS_FILEPATH} fi diff --git a/vhdbuilder/packer/packer_source.sh b/vhdbuilder/packer/packer_source.sh index 40e922672f4..6a3567a4f9e 100644 --- a/vhdbuilder/packer/packer_source.sh +++ b/vhdbuilder/packer/packer_source.sh @@ -503,6 +503,52 @@ copyPackerFiles() { cpAndMode $NOTICE_SRC $NOTICE_DEST 444 fi + if grep -q "GB200" <<< "$FEATURE_FLAGS"; then + FMT_SH_SRC=/home/packer/format-mount-nvme-root.sh + FMT_SH_DEST=/opt/azure/containers/format-mount-nvme-root.sh + cpAndMode $FMT_SH_SRC $FMT_SH_DEST 0544 + FMT_SVC_SRC=/home/packer/format-mount-nvme-root.service + FMT_SVC_DEST=/etc/systemd/system/format-mount-nvme-root.service + cpAndMode $FMT_SVC_SRC $FMT_SVC_DEST 600 + FMT_SVC_SRC=/home/packer/format-mount-kubelet.conf + FMT_SVC_DEST=/etc/systemd/system/kubelet.service.d/11-fmtmount.conf + cpAndMode $FMT_SVC_SRC $FMT_SVC_DEST 600 + + if [ ${UBUNTU_RELEASE} = "24.04" ]; then + NVIDIA_LIST_SRC=/home/packer/nvidia-2404.list + NVIDIA_LIST_DEST=/etc/apt/sources.list.d/nvidia.list + cpAndMode $NVIDIA_LIST_SRC $NVIDIA_LIST_DEST 644 + + NVIDIA_ASC_SRC=/home/packer/nvidia.pub + NVIDIA_ASC_DEST=/etc/apt/keyrings/nvidia.pub + cpAndMode $NVIDIA_ASC_SRC $NVIDIA_ASC_DEST 644 + + # This will only currently work if changes are applied to the subscription + # the node runs in. Otherwise, until the GB200 is recognized as a GPU SKU, + # it'll be overwritten by a containerd configuration that doesn't support + # running GPU workloads. + CONTAINERD_NVIDIA_TOML_SRC=/home/packer/containerd-nvidia.toml + CONTAINERD_NVIDIA_TOML_DEST=/etc/containerd/config.toml + cpAndMode $CONTAINERD_NVIDIA_TOML_SRC $CONTAINERD_NVIDIA_TOML_DEST 644 + + DOCA_LIST_SRC=/home/packer/doca.list + DOCA_LIST_DEST=/etc/apt/sources.list.d/doca-net.list + cpAndMode $DOCA_LIST_SRC $DOCA_LIST_DEST 644 + + DOCA_PUB_SRC=/home/packer/doca.pub + DOCA_PUB_DEST=/etc/apt/keyrings/doca-net.pub + cpAndMode $DOCA_PUB_SRC $DOCA_PUB_DEST 644 + + NVIDIA_MODPROBE_PARAMETERS_SRC=/home/packer/modprobe-nvidia-parameters.conf + NVIDIA_MODPROBE_PARAMETERS_DEST=/etc/modprobe.d/nvidia.conf + cpAndMode $NVIDIA_MODPROBE_PARAMETERS_SRC $NVIDIA_MODPROBE_PARAMETERS_DEST 644 + + BOM_SRC=/home/packer/gb200-mai-bom.json + BOM_DEST=/opt/azure/containers/gb200-mai-bom.json + cpAndMode $BOM_SRC $BOM_DEST 644 + fi + fi + # Always copy the VHD cleanup script responsible for prepping the instance for first boot # to disk so we can run it again if needed in subsequent builds/releases (prefetch during SIG release) cpAndMode $VHD_CLEANUP_SCRIPT_SRC $VHD_CLEANUP_SCRIPT_DEST 644 diff --git a/vhdbuilder/packer/post-install-dependencies.sh b/vhdbuilder/packer/post-install-dependencies.sh index 67cccc26da3..c6481f3e34c 100644 --- a/vhdbuilder/packer/post-install-dependencies.sh +++ b/vhdbuilder/packer/post-install-dependencies.sh @@ -19,7 +19,7 @@ VHD_LOGS_FILEPATH=/opt/azure/vhd-install.complete PERFORMANCE_DATA_FILE=/opt/azure/vhd-build-performance-data.json # Hardcode the desired size of the OS disk so we don't accidently rely on extra disk space -if [ "$OS" = "$FLATCAR_OS_NAME" ] || isACL "$OS" "$OS_VARIANT"; then +if [ "$OS" = "$FLATCAR_OS_NAME" ] || isACL "$OS" "$OS_VARIANT" || grep -q "GB200" <<< "$FEATURE_FLAGS"; then MAX_BLOCK_COUNT=60397977 # 60 GB DISK_SIZE_GB=60 else @@ -29,7 +29,7 @@ fi capture_benchmark "${SCRIPT_NAME}_source_packer_files_and_declare_variables" if [ $OS = $UBUNTU_OS_NAME ]; then - # We do not purge extra kernels from the Ubuntu 24.04 ARM image, since that image must dual-boot for GB200. + # We do not purge extra kernels from the Ubuntu 24.04 ARM images, since those images must dual-boot for GB200. if [ $CPU_ARCH != "arm64" ] || [ $UBUNTU_RELEASE != "24.04" ]; then # shellcheck disable=SC2021 current_kernel="$(uname -r | cut -d- -f-2)" @@ -39,6 +39,11 @@ if [ $OS = $UBUNTU_OS_NAME ]; then else dpkg --get-selections | grep -e "linux-\(headers\|modules\|image\)" | grep -v "linux-\(headers\|modules\|image\)-azure" | grep -v "$current_kernel" | tr -s '[[:space:]]' | tr '\t' ' ' | cut -d' ' -f1 | xargs -I{} apt-get --purge remove -yq {} fi + else + # However, for the 24.04 ARM images, we MUST have both -azure and -azure-nvidia kernels, so that we can run on either vanilla ARM64 hardware or GB200. + if [ $(dpkg --get-selections | grep -c "linux-image") -lt 2 ]; then + echo "ERROR: Ubuntu 24.04 ARM image is missing either the -azure or -azure-nvidia kernel, cannot continue!" && exit 1 + fi fi # remove apport diff --git a/vhdbuilder/packer/pre-install-dependencies.sh b/vhdbuilder/packer/pre-install-dependencies.sh index f5a5fd816fe..d46b2312150 100644 --- a/vhdbuilder/packer/pre-install-dependencies.sh +++ b/vhdbuilder/packer/pre-install-dependencies.sh @@ -198,18 +198,60 @@ if [[ ${UBUNTU_RELEASE//./} -ge 2204 && "${ENABLE_FIPS,,}" != "true" ]]; then fi NVIDIA_KERNEL_PACKAGE="linux-azure-nvidia" if [[ "${CPU_ARCH}" == "arm64" && "${UBUNTU_RELEASE}" = "24.04" ]]; then - # This is the ubuntu 2404arm64gen2containerd image. + # This is the ubuntu 2404arm64gen2containerd image or the 2404arm64gb200 image + # The Ubuntu PPA has early access to new kernels, such as the one in the GB300 CRD. # Uncomment if we have trouble finding the kernel package. - # sudo add-apt-repository ppa:canonical-kernel-team/ppa - sudo apt update - if apt-cache show "${NVIDIA_KERNEL_PACKAGE}" &> /dev/null; then - echo "ARM64 image. Installing NVIDIA kernel and its packages alongside LTS kernel" - wait_for_apt_locks - sudo apt install --no-install-recommends -y "${NVIDIA_KERNEL_PACKAGE}" - echo "after installation:" - dpkg -l | grep "linux-.*-azure-nvidia" || true + # add-apt-repository ppa:canonical-kernel-team/ppa + if grep -q "GB200" <<< "$FEATURE_FLAGS"; then + add-apt-repository ppa:canonical-kernel-team/ppa + apt-get update + BOM_PATH="gb200-mai-bom.json" + if [ -n "$(jq -r '.["kernel-versions"] | keys[]' $BOM_PATH)" ]; then + NVIDIA_KERNEL_PACKAGE=$(jq -r '.["kernel-versions"] | to_entries[] | "\(.key)=\(.value)"' $BOM_PATH) + fi + if apt-get install -s "${NVIDIA_KERNEL_PACKAGE}" &> /dev/null; then + echo "ARM64 image. Installing NVIDIA kernel and its packages alongside LTS kernel" + wait_for_apt_locks + apt install --no-install-recommends -y "${NVIDIA_KERNEL_PACKAGE}" + echo "after installation:" + dpkg -l | grep "linux-.*-azure-nvidia" || true + else + echo "ARM64 image. NVIDIA kernel not available from repo, fetching and installing dpkgs by hand" + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb > /tmp/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb > /tmp/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb > /tmp/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb > /tmp/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + + curl -fsSL https://ports.ubuntu.com/pool/main/l/linux-azure-nvidia-6.14/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb > /tmp/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + + dpkg -i /tmp/linux-modules-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-cloud-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-cloud-tools-common_6.14.0-1003.3_all.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-headers-6.14.0-1003_6.14.0-1003.3_all.deb + dpkg -i /tmp/linux-azure-nvidia-6.14-tools-6.14.0-1003_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-cloud-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-headers-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-tools-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + dpkg -i /tmp/linux-image-unsigned-6.14.0-1003-azure-nvidia_6.14.0-1003.3_arm64.deb + + rm /tmp/*.deb + fi + add-apt-repository --remove ppa:canonical-kernel-team/ppa else - echo "ARM64 image. NVIDIA kernel not available, skipping installation." + apt-get update + if apt-cache show "${NVIDIA_KERNEL_PACKAGE}" &> /dev/null; then + echo "ARM64 image. Installing NVIDIA kernel and its packages alongside LTS kernel" + wait_for_apt_locks + sudo apt install --no-install-recommends -y "${NVIDIA_KERNEL_PACKAGE}" + echo "after installation:" + dpkg -l | grep "linux-.*-azure-nvidia" || true + else + echo "ARM64 image. NVIDIA kernel not available, skipping installation." + fi fi fi wait_for_apt_locks diff --git a/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json new file mode 100644 index 00000000000..8cbc65dfd8f --- /dev/null +++ b/vhdbuilder/packer/vhd-image-builder-arm64-gb200.json @@ -0,0 +1,912 @@ +{ + "variables": { + "subscription_id": "{{env `AZURE_SUBSCRIPTION_ID`}}", + "gallery_subscription_id": "{{user `gallery_subscription_id`}}", + "location": "{{env `PACKER_BUILD_LOCATION`}}", + "vm_size": "{{env `AZURE_VM_SIZE`}}", + "build_definition_name": "{{env `BUILD_DEFINITION_NAME`}}", + "build_number": "{{env `BUILD_NUMBER`}}", + "build_id": "{{env `BUILD_ID`}}", + "commit": "{{env `GIT_VERSION`}}", + "feature_flags": "{{env `FEATURE_FLAGS`}}", + "image_version": "{{env `IMAGE_VERSION`}}", + "os_version": "{{env `OS_VERSION`}}", + "sku_name": "{{env `SKU_NAME`}}", + "hyperv_generation": "{{env `HYPERV_GENERATION`}}", + "sig_gallery_name": "{{env `SIG_GALLERY_NAME`}}", + "sig_image_name": "{{env `SIG_IMAGE_NAME`}}", + "sig_image_version": "{{env `SIG_IMAGE_VERSION`}}", + "container_runtime": "{{env `CONTAINER_RUNTIME`}}", + "captured_sig_version": "{{env `${CAPTURED_SIG_VERSION`}}", + "enable_fips": "{{env `ENABLE_FIPS`}}", + "img_publisher": "{{env `IMG_PUBLISHER`}}", + "img_offer": "{{env `IMG_OFFER`}}", + "img_sku": "{{env `IMG_SKU`}}", + "img_version": "{{env `IMG_VERSION`}}", + "vnet_resource_group_name": "{{env `VNET_RESOURCE_GROUP_NAME`}}", + "vnet_name": "{{env `VNET_NAME`}}", + "subnet_name": "{{env `SUBNET_NAME`}}", + "private_packages_url": "{{env `PRIVATE_PACKAGES_URL`}}", + "branch": "{{env `BRANCH`}}", + "vhd_build_timestamp": "{{user `VHD_BUILD_TIMESTAMP`}}", + "local_doca_repo_url": "{{env `LOCAL_DOCA_REPO_URL`}}", + "continue_on_local_repo_download_error": "{{env `CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR`}}" + }, + "builders": [ + { + "type": "azure-arm", + "subscription_id": "{{user `subscription_id`}}", + "virtual_network_resource_group_name": "{{user `vnet_resource_group_name`}}", + "virtual_network_name": "{{user `vnet_name`}}", + "virtual_network_subnet_name": "{{user `subnet_name`}}", + "ssh_read_write_timeout": "5m", + "os_type": "Linux", + "os_disk_size_gb": 60, + "image_publisher": "{{user `img_publisher`}}", + "image_offer": "{{user `img_offer`}}", + "image_sku": "{{user `img_sku`}}", + "image_version": "{{user `img_version`}}", + "azure_tags": { + "buildDefinitionName": "{{user `build_definition_name`}}", + "buildNumber": "{{user `build_number`}}", + "buildId": "{{user `build_id`}}", + "SkipLinuxAzSecPack": "true", + "os": "Linux", + "now": "{{user `create_time`}}", + "createdBy": "aks-vhd-pipeline", + "image_sku": "{{user `img_sku`}}", + "branch": "{{user `branch`}}" + }, + "location": "{{user `location`}}", + "vm_size": "{{user `vm_size`}}", + "use_azure_cli_auth": "true", + "polling_duration_timeout": "1h", + "managed_image_storage_account_type": "Premium_LRS", + "shared_image_gallery_destination": { + "subscription": "{{user `gallery_subscription_id`}}", + "resource_group": "{{user `resource_group_name`}}", + "gallery_name": "{{user `sig_gallery_name`}}", + "image_name": "{{user `sig_image_name`}}", + "image_version": "{{user `captured_sig_version`}}", + "replication_regions": ["{{user `location`}}"] + }, + "user_assigned_managed_identities": "{{user `msi_resource_strings`}}" + } + ], + "provisioners": [ + { + "type": "shell", + "inline": [ + "sudo mkdir -p /opt/azure/containers", + "sudo mkdir -p /opt/scripts", + "sudo mkdir -p /opt/certs" + ] + }, + { + "type": "file", + "source": "vhdbuilder/lister/bin/lister", + "destination": "/home/packer/lister" + }, + { + "type": "file", + "source": "image-fetcher/bin/image-fetcher-linux-arm64", + "destination": "/home/packer/image-fetcher" + }, + { + "type": "file", + "source": "aks-node-controller/bin/aks-node-controller-linux-arm64", + "destination": "/home/packer/aks-node-controller" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-node-controller.service", + "destination": "/home/packer/aks-node-controller.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-node-controller-wrapper.sh", + "destination": "/home/packer/aks-node-controller-wrapper.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cloud-init-status-check.sh", + "destination": "/home/packer/cloud-init-status-check.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.sh", + "destination": "/home/packer/format-mount-nvme-root.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/format-mount-nvme-root.service", + "destination": "/home/packer/format-mount-nvme-root.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/format-mount-kubelet.conf", + "destination": "/home/packer/format-mount-kubelet.conf" + }, + { + "type": "file", + "source": "vhdbuilder/packer/prefetch.sh", + "destination": "/home/packer/prefetch.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/cleanup-vhd.sh", + "destination": "/home/packer/cleanup-vhd.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/post-deprovision-walinuxagent.sh", + "destination": "/home/packer/post-deprovision-walinuxagent.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/install_walinuxagent.py", + "destination": "/home/packer/install_walinuxagent.py" + }, + { + "type": "file", + "source": "vhdbuilder/packer/packer_source.sh", + "destination": "/home/packer/packer_source.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_install.sh", + "destination": "/home/packer/provision_installs.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/cse_install_ubuntu.sh", + "destination": "/home/packer/provision_installs_distro.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_helpers.sh", + "destination": "/home/packer/provision_source.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_benchmark_functions.sh", + "destination": "/home/packer/provision_source_benchmarks.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/cse_helpers_ubuntu.sh", + "destination": "/home/packer/provision_source_distro.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_config.sh", + "destination": "/home/packer/provision_configs.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_main.sh", + "destination": "/home/packer/provision.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_start.sh", + "destination": "/home/packer/provision_start.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/containerd_exec_start.conf", + "destination": "/home/packer/containerd_exec_start.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/kubelet.service", + "destination": "/home/packer/kubelet.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/secure-tls-bootstrap.service", + "destination": "/home/packer/secure-tls-bootstrap.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/reconcile-private-hosts.sh", + "destination": "/home/packer/reconcile-private-hosts.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/block_wireserver.sh", + "destination": "/home/packer/block_wireserver.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ensure_imds_restriction.sh", + "destination": "/home/packer/ensure_imds_restriction.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/measure-tls-bootstrapping-latency.sh", + "destination": "/home/packer/measure-tls-bootstrapping-latency.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/measure-tls-bootstrapping-latency.service", + "destination": "/home/packer/measure-tls-bootstrapping-latency.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/validate-kubelet-credentials.sh", + "destination": "/home/packer/validate-kubelet-credentials.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_redact_cloud_config.py", + "destination": "/home/packer/cse_redact_cloud_config.py" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cse_send_logs.py", + "destination": "/home/packer/cse_send_logs.py" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh", + "destination": "/home/packer/init-aks-custom-cloud.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/reconcile-private-hosts.service", + "destination": "/home/packer/reconcile-private-hosts.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/mig-partition.service", + "destination": "/home/packer/mig-partition.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/bind-mount.sh", + "destination": "/home/packer/bind-mount.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/bind-mount.service", + "destination": "/home/packer/bind-mount.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/enable-dhcpv6.sh", + "destination": "/home/packer/enable-dhcpv6.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/dhcpv6.service", + "destination": "/home/packer/dhcpv6.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/sync-container-logs.sh", + "destination": "/home/packer/sync-container-logs.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/sync-container-logs.service", + "destination": "/home/packer/sync-container-logs.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/crictl.yaml", + "destination": "/home/packer/crictl.yaml" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ensure-no-dup.sh", + "destination": "/home/packer/ensure-no-dup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ensure-no-dup.service", + "destination": "/home/packer/ensure-no-dup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/setup-custom-search-domains.sh", + "destination": "/home/packer/setup-custom-search-domains.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/ubuntu-snapshot-update.sh", + "destination": "/home/packer/ubuntu-snapshot-update.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/snapshot-update.service", + "destination": "/home/packer/snapshot-update.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/snapshot-update.timer", + "destination": "/home/packer/snapshot-update.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cis.sh", + "destination": "/home/packer/cis.sh" + }, + { + "type": "file", + "source": "vhdbuilder/scripts/linux/tool_installs.sh", + "destination": "/home/packer/tool_installs.sh" + }, + { + "type": "file", + "source": "vhdbuilder/scripts/linux/ubuntu/tool_installs_ubuntu.sh", + "destination": "/home/packer/tool_installs_distro.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/pre-install-dependencies.sh", + "destination": "/home/packer/pre-install-dependencies.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/install-ig.sh", + "destination": "/home/packer/install-ig.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/inspektor-gadget/baseline/usr/share/inspektor-gadget/import_gadgets.sh", + "destination": "/home/packer/ig-import-gadgets.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/inspektor-gadget/baseline/usr/share/inspektor-gadget/remove_gadgets.sh", + "destination": "/home/packer/ig-remove-gadgets.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/inspektor-gadget/baseline/usr/lib/systemd/system/ig-import-gadgets.service", + "destination": "/home/packer/ig-import-gadgets.service" + }, + { + "type": "file", + "source": "vhdbuilder/packer/install-dependencies.sh", + "destination": "/home/packer/install-dependencies.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/install-node-exporter.sh", + "destination": "/home/packer/install-node-exporter.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/node-exporter/node-exporter-startup.sh", + "destination": "/home/packer/node-exporter-startup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/systemd/system/node-exporter.service", + "destination": "/home/packer/node-exporter.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/systemd/system/node-exporter-restart.service", + "destination": "/home/packer/node-exporter-restart.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/systemd/system/node-exporter-restart.path", + "destination": "/home/packer/node-exporter-restart.path" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/node-exporter/baseline/etc/node-exporter.d/web-config.yml", + "destination": "/home/packer/node-exporter-web-config.yml" + }, + { + "type": "file", + "source": "vhdbuilder/packer/generate-disk-usage.sh", + "destination": "/home/packer/generate-disk-usage.sh" + }, + { + "type": "file", + "source": "vhdbuilder/packer/post-install-dependencies.sh", + "destination": "/home/packer/post-install-dependencies.sh" + }, + { + "type": "file", + "source": "parts/common/components.json", + "destination": "/home/packer/components.json" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/manifest.json", + "destination": "/home/packer/manifest.json" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/sysctl-d-60-CIS.conf", + "destination": "/home/packer/sysctl-d-60-CIS.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/sshd_config", + "destination": "/home/packer/sshd_config" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/rsyslog-d-60-CIS.conf", + "destination": "/home/packer/rsyslog-d-60-CIS.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/logrotate-d-rsyslog-CIS.conf", + "destination": "/home/packer/logrotate-d-rsyslog-CIS.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/etc-issue", + "destination": "/home/packer/etc-issue" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/etc-issue.net", + "destination": "/home/packer/etc-issue.net" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/modprobe-CIS.conf", + "destination": "/home/packer/modprobe-CIS.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/faillock-CIS.conf", + "destination": "/home/packer/faillock-CIS.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pwquality-CIS.conf", + "destination": "/home/packer/pwquality-CIS.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-su", + "destination": "/home/packer/pam-d-su" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-account", + "destination": "/home/packer/pam-d-common-account" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-auth", + "destination": "/home/packer/pam-d-common-auth" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-auth-2204", + "destination": "/home/packer/pam-d-common-auth-2204" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-password", + "destination": "/home/packer/pam-d-common-password" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/profile-d-cis.sh", + "destination": "/home/packer/profile-d-cis.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/profile-d-path.sh", + "destination": "/home/packer/profile-d-path.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/disk_queue.sh", + "destination": "/home/packer/disk_queue.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/disk_queue.service", + "destination": "/home/packer/disk_queue.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-memory-telemetry.sh", + "destination": "/home/packer/cgroup-memory-telemetry.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-memory-telemetry.service", + "destination": "/home/packer/cgroup-memory-telemetry.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-memory-telemetry.timer", + "destination": "/home/packer/cgroup-memory-telemetry.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-pressure-telemetry.sh", + "destination": "/home/packer/cgroup-pressure-telemetry.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-pressure-telemetry.service", + "destination": "/home/packer/cgroup-pressure-telemetry.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/cgroup-pressure-telemetry.timer", + "destination": "/home/packer/cgroup-pressure-telemetry.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/update_certs.service", + "destination": "/home/packer/update_certs.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/update_certs.path", + "destination": "/home/packer/update_certs.path" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/update_certs.sh", + "destination": "/home/packer/update_certs.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ci-syslog-watcher.path", + "destination": "/home/packer/ci-syslog-watcher.path" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ci-syslog-watcher.service", + "destination": "/home/packer/ci-syslog-watcher.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ci-syslog-watcher.sh", + "destination": "/home/packer/ci-syslog-watcher.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-diagnostic.py", + "destination": "/home/packer/aks-diagnostic.py" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-log-collector.sh", + "destination": "/home/packer/aks-log-collector.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-log-collector-send.py", + "destination": "/home/packer/aks-log-collector-send.py" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-log-collector.service", + "destination": "/home/packer/aks-log-collector.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-log-collector.slice", + "destination": "/home/packer/aks-log-collector.slice" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-log-collector.timer", + "destination": "/home/packer/aks-log-collector.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-check-network.sh", + "destination": "/home/packer/aks-check-network.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-check-network.service", + "destination": "/home/packer/aks-check-network.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-logrotate.sh", + "destination": "/home/packer/logrotate.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-logrotate.service", + "destination": "/home/packer/logrotate.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-logrotate.timer", + "destination": "/home/packer/logrotate.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-logrotate-override.conf", + "destination": "/home/packer/override.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-rsyslog", + "destination": "/home/packer/rsyslog" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ipv6_nftables", + "destination": "/home/packer/ipv6_nftables" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ipv6_nftables.service", + "destination": "/home/packer/ipv6_nftables.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ipv6_nftables.sh", + "destination": "/home/packer/ipv6_nftables.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/apt-preferences", + "destination": "/home/packer/apt-preferences" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/kms.service", + "destination": "/home/packer/kms.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/mig-partition.sh", + "destination": "/home/packer/mig-partition.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/docker_clear_mount_propagation_flags.conf", + "destination": "/home/packer/docker_clear_mount_propagation_flags.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/nvidia-modprobe.service", + "destination": "/home/packer/nvidia-modprobe.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/nvidia-docker-daemon.json", + "destination": "/home/packer/nvidia-docker-daemon.json" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-auth", + "destination": "/home/packer/pam-d-common-auth" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-common-password", + "destination": "/home/packer/pam-d-common-password" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/pam-d-su", + "destination": "/home/packer/pam-d-su" + }, + { + "type": "file", + "source": "vhdbuilder/notice.txt", + "destination": "/home/packer/NOTICE.txt" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns.sh", + "destination": "/home/packer/localdns.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns.service", + "destination": "/home/packer/localdns.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns-delegate.conf", + "destination": "/home/packer/localdns-delegate.conf" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns_exporter.sh", + "destination": "/home/packer/localdns_exporter.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns-exporter.socket", + "destination": "/home/packer/localdns-exporter.socket" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/localdns-exporter@.service", + "destination": "/home/packer/localdns-exporter@.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-localdns-hosts-setup.sh", + "destination": "/home/packer/aks-localdns-hosts-setup.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-localdns-hosts-setup.service", + "destination": "/home/packer/aks-localdns-hosts-setup.service" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/aks-localdns-hosts-setup.timer", + "destination": "/home/packer/aks-localdns-hosts-setup.timer" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/configure-azure-network.sh", + "destination": "/home/packer/configure-azure-network.sh" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/99-azure-network.rules", + "destination": "/home/packer/99-azure-network.rules" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/10_azure_nvidia", + "destination": "/home/packer/10_azure_nvidia" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/51-azure-nvidia.cfg", + "destination": "/home/packer/51-azure-nvidia.cfg" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/doca.list", + "destination": "/home/packer/doca.list" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/doca.pub", + "destination": "/home/packer/doca.pub" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/nvidia-2404.list", + "destination": "/home/packer/nvidia-2404.list" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/nvidia.pub", + "destination": "/home/packer/nvidia.pub" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/containerd-nvidia.toml", + "destination": "/home/packer/containerd-nvidia.toml" + }, + { + "type": "file", + "source": "parts/linux/cloud-init/artifacts/ubuntu/modprobe-nvidia-parameters.conf", + "destination": "/home/packer/modprobe-nvidia-parameters.conf" + }, + { + "type": "file", + "source": "vhdbuilder/packer/gb200-mai-bom.json", + "destination": "/home/packer/gb200-mai-bom.json" + }, + { + "type": "shell", + "inline": [ + "sudo FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} VHD_BUILD_TIMESTAMP={{user `vhd_build_timestamp`}} /bin/bash -eux /home/packer/pre-install-dependencies.sh" + ] + }, + { + "type": "shell", + "inline": "sudo reboot", + "expect_disconnect": true, + "skip_clean": true, + "pause_after": "60s" + }, + { + "type": "shell", + "inline": [ + "sudo CONTINUE_ON_LOCAL_REPO_DOWNLOAD_ERROR={{user `continue_on_local_repo_download_error`}} LOCAL_DOCA_REPO_URL=\"{{user `local_doca_repo_url`}}\" FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} PRIVATE_PACKAGES_URL={{user `private_packages_url`}} VHD_BUILD_TIMESTAMP={{user `vhd_build_timestamp`}} /bin/bash -eux /home/packer/install-dependencies.sh" + ] + }, + { + "type": "file", + "direction": "download", + "source": "/var/log/bcc_installation.log", + "destination": "bcc-tools-installation.log" + }, + { + "type": "shell", + "inline": ["sudo rm /var/log/bcc_installation.log"] + }, + { + "type": "shell", + "inline": ["sudo /bin/bash /home/packer/generate-disk-usage.sh"] + }, + { + "type": "file", + "direction": "download", + "source": "/opt/azure/disk-usage.txt", + "destination": "disk-usage.txt" + }, + { + "type": "shell", + "inline": "sudo reboot", + "expect_disconnect": true, + "skip_clean": true, + "pause_after": "60s" + }, + { + "type": "shell", + "inline": [ + "sudo FEATURE_FLAGS={{user `feature_flags`}} BUILD_NUMBER={{user `build_number`}} BUILD_ID={{user `build_id`}} COMMIT={{user `commit`}} HYPERV_GENERATION={{user `hyperv_generation`}} CONTAINER_RUNTIME={{user `container_runtime`}} TELEPORTD_PLUGIN_DOWNLOAD_URL={{user `teleportd_plugin_download_url`}} ENABLE_FIPS={{user `enable_fips`}} IMG_SKU={{user `img_sku`}} /bin/bash -eux /home/packer/post-install-dependencies.sh" + ] + }, + { + "type": "file", + "source": "vhdbuilder/packer/list-images.sh", + "destination": "/home/packer/list-images.sh" + }, + { + "type": "shell", + "inline": [ + "sudo SKU_NAME={{user `sku_name`}} IMAGE_VERSION={{user `image_version`}} CONTAINER_RUNTIME={{user `container_runtime`}} /bin/bash -eux /home/packer/list-images.sh" + ] + }, + { + "type": "file", + "direction": "download", + "source": "/opt/azure/containers/image-bom.json", + "destination": "image-bom.json" + }, + { + "type": "file", + "direction": "download", + "source": "/opt/azure/vhd-install.complete", + "destination": "release-notes.txt" + }, + { + "type": "file", + "direction": "download", + "source": "/opt/azure/vhd-build-performance-data.json", + "destination": "vhd-build-performance-data.json" + }, + { + "type": "file", + "direction": "download", + "source": "/opt/azure/vhd-grid-compatibility-data.json", + "destination": "vhd-grid-compatibility-data.json" + }, + { + "type": "shell", + "inline": [ + "sudo rm /opt/azure/vhd-build-performance-data.json", + "sudo rm /opt/azure/vhd-grid-compatibility-data.json" + ] + }, + { + "type": "shell", + "inline": [ + "sudo /bin/bash -eux /home/packer/cis.sh", + "sudo /bin/bash -eux /opt/azure/containers/cleanup-vhd.sh", + "sudo /bin/bash -c '/usr/sbin/waagent -force -deprovision+user && export HISTSIZE=0 && /opt/azure/containers/post-deprovision-walinuxagent.sh' || exit 125" + ] + } + ], + "error-cleanup-provisioner": { + "type": "shell", + "inline": ["sudo /bin/bash /home/packer/generate-disk-usage.sh"] + } +}