From f994c7354e45a82671242416e2e5c34f09106449 Mon Sep 17 00:00:00 2001 From: Nishchay Date: Sun, 3 May 2026 20:42:27 -0700 Subject: [PATCH] feat: add support for producing a compressed package for VHD copy --- .github/workflows/ci.yaml | 182 +++++++++++++++++++++++----------- .github/workflows/main.yaml | 119 ++++++++++++++++++++-- .gitignore | 1 + README.md | 101 +++++++++++++++---- build_package.sh | 176 +++++++++++++++++++++++++++++++++ compile_package.sh | 174 ++++++++++++++++++++++++++++++++ download.sh | 89 ++++++++++++++--- install.sh | 8 +- install_package.sh | 192 ++++++++++++++++++++++++++++++++++++ package_manager_helpers.sh | 47 ++++++++- 10 files changed, 983 insertions(+), 106 deletions(-) create mode 100644 .gitignore create mode 100755 build_package.sh create mode 100644 compile_package.sh create mode 100644 install_package.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2ae57bd..3d2e615 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -1,4 +1,4 @@ -name: CI test build +name: CI build on: pull_request: branches: @@ -50,11 +50,11 @@ jobs: bump_each_commit: false version_format: "${{ steps.load_config.outputs.cuda_version }}-${{ steps.timestamp.outputs.timestamp }}" id: semver - - name: 'Check version' + - name: Check version run: | echo "version is ${{ steps.semver.outputs.version }}" echo "version is ${{ steps.semver.outputs.version_tag }}" - - name: 'Build and Push' + - name: Build CUDA image run: | set -x echo "tag is: " @@ -99,11 +99,11 @@ jobs: bump_each_commit: false version_format: "${{ steps.load_config.outputs.cuda_version }}-${{ steps.timestamp.outputs.timestamp }}" id: semver - - name: 'Check version' + - name: Check version run: | echo "version is ${{ steps.semver.outputs.version }}" echo "version is ${{ steps.semver.outputs.version_tag }}" - - name: 'Build and Push' + - name: Build CUDA arm64 image run: | set -x echo "tag is: " @@ -115,53 +115,125 @@ jobs: rm -rf /tmp/.buildx-cache mv /tmp/.buildx-cache-new /tmp/.buildx-cache grid: - runs-on: ubuntu-latest - strategy: - matrix: - driver_kind: ["grid"] - steps: - - uses: actions/checkout@v6 - with: - fetch-depth: 0 - - name: Load GRID config - id: load_config - run: | - grid_version=$(yq e '.grid.version' driver_config.yml) - grid_url=$(yq e '.grid.url' driver_config.yml) - echo "GRID_VERSION=$grid_version" - echo "GRID_URL=$grid_url" - echo "grid_version=$grid_version" >> $GITHUB_OUTPUT - echo "grid_url=$grid_url" >> $GITHUB_OUTPUT - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v4 - - name: Cache Docker layers - uses: actions/cache@v5 - with: - path: /tmp/.buildx-cache - key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }}-${{ github.sha }} - restore-keys: | - ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }} - - name: Generate timestamp - id: timestamp - run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT - - uses: paulhatch/semantic-version@v6.0.2 - with: - bump_each_commit: false - version_format: "${{ steps.load_config.outputs.grid_version }}-${{ steps.timestamp.outputs.timestamp }}" - id: semver - - name: 'Check version' - run: | - echo "version is ${{ steps.semver.outputs.version }}" - echo "version is ${{ steps.semver.outputs.version_tag }}" - - name: 'Build and Push' - run: | - set -x - echo "tag is: " - echo ${{ steps.semver.outputs.version }} - docker buildx build --build-arg DRIVER_URL=${{ steps.load_config.outputs.grid_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.grid_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-grid:${{ steps.semver.outputs.version }} . - docker images - - name: Move cache - run: | - rm -rf /tmp/.buildx-cache - mv /tmp/.buildx-cache-new /tmp/.buildx-cache - + runs-on: ubuntu-latest + strategy: + matrix: + driver_kind: ["grid"] + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - name: Load GRID config + id: load_config + run: | + grid_version=$(yq e '.grid.version' driver_config.yml) + grid_url=$(yq e '.grid.url' driver_config.yml) + echo "GRID_VERSION=$grid_version" + echo "GRID_URL=$grid_url" + echo "grid_version=$grid_version" >> $GITHUB_OUTPUT + echo "grid_url=$grid_url" >> $GITHUB_OUTPUT + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v4 + - name: Cache Docker layers + uses: actions/cache@v5 + with: + path: /tmp/.buildx-cache + key: ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }}-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-buildx-${{ matrix.driver_kind}}-${{ steps.load_config.outputs.grid_version }} + - name: Generate timestamp + id: timestamp + run: echo "timestamp=$(date +'%Y%m%d%H%M%S')" >> $GITHUB_OUTPUT + - uses: paulhatch/semantic-version@v6.0.2 + with: + bump_each_commit: false + version_format: "${{ steps.load_config.outputs.grid_version }}-${{ steps.timestamp.outputs.timestamp }}" + id: semver + - name: Check version + run: | + echo "version is ${{ steps.semver.outputs.version }}" + echo "version is ${{ steps.semver.outputs.version_tag }}" + - name: Build GRID image + run: | + set -x + echo "tag is: " + echo ${{ steps.semver.outputs.version }} + docker buildx build --build-arg DRIVER_URL=${{ steps.load_config.outputs.grid_url }} --build-arg DRIVER_KIND=${{ matrix.driver_kind }} --build-arg DRIVER_VERSION=${{ steps.load_config.outputs.grid_version }} --cache-from=type=local,src=/tmp/.buildx-cache --cache-to=type=local,dest=/tmp/.buildx-cache-new --output=type=docker -t ${{ secrets.AZURE_REGISTRY_SERVER }}/public/aks/aks-gpu-grid:${{ steps.semver.outputs.version }} . + docker images + - name: Move cache + run: | + rm -rf /tmp/.buildx-cache + mv /tmp/.buildx-cache-new /tmp/.buildx-cache + package-cuda: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - name: Load CUDA config + id: load_config + run: | + cuda_version=$(yq e '.cuda.version' driver_config.yml) + echo "CUDA_VERSION=$cuda_version" + echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT + - name: Build CUDA amd64 package + run: | + bash ./build_package.sh \ + --driver-kind cuda \ + --driver-version "${{ steps.load_config.outputs.cuda_version }}" \ + --target-arch amd64 \ + --distro 24.04 + - uses: actions/upload-artifact@v4 + with: + name: aks-gpu-cuda-amd64 + path: dist/*.tar.gz + package-cuda-arm64: + runs-on: ubuntu-24.04-arm + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - name: Load CUDA config + id: load_config + run: | + cuda_version=$(yq e '.cuda.version' driver_config.yml) + echo "CUDA_VERSION=$cuda_version" + echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT + - name: Build CUDA arm64 package + run: | + bash ./build_package.sh \ + --driver-kind cuda \ + --driver-version "${{ steps.load_config.outputs.cuda_version }}" \ + --target-arch arm64 \ + --distro 24.04 + - uses: actions/upload-artifact@v4 + with: + name: aks-gpu-cuda-arm64 + path: dist/*.tar.gz + package-grid: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - name: Load GRID config + id: load_config + run: | + grid_version=$(yq e '.grid.version' driver_config.yml) + grid_url=$(yq e '.grid.url' driver_config.yml) + echo "GRID_VERSION=$grid_version" + echo "GRID_URL=$grid_url" + echo "grid_version=$grid_version" >> $GITHUB_OUTPUT + echo "grid_url=$grid_url" >> $GITHUB_OUTPUT + - name: Build GRID amd64 package + run: | + bash ./build_package.sh \ + --driver-kind grid \ + --driver-version "${{ steps.load_config.outputs.grid_version }}" \ + --driver-url "${{ steps.load_config.outputs.grid_url }}" \ + --target-arch amd64 \ + --distro 24.04 + - uses: actions/upload-artifact@v4 + with: + name: aks-gpu-grid-amd64 + path: dist/*.tar.gz diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index c7f650c..da13a62 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -7,7 +7,7 @@ on: permissions: id-token: write - contents: read + contents: write jobs: cuda: runs-on: ubuntu-latest @@ -53,17 +53,17 @@ jobs: bump_each_commit: false version_format: "${{ steps.load_config.outputs.cuda_version }}-${{ steps.timestamp.outputs.timestamp }}" id: semver - - name: 'Check version' + - name: Check version run: | echo "version is ${{ steps.semver.outputs.version }}" echo "version is ${{ steps.semver.outputs.version_tag }}" - - name: 'Azure CLI login' + - name: Azure CLI login uses: azure/login@v3 with: client-id: ${{ secrets.AZURE_CLIENT_ID }} tenant-id: ${{ secrets.AZURE_TENANT_ID }} subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - - name: 'Build and Push' + - name: Build and Push CUDA image run: | set -x echo "tag is: " @@ -110,17 +110,17 @@ jobs: bump_each_commit: false version_format: "${{ steps.load_config.outputs.cuda_version }}-${{ steps.timestamp.outputs.timestamp }}" id: semver - - name: 'Check version' + - name: Check version run: | echo "version is ${{ steps.semver.outputs.version }}" echo "version is ${{ steps.semver.outputs.version_tag }}" - - name: 'Azure CLI login' + - name: Azure CLI login uses: azure/login@v3 with: client-id: ${{ secrets.AZURE_CLIENT_ID }} tenant-id: ${{ secrets.AZURE_TENANT_ID }} subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - - name: 'Build and Push' + - name: Build and Push CUDA arm64 image run: | set -x echo "tag is: " @@ -169,17 +169,17 @@ jobs: bump_each_commit: false version_format: "${{ steps.load_config.outputs.grid_version }}-${{ steps.timestamp.outputs.timestamp }}" id: semver - - name: 'Check version' + - name: Check version run: | echo "version is ${{ steps.semver.outputs.version }}" echo "version is ${{ steps.semver.outputs.version_tag }}" - - name: 'Azure CLI login' + - name: Azure CLI login uses: azure/login@v3 with: client-id: ${{ secrets.AZURE_CLIENT_ID }} tenant-id: ${{ secrets.AZURE_TENANT_ID }} subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} - - name: 'Build and Push' + - name: Build and Push GRID image run: | set -x echo "tag is: " @@ -192,3 +192,102 @@ jobs: run: | rm -rf /tmp/.buildx-cache mv /tmp/.buildx-cache-new /tmp/.buildx-cache + package-cuda: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - name: Load CUDA config + id: load_config + run: | + cuda_version=$(yq e '.cuda.version' driver_config.yml) + echo "CUDA_VERSION=$cuda_version" + echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT + - name: Build CUDA amd64 package + run: | + bash ./build_package.sh \ + --driver-kind cuda \ + --driver-version "${{ steps.load_config.outputs.cuda_version }}" \ + --target-arch amd64 \ + --distro 24.04 + - uses: actions/upload-artifact@v4 + with: + name: aks-gpu-cuda-amd64 + path: dist/*.tar.gz + package-cuda-arm64: + runs-on: ubuntu-24.04-arm + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - name: Load CUDA config + id: load_config + run: | + cuda_version=$(yq e '.cuda.version' driver_config.yml) + echo "CUDA_VERSION=$cuda_version" + echo "cuda_version=$cuda_version" >> $GITHUB_OUTPUT + - name: Build CUDA arm64 package + run: | + bash ./build_package.sh \ + --driver-kind cuda \ + --driver-version "${{ steps.load_config.outputs.cuda_version }}" \ + --target-arch arm64 \ + --distro 24.04 + - uses: actions/upload-artifact@v4 + with: + name: aks-gpu-cuda-arm64 + path: dist/*.tar.gz + package-grid: + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 + - name: Load GRID config + id: load_config + run: | + grid_version=$(yq e '.grid.version' driver_config.yml) + grid_url=$(yq e '.grid.url' driver_config.yml) + echo "GRID_VERSION=$grid_version" + echo "GRID_URL=$grid_url" + echo "grid_version=$grid_version" >> $GITHUB_OUTPUT + echo "grid_url=$grid_url" >> $GITHUB_OUTPUT + - name: Build GRID amd64 package + run: | + bash ./build_package.sh \ + --driver-kind grid \ + --driver-version "${{ steps.load_config.outputs.grid_version }}" \ + --driver-url "${{ steps.load_config.outputs.grid_url }}" \ + --target-arch amd64 \ + --distro 24.04 + - uses: actions/upload-artifact@v4 + with: + name: aks-gpu-grid-amd64 + path: dist/*.tar.gz + publish-release: + runs-on: ubuntu-24.04 + needs: + - package-cuda + - package-cuda-arm64 + - package-grid + steps: + - uses: actions/download-artifact@v4 + with: + path: release-assets + merge-multiple: true + - name: Create or update release assets + env: + GH_TOKEN: ${{ github.token }} + run: | + short_sha="${GITHUB_SHA::12}" + release_tag="gpu-packages-${short_sha}" + release_title="GPU packages ${short_sha}" + + if gh release view "${release_tag}" >/dev/null 2>&1; then + gh release upload "${release_tag}" release-assets/*.tar.gz --clobber + else + gh release create "${release_tag}" release-assets/*.tar.gz \ + --title "${release_title}" \ + --notes "Host-executable GPU driver packages built from commit ${GITHUB_SHA}." + fi diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..849ddff --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +dist/ diff --git a/README.md b/README.md index 307a592..8f31e0e 100644 --- a/README.md +++ b/README.md @@ -1,34 +1,99 @@ -# Driver container image for AKS VHD +# GPU driver packages for AKS VHD -This repo provides steps to build a container image with all components required for -Kubernetes Nvidia GPU integration. Run it as a privileged container in the host PID namespace. -It will enter the host mount namespace and install the nvidia drivers, container runtime, -and associated libraries on the host, validating their functionality +This repo builds self-contained tar.gz packages with all components required for +Kubernetes NVIDIA GPU integration on Ubuntu hosts. Copy a package to the target VM, +extract it, run the bundled `compile_package.sh` during image build if you want +a kernel-specific precompiled NVIDIA installer, and run `install_package.sh` +later on the host to install the NVIDIA drivers, container runtime tooling, and +associated libraries. ## Build + +### CUDA amd64 +``` +bash ./build_package.sh \ + --driver-kind cuda \ + --driver-version "$(yq e '.cuda.version' driver_config.yml)" \ + --target-arch amd64 \ + --distro 24.04 +``` + +### CUDA arm64 +```bash +bash ./build_package.sh \ + --driver-kind cuda \ + --driver-version "$(yq e '.cuda.version' driver_config.yml)" \ + --target-arch arm64 \ + --distro 24.04 ``` -docker build -f Dockerfile --build-arg DRIVER_VERSION=??? -t docker.io/alexeldeib/aks-gpu:latest . -docker push docker.io/alexeldeib/aks-gpu:latest -``` -#### For DRIVER_VERSION, following versions are known to work : -- 470.82.01 -- 510.47.03 -- 515.65.01 +### GRID amd64 +```bash +bash ./build_package.sh \ + --driver-kind grid \ + --driver-version "$(yq e '.grid.version' driver_config.yml)" \ + --driver-url "$(yq e '.grid.url' driver_config.yml)" \ + --target-arch amd64 \ + --distro 24.04 +``` + +Artifacts are written to `./dist`. + +Pushes to `main` also publish the generated tar.gz files to a GitHub Release named +`gpu-packages-`, which makes the packages easy to download from downstream +automation outside the original workflow run. ## Run + +### Compile during VHD build ```bash -mkdir -p /opt/{actions,gpu} -ctr image pull docker.io/alexeldeib/aks-gpu:latest -ctr run --privileged --net-host --with-ns pid:/proc/1/ns/pid --mount type=bind,src=/opt/gpu,dst=/mnt/gpu,options=rbind --mount type=bind,src=/opt/actions,dst=/mnt/actions,options=rbind -t docker.io/alexeldeib/aks-gpu:latest gpuinstall /entrypoint.sh install +tar -C /opt -xzf dist/aks-gpu-cuda--ubuntu-24.04-amd64.tar.gz +cd /opt/aks-gpu-cuda--ubuntu-24.04-amd64 +sudo bash ./compile_package.sh ``` -or Docker (untested...) +`compile_package.sh` generates a kernel-specific extracted installer tree at +`nvidia-custom/`, writes matching metadata to `nvidia-custom.metadata`, and +prunes the package root down to the runtime-only payload so the image does not +keep the large build-time sources around. + +### Install at runtime ```bash -docker run -it --privileged --net=host --pid=host -v /opt/gpu:/mnt/gpu -v /opt/actions:/mnt/actions --rm docker.io/alexeldeib/aks-gpu:latest install +cd /opt/aks-gpu-cuda--ubuntu-24.04-amd64 +sudo bash ./install_package.sh ``` -Note the `--with-ns pid:/proc/1/ns/pid` and `--privileged`, as well as the bind mounts, these are key. +If `nvidia-custom/` exists and matches the current kernel and package metadata, +`install_package.sh` runs `nvidia-installer` directly from that extracted +precompiled tree and skips both runtime self-extraction and runtime driver +compilation. Older package layouts with `nvidia-custom.run` still work as a +fallback. Otherwise it falls back to the original compile-and-install path when +the extracted NVIDIA sources are still present. + +For fast-path experiments, `install_package.sh` also supports +`SKIP_LDCONFIG=1` to skip the linker cache refresh after installation; the +script manually `insmod`s the installed NVIDIA modules before validating with +`nvidia-smi`. + +The installer removes the extracted payload directory when it completes successfully, +so extract packages into a disposable working directory such as `/tmp`. + +## Legacy container compatibility + +The old privileged container flow still works during migration. + +```bash +docker build \ + --build-arg DRIVER_KIND=cuda \ + --build-arg DRIVER_VERSION="$(yq e '.cuda.version' driver_config.yml)" \ + -f Dockerfile \ + -t aks-gpu:legacy-cuda . + +docker run -it --privileged --net=host --pid=host \ + -v /opt/gpu:/mnt/gpu \ + -v /opt/actions:/mnt/actions \ + --rm aks-gpu:legacy-cuda install +``` ## Fabric manager installation diff --git a/build_package.sh b/build_package.sh new file mode 100755 index 0000000..3707295 --- /dev/null +++ b/build_package.sh @@ -0,0 +1,176 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +usage() { + cat <<'EOF' +Usage: build_package.sh --driver-kind --driver-version [options] + +Options: + --driver-url Required for GRID packages. + --target-arch Target architecture: amd64 or arm64. Defaults to the build host arch. + --distro Ubuntu distro version. Defaults to the current host VERSION_ID. + --output-dir Directory for generated tar.gz artifacts. Defaults to ./dist. +EOF +} + +normalize_arch() { + case "${1}" in + amd64|x86_64) + echo "amd64" + ;; + arm64|aarch64) + echo "arm64" + ;; + *) + echo "Unsupported architecture: ${1}" >&2 + exit 1 + ;; + esac +} + +detect_host_arch() { + normalize_arch "$(uname -m)" +} + +copy_static_assets() { + local package_root="${1}" + local asset + + for asset in \ + 10-nvidia-runtime.toml \ + 71-nvidia-char-dev.rules \ + blacklist-nouveau.conf \ + compile_package.sh \ + fm_run_package_installer.sh \ + install_package.sh \ + package_manager_helpers.sh + do + cp "${SCRIPT_DIR}/${asset}" "${package_root}/${asset}" + done +} + +write_config() { + local package_root="${1}" + + cat > "${package_root}/config.sh" <&2 + usage + exit 1 + ;; + esac +done + +if [[ -z "${DRIVER_KIND}" || -z "${DRIVER_VERSION}" ]]; then + echo "--driver-kind and --driver-version are required" >&2 + usage + exit 1 +fi + +if [[ "${DRIVER_KIND}" != "cuda" && "${DRIVER_KIND}" != "grid" ]]; then + echo "Unsupported driver kind: ${DRIVER_KIND}" >&2 + exit 1 +fi + +if [[ "${DRIVER_KIND}" == "grid" && -z "${DRIVER_URL}" ]]; then + echo "--driver-url is required for GRID packages" >&2 + exit 1 +fi + +TARGET_ARCH="${TARGET_ARCH:-$(detect_host_arch)}" + +if [[ -f /etc/os-release ]]; then + source /etc/os-release +fi + +HOST_DISTRO="${VERSION_ID:-}" + +if [[ -z "${HOST_DISTRO}" ]]; then + echo "Package builds must run on an Ubuntu host with /etc/os-release available." >&2 + exit 1 +fi + +DISTRO="${DISTRO:-${HOST_DISTRO}}" + +if [[ "${DISTRO}" != "${HOST_DISTRO}" ]]; then + echo "Package builds must run on the target Ubuntu release. Host is ${HOST_DISTRO}, requested ${DISTRO}." >&2 + exit 1 +fi + +if [[ "${TARGET_ARCH}" != "$(detect_host_arch)" ]]; then + echo "Package builds must run on the target architecture. Host is $(detect_host_arch), requested ${TARGET_ARCH}." >&2 + exit 1 +fi + +if [[ "${DRIVER_KIND}" == "grid" && "${TARGET_ARCH}" != "amd64" ]]; then + echo "GRID packages are only supported on amd64" >&2 + exit 1 +fi + +artifact_name="aks-gpu-${DRIVER_KIND}-${DRIVER_VERSION}-ubuntu-${DISTRO}-${TARGET_ARCH}" +workdir="$(mktemp -d)" +package_root="${workdir}/${artifact_name}" + +cleanup() { + rm -rf "${workdir}" +} + +trap cleanup EXIT + +mkdir -p "${package_root}" "${OUTPUT_DIR}" + +copy_static_assets "${package_root}" +write_config "${package_root}" + +TARGETARCH="${TARGET_ARCH}" GPU_ROOT="${package_root}" DRIVER_URL="${DRIVER_URL}" bash "${SCRIPT_DIR}/download.sh" + +tar -C "${workdir}" -czf "${OUTPUT_DIR}/${artifact_name}.tar.gz" "${artifact_name}" + +echo "Created ${OUTPUT_DIR}/${artifact_name}.tar.gz" diff --git a/compile_package.sh b/compile_package.sh new file mode 100644 index 0000000..4666121 --- /dev/null +++ b/compile_package.sh @@ -0,0 +1,174 @@ +#!/usr/bin/env bash +set -euxo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GPU_ROOT="${GPU_ROOT:-${SCRIPT_DIR}}" + +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + cat <<'EOF' +Usage: compile_package.sh + +Builds a kernel-specific NVIDIA custom installer tree for the currently +running kernel and saves it as ./nvidia-custom inside the package root. +EOF + exit 0 +fi + +source "${GPU_ROOT}/config.sh" + +PS4='+ $(date -u -I"seconds" | cut -c1-19) ' + +get_driver_runfile_name() { + local arch="${1:-$(uname -m)}" + + if [[ "${DRIVER_KIND}" == "cuda" ]]; then + echo "NVIDIA-Linux-${arch}-${DRIVER_VERSION}" + return 0 + fi + + if [[ "${DRIVER_KIND}" == "grid" ]]; then + if [[ "${arch}" != "x86_64" ]]; then + echo "GRID driver is only supported on x86_64 architecture" >&2 + exit 1 + fi + + echo "NVIDIA-Linux-x86_64-${DRIVER_VERSION}-grid-azure" + return 0 + fi + + echo "Invalid driver kind: ${DRIVER_KIND}" >&2 + exit 1 +} + +get_fabric_manager_arch() { + case "$(uname -m)" in + arm64|aarch64) + echo "sbsa" + ;; + amd64|x86_64) + echo "x86_64" + ;; + *) + uname -m + ;; + esac +} + +KERNEL_NAME="$(uname -r)" +LOG_FILE_NAME="/var/log/nvidia-precompile-$(date +%s).log" +KERNEL_SOURCE_PATH="/lib/modules/${KERNEL_NAME}/build" +RUNFILE="$(get_driver_runfile_name)" +SOURCE_RUNFILE="${GPU_ROOT}/${RUNFILE}.run" +PRECOMPILED_METADATA="${GPU_ROOT}/nvidia-custom.metadata" +PRECOMPILED_INSTALLER_DIR_NAME="nvidia-custom" +PRECOMPILED_INSTALLER_DIR="${GPU_ROOT}/${PRECOMPILED_INSTALLER_DIR_NAME}" +BUILD_WORKDIR="$(mktemp -d)" +EXTRACT_WORKDIR="$(mktemp -d)" + +cleanup() { + rm -rf "${BUILD_WORKDIR}" "${EXTRACT_WORKDIR}" +} + +trap cleanup EXIT + +is_runtime_entry() { + local entry_name="${1}" + local fm_arch apt_package + + case "${entry_name}" in + 10-nvidia-runtime.toml|71-nvidia-char-dev.rules|blacklist-nouveau.conf|config.sh|install_package.sh|package_manager_helpers.sh|nvidia-custom|nvidia-custom.metadata) + return 0 + ;; + esac + + if [[ "${DRIVER_KIND}" == "cuda" ]]; then + fm_arch="$(get_fabric_manager_arch)" + if [[ "${entry_name}" == "fabricmanager-linux-${fm_arch}-${DRIVER_VERSION}" ]]; then + return 0 + fi + fi + + for apt_package in $NVIDIA_PACKAGES; do + if [[ "${entry_name}" == ${apt_package}_${NVIDIA_CONTAINER_TOOLKIT_VER}* ]]; then + return 0 + fi + done + + return 1 +} + +prune_runtime_payload() { + local path entry_name + + shopt -s nullglob dotglob + for path in "${GPU_ROOT}"/*; do + entry_name="$(basename "${path}")" + if ! is_runtime_entry "${entry_name}"; then + rm -rf "${path}" + fi + done + shopt -u nullglob dotglob +} + +if [[ ! -f "${SOURCE_RUNFILE}" ]]; then + echo "Expected to find source runfile '${SOURCE_RUNFILE}', but it does not exist" >&2 + exit 1 +fi + +if [[ ! -d "${KERNEL_SOURCE_PATH}" ]]; then + echo "Expected to find kernel headers at '${KERNEL_SOURCE_PATH}', but they do not exist" >&2 + exit 1 +fi + +rm -f "${PRECOMPILED_METADATA}" +rm -rf "${PRECOMPILED_INSTALLER_DIR}" + +pushd "${BUILD_WORKDIR}" +sh "${SOURCE_RUNFILE}" \ + --ui=none \ + --no-questions \ + --accept-license \ + --no-dkms \ + --add-this-kernel \ + --kernel-source-path="${KERNEL_SOURCE_PATH}" \ + --log-file-name="${LOG_FILE_NAME}" +popd + +GENERATED_RUNFILE="${BUILD_WORKDIR}/${RUNFILE}-custom.run" + +if [[ ! -f "${GENERATED_RUNFILE}" ]]; then + echo "Expected to find generated precompiled runfile '${GENERATED_RUNFILE}', but it does not exist" >&2 + exit 1 +fi + +pushd "${EXTRACT_WORKDIR}" +sh "${GENERATED_RUNFILE}" -x +popd + +GENERATED_INSTALLER_DIR="" +for candidate in "${EXTRACT_WORKDIR}"/*; do + if [[ -d "${candidate}" && -x "${candidate}/nvidia-installer" ]]; then + GENERATED_INSTALLER_DIR="${candidate}" + break + fi +done + +if [[ -z "${GENERATED_INSTALLER_DIR}" ]]; then + echo "Expected to find an extracted precompiled installer tree in '${EXTRACT_WORKDIR}', but none was created" >&2 + exit 1 +fi + +mv "${GENERATED_INSTALLER_DIR}" "${PRECOMPILED_INSTALLER_DIR}" +cat > "${PRECOMPILED_METADATA}" <&2 + exit 1 +fi + +if [[ -z "${DRIVER_KIND:-}" ]]; then + echo "DRIVER_KIND must be set in ${GPU_ROOT}/config.sh" >&2 + exit 1 +fi + +normalize_arch() { + case "${1}" in + amd64|x86_64) + echo "amd64" + ;; + arm64|aarch64) + echo "arm64" + ;; + *) + echo "Unsupported architecture: ${1}" >&2 + exit 1 + ;; + esac +} + +TARGETARCH="${TARGETARCH:-$(normalize_arch "$(uname -m)")}" workdir="$(mktemp -d)" +apt_root="${workdir}/apt" + +cleanup() { + rm -rf "${workdir}" +} + +trap cleanup EXIT + pushd "$workdir" || exit NVIDIA_DRIVER_ARCH=$TARGETARCH @@ -26,6 +63,10 @@ if [[ "${DRIVER_KIND}" == "cuda" ]]; then RUNFILE="NVIDIA-Linux-${NVIDIA_DRIVER_ARCH}-${DRIVER_VERSION}" curl -fsSLO https://us.download.nvidia.com/tesla/${DRIVER_VERSION}/${RUNFILE}.run elif [[ "${DRIVER_KIND}" == "grid" ]]; then + if [[ -z "${DRIVER_URL:-}" ]]; then + echo "DRIVER_URL must be set when DRIVER_KIND=grid" >&2 + exit 1 + fi RUNFILE="NVIDIA-Linux-${NVIDIA_DRIVER_ARCH}-${DRIVER_VERSION}-grid-azure" curl -fsSLO "${DRIVER_URL}" else @@ -34,18 +75,17 @@ else fi # download nvidia drivers, move to permanent cache -mv ${RUNFILE}.run /opt/gpu/${RUNFILE}.run -pushd /opt/gpu +mv ${RUNFILE}.run "${GPU_ROOT}/${RUNFILE}.run" +pushd "${GPU_ROOT}" # extract runfile, takes some time, so do ahead of time -sh /opt/gpu/${RUNFILE}.run -x -rm /opt/gpu/${RUNFILE}.run +sh "${GPU_ROOT}/${RUNFILE}.run" -x popd install_fabric_manager () { curl -fsSLO https://developer.download.nvidia.com/compute/nvidia-driver/redist/fabricmanager/linux-${NVIDIA_FM_ARCH}/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}-archive.tar.xz tar -xvf fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}-archive.tar.xz - mv fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}-archive /opt/gpu/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION} - mv /opt/gpu/fm_run_package_installer.sh /opt/gpu/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh + mv fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}-archive "${GPU_ROOT}/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}" + cp "${GPU_ROOT}/fm_run_package_installer.sh" "${GPU_ROOT}/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh" } if [[ "${DRIVER_KIND}" == "cuda" ]]; then @@ -53,21 +93,38 @@ if [[ "${DRIVER_KIND}" == "cuda" ]]; then install_fabric_manager fi +apt_get_with_temp_root() { + apt-get \ + -o Dir::Etc::sourcelist=/dev/null \ + -o Dir::Etc::sourceparts="${apt_root}/etc/apt/sources.list.d" \ + -o Dir::Etc::trusted=/dev/null \ + -o Dir::Etc::trustedparts="${apt_root}/etc/apt/trusted.gpg.d" \ + -o Dir::State::status="${apt_root}/var/lib/dpkg/status" \ + -o Dir::State::lists="${apt_root}/var/lib/apt/lists" \ + -o Dir::Cache::archives="${apt_root}/var/cache/apt/archives" \ + "$@" +} + +mkdir -p \ + "${apt_root}/etc/apt/sources.list.d" \ + "${apt_root}/etc/apt/trusted.gpg.d" \ + "${apt_root}/var/lib/apt/lists/partial" \ + "${apt_root}/var/cache/apt/archives/partial" \ + "${apt_root}/var/lib/dpkg" +touch "${apt_root}/var/lib/dpkg/status" # configure nvidia apt repo to cache packages curl -fsSLO https://nvidia.github.io/libnvidia-container/gpgkey -gpg --dearmor -o aptnvidia.gpg gpgkey -mv aptnvidia.gpg /etc/apt/trusted.gpg.d/aptnvidia.gpg -curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list -o /etc/apt/sources.list.d/nvidia-container-toolkit.list +gpg --dearmor -o "${apt_root}/etc/apt/trusted.gpg.d/aptnvidia.gpg" gpgkey +curl -fsSL https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list -o "${apt_root}/etc/apt/sources.list.d/nvidia-container-toolkit.list" -apt update -chmod 644 /etc/apt/trusted.gpg.d/* +apt_get_with_temp_root update +chmod 644 "${apt_root}/etc/apt/trusted.gpg.d/"* # download nvidia debian packages for nvidia-container-runtime compat for apt_package in $NVIDIA_PACKAGES; do - apt-get download ${apt_package}=${NVIDIA_CONTAINER_TOOLKIT_VER}* - mv ${apt_package}_${NVIDIA_CONTAINER_TOOLKIT_VER}* /opt/gpu + apt_get_with_temp_root download ${apt_package}=${NVIDIA_CONTAINER_TOOLKIT_VER}* + mv ${apt_package}_${NVIDIA_CONTAINER_TOOLKIT_VER}* "${GPU_ROOT}" done popd || exit -rm -r "$workdir" diff --git a/install.sh b/install.sh index 26aede0..d498f2d 100644 --- a/install.sh +++ b/install.sh @@ -31,7 +31,7 @@ use_package_manager_with_retries wait_for_dpkg_lock install_cached_nvidia_packag # blacklist nouveau driver, nvidia driver dependency cp /opt/gpu/blacklist-nouveau.conf /etc/modprobe.d/blacklist-nouveau.conf -update-initramfs -u +update_initramfs_for_nouveau_blacklist # clean up lingering files from previous install set +e @@ -95,11 +95,7 @@ nvidia-smi # install fabricmanager for nvlink based systems if [[ "${DRIVER_KIND}" == "cuda" ]]; then - NVIDIA_FM_ARCH=$(uname -m) - if [ $NVIDIA_FM_ARCH = "arm64" ]; then - # NVIDIA uses the name "SBSA" for ARM64 platforms for the fabric manager. See https://en.wikipedia.org/wiki/Server_Base_System_Architecture - NVIDIA_FM_ARCH="sbsa" - fi + NVIDIA_FM_ARCH="$(get_fabric_manager_arch)" bash /opt/gpu/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh fi diff --git a/install_package.sh b/install_package.sh new file mode 100644 index 0000000..6e74c55 --- /dev/null +++ b/install_package.sh @@ -0,0 +1,192 @@ +#!/usr/bin/env bash +set -euxo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GPU_ROOT="${GPU_ROOT:-${SCRIPT_DIR}}" + +source "${GPU_ROOT}/config.sh" +source "${GPU_ROOT}/package_manager_helpers.sh" + +trap 'PS4="+ "' exit +PS4='+ $(date -u -I"seconds" | cut -c1-19) ' + +get_driver_runfile_name() { + local arch="${1:-$(uname -m)}" + + if [[ "${DRIVER_KIND}" == "cuda" ]]; then + echo "NVIDIA-Linux-${arch}-${DRIVER_VERSION}" + return 0 + fi + + if [[ "${DRIVER_KIND}" == "grid" ]]; then + if [[ "${arch}" != "x86_64" ]]; then + echo "GRID driver is only supported on x86_64 architecture" >&2 + exit 1 + fi + + echo "NVIDIA-Linux-x86_64-${DRIVER_VERSION}-grid-azure" + return 0 + fi + + echo "Invalid driver kind: ${DRIVER_KIND}" >&2 + exit 1 +} + +KERNEL_NAME="$(uname -r)" +LOG_FILE_NAME="/var/log/nvidia-installer-$(date +%s).log" +RUNFILE="$(get_driver_runfile_name)" +PRECOMPILED_RUNFILE="${GPU_ROOT}/nvidia-custom.run" +PRECOMPILED_METADATA="${GPU_ROOT}/nvidia-custom.metadata" +PRECOMPILED_INSTALLER_DIR_NAME="nvidia-custom" +PRECOMPILED_INSTALLER_DIR="${GPU_ROOT}/${PRECOMPILED_INSTALLER_DIR_NAME}" +USED_PRECOMPILED_ARTIFACT=0 + +set +euo pipefail +open_devices="$(lsof /dev/nvidia* 2>/dev/null)" +echo "Open devices: $open_devices" + +open_gridd="$(lsof /usr/bin/nvidia-gridd 2>/dev/null)" +echo "Open gridd: $open_gridd" + +set -euo pipefail + +install_cached_nvidia_packages() { +for apt_package in $NVIDIA_PACKAGES; do + dpkg -i --force-overwrite "${GPU_ROOT}/${apt_package}_${NVIDIA_CONTAINER_TOOLKIT_VER}"* +done +} + +find_installed_nvidia_module() { + local file_name="${1}" + + find "/lib/modules/${KERNEL_NAME}" -type f -name "${file_name}" | head -n1 +} + +is_nvidia_module_loaded() { + local module_name="${1}" + + lsmod | awk '{print $1}' | grep -qx "${module_name}" +} + +insmod_installed_nvidia_module() { + local module_name="${1}" + local file_name="${2}" + local required="${3:-0}" + local module_path="" + + if is_nvidia_module_loaded "${module_name}"; then + return 0 + fi + + module_path="$(find_installed_nvidia_module "${file_name}")" + if [[ -z "${module_path}" ]]; then + if [[ "${required}" == "1" ]]; then + echo "Expected to find installed NVIDIA module '${file_name}' under /lib/modules/${KERNEL_NAME}, but it does not exist" >&2 + exit 1 + fi + + return 0 + fi + + insmod "${module_path}" +} + +load_installed_nvidia_modules() { + insmod_installed_nvidia_module "nvidia" "nvidia.ko" 1 + insmod_installed_nvidia_module "nvidia_modeset" "nvidia-modeset.ko" + insmod_installed_nvidia_module "nvidia_uvm" "nvidia-uvm.ko" +} + +use_package_manager_with_retries wait_for_dpkg_lock install_cached_nvidia_packages 10 3 + +cp "${GPU_ROOT}/blacklist-nouveau.conf" /etc/modprobe.d/blacklist-nouveau.conf +update_initramfs_for_nouveau_blacklist + +set +e +umount -l /usr/lib/$(uname -m)-linux-gnu || true +umount -l /tmp/overlay || true +rm -r /tmp/overlay || true +set -e + +mkdir /tmp/overlay +mount -t tmpfs tmpfs /tmp/overlay +mkdir /tmp/overlay/{workdir,lib64} +mkdir -p ${GPU_DEST}/lib64 +mount -t overlay overlay -o lowerdir=/usr/lib/$(uname -m)-linux-gnu,upperdir=/tmp/overlay/lib64,workdir=/tmp/overlay/workdir /usr/lib/$(uname -m)-linux-gnu + +if [[ -f "${PRECOMPILED_METADATA}" ]]; then + # shellcheck disable=SC1090 + source "${PRECOMPILED_METADATA}" + + PRECOMPILED_INSTALLER_DIR_NAME="${PRECOMPILED_INSTALLER_DIR_NAME:-nvidia-custom}" + PRECOMPILED_INSTALLER_DIR="${GPU_ROOT}/${PRECOMPILED_INSTALLER_DIR_NAME}" + + if [[ "${PRECOMPILED_KERNEL_NAME}" != "${KERNEL_NAME}" ]]; then + echo "Precompiled installer targets kernel '${PRECOMPILED_KERNEL_NAME}', but current kernel is '${KERNEL_NAME}'" >&2 + exit 1 + fi + + if [[ "${PRECOMPILED_DRIVER_VERSION}" != "${DRIVER_VERSION}" || "${PRECOMPILED_DRIVER_KIND}" != "${DRIVER_KIND}" || "${PRECOMPILED_ARCH}" != "$(uname -m)" ]]; then + echo "Precompiled installer metadata does not match the current package configuration" >&2 + exit 1 + fi +elif [[ -d "${PRECOMPILED_INSTALLER_DIR}" || -f "${PRECOMPILED_RUNFILE}" ]]; then + echo "Expected to find precompiled metadata '${PRECOMPILED_METADATA}', but it does not exist" >&2 + exit 1 +fi + +if [[ -d "${PRECOMPILED_INSTALLER_DIR}" ]]; then + echo "Installing from precompiled installer directory ${PRECOMPILED_INSTALLER_DIR}" + pushd "${PRECOMPILED_INSTALLER_DIR}" + ./nvidia-installer -s --skip-depmod --no-opengl-files --no-install-libglvnd --log-file-name="${LOG_FILE_NAME}" -a --no-drm --no-dkms + popd + USED_PRECOMPILED_ARTIFACT=1 +elif [[ -f "${PRECOMPILED_RUNFILE}" ]]; then + echo "Installing from precompiled runfile ${PRECOMPILED_RUNFILE}" + sh "${PRECOMPILED_RUNFILE}" -s --skip-depmod --no-opengl-files --no-install-libglvnd --log-file-name="${LOG_FILE_NAME}" -a --no-drm --no-dkms + USED_PRECOMPILED_ARTIFACT=1 +elif [[ -d "${GPU_ROOT}/${RUNFILE}" ]]; then + echo "Precompiled runfile not found, falling back to local compilation" + pushd "${GPU_ROOT}" + "${GPU_ROOT}/${RUNFILE}/nvidia-installer" -s -k="${KERNEL_NAME}" --skip-depmod --no-opengl-files --no-install-libglvnd --log-file-name="${LOG_FILE_NAME}" -a --no-drm --dkms + popd +else + echo "Neither a precompiled installer, precompiled runfile, nor extracted installer sources are available in '${GPU_ROOT}'" >&2 + exit 1 +fi + +load_installed_nvidia_modules +nvidia-smi + +cp -a /tmp/overlay/lib64 ${GPU_DEST}/lib64 + +echo "${GPU_DEST}/lib64" > /etc/ld.so.conf.d/nvidia.conf +ldconfig + +set +e +umount -l /usr/lib/$(uname -m)-linux-gnu +umount /tmp/overlay +rm -r /tmp/overlay +set -e + +if [[ "${USED_PRECOMPILED_ARTIFACT}" -eq 0 ]]; then + dkms status +fi +nvidia-modprobe -u -c0 + +cp -r /usr/bin/lib64/lib64/* /usr/lib/$(uname -m)-linux-gnu/ +nvidia-smi + +if [[ "${DRIVER_KIND}" == "cuda" ]]; then + NVIDIA_FM_ARCH="$(get_fabric_manager_arch)" + bash "${GPU_ROOT}/fabricmanager-linux-${NVIDIA_FM_ARCH}-${DRIVER_VERSION}/sbin/fm_run_package_installer.sh" +fi + +mkdir -p /etc/containerd/config.d +cp "${GPU_ROOT}/10-nvidia-runtime.toml" /etc/containerd/config.d/10-nvidia-runtime.toml + +mkdir -p "$(dirname /lib/udev/rules.d/71-nvidia-dev-char.rules)" +cp "${GPU_ROOT}/71-nvidia-char-dev.rules" /lib/udev/rules.d/71-nvidia-dev-char.rules +/usr/bin/nvidia-ctk system create-dev-char-symlinks --create-all + +rm -r "${GPU_ROOT}" diff --git a/package_manager_helpers.sh b/package_manager_helpers.sh index cea6dfd..f56e74e 100755 --- a/package_manager_helpers.sh +++ b/package_manager_helpers.sh @@ -12,6 +12,20 @@ wait_for_dpkg_lock() { done } +get_fabric_manager_arch() { + case "$(uname -m)" in + arm64|aarch64) + echo "sbsa" + ;; + amd64|x86_64) + echo "x86_64" + ;; + *) + uname -m + ;; + esac +} + use_package_manager_with_retries() { local wait_for_locks=$1 local install_dependencies=$2 @@ -28,4 +42,35 @@ use_package_manager_with_retries() { else sleep "$sleep_duration" fi done -} \ No newline at end of file +} + +update_initramfs_for_nouveau_blacklist() { + local kernel_name initrd_path + + kernel_name="$(uname -r)" + initrd_path="/boot/initrd.img-${kernel_name}" + + if ! command -v update-initramfs >/dev/null 2>&1; then + echo "Skipping initramfs update because update-initramfs is unavailable" + return 0 + fi + + if ! command -v lsinitramfs >/dev/null 2>&1; then + echo "lsinitramfs is unavailable; updating initramfs conservatively" + update-initramfs -u -k "${kernel_name}" + return 0 + fi + + if [[ ! -f "${initrd_path}" ]]; then + echo "Skipping initramfs update because ${initrd_path} does not exist" + return 0 + fi + + if lsinitramfs "${initrd_path}" | grep -Eq '(^|/)kernel/.*/nouveau\.ko(\.[^.]+)?$'; then + echo "Updating initramfs to apply nouveau blacklist for ${kernel_name}" + update-initramfs -u -k "${kernel_name}" + return 0 + fi + + echo "Skipping initramfs update because nouveau is not present in ${initrd_path}" +}