From 680faa13c6e2e8d575c0d24697ff77f8058643e6 Mon Sep 17 00:00:00 2001 From: John Paul Bunn Date: Wed, 7 May 2025 01:19:12 +0000 Subject: [PATCH 1/2] Adding LATEST driver installer daemonset for cgpu --- .../cos/daemonset-confidential-latest.yaml | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 nvidia-driver-installer/cos/daemonset-confidential-latest.yaml diff --git a/nvidia-driver-installer/cos/daemonset-confidential-latest.yaml b/nvidia-driver-installer/cos/daemonset-confidential-latest.yaml new file mode 100644 index 000000000..d1183fa60 --- /dev/null +++ b/nvidia-driver-installer/cos/daemonset-confidential-latest.yaml @@ -0,0 +1,208 @@ +# Copyright 2025 Google Inc. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# The Dockerfile and other source for this daemonset are in +# https://cos.googlesource.com/cos/tools/+/refs/heads/master/src/cmd/cos_gpu_installer/ +# +# This is the same as ../../daemonset.yaml except that it assumes that the +# docker image is present on the node instead of downloading from GCR. This +# allows easier upgrades because GKE can preload the correct image on the +# node and the daemonset can just use that image. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-driver-installer + namespace: kube-system + labels: + k8s-app: nvidia-driver-installer +spec: + selector: + matchLabels: + k8s-app: nvidia-driver-installer + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-driver-installer + k8s-app: nvidia-driver-installer + spec: + priorityClassName: system-node-critical + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: cloud.google.com/gke-accelerator + operator: Exists + - key: cloud.google.com/gke-gpu-driver-version + operator: DoesNotExist + - key: cloud.google.com/gke-confidential-nodes-instance-type + operator: In + values: + - TDX + tolerations: + - operator: "Exists" + hostNetwork: true + hostPID: true + volumes: + - name: dev + hostPath: + path: /dev + - name: vulkan-icd-mount + hostPath: + path: /home/kubernetes/bin/nvidia/vulkan/icd.d + - name: nvidia-install-dir-host + hostPath: + path: /home/kubernetes/bin/nvidia + - name: root-mount + hostPath: + path: / + - name: cos-tools + hostPath: + path: /var/lib/cos-tools + - name: nvidia-config + hostPath: + path: /etc/nvidia + initContainers: + - image: "cos-nvidia-installer:fixed" + imagePullPolicy: Never + name: nvidia-driver-installer + resources: + requests: + cpu: 150m + securityContext: + privileged: true + env: + - name: NVIDIA_INSTALL_DIR_HOST + value: /home/kubernetes/bin/nvidia + - name: NVIDIA_INSTALL_DIR_CONTAINER + value: /usr/local/nvidia + - name: VULKAN_ICD_DIR_HOST + value: /home/kubernetes/bin/nvidia/vulkan/icd.d + - name: VULKAN_ICD_DIR_CONTAINER + value: /etc/vulkan/icd.d + - name: ROOT_MOUNT_DIR + value: /root + - name: COS_TOOLS_DIR_HOST + value: /var/lib/cos-tools + - name: COS_TOOLS_DIR_CONTAINER + value: /build/cos-tools + volumeMounts: + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia + - name: vulkan-icd-mount + mountPath: /etc/vulkan/icd.d + - name: dev + mountPath: /dev + - name: root-mount + mountPath: /root + - name: cos-tools + mountPath: /build/cos-tools + - name: nvidia-config + mountPath: /etc/nvidia + command: + - bash + - -c + - | + echo "Checking for existing GPU driver modules" + LABELS=$( curl --retry 5 -H "Metadata-Flavor:Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/kube-labels || exit 1 ) + IFS=,; for label in $LABELS; do + IFS==; read -r LABEL VALUE <<< "$label" + if [[ "${LABEL}" == "cloud.google.com/gke-confidential-nodes-instance-type" ]]; then + CONFIDENTIAL_INSTANCE_TYPE=$VALUE + echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt + fi + done + LABELS=$( curl --retry 5 -H "Metadata-Flavor:Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/kube-labels || exit 1 ) + IFS=,; for label in $LABELS; do + IFS==; read -r LABEL VALUE <<< "$label" + if [[ "${LABEL}" == "cloud.google.com/gke-confidential-nodes-instance-type" ]]; then + CONFIDENTIAL_INSTANCE_TYPE=$VALUE + echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt + fi + done + if lsmod | grep nvidia; then + echo "GPU driver is already installed, the installed version may or may not be the driver version being tried to install, skipping installation" + exit 0 + else + if [[ "${CONFIDENTIAL_INSTANCE_TYPE}" == "TDX" ]]; then + echo "No GPU driver module detected, installing now" + /cos-gpu-installer install --no-verify --version=latest || exit 1 + sbin/modprobe -d /root drm_kms_helper; /sbin/insmod /usr/local/nvidia/drivers/nvidia.ko; sbin/insmod /usr/local/nvidia/drivers/nvidia-uvm.ko; /sbin/insmod /usr/local/nvidia/drivers/nvidia-modeset.ko; /sbin/insmod /usr/local/nvidia/drivers/nvidia-drm.ko + /usr/local/nvidia/bin/nvidia-modprobe -c0 -u -m + chmod 755 /root/home/kubernetes/bin/nvidia + else + echo "Confidential GPU is not supported on this VM, skipping driver installation" + fi + fi + - image: "gcr.io/gke-release/nvidia-persistenced-installer@sha256:e875101ea7bddcef6e628359e3a8f02fdfbcd05f6efe75bc7ad9457ef4020a04" + name: "nvidia-persistenced-installer" + restartPolicy: Always + securityContext: + privileged: true + env: + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 + - name: NVIDIA_INSTALL_DIR_HOST + value: /home/kubernetes/bin/nvidia + - name: NVIDIA_INSTALL_DIR_CONTAINER + value: /usr/local/nvidia + - name: VULKAN_ICD_DIR_HOST + value: /home/kubernetes/bin/nvidia/vulkan/icd.d + - name: VULKAN_ICD_DIR_CONTAINER + value: /etc/vulkan/icd.d + - name: ROOT_MOUNT_DIR + value: /root + - name: COS_TOOLS_DIR_HOST + value: /var/lib/cos-tools + - name: COS_TOOLS_DIR_CONTAINER + value: /build/cos-tools + volumeMounts: + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia + mountPropagation: HostToContainer + - name: root-mount + mountPath: /root + mountPropagation: HostToContainer + - name: nvidia-config + mountPath: /etc/nvidia + mountPropagation: HostToContainer + - name: vulkan-icd-mount + mountPath: /etc/vulkan/icd.d + - name: dev + mountPath: /dev + - name: cos-tools + mountPath: /build/cos-tools + - image: "gcr.io/gke-release/nvidia-partition-gpu@sha256:116be6b7335c1d34366223b9a3780fe80d862fcf06cd2c580426fdc1697af693" + name: partition-gpus + env: + - name: LD_LIBRARY_PATH + value: /usr/local/nvidia/lib64 + resources: + requests: + cpu: 150m + securityContext: + privileged: true + volumeMounts: + - name: nvidia-install-dir-host + mountPath: /usr/local/nvidia + - name: dev + mountPath: /dev + - name: nvidia-config + mountPath: /etc/nvidia + containers: + - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830" + name: pause \ No newline at end of file From 6c7d9272bc4246fb5ba71c910183712c81e2e583 Mon Sep 17 00:00:00 2001 From: John Paul Bunn Date: Mon, 16 Jun 2025 23:40:28 +0000 Subject: [PATCH 2/2] Remove dupes from earlier double-commit. --- .../cos/daemonset-confidential-latest.yaml | 8 -------- nvidia-driver-installer/cos/daemonset-confidential.yaml | 8 -------- 2 files changed, 16 deletions(-) diff --git a/nvidia-driver-installer/cos/daemonset-confidential-latest.yaml b/nvidia-driver-installer/cos/daemonset-confidential-latest.yaml index d1183fa60..d96987c39 100644 --- a/nvidia-driver-installer/cos/daemonset-confidential-latest.yaml +++ b/nvidia-driver-installer/cos/daemonset-confidential-latest.yaml @@ -126,14 +126,6 @@ spec: echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt fi done - LABELS=$( curl --retry 5 -H "Metadata-Flavor:Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/kube-labels || exit 1 ) - IFS=,; for label in $LABELS; do - IFS==; read -r LABEL VALUE <<< "$label" - if [[ "${LABEL}" == "cloud.google.com/gke-confidential-nodes-instance-type" ]]; then - CONFIDENTIAL_INSTANCE_TYPE=$VALUE - echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt - fi - done if lsmod | grep nvidia; then echo "GPU driver is already installed, the installed version may or may not be the driver version being tried to install, skipping installation" exit 0 diff --git a/nvidia-driver-installer/cos/daemonset-confidential.yaml b/nvidia-driver-installer/cos/daemonset-confidential.yaml index a512f58c0..cccaf380c 100644 --- a/nvidia-driver-installer/cos/daemonset-confidential.yaml +++ b/nvidia-driver-installer/cos/daemonset-confidential.yaml @@ -126,14 +126,6 @@ spec: echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt fi done - LABELS=$( curl --retry 5 -H "Metadata-Flavor:Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/kube-labels || exit 1 ) - IFS=,; for label in $LABELS; do - IFS==; read -r LABEL VALUE <<< "$label" - if [[ "${LABEL}" == "cloud.google.com/gke-confidential-nodes-instance-type" ]]; then - CONFIDENTIAL_INSTANCE_TYPE=$VALUE - echo "${CONFIDENTIAL_INSTANCE_TYPE}" > /etc/nvidia/confidential_node_type.txt - fi - done if lsmod | grep nvidia; then echo "GPU driver is already installed, the installed version may or may not be the driver version being tried to install, skipping installation" exit 0