Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ presubmit: vet
@./build/check_boilerplate.sh

TAG=$(shell cat VERSION)
REGISTRY?=gcr.io/google-containers
REGISTRY?=us-west1-docker.pkg.dev/yshangguan-gke-dev/sgy-test-image
IMAGE=nvidia-gpu-device-plugin
PARTITION_GPU_IMAGE=nvidia-partition-gpu
FASTSOCKET_INSTALLER_IMAGE=fastsocket-installer
Expand Down
8 changes: 7 additions & 1 deletion cmd/nvidia_gpu/nvidia_gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
gpumanager "github.com/GoogleCloudPlatform/container-engine-accelerators/pkg/gpu/nvidia"
healthcheck "github.com/GoogleCloudPlatform/container-engine-accelerators/pkg/gpu/nvidia/health_check"
"github.com/GoogleCloudPlatform/container-engine-accelerators/pkg/gpu/nvidia/metrics"
util "github.com/GoogleCloudPlatform/container-engine-accelerators/pkg/gpu/nvidia/util"
"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/golang/glog"
pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
Expand Down Expand Up @@ -139,7 +140,12 @@ func main() {
}

if *enableHealthMonitoring {
hc := healthcheck.NewGPUHealthChecker(ngm.ListPhysicalDevices(), ngm.Health, ngm.ListHealthCriticalXid())
kubeClient, err := util.BuildKubeClient()
if err != nil {
glog.Infof("Failed to build kube client: %v", err)
return
}
hc := healthcheck.NewGPUHealthChecker(ngm.ListPhysicalDevices(), ngm.Health, ngm.ListHealthCriticalXid(), kubeClient)
if err := hc.Start(); err != nil {
glog.Infof("Failed to start GPU Health Checker: %v", err)
return
Expand Down
95 changes: 0 additions & 95 deletions daemonset.yaml

This file was deleted.

Loading