diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index be08b5b4..92deade4 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -9,4 +9,4 @@ dependencies: repository: file://./charts/remediation version: v1.0.0 digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07 -generated: "2026-01-05T19:49:07.882445585Z" +generated: "2026-01-07T10:51:28.442192317Z" diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md index 203d9ef4..216745a6 100644 --- a/helm-charts-k8s/README.md +++ b/helm-charts-k8s/README.md @@ -13,6 +13,7 @@ AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU ac * AMD GPU Operator Controller * K8s Device Plugin * K8s Node Labeller +* Device Config Manager * Device Metrics Exporter * Device Test Runner * Node Feature Discovery Operator @@ -24,6 +25,7 @@ AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU ac * Comprehensive metrics collection and export * Easy deployment of AMD GPU device plugin for Kubernetes * Automated labeling of nodes with AMD GPU capabilities +* GPU partitioning configuration and management via Device Config Manager * Compatibility with standard Kubernetes environments * Efficient GPU resource allocation for containerized workloads * GPU health monitoring and troubleshooting diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock index d9d69628..682e0e64 100644 --- a/helm-charts-openshift/Chart.lock +++ b/helm-charts-openshift/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac -generated: "2025-12-09T09:27:48.895675076Z" +generated: "2026-01-07T10:51:41.178709798Z" diff --git a/internal/controllers/mock_remediation_handler.go b/internal/controllers/mock_remediation_handler.go index cfff50ce..7fb93822 100644 --- a/internal/controllers/mock_remediation_handler.go +++ b/internal/controllers/mock_remediation_handler.go @@ -569,20 +569,6 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) isRemediationDisabled(ctx, de return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isRemediationDisabled", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).isRemediationDisabled), ctx, devConfig) } -// isStatusSynced mocks base method. -func (m *MockremediationMgrHelperAPI) isStatusSynced(ctx context.Context) bool { - m.ctrl.T.Helper() - ret := m.ctrl.Call(m, "isStatusSynced", ctx) - ret0, _ := ret[0].(bool) - return ret0 -} - -// isStatusSynced indicates an expected call of isStatusSynced. -func (mr *MockremediationMgrHelperAPIMockRecorder) isStatusSynced(ctx any) *gomock.Call { - mr.mock.ctrl.T.Helper() - return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isStatusSynced", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).isStatusSynced), ctx) -} - // isWorkflowSchedulableOnNode mocks base method. func (m *MockremediationMgrHelperAPI) isWorkflowSchedulableOnNode(ctx context.Context, devConfig *v1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping) bool { m.ctrl.T.Helper() diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go index 678bf052..f77c9d6b 100644 --- a/internal/controllers/remediation_handler.go +++ b/internal/controllers/remediation_handler.go @@ -165,14 +165,11 @@ func (n *remediationMgr) HandleRemediation(ctx context.Context, devConfig *amdv1 return res, err } - // If statusSynced is false, we need to populate the internal map from the status CR - if !n.helper.isStatusSynced(ctx) { - if err := n.helper.syncInternalMapFromStatusCR(ctx, devConfig.Namespace); err != nil { - logger.Error(err, "Failed to sync internal map from status CR") - return res, err - } - logger.Info("Internal map synced from status CR successfully") + if err := n.helper.syncInternalMapFromStatusCR(ctx, devConfig.Namespace); err != nil { + logger.Error(err, "Failed to sync internal map from status CR") + return res, err } + logger.Info("Internal map synced from status CR successfully") var mappingsList []ConditionWorkflowMapping if err = yaml.Unmarshal([]byte(configMap.Data["workflow"]), &mappingsList); err != nil { @@ -304,7 +301,6 @@ type remediationMgrHelperAPI interface { isRecoveryPolicyViolated(ctx context.Context, nodeName string, mapping *ConditionWorkflowMapping) bool canResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping *ConditionWorkflowMapping) bool syncInternalMapFromStatusCR(ctx context.Context, namespace string) error - isStatusSynced(ctx context.Context) bool isNodeLabelledForForceResume(ctx context.Context, node *v1.Node) bool removeForceResumeWorkflowLabelFromNode(ctx context.Context, node *v1.Node) error isNodeLabelledForAbortWorkflow(node *v1.Node) bool @@ -321,7 +317,6 @@ type remediationMgrHelper struct { client client.Client k8sInterface kubernetes.Interface recoveryTracker *sync.Map - statusSynced bool serviceAccountName string maxParallelWorkflows int } @@ -332,7 +327,6 @@ func newRemediationMgrHelperHandler(client client.Client, k8sInterface kubernete client: client, k8sInterface: k8sInterface, recoveryTracker: new(sync.Map), - statusSynced: false, } } @@ -1057,6 +1051,12 @@ func (h *remediationMgrHelper) isWorkflowSchedulableOnNode(ctx context.Context, logger.Info(fmt.Sprintf("Driver Install/Upgrade is in progress, skipping creation of workflow on node %s", node.Name)) return false } + + // if same node condition remediation workflow has crossed max threshold, skip the node + if h.isRecoveryPolicyViolated(ctx, node.Name, &mapping) { + logger.Info(fmt.Sprintf("Max remediation attempts reached for node %s on condition %s, skipping creation of workflow", node.Name, mapping.NodeCondition)) + return false + } return true } @@ -1372,10 +1372,7 @@ func (h *remediationMgrHelper) syncInternalMapFromStatusCR(ctx context.Context, return fmt.Errorf("failed to get remediation workflow status: %w", err) } - if wfStatus.Status == nil { - h.statusSynced = true - return nil // Nothing to sync - } + h.recoveryTracker = new(sync.Map) for nodeName, conditions := range wfStatus.Status { for nodeCondition, attempts := range conditions { @@ -1393,14 +1390,9 @@ func (h *remediationMgrHelper) syncInternalMapFromStatusCR(ctx context.Context, } } - h.statusSynced = true return nil } -func (h *remediationMgrHelper) isStatusSynced(ctx context.Context) bool { - return h.statusSynced -} - func (h *remediationMgrHelper) isRecoveryPolicyViolated(ctx context.Context, nodeName string, mapping *ConditionWorkflowMapping) bool { logger := log.FromContext(ctx) diff --git a/tests/e2e/client/client.go b/tests/e2e/client/client.go index 2de113e3..e56d5ed3 100644 --- a/tests/e2e/client/client.go +++ b/tests/e2e/client/client.go @@ -19,6 +19,7 @@ package client import ( "context" "encoding/json" + "fmt" "github.com/ROCm/gpu-operator/api/v1alpha1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -382,3 +383,91 @@ func (c *deviceConfigsClient) Delete(name string) (*v1alpha1.DeviceConfig, error return &result, err } + +type RemediationWorkflowStatusClient struct { + restClient rest.Interface +} + +func NewWfStatusClient(c *rest.Config) (*RemediationWorkflowStatusClient, error) { + config := *c + config.ContentConfig.GroupVersion = &v1alpha1.GroupVersion + config.APIPath = "/apis" + config.NegotiatedSerializer = scheme.Codecs.WithoutConversion() + config.UserAgent = rest.DefaultKubernetesUserAgent() + + client, err := rest.RESTClientFor(&config) + if err != nil { + return nil, err + } + + return &RemediationWorkflowStatusClient{restClient: client}, nil +} + +func (c *RemediationWorkflowStatusClient) Create(rwfstatus *v1alpha1.RemediationWorkflowStatus) (*v1alpha1.RemediationWorkflowStatus, error) { + result := v1alpha1.RemediationWorkflowStatus{} + rwfstatus.TypeMeta = metav1.TypeMeta{ + Kind: "RemediationWorkflowStatus", + APIVersion: "amd.com/v1alpha1", + } + err := c.restClient. + Post(). + Namespace(rwfstatus.Namespace). + Resource("remediationworkflowstatuses"). + Body(rwfstatus). + Do(context.TODO()). + Into(&result) + return &result, err +} + +func (c *RemediationWorkflowStatusClient) Update(rwfstatus *v1alpha1.RemediationWorkflowStatus) (*v1alpha1.RemediationWorkflowStatus, error) { + result := v1alpha1.RemediationWorkflowStatus{} + rwfstatus.TypeMeta = metav1.TypeMeta{ + Kind: "RemediationWorkflowStatus", + APIVersion: "amd.com/v1alpha1", + } + err := c.restClient. + Put(). + Namespace(rwfstatus.Namespace). + Resource("remediationworkflowstatuses"). + Name(rwfstatus.Name). + SubResource("status"). + Body(rwfstatus). + Do(context.TODO()). + Into(&result) + + if err != nil { + return nil, fmt.Errorf("failed to update status: %w", err) + } + + return &result, err +} + +func (c *RemediationWorkflowStatusClient) Get(name, namespace string) (*v1alpha1.RemediationWorkflowStatus, error) { + result := v1alpha1.RemediationWorkflowStatus{} + err := c.restClient. + Get(). + Namespace(namespace). + Resource("remediationworkflowstatuses"). + Name(name). + Do(context.TODO()). + Into(&result) + + return &result, err +} + +func (c *RemediationWorkflowStatusClient) Delete(name string, namespace string) (*v1alpha1.RemediationWorkflowStatus, error) { + result := v1alpha1.RemediationWorkflowStatus{} + err := c.restClient. + Delete(). + Namespace(namespace). + Resource("remediationworkflowstatuses"). + Body(&v1alpha1.RemediationWorkflowStatus{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + }). + Do(context.TODO()). + Into(&result) + + return &result, err +} diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go index c8801b8b..50b54c22 100644 --- a/tests/e2e/cluster_test.go +++ b/tests/e2e/cluster_test.go @@ -122,6 +122,9 @@ func (s *E2ESuite) getDeviceConfig(c *C) *v1alpha1.DeviceConfig { Port: 5001, }, Selector: map[string]string{"feature.node.kubernetes.io/amd-gpu": "true"}, + CommonConfig: v1alpha1.CommonConfigSpec{ + InitContainerImage: initContainerImage, + }, }, } insecure := true diff --git a/tests/e2e/doc.go b/tests/e2e/doc.go index cce60e3c..2161d9ca 100644 --- a/tests/e2e/doc.go +++ b/tests/e2e/doc.go @@ -42,4 +42,5 @@ type E2ESuite struct { monClient monitoringClient.Interface apiClientSet *apiextClient.Clientset framework string + wfStatusClient *client.RemediationWorkflowStatusClient } diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index c1e522b6..89b8d397 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -108,6 +108,12 @@ func (s *E2ESuite) SetUpSuite(c *C) { } s.dClient = dcCli + wfstatusClient, err := client.NewWfStatusClient(config) + if err != nil { + c.Fatalf("Error: %v", err.Error()) + } + s.wfStatusClient = wfstatusClient + err = apiextv1.AddToScheme(scheme.Scheme) if err != nil { c.Fatalf("Error: %v", err.Error()) diff --git a/tests/e2e/remediation_test.go b/tests/e2e/remediation_test.go index 9dae48ce..9955685f 100644 --- a/tests/e2e/remediation_test.go +++ b/tests/e2e/remediation_test.go @@ -22,6 +22,8 @@ import ( "strings" "time" + "github.com/ROCm/gpu-operator/api/v1alpha1" + "github.com/ROCm/gpu-operator/tests/e2e/utils" wfv1alpha1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1" "github.com/stretchr/testify/assert" . "gopkg.in/check.v1" @@ -30,9 +32,11 @@ import ( ) const ( - remediationNodeCondition = "AMDGPUHardwareAssertionHwa" - npdInbandRASConfigPath = "./yamls/config/npd/node-problem-detector-config-inband.yaml" - npdInbandRASErrorConfigPath = "./yamls/config/npd/node-problem-detector-error-config-inband.yaml" + conditionHWAssertion = "AMDGPUHardwareAssertionHwa" + conditionInternalError = "AMDGPUDeviceInternalError" + npdInbandRASConfigPath = "./yamls/config/npd/node-problem-detector-config-inband.yaml" + npdInbandRASErrorConfigPath = "./yamls/config/npd/node-problem-detector-error-config-inband.yaml" + npdInband2RASErrorConfigPath = "./yamls/config/npd/node-problem-detector-error-config-inband2.yaml" ) func (s *E2ESuite) verifyRemediationWorkflowStatus(c *C, nodeName, status string, waitTime int) { @@ -51,8 +55,106 @@ func (s *E2ESuite) verifyRemediationWorkflowStatus(c *C, nodeName, status string }, time.Duration(waitTime)*time.Minute, 10*time.Second, "Remediation workflow did not reach expected status") } +func (s *E2ESuite) checkWorkflowExistence(c *C, nodeName string, shouldExist bool) bool { + wfs, err := s.wfClient.ArgoprojV1alpha1().Workflows(s.ns).List(context.Background(), metav1.ListOptions{}) + if err != nil { + logger.Infof("Error listing workflows: %v", err) + return false + } + exists := false + for _, wf := range wfs.Items { + if strings.Contains(wf.Name, nodeName) { + exists = true + break + } + } + return exists == shouldExist +} + +func (s *E2ESuite) isWorkflowSuspended(c *C, nodeName string) bool { + wfs, err := s.wfClient.ArgoprojV1alpha1().Workflows(s.ns).List(context.Background(), metav1.ListOptions{}) + if err != nil || len(wfs.Items) == 0 { + logger.Infof("Error listing workflows: %v", err) + return false + } + wf := wfs.Items[0] + for _, wfItem := range wfs.Items { + if strings.Contains(wfItem.Name, nodeName) { + wf = wfItem + break + } + } + for _, nodeStatus := range wf.Status.Nodes { + if nodeStatus.Type == "Suspend" && nodeStatus.Phase == "Running" { + return true + } + } + return false +} + +func (s *E2ESuite) populateDeviceConfig(c *C) *v1alpha1.DeviceConfig { + driverEnable := false + remediationEnable := true + devCfg := s.getDeviceConfig(c) + devCfg.Spec.Driver.Enable = &driverEnable + devCfg.Spec.RemediationWorkflow.Enable = &remediationEnable + devCfg.Spec.MetricsExporter.Enable = &remediationEnable + devCfg.Spec.MetricsExporter.Image = exporterImage + devCfg.Spec.MetricsExporter.ImagePullPolicy = "Always" + devCfg.Spec.MetricsExporter.Port = 5000 + devCfg.Spec.CommonConfig.UtilsContainer.Image = utilsContainerImage + devCfg.Spec.CommonConfig.UtilsContainer.ImagePullPolicy = "Always" + return devCfg +} + +func (s *E2ESuite) addRemediationWorkflowStatusMetaData(ns, nodeName, nodeCondition string, metadataCount int, c *C) { + // Create initial RemediationWorkflowStatus object if not present + wfstatus, err := s.wfStatusClient.Get("default", ns) + //if not found, create a new one + if err != nil { + logger.Infof("RemediationWorkflowStatus CR not found, creating a new one") + wfstatus.Name = "default" + wfstatus.Namespace = ns + wfstatus, err = s.wfStatusClient.Create(wfstatus) + assert.NoError(c, err, "Failed to create remediation workflow status") + if err != nil { + return + } + } + wfMetaData := make([]v1alpha1.WorkflowMetadata, 0) + for i := 0; i < metadataCount; i++ { + data := v1alpha1.WorkflowMetadata{ + Name: fmt.Sprintf("%s-%s", nodeName, nodeCondition), + StartTime: time.Now().UTC().Format("2006-01-02 15:04:05 UTC"), + } + wfMetaData = append(wfMetaData, data) + } + ncmap := make(map[string][]v1alpha1.WorkflowMetadata) + ncmap[nodeCondition] = wfMetaData + wfstatus.Status = make(map[string]map[string][]v1alpha1.WorkflowMetadata) + wfstatus.Status[nodeName] = ncmap + _, err = s.wfStatusClient.Update(wfstatus) + assert.NoError(c, err, "Failed to add metadata to remediation workflow status CR") +} + +func (s *E2ESuite) untaintNode(nodeName string) { + cmd := fmt.Sprintf("kubectl taint node %s amd-gpu-unhealthy:NoSchedule-", nodeName) + utils.RunCommand(cmd) +} + +func (s *E2ESuite) clearRemediationWorkflowStatusMetaData(ns string, c *C) { + wfstatus, err := s.wfStatusClient.Get("default", ns) + if err != nil { + logger.Infof("RemediationWorkflowStatus object is not found") + return + } + wfstatus.Status = make(map[string]map[string][]v1alpha1.WorkflowMetadata) + _, err = s.wfStatusClient.Update(wfstatus) + assert.NoError(c, err, "Failed to clear metadata from remediation workflow status CR") +} + func (s *E2ESuite) TestAutoNodeRemediationWithoutPhysicalAction(c *C) { - logger.Infof("Starting Auto Node Remediation Test") + logger.Infof("Starting Auto Node Remediation Test without physical action") if s.simEnable { c.Skip("Skipping for non amd gpu testbed") } @@ -69,17 +171,7 @@ func (s *E2ESuite) TestAutoNodeRemediationWithoutPhysicalAction(c *C) { _, err = s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{}) assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName)) - driverEnable := false - remediationEnable := true - devCfg := s.getDeviceConfig(c) - devCfg.Spec.Driver.Enable = &driverEnable - devCfg.Spec.RemediationWorkflow.Enable = &remediationEnable - devCfg.Spec.MetricsExporter.Enable = &remediationEnable - devCfg.Spec.MetricsExporter.Image = exporterImage - devCfg.Spec.MetricsExporter.ImagePullPolicy = "Always" - devCfg.Spec.MetricsExporter.Port = 5000 - devCfg.Spec.CommonConfig.UtilsContainer.Image = utilsContainerImage - devCfg.Spec.CommonConfig.UtilsContainer.ImagePullPolicy = "Always" + devCfg := s.populateDeviceConfig(c) logger.Infof("Creating DeviceConfig with remediation enabled and driver disabled") s.createDeviceConfig(devCfg, c) @@ -97,16 +189,16 @@ func (s *E2ESuite) TestAutoNodeRemediationWithoutPhysicalAction(c *C) { logger.Infof("Verify if Node Problem Detector (NPD) is running on all GPU nodes") s.verifyNPDRunning(c) - logger.Infof("Verifying that node condition %s is added for the node %s", remediationNodeCondition, nodeName) - s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionTrue) + logger.Infof("Verifying that node condition %s is added for the node %s", conditionHWAssertion, nodeName) + s.verifyNodeCondition(c, conditionHWAssertion, corev1.ConditionFalse) // Trigger error condition by modifying NPD config logger.Infof("Edit Node Problem Detector (NPD) thresholds to simulate error condition") s.updateConfigForNPD(c, npdInbandRASConfigPath, npdInbandRASErrorConfigPath) - s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionTrue) + s.verifyNodeCondition(c, conditionHWAssertion, corev1.ConditionTrue) - // Verify remediation workflow started and completed + // Verify remediation workflow is started and running logger.Infof("Verifying remediation workflow started on the node %s", nodeName) s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowRunning), 5) @@ -114,9 +206,240 @@ func (s *E2ESuite) TestAutoNodeRemediationWithoutPhysicalAction(c *C) { logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration") s.updateConfigForNPD(c, npdInbandRASErrorConfigPath, npdInbandRASConfigPath) + //verify workflow succeeded + logger.Infof("Waiting for remediation workflow to complete on the node %s", nodeName) + s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowSucceeded), 70) + + logger.Infof("Verifying that node condition %s is false on the node %s", conditionHWAssertion, nodeName) + s.verifyNodeCondition(c, conditionHWAssertion, corev1.ConditionFalse) +} + +func (s *E2ESuite) TestAutoNodeRemediationWithPhysicalAction(c *C) { + logger.Infof("Starting Auto Node Remediation Test with physical action") + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + + nodes, err := s.clientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ + LabelSelector: "feature.node.kubernetes.io/amd-gpu=true", + }) + assert.NoError(c, err, "Failed to list nodes with AMD GPU label") + if len(nodes.Items) == 0 { + c.Fatalf("No nodes found with AMD GPU label") + } + nodeName := nodes.Items[0].Name + + _, err = s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{}) + assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName)) + + devCfg := s.populateDeviceConfig(c) + + logger.Infof("Creating DeviceConfig with remediation enabled and driver disabled") + s.createDeviceConfig(devCfg, c) + s.checkMetricsExporterStatus(devCfg, s.ns, corev1.ServiceTypeClusterIP, c) + + // Wait for cluster to be up + logger.Infof("Waiting for device config to be applied") + time.Sleep(5 * time.Second) + + // Setup NPD + logger.Infof("Setting up Node Problem Detector (NPD)") + setupNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath) + defer tearDownNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath) + + logger.Infof("Verify if Node Problem Detector (NPD) is running on all GPU nodes") + s.verifyNPDRunning(c) + + logger.Infof("Verifying that node condition %s is added for the node %s", conditionInternalError, nodeName) + s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionFalse) + + // Trigger error condition by modifying NPD config + logger.Infof("Edit Node Problem Detector (NPD) thresholds to simulate error condition") + s.updateConfigForNPD(c, npdInbandRASConfigPath, npdInband2RASErrorConfigPath) + + s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionTrue) + + // Verify remediation workflow started + logger.Infof("Verifying remediation workflow started on the node %s", nodeName) + s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowRunning), 5) + + //verify workflow is suspended waiting for physical action + logger.Infof("Verifying remediation workflow is suspended on the node %s", nodeName) + assert.Eventually(c, func() bool { + return s.isWorkflowSuspended(c, nodeName) + }, 5*time.Minute, 10*time.Second, "Remediation workflow did not reach suspended state") + + // resume workflow by adding label to node + err = utils.AddNodeLabel(s.clientSet, nodeName, "operator.amd.com/gpu-force-resume-workflow", "true") + assert.NoError(c, err, "Failed to add label to resume workflow") + + logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration") + s.updateConfigForNPD(c, npdInband2RASErrorConfigPath, npdInbandRASConfigPath) + logger.Infof("Waiting for remediation workflow to complete on the node %s", nodeName) s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowSucceeded), 70) - logger.Infof("Verifying that node condition %s is false on the node %s", remediationNodeCondition, nodeName) - s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionFalse) + logger.Infof("Verifying that node condition %s is false on the node %s", conditionInternalError, nodeName) + s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionFalse) +} + +func (s *E2ESuite) TestAutoNodeRemediationAbortWorkflow(c *C) { + logger.Infof("Starting Auto Node Remediation abort workflow test") + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + + nodes, err := s.clientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ + LabelSelector: "feature.node.kubernetes.io/amd-gpu=true", + }) + assert.NoError(c, err, "Failed to list nodes with AMD GPU label") + if len(nodes.Items) == 0 { + c.Fatalf("No nodes found with AMD GPU label") + } + nodeName := nodes.Items[0].Name + + _, err = s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{}) + assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName)) + + devCfg := s.populateDeviceConfig(c) + + logger.Infof("Creating DeviceConfig with remediation enabled and driver disabled") + s.createDeviceConfig(devCfg, c) + s.checkMetricsExporterStatus(devCfg, s.ns, corev1.ServiceTypeClusterIP, c) + + // Wait for cluster to be up + logger.Infof("Waiting for device config to be applied") + time.Sleep(5 * time.Second) + + // Setup NPD + logger.Infof("Setting up Node Problem Detector (NPD)") + setupNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath) + defer tearDownNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath) + + logger.Infof("Verify if Node Problem Detector (NPD) is running on all GPU nodes") + s.verifyNPDRunning(c) + + logger.Infof("Verifying that node condition %s is added for the node %s", conditionInternalError, nodeName) + s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionFalse) + + // Trigger error condition by modifying NPD config + logger.Infof("Edit Node Problem Detector (NPD) thresholds to simulate error condition") + s.updateConfigForNPD(c, npdInbandRASConfigPath, npdInband2RASErrorConfigPath) + + s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionTrue) + + // Verify remediation workflow started + logger.Infof("Verifying remediation workflow started on the node %s", nodeName) + s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowRunning), 5) + + //verify workflow is suspended waiting for physical action + logger.Infof("Verifying remediation workflow is suspended on the node %s", nodeName) + assert.Eventually(c, func() bool { + return s.isWorkflowSuspended(c, nodeName) + }, 5*time.Minute, 10*time.Second, "Remediation workflow did not reach suspended state") + + // abort workflow by adding label to node + err = utils.AddNodeLabel(s.clientSet, nodeName, "operator.amd.com/gpu-abort-workflow", "true") + assert.NoError(c, err, "Failed to add label to abort workflow") + + logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration") + s.updateConfigForNPD(c, npdInband2RASErrorConfigPath, npdInbandRASConfigPath) + + //verify workflow is aborted and deleted + logger.Infof("Verifying remediation workflow is aborted and deleted on the node %s", nodeName) + assert.Eventually(c, func() bool { + return s.checkWorkflowExistence(c, nodeName, false) + }, 1*time.Minute, 10*time.Second, "Remediation workflow was not aborted and deleted") + s.untaintNode(nodeName) +} + +func (s *E2ESuite) TestAutoNodeRemediationRecoveryPolicy(c *C) { + logger.Infof("Starting Auto Node Remediation recovery policy test") + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + + nodes, err := s.clientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ + LabelSelector: "feature.node.kubernetes.io/amd-gpu=true", + }) + assert.NoError(c, err, "Failed to list nodes with AMD GPU label") + if len(nodes.Items) == 0 { + c.Fatalf("No nodes found with AMD GPU label") + } + nodeName := nodes.Items[0].Name + + _, err = s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{}) + assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName)) + + devCfg := s.populateDeviceConfig(c) + + //Clear previous state before starting the test + logger.Infof("Clean-up RemediationWorkflowStatus CR before the test") + s.clearRemediationWorkflowStatusMetaData(devCfg.Namespace, c) + + // Pre-populate RemediationWorkflowStatus with max retries + logger.Infof("Pre-populate RemediationWorkflowStatus with max retries for node %s and condition %s", nodeName, conditionInternalError) + s.addRemediationWorkflowStatusMetaData(devCfg.Namespace, nodeName, conditionInternalError, 4, c) + + logger.Infof("Creating DeviceConfig with remediation enabled and driver disabled") + s.createDeviceConfig(devCfg, c) + s.checkMetricsExporterStatus(devCfg, s.ns, corev1.ServiceTypeClusterIP, c) + + // Wait for cluster to be up + logger.Infof("Waiting for device config to be applied") + time.Sleep(5 * time.Second) + + // Setup NPD + logger.Infof("Setting up Node Problem Detector (NPD)") + setupNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath) + defer tearDownNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath) + + logger.Infof("Verify if Node Problem Detector (NPD) is running on all GPU nodes") + s.verifyNPDRunning(c) + + logger.Infof("Verifying that node condition %s is added for the node %s", conditionInternalError, nodeName) + s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionFalse) + + // Trigger error condition by modifying NPD config + logger.Infof("Edit Node Problem Detector (NPD) thresholds to simulate error condition") + s.updateConfigForNPD(c, npdInbandRASConfigPath, npdInband2RASErrorConfigPath) + + s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionTrue) + + // Verify remediation workflow is not started due to max retries reached + logger.Infof("Verifying remediation workflow is not started on the node %s due to max retries reached", nodeName) + assert.Eventually(c, func() bool { + return s.checkWorkflowExistence(c, nodeName, false) + }, 2*time.Minute, 10*time.Second, "Remediation workflow was started despite max retries reached") + + // Clear RemediationWorkflowStatus metadata + logger.Infof("Clearing RemediationWorkflowStatus metadata for node %s and condition %s", nodeName, conditionInternalError) + s.clearRemediationWorkflowStatusMetaData(devCfg.Namespace, c) + + // Verify remediation workflow is started and running now + logger.Infof("Verifying remediation workflow is started on the node %s after clearing metadata", nodeName) + assert.Eventually(c, func() bool { + return s.checkWorkflowExistence(c, nodeName, true) + }, 2*time.Minute, 10*time.Second, "Remediation workflow was started despite max retries reached") + + //verify workflow is suspended waiting for physical action + logger.Infof("Verifying remediation workflow is suspended on the node %s", nodeName) + assert.Eventually(c, func() bool { + return s.isWorkflowSuspended(c, nodeName) + }, 3*time.Minute, 10*time.Second, "Remediation workflow did not reach suspended state") + + // abort workflow by adding label to node + logger.Infof("Aborting the suspended workflow") + err = utils.AddNodeLabel(s.clientSet, nodeName, "operator.amd.com/gpu-abort-workflow", "true") + assert.NoError(c, err, "Failed to add label to abort workflow") + + logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration") + s.updateConfigForNPD(c, npdInband2RASErrorConfigPath, npdInbandRASConfigPath) + + //verify workflow is aborted and deleted + logger.Infof("Verifying remediation workflow is aborted and deleted on the node %s", nodeName) + assert.Eventually(c, func() bool { + return s.checkWorkflowExistence(c, nodeName, false) + }, 1*time.Minute, 10*time.Second, "Remediation workflow was not aborted and deleted") + s.untaintNode(nodeName) } diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml index a2b6976d..8468b1c9 100644 --- a/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml +++ b/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml @@ -17,6 +17,11 @@ data: "type": "AMDGPUHardwareAssertionHwa", "reason": "AMDGPUIsUp", "message": "AMDGPU is up" + }, + { + "type": "AMDGPUDeviceInternalError", + "reason": "AMDGPUIsUp", + "message": "AMDGPU is up" } ], "rules": [ @@ -33,6 +38,20 @@ data: "-t=0" ], "timeout": "15s" + }, + { + "type": "permanent", + "condition": "AMDGPUDeviceInternalError", + "reason": "AMDGPU Device Internal Error", + "path": "/var/lib/amd-metrics-exporter/amdgpuhealth", + "args": [ + "query", + "inband-ras-errors", + "-s=CPER_SEVERITY_FATAL", + "--afid=25", + "-t=0" + ], + "timeout": "15s" } ] } diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml index 4ffc7d68..d07679b3 100644 --- a/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml +++ b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml @@ -17,6 +17,11 @@ data: "type": "AMDGPUHardwareAssertionHwa", "reason": "AMDGPUIsUp", "message": "AMDGPU is up" + }, + { + "type": "AMDGPUDeviceInternalError", + "reason": "AMDGPUIsUp", + "message": "AMDGPU is up" } ], "rules": [ @@ -33,6 +38,20 @@ data: "-t=-1" ], "timeout": "15s" + }, + { + "type": "permanent", + "condition": "AMDGPUDeviceInternalError", + "reason": "AMDGPU Device Internal Error", + "path": "/var/lib/amd-metrics-exporter/amdgpuhealth", + "args": [ + "query", + "inband-ras-errors", + "-s=CPER_SEVERITY_FATAL", + "--afid=25", + "-t=0" + ], + "timeout": "15s" } ] } diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband2.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband2.yaml new file mode 100644 index 00000000..9a6bb730 --- /dev/null +++ b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband2.yaml @@ -0,0 +1,61 @@ +apiVersion: v1 +data: + custom-plugin-monitor.json: | + { + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "30s", + "timeout": "15s", + "max_output_length": 80, + "concurrency": 3, + "enable_message_change_based_condition_update": false + }, + "source": "amdgpu-custom-plugin-monitor", + "metricsReporting": true, + "conditions": [ + { + "type": "AMDGPUHardwareAssertionHwa", + "reason": "AMDGPUIsUp", + "message": "AMDGPU is up" + }, + { + "type": "AMDGPUDeviceInternalError", + "reason": "AMDGPUIsUp", + "message": "AMDGPU is up" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "AMDGPUHardwareAssertionHwa", + "reason": "AMDGPU Hardware Assertion", + "path": "/var/lib/amd-metrics-exporter/amdgpuhealth", + "args": [ + "query", + "inband-ras-errors", + "-s=CPER_SEVERITY_FATAL", + "--afid=30", + "-t=0" + ], + "timeout": "15s" + }, + { + "type": "permanent", + "condition": "AMDGPUDeviceInternalError", + "reason": "AMDGPU Device Internal Error", + "path": "/var/lib/amd-metrics-exporter/amdgpuhealth", + "args": [ + "query", + "inband-ras-errors", + "-s=CPER_SEVERITY_FATAL", + "--afid=25", + "-t=-1" + ], + "timeout": "15s" + } + ] + } +kind: ConfigMap +metadata: + name: node-problem-detector-config + namespace: kube-system