diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go index 3b6c6ea6..c4f2beaa 100644 --- a/api/v1alpha1/deviceconfig_types.go +++ b/api/v1alpha1/deviceconfig_types.go @@ -103,6 +103,11 @@ type RemediationWorkflowSpec struct { // +optional // +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$` TesterImage string `json:"testerImage,omitempty"` + + // MaxParallelWorkflows specifies limit on how many remediation workflows can be executed in parallel. 0 is the default value and it means no limit. + //+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="MaxParallelWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows"} + // +optional + MaxParallelWorkflows int `json:"maxParallelWorkflows"` } type RegistryTLS struct { diff --git a/api/v1alpha1/remediationwf_types.go b/api/v1alpha1/remediationwf_types.go index 36234ca2..528f135a 100644 --- a/api/v1alpha1/remediationwf_types.go +++ b/api/v1alpha1/remediationwf_types.go @@ -25,11 +25,14 @@ import ( //+kubebuilder:subresource:status // RemediationWorkflowStatus keeps a record of recent remediation workflow runs. +// We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed. // +operator-sdk:csv:customresourcedefinitions:displayName="RemediationWorkflowStatus",resources={{Module,v1beta1,modules.kmm.sigs.x-k8s.io},{Daemonset,v1,apps},{services,v1,core},{Pod,v1,core}} type RemediationWorkflowStatus struct { metav1.TypeMeta `json:",inline"` metav1.ObjectMeta `json:"metadata,omitempty"` + // Status field holds remediation workflow run history for each node and node condition + // Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time) Status map[string]map[string][]WorkflowMetadata `json:"status,omitempty"` } diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml index 94294cea..bc7e0a73 100644 --- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml +++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml @@ -32,7 +32,7 @@ metadata: capabilities: Seamless Upgrades categories: AI/Machine Learning,Monitoring containerImage: docker.io/rocm/gpu-operator:v1.4.0 - createdAt: "2025-11-03T10:08:51Z" + createdAt: "2025-12-09T09:27:50Z" description: |- Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/) @@ -718,6 +718,13 @@ spec: path: remediationWorkflow.enable x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:enable + - description: MaxParallelWorkflows specifies limit on how many remediation + workflows can be executed in parallel. 0 is the default value and it means + no limit. + displayName: MaxParallelWorkflows + path: remediationWorkflow.maxParallelWorkflows + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows - description: Tester image used to run tests and verify if remediation fixed the reported problem. displayName: TesterImage diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml index d4941f3b..ae180f96 100644 --- a/bundle/manifests/amd.com_deviceconfigs.yaml +++ b/bundle/manifests/amd.com_deviceconfigs.yaml @@ -1394,6 +1394,11 @@ spec: enable remediation workflows. disabled by default enable if operator should automatically handle remediation of node incase of gpu issues type: boolean + maxParallelWorkflows: + description: MaxParallelWorkflows specifies limit on how many + remediation workflows can be executed in parallel. 0 is the + default value and it means no limit. + type: integer testerImage: description: Tester image used to run tests and verify if remediation fixed the reported problem. diff --git a/bundle/manifests/amd.com_remediationworkflowstatuses.yaml b/bundle/manifests/amd.com_remediationworkflowstatuses.yaml index 7becfc99..a3678a9c 100644 --- a/bundle/manifests/amd.com_remediationworkflowstatuses.yaml +++ b/bundle/manifests/amd.com_remediationworkflowstatuses.yaml @@ -23,8 +23,9 @@ spec: - name: v1alpha1 schema: openAPIV3Schema: - description: RemediationWorkflowStatus keeps a record of recent remediation - workflow runs. + description: |- + RemediationWorkflowStatus keeps a record of recent remediation workflow runs. + We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed. properties: apiVersion: description: |- @@ -55,6 +56,9 @@ spec: type: object type: array type: object + description: |- + Status field holds remediation workflow run history for each node and node condition + Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time) type: object type: object served: true diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml index c4bf1868..80e1beb1 100644 --- a/config/crd/bases/amd.com_deviceconfigs.yaml +++ b/config/crd/bases/amd.com_deviceconfigs.yaml @@ -1390,6 +1390,11 @@ spec: enable remediation workflows. disabled by default enable if operator should automatically handle remediation of node incase of gpu issues type: boolean + maxParallelWorkflows: + description: MaxParallelWorkflows specifies limit on how many + remediation workflows can be executed in parallel. 0 is the + default value and it means no limit. + type: integer testerImage: description: Tester image used to run tests and verify if remediation fixed the reported problem. diff --git a/config/crd/bases/amd.com_remediationworkflowstatuses.yaml b/config/crd/bases/amd.com_remediationworkflowstatuses.yaml index 651af2d9..5708c17a 100644 --- a/config/crd/bases/amd.com_remediationworkflowstatuses.yaml +++ b/config/crd/bases/amd.com_remediationworkflowstatuses.yaml @@ -19,8 +19,9 @@ spec: - name: v1alpha1 schema: openAPIV3Schema: - description: RemediationWorkflowStatus keeps a record of recent remediation - workflow runs. + description: |- + RemediationWorkflowStatus keeps a record of recent remediation workflow runs. + We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed. properties: apiVersion: description: |- @@ -51,6 +52,9 @@ spec: type: object type: array type: object + description: |- + Status field holds remediation workflow run history for each node and node condition + Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time) type: object type: object served: true diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml index a189e364..5a4d28ff 100644 --- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml +++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml @@ -689,6 +689,13 @@ spec: path: remediationWorkflow.enable x-descriptors: - urn:alm:descriptor:com.amd.deviceconfigs:enable + - description: MaxParallelWorkflows specifies limit on how many remediation + workflows can be executed in parallel. 0 is the default value and it means + no limit. + displayName: MaxParallelWorkflows + path: remediationWorkflow.maxParallelWorkflows + x-descriptors: + - urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows - description: Tester image used to run tests and verify if remediation fixed the reported problem. displayName: TesterImage diff --git a/docs/autoremediation/auto-remediation.md b/docs/autoremediation/auto-remediation.md index 3b5d9a5d..a619d713 100644 --- a/docs/autoremediation/auto-remediation.md +++ b/docs/autoremediation/auto-remediation.md @@ -130,6 +130,16 @@ The most common CR users will be using will be of this form which will use the ` enable: true ``` +You can limit the number of nodes undergoing remediation simultaneously by setting the `maxParallelWorkflows` field in the Device Config custom resource. For example, to ensure no more than 5 nodes undergo remediation at the same time, configure the value as 5(as shown below). The default value is zero, which means there is no upper limit on the number of parallel workflows that can run simultaneously. + +```yaml + remediationWorkflow: + enable: true + maxParallelWorkflows: 5 +``` + +When more workflows are triggered beyond the above workflow parallelism limit, the excess workflows are queued by the Argo workflow controller and enter a **Pending** state. They will remain in the queue until a running workflow finishes and a "slot" within the configured parallelism limit becomes available. + ## Default Workflow Template Note: `default-template` will be created on the cluster by GPU-Operator diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml index b924972a..b08b2f32 100644 --- a/hack/k8s-patch/metadata-patch/values.yaml +++ b/hack/k8s-patch/metadata-patch/values.yaml @@ -231,7 +231,10 @@ deviceConfig: # -- config manager tolerations configManagerTolerations: [] remediationWorkflow: + # -- enable/disable remediation workflow controller enable: false + # -- Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit + maxParallelWorkflows: 0 # AMD GPU operator controller related configs controllerManager: manager: diff --git a/hack/k8s-patch/template-patch/default-deviceconfig.yaml b/hack/k8s-patch/template-patch/default-deviceconfig.yaml index 3c40cff8..32bdbc5a 100644 --- a/hack/k8s-patch/template-patch/default-deviceconfig.yaml +++ b/hack/k8s-patch/template-patch/default-deviceconfig.yaml @@ -437,6 +437,10 @@ spec: {{- with .testerImage }} testerImage: {{ . }} {{- end }} + + {{- with .maxParallelWorkflows }} + maxParallelWorkflows: {{ . }} + {{- end }} {{- end }} {{- end }} diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock index bb567af6..058c7ecf 100644 --- a/helm-charts-k8s/Chart.lock +++ b/helm-charts-k8s/Chart.lock @@ -9,4 +9,4 @@ dependencies: repository: file://./charts/remediation version: v1.0.0 digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07 -generated: "2025-11-03T10:08:37.655536804Z" +generated: "2025-12-09T09:27:36.511662862Z" diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md index 656de6e0..203d9ef4 100644 --- a/helm-charts-k8s/README.md +++ b/helm-charts-k8s/README.md @@ -231,6 +231,8 @@ Kubernetes: `>= 1.29.0-0` | deviceConfig.spec.metricsExporter.tolerations | list | `[]` | metrics exporter tolerations | | deviceConfig.spec.metricsExporter.upgradePolicy.maxUnavailable | int | `1` | the maximum number of Pods that can be unavailable during the update process | | deviceConfig.spec.metricsExporter.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete | +| deviceConfig.spec.remediationWorkflow.enable | bool | `false` | enable/disable remediation workflow controller | +| deviceConfig.spec.remediationWorkflow.maxParallelWorkflows | int | `0` | Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit | | deviceConfig.spec.selector | object | `{"feature.node.kubernetes.io/amd-gpu":"true"}` | Set node selector for the default DeviceConfig | | deviceConfig.spec.testRunner.config | object | `{}` | test runner config map, e.g. {"name": "myConfigMap"} | | deviceConfig.spec.testRunner.enable | bool | `false` | enable / disable test runner | diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml index 752cf141..6061e2c9 100644 --- a/helm-charts-k8s/crds/deviceconfig-crd.yaml +++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml @@ -1396,6 +1396,11 @@ spec: enable remediation workflows. disabled by default enable if operator should automatically handle remediation of node incase of gpu issues type: boolean + maxParallelWorkflows: + description: MaxParallelWorkflows specifies limit on how many remediation + workflows can be executed in parallel. 0 is the default value + and it means no limit. + type: integer testerImage: description: Tester image used to run tests and verify if remediation fixed the reported problem. diff --git a/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml b/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml index aa5c0ac0..011e3ad0 100644 --- a/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml +++ b/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml @@ -28,8 +28,9 @@ spec: - name: v1alpha1 schema: openAPIV3Schema: - description: RemediationWorkflowStatus keeps a record of recent remediation - workflow runs. + description: |- + RemediationWorkflowStatus keeps a record of recent remediation workflow runs. + We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed. properties: apiVersion: description: |- @@ -60,6 +61,9 @@ spec: type: object type: array type: object + description: |- + Status field holds remediation workflow run history for each node and node condition + Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time) type: object type: object served: true diff --git a/helm-charts-k8s/templates/default-deviceconfig.yaml b/helm-charts-k8s/templates/default-deviceconfig.yaml index 3c40cff8..32bdbc5a 100644 --- a/helm-charts-k8s/templates/default-deviceconfig.yaml +++ b/helm-charts-k8s/templates/default-deviceconfig.yaml @@ -437,6 +437,10 @@ spec: {{- with .testerImage }} testerImage: {{ . }} {{- end }} + + {{- with .maxParallelWorkflows }} + maxParallelWorkflows: {{ . }} + {{- end }} {{- end }} {{- end }} diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml index b924972a..b08b2f32 100644 --- a/helm-charts-k8s/values.yaml +++ b/helm-charts-k8s/values.yaml @@ -231,7 +231,10 @@ deviceConfig: # -- config manager tolerations configManagerTolerations: [] remediationWorkflow: + # -- enable/disable remediation workflow controller enable: false + # -- Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit + maxParallelWorkflows: 0 # AMD GPU operator controller related configs controllerManager: manager: diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock index 2310380d..d9d69628 100644 --- a/helm-charts-openshift/Chart.lock +++ b/helm-charts-openshift/Chart.lock @@ -6,4 +6,4 @@ dependencies: repository: file://./charts/kmm version: v1.0.0 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac -generated: "2025-11-03T10:08:49.883010865Z" +generated: "2025-12-09T09:27:48.895675076Z" diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml index 752cf141..6061e2c9 100644 --- a/helm-charts-openshift/crds/deviceconfig-crd.yaml +++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml @@ -1396,6 +1396,11 @@ spec: enable remediation workflows. disabled by default enable if operator should automatically handle remediation of node incase of gpu issues type: boolean + maxParallelWorkflows: + description: MaxParallelWorkflows specifies limit on how many remediation + workflows can be executed in parallel. 0 is the default value + and it means no limit. + type: integer testerImage: description: Tester image used to run tests and verify if remediation fixed the reported problem. diff --git a/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml b/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml index aa5c0ac0..011e3ad0 100644 --- a/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml +++ b/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml @@ -28,8 +28,9 @@ spec: - name: v1alpha1 schema: openAPIV3Schema: - description: RemediationWorkflowStatus keeps a record of recent remediation - workflow runs. + description: |- + RemediationWorkflowStatus keeps a record of recent remediation workflow runs. + We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed. properties: apiVersion: description: |- @@ -60,6 +61,9 @@ spec: type: object type: array type: object + description: |- + Status field holds remediation workflow run history for each node and node condition + Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time) type: object type: object served: true diff --git a/internal/controllers/mock_remediation_handler.go b/internal/controllers/mock_remediation_handler.go index 0ab77e2c..cfff50ce 100644 --- a/internal/controllers/mock_remediation_handler.go +++ b/internal/controllers/mock_remediation_handler.go @@ -355,12 +355,11 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) getRecentRecoveryCount(nodeNa } // getRecoveryTrackerKey mocks base method. -func (m *MockremediationMgrHelperAPI) getRecoveryTrackerKey(nodeName, nodeCondition string) (string, error) { +func (m *MockremediationMgrHelperAPI) getRecoveryTrackerKey(nodeName, nodeCondition string) string { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "getRecoveryTrackerKey", nodeName, nodeCondition) ret0, _ := ret[0].(string) - ret1, _ := ret[1].(error) - return ret0, ret1 + return ret0 } // getRecoveryTrackerKey indicates an expected call of getRecoveryTrackerKey. @@ -710,6 +709,20 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) syncInternalMapFromStatusCR(c return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "syncInternalMapFromStatusCR", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).syncInternalMapFromStatusCR), ctx, namespace) } +// updateMaxParallelWorkflows mocks base method. +func (m *MockremediationMgrHelperAPI) updateMaxParallelWorkflows(ctx context.Context, devConfig *v1alpha1.DeviceConfig) error { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "updateMaxParallelWorkflows", ctx, devConfig) + ret0, _ := ret[0].(error) + return ret0 +} + +// updateMaxParallelWorkflows indicates an expected call of updateMaxParallelWorkflows. +func (mr *MockremediationMgrHelperAPIMockRecorder) updateMaxParallelWorkflows(ctx, devConfig any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "updateMaxParallelWorkflows", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).updateMaxParallelWorkflows), ctx, devConfig) +} + // validateNodeConditions mocks base method. func (m *MockremediationMgrHelperAPI) validateNodeConditions(ctx context.Context, devConfig *v1alpha1.DeviceConfig, node *v1.Node, mappings map[string]ConditionWorkflowMapping) (ConditionWorkflowMapping, error) { m.ctrl.T.Helper() diff --git a/internal/controllers/remediation/scripts/notify.sh b/internal/controllers/remediation/scripts/notify.sh index 11c32140..39f013e7 100644 --- a/internal/controllers/remediation/scripts/notify.sh +++ b/internal/controllers/remediation/scripts/notify.sh @@ -17,11 +17,12 @@ involvedObject: kind: Node name: ${NODE_NAME} namespace: {{workflow.namespace}} -message: ${NOTIFY_MESSAGE} +message: '${NOTIFY_MESSAGE}' reason: AMDGPUUnhealthy reportingComponent: amd-gpu-node-remediation-workflow reportingInstance: amd-gpu-node-remediation-workflow source: component: {{workflow.name}} host: ${NODE_NAME} -type: Warning \ No newline at end of file +type: Warning +EOF \ No newline at end of file diff --git a/internal/controllers/remediation/scripts/test.sh b/internal/controllers/remediation/scripts/test.sh index 20695378..5a4c947e 100644 --- a/internal/controllers/remediation/scripts/test.sh +++ b/internal/controllers/remediation/scripts/test.sh @@ -1,7 +1,7 @@ set -e NODE_NAME="{{inputs.parameters.node_name}}" -JOB_NAME="test-runner-manual-trigger-${NODE_NAME}" -CM_NAME="manual-config-map-${NODE_NAME}" +JOB_NAME="{{workflow.name}}-test-run" +CM_NAME="{{workflow.name}}-test-configmap" FRAMEWORK="{{inputs.parameters.framework}}" RECIPE="{{inputs.parameters.recipe}}" ITERATIONS="{{inputs.parameters.iterations}}" @@ -10,6 +10,9 @@ TIMEOUTSECONDS="{{inputs.parameters.timeoutSeconds}}" TESTRUNNERIMAGE="{{inputs.parameters.testRunnerImage}}" TESTRUNNERSA="{{inputs.parameters.testRunnerServiceAccount}}" NAMESPACE="{{inputs.parameters.namespace}}" +INITCONTAINERIMAGE="{{inputs.parameters.initContainerImage}}" +WFNAME="{{workflow.name}}" +WFUID="{{workflow.uid}}" if [ -z "$FRAMEWORK" ] || [ -z "$RECIPE" ] || [ -z "$ITERATIONS" ] || [ -z "$STOPONFAILURE" ] || [ -z "$TIMEOUTSECONDS" ]; then echo "Validation profile incomplete, skipping configmap and job creation. Please enter framework, recipe, iterations, stopOnFailure, timeoutSeconds as per testrunner requirements" @@ -25,6 +28,13 @@ kind: ConfigMap metadata: name: ${CM_NAME} namespace: ${NAMESPACE} + ownerReferences: + - apiVersion: argoproj.io/v1alpha1 + kind: Workflow + name: ${WFNAME} + uid: ${WFUID} + blockOwnerDeletion: true + controller: true data: config.json: | { @@ -38,9 +48,9 @@ data: { "Framework": "${FRAMEWORK}", "Recipe": "${RECIPE}", - "Iterations": "${ITERATIONS}", - "StopOnFailure": "${STOPONFAILURE}", - "TimeoutSeconds": "${TIMEOUTSECONDS}" + "Iterations": ${ITERATIONS}, + "StopOnFailure": ${STOPONFAILURE}, + "TimeoutSeconds": ${TIMEOUTSECONDS} } ] } @@ -56,8 +66,14 @@ kind: Job metadata: name: ${JOB_NAME} namespace: ${NAMESPACE} + ownerReferences: + - apiVersion: argoproj.io/v1alpha1 + kind: Workflow + name: ${WFNAME} + uid: ${WFUID} + blockOwnerDeletion: true + controller: true spec: - ttlSecondsAfterFinished: 120 backoffLimit: 0 template: spec: @@ -85,6 +101,20 @@ spec: path: /var/log/amd-test-runner type: DirectoryOrCreate name: test-runner-volume + - name: host-sys + hostPath: + path: /sys + type: Directory + initContainers: + - name: driver-init + image: "${INITCONTAINERIMAGE}" + imagePullPolicy: IfNotPresent + command: ['sh', '-c', 'while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done; echo \"amdgpu driver is loaded\"'] + securityContext: + privileged: true + volumeMounts: + - name: host-sys + mountPath: /host-sys containers: - name: amd-test-runner image: "${TESTRUNNERIMAGE}" @@ -133,13 +163,18 @@ echo "Overall timeout for the job is set to $timeout seconds." echo "Waiting for Job $JOB_NAME to complete..." while true; do - job_status=$(kubectl get job "$JOB_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions[0].type}' 2>/dev/null || true) - if [ "$job_status" = "Complete" ]; then + if ! kubectl get job "$JOB_NAME" -n "$NAMESPACE" &>/dev/null; then + echo "Error: Job $JOB_NAME is not found in namespace $NAMESPACE" + exit 1 + fi + isComplete=$(kubectl get job "$JOB_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}') + isFailure=$(kubectl get job "$JOB_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}') + if [ "$isComplete" = "True" ]; then echo "Test runner job completed successfully." kubectl logs -n $NAMESPACE job/$JOB_NAME echo "Detailed run report can be found at /var/log/amd-test-runner" exit 0 - elif [ "$job_status" = "Failed" ]; then + elif [ "$isFailure" = "True" ]; then echo "Test runner job failed." kubectl logs -n $NAMESPACE job/$JOB_NAME echo "Detailed run report can be found at /var/log/amd-test-runner" diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go index 87c888e6..678bf052 100644 --- a/internal/controllers/remediation_handler.go +++ b/internal/controllers/remediation_handler.go @@ -38,6 +38,7 @@ import ( "fmt" "os" "path/filepath" + "strconv" "strings" "sync" "time" @@ -51,6 +52,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" + "k8s.io/client-go/util/retry" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -78,9 +80,11 @@ const ( ForceResumeWorkflowLabelKey = "operator.amd.com/gpu-force-resume-workflow" ForceResumeWorkflowLabelValue = "true" // Below is the label and value needed to be added to node to abort an ongoing workflow - AbortWorkflowLabelKey = "operator.amd.com/gpu-abort-workflow" - AbortWorkflowLabelValue = "true" - RemediationFilesPath = "/remediation" + AbortWorkflowLabelKey = "operator.amd.com/gpu-abort-workflow" + AbortWorkflowLabelValue = "true" + RemediationFilesPath = "/remediation" + DefaultInitContainerImage = "busybox:1.36" + ArgoWorkflowControllerConfigMap = "workflow-controller-configmap" ) type RecoveryPolicyConfig struct { @@ -150,6 +154,11 @@ func (n *remediationMgr) HandleRemediation(ctx context.Context, devConfig *amdv1 return res, err } + // Update max parallel workflows based on DeviceConfig + if err := n.helper.updateMaxParallelWorkflows(ctx, devConfig); err != nil { + logger.Error(err, "Failed to update max parallel workflows, continuing with remediation") + } + // Clear any older recovery attempts from the status CR if err := n.helper.dropOlderRecoveryAttemptsFromStatusCR(ctx, devConfig.Namespace); err != nil { logger.Error(err, "Failed to drop older recovery attempts from status CR") @@ -289,7 +298,7 @@ type remediationMgrHelperAPI interface { registerRecoveryAttempt(ctx context.Context, nodeName string, nodeCondition string, namespace string, wfName string) error registerRecoveryAttemptInternal(nodeName string, nodeCondition string, namespace string, startTime time.Time) error registerRecoveryAttemptToStatusCR(ctx context.Context, nodeName string, nodeCondition string, namespace string, wfName string, startTime time.Time) error - getRecoveryTrackerKey(nodeName string, nodeCondition string) (string, error) + getRecoveryTrackerKey(nodeName string, nodeCondition string) string getMaxAllowedRunsPerWindow(recoveryPolicy *RecoveryPolicyConfig) int getWindowSize(recoveryPolicy *RecoveryPolicyConfig) string isRecoveryPolicyViolated(ctx context.Context, nodeName string, mapping *ConditionWorkflowMapping) bool @@ -305,14 +314,16 @@ type remediationMgrHelperAPI interface { attemptResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping ConditionWorkflowMapping, wf *workflowv1alpha1.Workflow) handleSuspendedWorkflowsOnNode(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping, wf *workflowv1alpha1.Workflow) bool getWorkflowTaskScriptSource(scriptFileName string) (string, error) + updateMaxParallelWorkflows(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) error } type remediationMgrHelper struct { - client client.Client - k8sInterface kubernetes.Interface - recoveryTracker *sync.Map - statusSynced bool - serviceAccountName string + client client.Client + k8sInterface kubernetes.Interface + recoveryTracker *sync.Map + statusSynced bool + serviceAccountName string + maxParallelWorkflows int } // Initialize remediation manager helper interface @@ -583,7 +594,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context {Steps: []workflowv1alpha1.WorkflowStep{{Name: "drain", Template: "drain"}}}, {Steps: []workflowv1alpha1.WorkflowStep{ { - Name: "notifyBeforeSuspend", + Name: "notifybeforesuspend", TemplateRef: &workflowv1alpha1.TemplateRef{Name: "event-notify-template", Template: "notify"}, Arguments: workflowv1alpha1.Arguments{ Parameters: []workflowv1alpha1.Parameter{ @@ -600,7 +611,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context {Steps: []workflowv1alpha1.WorkflowStep{{Name: "test", Template: "test", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}}}}, {Steps: []workflowv1alpha1.WorkflowStep{ { - Name: "notifyGpuTestFailed", + Name: "notifygputestfailed", TemplateRef: &workflowv1alpha1.TemplateRef{Name: "event-notify-template", Template: "notify"}, Arguments: workflowv1alpha1.Arguments{ Parameters: []workflowv1alpha1.Parameter{ @@ -613,12 +624,12 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context }, }, }, - {Steps: []workflowv1alpha1.WorkflowStep{{Name: "failWorkflow", Template: "failWorkflow", When: "{{steps.test.exitCode}} != 0"}}}, + {Steps: []workflowv1alpha1.WorkflowStep{{Name: "failworkflow", Template: "failworkflow", When: "{{steps.test.exitCode}} != 0"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "wait", Template: "wait", When: "{{steps.test.exitCode}} == 0"}}}, {Steps: []workflowv1alpha1.WorkflowStep{{Name: "untaint", Template: "untaint", When: "{{steps.test.exitCode}} == 0"}}}, {Steps: []workflowv1alpha1.WorkflowStep{ { - Name: "notifyWorkflowSucceeded", + Name: "notifyworkflowsucceeded", TemplateRef: &workflowv1alpha1.TemplateRef{Name: "event-notify-template", Template: "notify"}, Arguments: workflowv1alpha1.Arguments{ Parameters: []workflowv1alpha1.Parameter{ @@ -723,6 +734,10 @@ containers: Name: "namespace", Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.namespace}}"), }, + { + Name: "initContainerImage", + Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.initContainerImage}}"), + }, }, }, Script: &workflowv1alpha1.ScriptTemplate{ @@ -765,7 +780,7 @@ containers: }, }, { - Name: "failWorkflow", + Name: "failworkflow", Script: &workflowv1alpha1.ScriptTemplate{ Source: `echo "Failing workflow" && exit 1`, Container: utilityContainer, @@ -832,6 +847,34 @@ func (h *remediationMgrHelper) createDefaultObjects(ctx context.Context, devConf return cm, nil } +func (h *remediationMgrHelper) updateMaxParallelWorkflows(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) error { + logger := log.FromContext(ctx) + // Set maximum parallel workflows that can run simultaneously + if h.maxParallelWorkflows != devConfig.Spec.RemediationWorkflow.MaxParallelWorkflows { + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { + acm, err := h.getConfigMap(ctx, ArgoWorkflowControllerConfigMap, devConfig.Namespace) + if err != nil { + logger.Error(err, "Failed to fetch argo workflow controller configmap") + return err + } + if acm.Data == nil { + acm.Data = make(map[string]string) + } + // Update parallelism in Argo workflow controller configmap. + // https://github.com/argoproj/argo-workflows/blob/main/config/config.go#L69 + acm.Data["parallelism"] = strconv.Itoa(devConfig.Spec.RemediationWorkflow.MaxParallelWorkflows) + return h.client.Update(ctx, acm) + }) + if err != nil { + logger.Error(err, "Failed to update parallelism in argo workflow controller") + return err + } + h.maxParallelWorkflows = devConfig.Spec.RemediationWorkflow.MaxParallelWorkflows + logger.Info(fmt.Sprintf("Updated maximum parallel remediation workflows to %d", h.maxParallelWorkflows)) + } + return nil +} + func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate *workflowv1alpha1.WorkflowTemplate, mapping *ConditionWorkflowMapping, nodeName string, devConfig *amdv1alpha1.DeviceConfig) *workflowv1alpha1.Workflow { wf := &workflowv1alpha1.Workflow{ ObjectMeta: metav1.ObjectMeta{ @@ -873,6 +916,11 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate testrunnerImage = devConfig.Spec.RemediationWorkflow.TesterImage } + initContainerImage := DefaultInitContainerImage + if devConfig.Spec.CommonConfig.InitContainerImage != "" { + initContainerImage = devConfig.Spec.CommonConfig.InitContainerImage + } + // Pass the args required to be used in the template wf.Spec.Arguments = workflowv1alpha1.Arguments{ Parameters: []workflowv1alpha1.Parameter{ @@ -928,6 +976,10 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate Name: "notifySuccessMessage", Value: workflowv1alpha1.AnyStringPtr(fmt.Sprintf("Remediation for node condition %s completed successfully on node %s", mapping.NodeCondition, nodeName)), }, + { + Name: "initContainerImage", + Value: workflowv1alpha1.AnyStringPtr(initContainerImage), + }, }, } @@ -1115,10 +1167,7 @@ func (h *remediationMgrHelper) getWorkflowUtilityImage(devConfig *amdv1alpha1.De func (h *remediationMgrHelper) getRecentRecoveryCount(nodeName string, nodeCondition string) int { // get the length of the slice of attempts for the given node and condition - key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition) - if err != nil { - return 0 - } + key := h.getRecoveryTrackerKey(nodeName, nodeCondition) attempts, ok := h.recoveryTracker.Load(key) if !ok { @@ -1132,10 +1181,7 @@ func (h *remediationMgrHelper) getRecentRecoveryCount(nodeName string, nodeCondi } func (h *remediationMgrHelper) dropOlderRecoveryAttemptsInternal(nodeName string, nodeCondition string, windowSize string) error { - key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition) - if err != nil { - return fmt.Errorf("failed to get recovery tracker key: %w", err) - } + key := h.getRecoveryTrackerKey(nodeName, nodeCondition) attempts, _ := h.recoveryTracker.LoadOrStore(key, []time.Time{}) if attemptsSlice, ok := attempts.([]time.Time); ok { @@ -1265,10 +1311,7 @@ func (h *remediationMgrHelper) registerRecoveryAttemptToStatusCR(ctx context.Con } func (h *remediationMgrHelper) registerRecoveryAttemptInternal(nodeName string, nodeCondition string, namespace string, startTime time.Time) error { - key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition) - if err != nil { - return fmt.Errorf("failed to get recovery tracker key: %w", err) - } + key := h.getRecoveryTrackerKey(nodeName, nodeCondition) attempts, _ := h.recoveryTracker.LoadOrStore(key, []time.Time{}) if attemptsSlice, ok := attempts.([]time.Time); ok { @@ -1281,9 +1324,8 @@ func (h *remediationMgrHelper) registerRecoveryAttemptInternal(nodeName string, return nil } -func (h *remediationMgrHelper) getRecoveryTrackerKey(nodeName string, nodeCondition string) (string, error) { - key := fmt.Sprintf("%s-%s", nodeName, nodeCondition) - return key, nil +func (h *remediationMgrHelper) getRecoveryTrackerKey(nodeName string, nodeCondition string) string { + return fmt.Sprintf("%s-%s", nodeName, nodeCondition) } func (h *remediationMgrHelper) getMaxAllowedRunsPerWindow(recoveryPolicy *RecoveryPolicyConfig) int { @@ -1337,10 +1379,7 @@ func (h *remediationMgrHelper) syncInternalMapFromStatusCR(ctx context.Context, for nodeName, conditions := range wfStatus.Status { for nodeCondition, attempts := range conditions { - key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition) - if err != nil { - return fmt.Errorf("failed to get recovery tracker key: %w", err) - } + key := h.getRecoveryTrackerKey(nodeName, nodeCondition) attemptTimes := make([]time.Time, len(attempts)) for i, attempt := range attempts { diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile index 7ac47e7d..b87d7732 100644 --- a/tests/e2e/Makefile +++ b/tests/e2e/Makefile @@ -17,6 +17,7 @@ E2E_NODE_LABELLER_IMAGE_2 ?= rocm/k8s-device-plugin:labeller-1.31.0.6 E2E_TEST_RUNNER_IMAGE ?= rocm/test-runner:v1.4.0 E2E_KUBEVIRT_DEVICE_PLUGIN_IMAGE ?= rocm/k8s-device-plugin:latest E2E_KUBEVIRT_NODE_LABELLER_IMAGE ?= rocm/k8s-device-plugin:labeller-latest +E2E_UTILS_CONTAINER_IMAGE ?= docker.io/rocm/gpu-operator-utils:v1.4.0 export E2E_INIT_CONTAINER_IMAGE export E2E_KUBE_RBAC_PROXY_CURL_IMAGE @@ -34,6 +35,7 @@ export E2E_TEST_RUNNER_IMAGE export E2E_AGFHC_TEST_RUNNER_IMAGE export E2E_KUBEVIRT_DEVICE_PLUGIN_IMAGE export E2E_KUBEVIRT_NODE_LABELLER_IMAGE +export E2E_UTILS_CONTAINER_IMAGE export E2E_DCM_IMAGE export E2E_NODEAPP_IMG diff --git a/tests/e2e/doc.go b/tests/e2e/doc.go index 89226b2c..cce60e3c 100644 --- a/tests/e2e/doc.go +++ b/tests/e2e/doc.go @@ -28,7 +28,7 @@ import ( type E2ESuite struct { clientSet *kubernetes.Clientset dClient *client.DeviceConfigClient - wfClient workflowclient.Interface + wfClient workflowclient.Clientset cfgName string registry string helmChart string diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go index 32cbcb13..c1e522b6 100644 --- a/tests/e2e/e2e_test.go +++ b/tests/e2e/e2e_test.go @@ -138,7 +138,7 @@ func (s *E2ESuite) SetUpSuite(c *C) { if err != nil { c.Fatalf("Failed to create workflow client: %v", err) } - s.wfClient = wfClient + s.wfClient = *wfClient s.clusterType = utils.GetClusterType(config) diff --git a/tests/e2e/init.go b/tests/e2e/init.go index 90e90f1c..9245b0c7 100644 --- a/tests/e2e/init.go +++ b/tests/e2e/init.go @@ -36,6 +36,7 @@ var ( driverImageRepo string kubeVirtHostDevicePluginImage string kubeVirtHostNodeLabellerImage string + utilsContainerImage string ) func init() { @@ -97,4 +98,8 @@ func init() { if !ok { log.Fatalf("E2E_KUBEVIRT_NODE_LABELLER_IMAGE is not defined.") } + utilsContainerImage, ok = os.LookupEnv("E2E_UTILS_CONTAINER_IMAGE") + if !ok { + log.Fatalf("E2E_UTILS_CONTAINER_IMAGE is not defined.") + } } diff --git a/tests/e2e/remediation_test.go b/tests/e2e/remediation_test.go new file mode 100644 index 00000000..9dae48ce --- /dev/null +++ b/tests/e2e/remediation_test.go @@ -0,0 +1,122 @@ +/* +Copyright (c) Advanced Micro Devices, Inc. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the \"License\"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an \"AS IS\" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "context" + "fmt" + "strings" + "time" + + wfv1alpha1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1" + "github.com/stretchr/testify/assert" + . "gopkg.in/check.v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +const ( + remediationNodeCondition = "AMDGPUHardwareAssertionHwa" + npdInbandRASConfigPath = "./yamls/config/npd/node-problem-detector-config-inband.yaml" + npdInbandRASErrorConfigPath = "./yamls/config/npd/node-problem-detector-error-config-inband.yaml" +) + +func (s *E2ESuite) verifyRemediationWorkflowStatus(c *C, nodeName, status string, waitTime int) { + assert.Eventually(c, func() bool { + wfs, err := s.wfClient.ArgoprojV1alpha1().Workflows(s.ns).List(context.Background(), metav1.ListOptions{}) + if err != nil { + logger.Infof("Error listing workflows: %v", err) + return false + } + for _, wf := range wfs.Items { + if strings.Contains(wf.Name, nodeName) && status == string(wf.Status.Phase) { + return true + } + } + return false + }, time.Duration(waitTime)*time.Minute, 10*time.Second, "Remediation workflow did not reach expected status") +} + +func (s *E2ESuite) TestAutoNodeRemediationWithoutPhysicalAction(c *C) { + logger.Infof("Starting Auto Node Remediation Test") + if s.simEnable { + c.Skip("Skipping for non amd gpu testbed") + } + + nodes, err := s.clientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ + LabelSelector: "feature.node.kubernetes.io/amd-gpu=true", + }) + assert.NoError(c, err, "Failed to list nodes with AMD GPU label") + if len(nodes.Items) == 0 { + c.Fatalf("No nodes found with AMD GPU label") + } + nodeName := nodes.Items[0].Name + + _, err = s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{}) + assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName)) + + driverEnable := false + remediationEnable := true + devCfg := s.getDeviceConfig(c) + devCfg.Spec.Driver.Enable = &driverEnable + devCfg.Spec.RemediationWorkflow.Enable = &remediationEnable + devCfg.Spec.MetricsExporter.Enable = &remediationEnable + devCfg.Spec.MetricsExporter.Image = exporterImage + devCfg.Spec.MetricsExporter.ImagePullPolicy = "Always" + devCfg.Spec.MetricsExporter.Port = 5000 + devCfg.Spec.CommonConfig.UtilsContainer.Image = utilsContainerImage + devCfg.Spec.CommonConfig.UtilsContainer.ImagePullPolicy = "Always" + + logger.Infof("Creating DeviceConfig with remediation enabled and driver disabled") + s.createDeviceConfig(devCfg, c) + s.checkMetricsExporterStatus(devCfg, s.ns, corev1.ServiceTypeClusterIP, c) + + // Wait for cluster to be up + logger.Infof("Waiting for device config to be applied") + time.Sleep(5 * time.Second) + + // Setup NPD + logger.Infof("Setting up Node Problem Detector (NPD)") + setupNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath) + defer tearDownNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath) + + logger.Infof("Verify if Node Problem Detector (NPD) is running on all GPU nodes") + s.verifyNPDRunning(c) + + logger.Infof("Verifying that node condition %s is added for the node %s", remediationNodeCondition, nodeName) + s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionTrue) + + // Trigger error condition by modifying NPD config + logger.Infof("Edit Node Problem Detector (NPD) thresholds to simulate error condition") + s.updateConfigForNPD(c, npdInbandRASConfigPath, npdInbandRASErrorConfigPath) + + s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionTrue) + + // Verify remediation workflow started and completed + logger.Infof("Verifying remediation workflow started on the node %s", nodeName) + s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowRunning), 5) + + time.Sleep(4 * time.Minute) // wait for workflow to progress + logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration") + s.updateConfigForNPD(c, npdInbandRASErrorConfigPath, npdInbandRASConfigPath) + + logger.Infof("Waiting for remediation workflow to complete on the node %s", nodeName) + s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowSucceeded), 70) + + logger.Infof("Verifying that node condition %s is false on the node %s", remediationNodeCondition, nodeName) + s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionFalse) +} diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml new file mode 100644 index 00000000..a2b6976d --- /dev/null +++ b/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +data: + custom-plugin-monitor.json: | + { + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "30s", + "timeout": "15s", + "max_output_length": 80, + "concurrency": 3, + "enable_message_change_based_condition_update": false + }, + "source": "amdgpu-custom-plugin-monitor", + "metricsReporting": true, + "conditions": [ + { + "type": "AMDGPUHardwareAssertionHwa", + "reason": "AMDGPUIsUp", + "message": "AMDGPU is up" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "AMDGPUHardwareAssertionHwa", + "reason": "AMDGPU Hardware Assertion", + "path": "/var/lib/amd-metrics-exporter/amdgpuhealth", + "args": [ + "query", + "inband-ras-errors", + "-s=CPER_SEVERITY_FATAL", + "--afid=30", + "-t=0" + ], + "timeout": "15s" + } + ] + } +kind: ConfigMap +metadata: + name: node-problem-detector-config + namespace: kube-system diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml new file mode 100644 index 00000000..4ffc7d68 --- /dev/null +++ b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml @@ -0,0 +1,42 @@ +apiVersion: v1 +data: + custom-plugin-monitor.json: | + { + "plugin": "custom", + "pluginConfig": { + "invoke_interval": "30s", + "timeout": "15s", + "max_output_length": 80, + "concurrency": 3, + "enable_message_change_based_condition_update": false + }, + "source": "amdgpu-custom-plugin-monitor", + "metricsReporting": true, + "conditions": [ + { + "type": "AMDGPUHardwareAssertionHwa", + "reason": "AMDGPUIsUp", + "message": "AMDGPU is up" + } + ], + "rules": [ + { + "type": "permanent", + "condition": "AMDGPUHardwareAssertionHwa", + "reason": "AMDGPU Hardware Assertion", + "path": "/var/lib/amd-metrics-exporter/amdgpuhealth", + "args": [ + "query", + "inband-ras-errors", + "-s=CPER_SEVERITY_FATAL", + "--afid=30", + "-t=-1" + ], + "timeout": "15s" + } + ] + } +kind: ConfigMap +metadata: + name: node-problem-detector-config + namespace: kube-system diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-rbac.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-rbac.yaml index 006eb4af..4ebda390 100644 --- a/tests/e2e/yamls/config/npd/node-problem-detector-rbac.yaml +++ b/tests/e2e/yamls/config/npd/node-problem-detector-rbac.yaml @@ -19,7 +19,7 @@ rules: - apiGroups: [""] resources: ["nodes/status"] verbs: ["patch"] -- nonResourceURLs: ["/metrics", "/gpumetrics"] +- nonResourceURLs: ["/metrics", "/gpumetrics", "/inbandraserrors"] verbs: ["get"] ---