Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions api/v1alpha1/deviceconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@ type RemediationWorkflowSpec struct {
// +optional
// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
TesterImage string `json:"testerImage,omitempty"`

// MaxParallelWorkflows specifies limit on how many remediation workflows can be executed in parallel. 0 is the default value and it means no limit.
//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="MaxParallelWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows"}
// +optional
MaxParallelWorkflows int `json:"maxParallelWorkflows"`
}

type RegistryTLS struct {
Expand Down
3 changes: 3 additions & 0 deletions api/v1alpha1/remediationwf_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,14 @@ import (
//+kubebuilder:subresource:status

// RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
// We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
// +operator-sdk:csv:customresourcedefinitions:displayName="RemediationWorkflowStatus",resources={{Module,v1beta1,modules.kmm.sigs.x-k8s.io},{Daemonset,v1,apps},{services,v1,core},{Pod,v1,core}}
type RemediationWorkflowStatus struct {
metav1.TypeMeta `json:",inline"`
metav1.ObjectMeta `json:"metadata,omitempty"`

// Status field holds remediation workflow run history for each node and node condition
// Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
Status map[string]map[string][]WorkflowMetadata `json:"status,omitempty"`
}

Expand Down
9 changes: 8 additions & 1 deletion bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ metadata:
capabilities: Seamless Upgrades
categories: AI/Machine Learning,Monitoring
containerImage: docker.io/rocm/gpu-operator:v1.4.0
createdAt: "2025-11-03T10:08:51Z"
createdAt: "2025-12-09T09:27:50Z"
description: |-
Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
Expand Down Expand Up @@ -718,6 +718,13 @@ spec:
path: remediationWorkflow.enable
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:enable
- description: MaxParallelWorkflows specifies limit on how many remediation
workflows can be executed in parallel. 0 is the default value and it means
no limit.
displayName: MaxParallelWorkflows
path: remediationWorkflow.maxParallelWorkflows
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows
- description: Tester image used to run tests and verify if remediation fixed
the reported problem.
displayName: TesterImage
Expand Down
5 changes: 5 additions & 0 deletions bundle/manifests/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1394,6 +1394,11 @@ spec:
enable remediation workflows. disabled by default
enable if operator should automatically handle remediation of node incase of gpu issues
type: boolean
maxParallelWorkflows:
description: MaxParallelWorkflows specifies limit on how many
remediation workflows can be executed in parallel. 0 is the
default value and it means no limit.
type: integer
testerImage:
description: Tester image used to run tests and verify if remediation
fixed the reported problem.
Expand Down
8 changes: 6 additions & 2 deletions bundle/manifests/amd.com_remediationworkflowstatuses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ spec:
- name: v1alpha1
schema:
openAPIV3Schema:
description: RemediationWorkflowStatus keeps a record of recent remediation
workflow runs.
description: |-
RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
properties:
apiVersion:
description: |-
Expand Down Expand Up @@ -55,6 +56,9 @@ spec:
type: object
type: array
type: object
description: |-
Status field holds remediation workflow run history for each node and node condition
Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
type: object
type: object
served: true
Expand Down
5 changes: 5 additions & 0 deletions config/crd/bases/amd.com_deviceconfigs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1390,6 +1390,11 @@ spec:
enable remediation workflows. disabled by default
enable if operator should automatically handle remediation of node incase of gpu issues
type: boolean
maxParallelWorkflows:
description: MaxParallelWorkflows specifies limit on how many
remediation workflows can be executed in parallel. 0 is the
default value and it means no limit.
type: integer
testerImage:
description: Tester image used to run tests and verify if remediation
fixed the reported problem.
Expand Down
8 changes: 6 additions & 2 deletions config/crd/bases/amd.com_remediationworkflowstatuses.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@ spec:
- name: v1alpha1
schema:
openAPIV3Schema:
description: RemediationWorkflowStatus keeps a record of recent remediation
workflow runs.
description: |-
RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
properties:
apiVersion:
description: |-
Expand Down Expand Up @@ -51,6 +52,9 @@ spec:
type: object
type: array
type: object
description: |-
Status field holds remediation workflow run history for each node and node condition
Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
type: object
type: object
served: true
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -689,6 +689,13 @@ spec:
path: remediationWorkflow.enable
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:enable
- description: MaxParallelWorkflows specifies limit on how many remediation
workflows can be executed in parallel. 0 is the default value and it means
no limit.
displayName: MaxParallelWorkflows
path: remediationWorkflow.maxParallelWorkflows
x-descriptors:
- urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows
- description: Tester image used to run tests and verify if remediation fixed
the reported problem.
displayName: TesterImage
Expand Down
10 changes: 10 additions & 0 deletions docs/autoremediation/auto-remediation.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,24 @@

The source yaml to install it is present here: https://github.com/argoproj/argo-workflows/releases/download/v3.6.5/install.yaml

It has been modified to fit the requirements of this feature. For example, the workflow server is not necessary, so it doesn't get deployed as part of the

Check failure on line 12 in docs/autoremediation/auto-remediation.md

View workflow job for this annotation

GitHub Actions / Documentation / Markdown

Trailing spaces

docs/autoremediation/auto-remediation.md:12:155 MD009/no-trailing-spaces Trailing spaces [Expected: 0 or 2; Actual: 1] https://github.com/DavidAnson/markdownlint/blob/v0.32.1/doc/md009.md
GPU Operator-packaged argo installation

## About Workflows and Workflow Templates

The workflow controller is responsible for running a workflow and managing its lifecycle.

Check failure on line 17 in docs/autoremediation/auto-remediation.md

View workflow job for this annotation

GitHub Actions / Documentation / Markdown

Trailing spaces

docs/autoremediation/auto-remediation.md:17:90 MD009/no-trailing-spaces Trailing spaces [Expected: 0 or 2; Actual: 1] https://github.com/DavidAnson/markdownlint/blob/v0.32.1/doc/md009.md

Argo workflows by default uses Kubernetes API Server(etcd) as its database. Once a workflow is triggered, the controller maintains the running state of the workflow and persists in the database. In case workflow controller restarts in between, we still have the state.

A typical workflow refers a workflow template. A workflow template can either be used to define a specific work, or it can be used to orchestrate a workflow. Each task within a workflow is run inside a container.

Creating a `workflow-template` on the cluster will store the template with its steps in k8s apiserver (etcd) but not trigger any action.

Check failure on line 23 in docs/autoremediation/auto-remediation.md

View workflow job for this annotation

GitHub Actions / Documentation / Markdown

Trailing spaces

docs/autoremediation/auto-remediation.md:23:137 MD009/no-trailing-spaces Trailing spaces [Expected: 0 or 2; Actual: 1] https://github.com/DavidAnson/markdownlint/blob/v0.32.1/doc/md009.md
Creating a `workflow` which invokes a `workflow-template` will store the workflow in k8s apiserver(etcd) and also trigger the actual steps in the template.

Check failure on line 24 in docs/autoremediation/auto-remediation.md

View workflow job for this annotation

GitHub Actions / Documentation / Markdown

Trailing spaces

docs/autoremediation/auto-remediation.md:24:156 MD009/no-trailing-spaces Trailing spaces [Expected: 0 or 2; Actual: 1] https://github.com/DavidAnson/markdownlint/blob/v0.32.1/doc/md009.md
GPU Operator creates the `workflow` which invokes the `workflow-template` to trigger remediation

Check failure on line 25 in docs/autoremediation/auto-remediation.md

View workflow job for this annotation

GitHub Actions / Documentation / Markdown

Trailing spaces

docs/autoremediation/auto-remediation.md:25:97 MD009/no-trailing-spaces Trailing spaces [Expected: 0 or 2; Actual: 1] https://github.com/DavidAnson/markdownlint/blob/v0.32.1/doc/md009.md

## Configuration to be handled by the User

-> Toggling `RemediationWorkflow.Enable` to True.

Check failure on line 29 in docs/autoremediation/auto-remediation.md

View workflow job for this annotation

GitHub Actions / Documentation / Markdown

Trailing spaces

docs/autoremediation/auto-remediation.md:29:50 MD009/no-trailing-spaces Trailing spaces [Expected: 0 or 2; Actual: 1] https://github.com/DavidAnson/markdownlint/blob/v0.32.1/doc/md009.md

-> NPD daemonset is relied upon to verify that the issue is fixed during the workflow run. Hence, user needs to add this toleration to NPD daemonset so that it can continue to be scheduled during the workflow run:

Expand All @@ -43,8 +43,8 @@

## How Workflows are triggered

Node problem detector (NPD) can set the node conditions by listening to GPU health reported by device metrics exporter periodically.

Check failure on line 46 in docs/autoremediation/auto-remediation.md

View workflow job for this annotation

GitHub Actions / Documentation / Markdown

Trailing spaces

docs/autoremediation/auto-remediation.md:46:133 MD009/no-trailing-spaces Trailing spaces [Expected: 0 or 2; Actual: 1] https://github.com/DavidAnson/markdownlint/blob/v0.32.1/doc/md009.md
GPU-Operator keeps monitoring the node conditions periodically and creates appropriate workflow based on the node condition status moving to `True`. For example, the below node condition would mean node is in a bad state:

Check failure on line 47 in docs/autoremediation/auto-remediation.md

View workflow job for this annotation

GitHub Actions / Documentation / Markdown

Trailing spaces

docs/autoremediation/auto-remediation.md:47:222 MD009/no-trailing-spaces Trailing spaces [Expected: 0 or 2; Actual: 1] https://github.com/DavidAnson/markdownlint/blob/v0.32.1/doc/md009.md

```yaml
- lastHeartbeatTime: "2025-08-04T08:56:04Z"
Expand All @@ -54,7 +54,7 @@
type: AMDGPUUnhealthy
```

When the status of the node condition is `False`, it means that node condition is currently fine and in good state.

Check failure on line 57 in docs/autoremediation/auto-remediation.md

View workflow job for this annotation

GitHub Actions / Documentation / Markdown

Trailing spaces

docs/autoremediation/auto-remediation.md:57:116 MD009/no-trailing-spaces Trailing spaces [Expected: 0 or 2; Actual: 1] https://github.com/DavidAnson/markdownlint/blob/v0.32.1/doc/md009.md
These are the new fields introduced under the RemediationWorkflow field in the DeviceConfig CR:

```yaml
Expand All @@ -73,7 +73,7 @@
// +kubebuilder:default:=24
TtlForFailedWorkflows int `json:"ttlForFailedWorkflows,omitempty"`
}
```

Check failure on line 76 in docs/autoremediation/auto-remediation.md

View workflow job for this annotation

GitHub Actions / Documentation / Markdown

Trailing spaces

docs/autoremediation/auto-remediation.md:76:4 MD009/no-trailing-spaces Trailing spaces [Expected: 0 or 2; Actual: 1] https://github.com/DavidAnson/markdownlint/blob/v0.32.1/doc/md009.md
The mappings are present in the configmap referenced by the ConditionalWorkflows field.
GPU-Operator will create the `default-conditional-workflow-mappings` configmap on the cluster with some default mappings. The user can modify them if required and can add more mappings as well. If the user wants to use this default configmap, then they may leave the `RemediationWorkflow.ConditionalWorkflows` field empty in the CR. The user can also come up with their own configmap and mention the name of the configmap under `RemediationWorkflow.ConditionalWorkflows` if they do not want to use the default `default-conditional-workflow-mappings` configmap.

Expand Down Expand Up @@ -130,6 +130,16 @@
enable: true
```

You can limit the number of nodes undergoing remediation simultaneously by setting the `maxParallelWorkflows` field in the Device Config custom resource. For example, to ensure no more than 5 nodes undergo remediation at the same time, configure the value as 5(as shown below). The default value is zero, which means there is no upper limit on the number of parallel workflows that can run simultaneously.

```yaml
remediationWorkflow:
enable: true
maxParallelWorkflows: 5
```

When more workflows are triggered beyond the above workflow parallelism limit, the excess workflows are queued by the Argo workflow controller and enter a **Pending** state. They will remain in the queue until a running workflow finishes and a "slot" within the configured parallelism limit becomes available.

## Default Workflow Template

Note: `default-template` will be created on the cluster by GPU-Operator
Expand Down
3 changes: 3 additions & 0 deletions hack/k8s-patch/metadata-patch/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,10 @@ deviceConfig:
# -- config manager tolerations
configManagerTolerations: []
remediationWorkflow:
# -- enable/disable remediation workflow controller
enable: false
# -- Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit
maxParallelWorkflows: 0
# AMD GPU operator controller related configs
controllerManager:
manager:
Expand Down
4 changes: 4 additions & 0 deletions hack/k8s-patch/template-patch/default-deviceconfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,10 @@ spec:
{{- with .testerImage }}
testerImage: {{ . }}
{{- end }}

{{- with .maxParallelWorkflows }}
maxParallelWorkflows: {{ . }}
{{- end }}
{{- end }}

{{- end }}
Expand Down
2 changes: 1 addition & 1 deletion helm-charts-k8s/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ dependencies:
repository: file://./charts/remediation
version: v1.0.0
digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07
generated: "2025-11-03T10:08:37.655536804Z"
generated: "2025-12-09T09:27:36.511662862Z"
2 changes: 2 additions & 0 deletions helm-charts-k8s/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@ Kubernetes: `>= 1.29.0-0`
| deviceConfig.spec.metricsExporter.tolerations | list | `[]` | metrics exporter tolerations |
| deviceConfig.spec.metricsExporter.upgradePolicy.maxUnavailable | int | `1` | the maximum number of Pods that can be unavailable during the update process |
| deviceConfig.spec.metricsExporter.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete |
| deviceConfig.spec.remediationWorkflow.enable | bool | `false` | enable/disable remediation workflow controller |
| deviceConfig.spec.remediationWorkflow.maxParallelWorkflows | int | `0` | Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit |
| deviceConfig.spec.selector | object | `{"feature.node.kubernetes.io/amd-gpu":"true"}` | Set node selector for the default DeviceConfig |
| deviceConfig.spec.testRunner.config | object | `{}` | test runner config map, e.g. {"name": "myConfigMap"} |
| deviceConfig.spec.testRunner.enable | bool | `false` | enable / disable test runner |
Expand Down
5 changes: 5 additions & 0 deletions helm-charts-k8s/crds/deviceconfig-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1396,6 +1396,11 @@ spec:
enable remediation workflows. disabled by default
enable if operator should automatically handle remediation of node incase of gpu issues
type: boolean
maxParallelWorkflows:
description: MaxParallelWorkflows specifies limit on how many remediation
workflows can be executed in parallel. 0 is the default value
and it means no limit.
type: integer
testerImage:
description: Tester image used to run tests and verify if remediation
fixed the reported problem.
Expand Down
8 changes: 6 additions & 2 deletions helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ spec:
- name: v1alpha1
schema:
openAPIV3Schema:
description: RemediationWorkflowStatus keeps a record of recent remediation
workflow runs.
description: |-
RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
properties:
apiVersion:
description: |-
Expand Down Expand Up @@ -60,6 +61,9 @@ spec:
type: object
type: array
type: object
description: |-
Status field holds remediation workflow run history for each node and node condition
Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
type: object
type: object
served: true
Expand Down
4 changes: 4 additions & 0 deletions helm-charts-k8s/templates/default-deviceconfig.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,10 @@ spec:
{{- with .testerImage }}
testerImage: {{ . }}
{{- end }}

{{- with .maxParallelWorkflows }}
maxParallelWorkflows: {{ . }}
{{- end }}
{{- end }}

{{- end }}
Expand Down
3 changes: 3 additions & 0 deletions helm-charts-k8s/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,10 @@ deviceConfig:
# -- config manager tolerations
configManagerTolerations: []
remediationWorkflow:
# -- enable/disable remediation workflow controller
enable: false
# -- Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit
maxParallelWorkflows: 0
# AMD GPU operator controller related configs
controllerManager:
manager:
Expand Down
2 changes: 1 addition & 1 deletion helm-charts-openshift/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ dependencies:
repository: file://./charts/kmm
version: v1.0.0
digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
generated: "2025-11-03T10:08:49.883010865Z"
generated: "2025-12-09T09:27:48.895675076Z"
5 changes: 5 additions & 0 deletions helm-charts-openshift/crds/deviceconfig-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1396,6 +1396,11 @@ spec:
enable remediation workflows. disabled by default
enable if operator should automatically handle remediation of node incase of gpu issues
type: boolean
maxParallelWorkflows:
description: MaxParallelWorkflows specifies limit on how many remediation
workflows can be executed in parallel. 0 is the default value
and it means no limit.
type: integer
testerImage:
description: Tester image used to run tests and verify if remediation
fixed the reported problem.
Expand Down
8 changes: 6 additions & 2 deletions helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ spec:
- name: v1alpha1
schema:
openAPIV3Schema:
description: RemediationWorkflowStatus keeps a record of recent remediation
workflow runs.
description: |-
RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
properties:
apiVersion:
description: |-
Expand Down Expand Up @@ -60,6 +61,9 @@ spec:
type: object
type: array
type: object
description: |-
Status field holds remediation workflow run history for each node and node condition
Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
type: object
type: object
served: true
Expand Down
19 changes: 16 additions & 3 deletions internal/controllers/mock_remediation_handler.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 3 additions & 2 deletions internal/controllers/remediation/scripts/notify.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@ involvedObject:
kind: Node
name: ${NODE_NAME}
namespace: {{workflow.namespace}}
message: ${NOTIFY_MESSAGE}
message: '${NOTIFY_MESSAGE}'
reason: AMDGPUUnhealthy
reportingComponent: amd-gpu-node-remediation-workflow
reportingInstance: amd-gpu-node-remediation-workflow
source:
component: {{workflow.name}}
host: ${NODE_NAME}
type: Warning
type: Warning
EOF
Loading
Loading