ROCm · sajmera-pensando · Dec 17, 2025 · Dec 5, 2025 · Dec 16, 2025
diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
@@ -103,6 +103,11 @@ type RemediationWorkflowSpec struct {
 	// +optional
 	// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
 	TesterImage string `json:"testerImage,omitempty"`
+
+	// MaxParallelWorkflows specifies limit on how many remediation workflows can be executed in parallel. 0 is the default value and it means no limit.
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="MaxParallelWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows"}
+	// +optional
+	MaxParallelWorkflows int `json:"maxParallelWorkflows"`
 }
 
 type RegistryTLS struct {

diff --git a/api/v1alpha1/remediationwf_types.go b/api/v1alpha1/remediationwf_types.go
@@ -25,11 +25,14 @@ import (
 //+kubebuilder:subresource:status
 
 // RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
+// We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
 // +operator-sdk:csv:customresourcedefinitions:displayName="RemediationWorkflowStatus",resources={{Module,v1beta1,modules.kmm.sigs.x-k8s.io},{Daemonset,v1,apps},{services,v1,core},{Pod,v1,core}}
 type RemediationWorkflowStatus struct {
 	metav1.TypeMeta   `json:",inline"`
 	metav1.ObjectMeta `json:"metadata,omitempty"`
 
+	// Status field holds remediation workflow run history for each node and node condition
+	// Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
 	Status map[string]map[string][]WorkflowMetadata `json:"status,omitempty"`
 }
 

diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -32,7 +32,7 @@ metadata:
     capabilities: Seamless Upgrades
     categories: AI/Machine Learning,Monitoring
     containerImage: docker.io/rocm/gpu-operator:v1.4.0
-    createdAt: "2025-11-03T10:08:51Z"
+    createdAt: "2025-12-09T09:27:50Z"
     description: |-
       Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
       For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
@@ -718,6 +718,13 @@ spec:
         path: remediationWorkflow.enable
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:enable
+      - description: MaxParallelWorkflows specifies limit on how many remediation
+          workflows can be executed in parallel. 0 is the default value and it means
+          no limit.
+        displayName: MaxParallelWorkflows
+        path: remediationWorkflow.maxParallelWorkflows
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows
       - description: Tester image used to run tests and verify if remediation fixed
           the reported problem.
         displayName: TesterImage

diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml
@@ -1394,6 +1394,11 @@ spec:
                       enable remediation workflows. disabled by default
                       enable if operator should automatically handle remediation of node incase of gpu issues
                     type: boolean
+                  maxParallelWorkflows:
+                    description: MaxParallelWorkflows specifies limit on how many
+                      remediation workflows can be executed in parallel. 0 is the
+                      default value and it means no limit.
+                    type: integer
                   testerImage:
                     description: Tester image used to run tests and verify if remediation
                       fixed the reported problem.

diff --git a/bundle/manifests/amd.com_remediationworkflowstatuses.yaml b/bundle/manifests/amd.com_remediationworkflowstatuses.yaml
@@ -23,8 +23,9 @@ spec:
   - name: v1alpha1
     schema:
       openAPIV3Schema:
-        description: RemediationWorkflowStatus keeps a record of recent remediation
-          workflow runs.
+        description: |-
+          RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
+          We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
         properties:
           apiVersion:
             description: |-
@@ -55,6 +56,9 @@ spec:
                   type: object
                 type: array
               type: object
+            description: |-
+              Status field holds remediation workflow run history for each node and node condition
+              Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
             type: object
         type: object
     served: true

diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml
@@ -1390,6 +1390,11 @@ spec:
                       enable remediation workflows. disabled by default
                       enable if operator should automatically handle remediation of node incase of gpu issues
                     type: boolean
+                  maxParallelWorkflows:
+                    description: MaxParallelWorkflows specifies limit on how many
+                      remediation workflows can be executed in parallel. 0 is the
+                      default value and it means no limit.
+                    type: integer
                   testerImage:
                     description: Tester image used to run tests and verify if remediation
                       fixed the reported problem.

diff --git a/config/crd/bases/amd.com_remediationworkflowstatuses.yaml b/config/crd/bases/amd.com_remediationworkflowstatuses.yaml
@@ -19,8 +19,9 @@ spec:
   - name: v1alpha1
     schema:
       openAPIV3Schema:
-        description: RemediationWorkflowStatus keeps a record of recent remediation
-          workflow runs.
+        description: |-
+          RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
+          We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
         properties:
           apiVersion:
             description: |-
@@ -51,6 +52,9 @@ spec:
                   type: object
                 type: array
               type: object
+            description: |-
+              Status field holds remediation workflow run history for each node and node condition
+              Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
             type: object
         type: object
     served: true

diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
@@ -689,6 +689,13 @@ spec:
         path: remediationWorkflow.enable
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:enable
+      - description: MaxParallelWorkflows specifies limit on how many remediation
+          workflows can be executed in parallel. 0 is the default value and it means
+          no limit.
+        displayName: MaxParallelWorkflows
+        path: remediationWorkflow.maxParallelWorkflows
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows
       - description: Tester image used to run tests and verify if remediation fixed
           the reported problem.
         displayName: TesterImage

diff --git a/docs/autoremediation/auto-remediation.md b/docs/autoremediation/auto-remediation.md
@@ -9,24 +9,24 @@

 The source yaml to install it is present here: https://github.com/argoproj/argo-workflows/releases/download/v3.6.5/install.yaml

 It has been modified to fit the requirements of this feature. For example, the workflow server is not necessary, so it doesn't get deployed as part of the 
 GPU Operator-packaged argo installation

 ## About Workflows and Workflow Templates

 The workflow controller is responsible for running a workflow and managing its lifecycle. 

 Argo workflows by default uses Kubernetes API Server(etcd) as its database. Once a workflow is triggered, the controller maintains the running state of the workflow and persists in the database. In case workflow controller restarts in between, we still have the state.  

 A typical workflow refers a workflow template. A workflow template can either be used to define a specific work, or it can be used to orchestrate a workflow. Each task within a workflow is run inside a container.

 Creating a `workflow-template` on the cluster will store the template with its steps in k8s apiserver (etcd) but not trigger any action. 
 Creating a `workflow` which invokes a `workflow-template` will store the workflow in k8s apiserver(etcd) and also trigger the actual steps in the template. 
 GPU Operator creates the `workflow` which invokes the `workflow-template` to trigger remediation 

 ## Configuration to be handled by the User

 -> Toggling `RemediationWorkflow.Enable` to True. 

 -> NPD daemonset is relied upon to verify that the issue is fixed during the workflow run. Hence, user needs to add this toleration to NPD daemonset so that it can continue to be scheduled during the workflow run:

@@ -43,8 +43,8 @@

 ## How Workflows are triggered

 Node problem detector (NPD) can set the node conditions by listening to GPU health reported by device metrics exporter periodically. 
 GPU-Operator keeps monitoring the node conditions periodically and creates appropriate workflow based on the node condition status moving to `True`. For example, the below node condition would mean node is in a bad state: 

 ```yaml
  - lastHeartbeatTime: "2025-08-04T08:56:04Z"
@@ -54,7 +54,7 @@
    type: AMDGPUUnhealthy
 ```

 When the status of the node condition is `False`, it means that node condition is currently fine and in good state. 
 These are the new fields introduced under the RemediationWorkflow field in the DeviceConfig CR:

 ```yaml
@@ -73,7 +73,7 @@
        // +kubebuilder:default:=24
        TtlForFailedWorkflows int `json:"ttlForFailedWorkflows,omitempty"`
    }
 ``` 
 The mappings are present in the configmap referenced by the ConditionalWorkflows field. 
 GPU-Operator will create the `default-conditional-workflow-mappings` configmap on the cluster with some default mappings. The user can modify them if required and can add more mappings as well. If the user wants to use this default configmap, then they may leave the `RemediationWorkflow.ConditionalWorkflows` field empty in the CR. The user can also come up with their own configmap and mention the name of the configmap under `RemediationWorkflow.ConditionalWorkflows` if they do not want to use the default `default-conditional-workflow-mappings` configmap.

@@ -130,6 +130,16 @@
     enable: true
 ```
 
+You can limit the number of nodes undergoing remediation simultaneously by setting the `maxParallelWorkflows` field in the Device Config custom resource. For example, to ensure no more than 5 nodes undergo remediation at the same time, configure the value as 5(as shown below). The default value is zero, which means there is no upper limit on the number of parallel workflows that can run simultaneously.
+
+```yaml
+  remediationWorkflow:
+    enable: true
+    maxParallelWorkflows: 5
+```
+
+When more workflows are triggered beyond the above workflow parallelism limit, the excess workflows are queued by the Argo workflow controller and enter a **Pending** state. They will remain in the queue until a running workflow finishes and a "slot" within the configured parallelism limit becomes available.
+
 ## Default Workflow Template
 
 Note: `default-template` will be created on the cluster by GPU-Operator 

diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml
@@ -231,7 +231,10 @@ deviceConfig:
       # -- config manager tolerations
       configManagerTolerations: []
     remediationWorkflow:
+      # -- enable/disable remediation workflow controller
       enable: false
+      # -- Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit
+      maxParallelWorkflows: 0
 # AMD GPU operator controller related configs
 controllerManager:
   manager:

diff --git a/hack/k8s-patch/template-patch/default-deviceconfig.yaml b/hack/k8s-patch/template-patch/default-deviceconfig.yaml
@@ -437,6 +437,10 @@ spec:
     {{- with .testerImage }}
     testerImage: {{ . }}
     {{- end }}
+
+    {{- with .maxParallelWorkflows }}
+    maxParallelWorkflows: {{ . }}
+    {{- end }}
   {{- end }}
 
 {{- end }}

diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
@@ -9,4 +9,4 @@ dependencies:
   repository: file://./charts/remediation
   version: v1.0.0
 digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07
-generated: "2025-11-03T10:08:37.655536804Z"
+generated: "2025-12-09T09:27:36.511662862Z"
diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md
@@ -231,6 +231,8 @@ Kubernetes: `>= 1.29.0-0`
 | deviceConfig.spec.metricsExporter.tolerations | list | `[]` | metrics exporter tolerations |
 | deviceConfig.spec.metricsExporter.upgradePolicy.maxUnavailable | int | `1` | the maximum number of Pods that can be unavailable during the update process |
 | deviceConfig.spec.metricsExporter.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete |
+| deviceConfig.spec.remediationWorkflow.enable | bool | `false` | enable/disable remediation workflow controller |
+| deviceConfig.spec.remediationWorkflow.maxParallelWorkflows | int | `0` | Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit |
 | deviceConfig.spec.selector | object | `{"feature.node.kubernetes.io/amd-gpu":"true"}` | Set node selector for the default DeviceConfig |
 | deviceConfig.spec.testRunner.config | object | `{}` | test runner config map, e.g. {"name": "myConfigMap"} |
 | deviceConfig.spec.testRunner.enable | bool | `false` | enable / disable test runner |

diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml
@@ -1396,6 +1396,11 @@ spec:
                       enable remediation workflows. disabled by default
                       enable if operator should automatically handle remediation of node incase of gpu issues
                     type: boolean
+                  maxParallelWorkflows:
+                    description: MaxParallelWorkflows specifies limit on how many remediation
+                      workflows can be executed in parallel. 0 is the default value
+                      and it means no limit.
+                    type: integer
                   testerImage:
                     description: Tester image used to run tests and verify if remediation
                       fixed the reported problem.

diff --git a/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml b/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml
@@ -28,8 +28,9 @@ spec:
   - name: v1alpha1
     schema:
       openAPIV3Schema:
-        description: RemediationWorkflowStatus keeps a record of recent remediation
-          workflow runs.
+        description: |-
+          RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
+          We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
         properties:
           apiVersion:
             description: |-
@@ -60,6 +61,9 @@ spec:
                   type: object
                 type: array
               type: object
+            description: |-
+              Status field holds remediation workflow run history for each node and node condition
+              Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
             type: object
         type: object
     served: true

diff --git a/helm-charts-k8s/templates/default-deviceconfig.yaml b/helm-charts-k8s/templates/default-deviceconfig.yaml
@@ -437,6 +437,10 @@ spec:
     {{- with .testerImage }}
     testerImage: {{ . }}
     {{- end }}
+
+    {{- with .maxParallelWorkflows }}
+    maxParallelWorkflows: {{ . }}
+    {{- end }}
   {{- end }}
 
 {{- end }}

diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml
@@ -231,7 +231,10 @@ deviceConfig:
       # -- config manager tolerations
       configManagerTolerations: []
     remediationWorkflow:
+      # -- enable/disable remediation workflow controller
       enable: false
+      # -- Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit
+      maxParallelWorkflows: 0
 # AMD GPU operator controller related configs
 controllerManager:
   manager:

diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
-generated: "2025-11-03T10:08:49.883010865Z"
+generated: "2025-12-09T09:27:48.895675076Z"
diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml
@@ -1396,6 +1396,11 @@ spec:
                       enable remediation workflows. disabled by default
                       enable if operator should automatically handle remediation of node incase of gpu issues
                     type: boolean
+                  maxParallelWorkflows:
+                    description: MaxParallelWorkflows specifies limit on how many remediation
+                      workflows can be executed in parallel. 0 is the default value
+                      and it means no limit.
+                    type: integer
                   testerImage:
                     description: Tester image used to run tests and verify if remediation
                       fixed the reported problem.

diff --git a/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml b/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml
@@ -28,8 +28,9 @@ spec:
   - name: v1alpha1
     schema:
       openAPIV3Schema:
-        description: RemediationWorkflowStatus keeps a record of recent remediation
-          workflow runs.
+        description: |-
+          RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
+          We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
         properties:
           apiVersion:
             description: |-
@@ -60,6 +61,9 @@ spec:
                   type: object
                 type: array
               type: object
+            description: |-
+              Status field holds remediation workflow run history for each node and node condition
+              Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
             type: object
         type: object
     served: true

diff --git a/internal/controllers/mock_remediation_handler.go b/internal/controllers/mock_remediation_handler.go
diff --git a/internal/controllers/remediation/scripts/notify.sh b/internal/controllers/remediation/scripts/notify.sh
@@ -17,11 +17,12 @@ involvedObject:
   kind: Node
   name: ${NODE_NAME}
   namespace: {{workflow.namespace}}
-message: ${NOTIFY_MESSAGE}
+message: '${NOTIFY_MESSAGE}'
 reason: AMDGPUUnhealthy
 reportingComponent: amd-gpu-node-remediation-workflow
 reportingInstance: amd-gpu-node-remediation-workflow
 source:
   component: {{workflow.name}}
   host: ${NODE_NAME}
-type: Warning
+type: Warning
+EOF