diff --git a/api/v1alpha1/deviceconfig_types.go b/api/v1alpha1/deviceconfig_types.go
index 3b6c6ea6..c4f2beaa 100644
--- a/api/v1alpha1/deviceconfig_types.go
+++ b/api/v1alpha1/deviceconfig_types.go
@@ -103,6 +103,11 @@ type RemediationWorkflowSpec struct {
 	// +optional
 	// +kubebuilder:validation:Pattern=`^([a-z0-9]+(?:[._-][a-z0-9]+)*(:[0-9]+)?)(/[a-z0-9]+(?:[._-][a-z0-9]+)*)*(?::[a-z0-9._-]+)?(?:@[a-zA-Z0-9]+:[a-f0-9]+)?$`
 	TesterImage string `json:"testerImage,omitempty"`
+
+	// MaxParallelWorkflows specifies limit on how many remediation workflows can be executed in parallel. 0 is the default value and it means no limit.
+	//+operator-sdk:csv:customresourcedefinitions:type=spec,displayName="MaxParallelWorkflows",xDescriptors={"urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows"}
+	// +optional
+	MaxParallelWorkflows int `json:"maxParallelWorkflows"`
 }
 
 type RegistryTLS struct {
diff --git a/api/v1alpha1/remediationwf_types.go b/api/v1alpha1/remediationwf_types.go
index 36234ca2..528f135a 100644
--- a/api/v1alpha1/remediationwf_types.go
+++ b/api/v1alpha1/remediationwf_types.go
@@ -25,11 +25,14 @@ import (
 //+kubebuilder:subresource:status
 
 // RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
+// We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
 // +operator-sdk:csv:customresourcedefinitions:displayName="RemediationWorkflowStatus",resources={{Module,v1beta1,modules.kmm.sigs.x-k8s.io},{Daemonset,v1,apps},{services,v1,core},{Pod,v1,core}}
 type RemediationWorkflowStatus struct {
 	metav1.TypeMeta   `json:",inline"`
 	metav1.ObjectMeta `json:"metadata,omitempty"`
 
+	// Status field holds remediation workflow run history for each node and node condition
+	// Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
 	Status map[string]map[string][]WorkflowMetadata `json:"status,omitempty"`
 }
 
diff --git a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
index 94294cea..bc7e0a73 100644
--- a/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
+++ b/bundle/manifests/amd-gpu-operator.clusterserviceversion.yaml
@@ -32,7 +32,7 @@ metadata:
     capabilities: Seamless Upgrades
     categories: AI/Machine Learning,Monitoring
     containerImage: docker.io/rocm/gpu-operator:v1.4.0
-    createdAt: "2025-11-03T10:08:51Z"
+    createdAt: "2025-12-09T09:27:50Z"
     description: |-
       Operator responsible for deploying AMD GPU kernel drivers, device plugin, device test runner and device metrics exporter
       For more information, visit [documentation](https://instinct.docs.amd.com/projects/gpu-operator/en/latest/)
@@ -718,6 +718,13 @@ spec:
         path: remediationWorkflow.enable
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:enable
+      - description: MaxParallelWorkflows specifies limit on how many remediation
+          workflows can be executed in parallel. 0 is the default value and it means
+          no limit.
+        displayName: MaxParallelWorkflows
+        path: remediationWorkflow.maxParallelWorkflows
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows
       - description: Tester image used to run tests and verify if remediation fixed
           the reported problem.
         displayName: TesterImage
diff --git a/bundle/manifests/amd.com_deviceconfigs.yaml b/bundle/manifests/amd.com_deviceconfigs.yaml
index d4941f3b..ae180f96 100644
--- a/bundle/manifests/amd.com_deviceconfigs.yaml
+++ b/bundle/manifests/amd.com_deviceconfigs.yaml
@@ -1394,6 +1394,11 @@ spec:
                       enable remediation workflows. disabled by default
                       enable if operator should automatically handle remediation of node incase of gpu issues
                     type: boolean
+                  maxParallelWorkflows:
+                    description: MaxParallelWorkflows specifies limit on how many
+                      remediation workflows can be executed in parallel. 0 is the
+                      default value and it means no limit.
+                    type: integer
                   testerImage:
                     description: Tester image used to run tests and verify if remediation
                       fixed the reported problem.
diff --git a/bundle/manifests/amd.com_remediationworkflowstatuses.yaml b/bundle/manifests/amd.com_remediationworkflowstatuses.yaml
index 7becfc99..a3678a9c 100644
--- a/bundle/manifests/amd.com_remediationworkflowstatuses.yaml
+++ b/bundle/manifests/amd.com_remediationworkflowstatuses.yaml
@@ -23,8 +23,9 @@ spec:
   - name: v1alpha1
     schema:
       openAPIV3Schema:
-        description: RemediationWorkflowStatus keeps a record of recent remediation
-          workflow runs.
+        description: |-
+          RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
+          We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
         properties:
           apiVersion:
             description: |-
@@ -55,6 +56,9 @@ spec:
                   type: object
                 type: array
               type: object
+            description: |-
+              Status field holds remediation workflow run history for each node and node condition
+              Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
             type: object
         type: object
     served: true
diff --git a/config/crd/bases/amd.com_deviceconfigs.yaml b/config/crd/bases/amd.com_deviceconfigs.yaml
index c4bf1868..80e1beb1 100644
--- a/config/crd/bases/amd.com_deviceconfigs.yaml
+++ b/config/crd/bases/amd.com_deviceconfigs.yaml
@@ -1390,6 +1390,11 @@ spec:
                       enable remediation workflows. disabled by default
                       enable if operator should automatically handle remediation of node incase of gpu issues
                     type: boolean
+                  maxParallelWorkflows:
+                    description: MaxParallelWorkflows specifies limit on how many
+                      remediation workflows can be executed in parallel. 0 is the
+                      default value and it means no limit.
+                    type: integer
                   testerImage:
                     description: Tester image used to run tests and verify if remediation
                       fixed the reported problem.
diff --git a/config/crd/bases/amd.com_remediationworkflowstatuses.yaml b/config/crd/bases/amd.com_remediationworkflowstatuses.yaml
index 651af2d9..5708c17a 100644
--- a/config/crd/bases/amd.com_remediationworkflowstatuses.yaml
+++ b/config/crd/bases/amd.com_remediationworkflowstatuses.yaml
@@ -19,8 +19,9 @@ spec:
   - name: v1alpha1
     schema:
       openAPIV3Schema:
-        description: RemediationWorkflowStatus keeps a record of recent remediation
-          workflow runs.
+        description: |-
+          RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
+          We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
         properties:
           apiVersion:
             description: |-
@@ -51,6 +52,9 @@ spec:
                   type: object
                 type: array
               type: object
+            description: |-
+              Status field holds remediation workflow run history for each node and node condition
+              Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
             type: object
         type: object
     served: true
diff --git a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
index a189e364..5a4d28ff 100644
--- a/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
+++ b/config/manifests/bases/amd-gpu-operator.clusterserviceversion.yaml
@@ -689,6 +689,13 @@ spec:
         path: remediationWorkflow.enable
         x-descriptors:
         - urn:alm:descriptor:com.amd.deviceconfigs:enable
+      - description: MaxParallelWorkflows specifies limit on how many remediation
+          workflows can be executed in parallel. 0 is the default value and it means
+          no limit.
+        displayName: MaxParallelWorkflows
+        path: remediationWorkflow.maxParallelWorkflows
+        x-descriptors:
+        - urn:alm:descriptor:com.amd.deviceconfigs:maxParallelWorkflows
       - description: Tester image used to run tests and verify if remediation fixed
           the reported problem.
         displayName: TesterImage
diff --git a/docs/autoremediation/auto-remediation.md b/docs/autoremediation/auto-remediation.md
index 3b5d9a5d..a619d713 100644
--- a/docs/autoremediation/auto-remediation.md
+++ b/docs/autoremediation/auto-remediation.md
@@ -130,6 +130,16 @@ The most common CR users will be using will be of this form which will use the `
     enable: true
 ```
 
+You can limit the number of nodes undergoing remediation simultaneously by setting the `maxParallelWorkflows` field in the Device Config custom resource. For example, to ensure no more than 5 nodes undergo remediation at the same time, configure the value as 5(as shown below). The default value is zero, which means there is no upper limit on the number of parallel workflows that can run simultaneously.
+
+```yaml
+  remediationWorkflow:
+    enable: true
+    maxParallelWorkflows: 5
+```
+
+When more workflows are triggered beyond the above workflow parallelism limit, the excess workflows are queued by the Argo workflow controller and enter a **Pending** state. They will remain in the queue until a running workflow finishes and a "slot" within the configured parallelism limit becomes available.
+
 ## Default Workflow Template
 
 Note: `default-template` will be created on the cluster by GPU-Operator 
diff --git a/hack/k8s-patch/metadata-patch/values.yaml b/hack/k8s-patch/metadata-patch/values.yaml
index b924972a..b08b2f32 100644
--- a/hack/k8s-patch/metadata-patch/values.yaml
+++ b/hack/k8s-patch/metadata-patch/values.yaml
@@ -231,7 +231,10 @@ deviceConfig:
       # -- config manager tolerations
       configManagerTolerations: []
     remediationWorkflow:
+      # -- enable/disable remediation workflow controller
       enable: false
+      # -- Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit
+      maxParallelWorkflows: 0
 # AMD GPU operator controller related configs
 controllerManager:
   manager:
diff --git a/hack/k8s-patch/template-patch/default-deviceconfig.yaml b/hack/k8s-patch/template-patch/default-deviceconfig.yaml
index 3c40cff8..32bdbc5a 100644
--- a/hack/k8s-patch/template-patch/default-deviceconfig.yaml
+++ b/hack/k8s-patch/template-patch/default-deviceconfig.yaml
@@ -437,6 +437,10 @@ spec:
     {{- with .testerImage }}
     testerImage: {{ . }}
     {{- end }}
+
+    {{- with .maxParallelWorkflows }}
+    maxParallelWorkflows: {{ . }}
+    {{- end }}
   {{- end }}
 
 {{- end }}
diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
index bb567af6..058c7ecf 100644
--- a/helm-charts-k8s/Chart.lock
+++ b/helm-charts-k8s/Chart.lock
@@ -9,4 +9,4 @@ dependencies:
   repository: file://./charts/remediation
   version: v1.0.0
 digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07
-generated: "2025-11-03T10:08:37.655536804Z"
+generated: "2025-12-09T09:27:36.511662862Z"
diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md
index 656de6e0..203d9ef4 100644
--- a/helm-charts-k8s/README.md
+++ b/helm-charts-k8s/README.md
@@ -231,6 +231,8 @@ Kubernetes: `>= 1.29.0-0`
 | deviceConfig.spec.metricsExporter.tolerations | list | `[]` | metrics exporter tolerations |
 | deviceConfig.spec.metricsExporter.upgradePolicy.maxUnavailable | int | `1` | the maximum number of Pods that can be unavailable during the update process |
 | deviceConfig.spec.metricsExporter.upgradePolicy.upgradeStrategy | string | `"RollingUpdate"` | the type of daemonset upgrade, RollingUpdate or OnDelete |
+| deviceConfig.spec.remediationWorkflow.enable | bool | `false` | enable/disable remediation workflow controller |
+| deviceConfig.spec.remediationWorkflow.maxParallelWorkflows | int | `0` | Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit |
 | deviceConfig.spec.selector | object | `{"feature.node.kubernetes.io/amd-gpu":"true"}` | Set node selector for the default DeviceConfig |
 | deviceConfig.spec.testRunner.config | object | `{}` | test runner config map, e.g. {"name": "myConfigMap"} |
 | deviceConfig.spec.testRunner.enable | bool | `false` | enable / disable test runner |
diff --git a/helm-charts-k8s/crds/deviceconfig-crd.yaml b/helm-charts-k8s/crds/deviceconfig-crd.yaml
index 752cf141..6061e2c9 100644
--- a/helm-charts-k8s/crds/deviceconfig-crd.yaml
+++ b/helm-charts-k8s/crds/deviceconfig-crd.yaml
@@ -1396,6 +1396,11 @@ spec:
                       enable remediation workflows. disabled by default
                       enable if operator should automatically handle remediation of node incase of gpu issues
                     type: boolean
+                  maxParallelWorkflows:
+                    description: MaxParallelWorkflows specifies limit on how many remediation
+                      workflows can be executed in parallel. 0 is the default value
+                      and it means no limit.
+                    type: integer
                   testerImage:
                     description: Tester image used to run tests and verify if remediation
                       fixed the reported problem.
diff --git a/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml b/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml
index aa5c0ac0..011e3ad0 100644
--- a/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml
+++ b/helm-charts-k8s/crds/remediationworkflowstatus-crd.yaml
@@ -28,8 +28,9 @@ spec:
   - name: v1alpha1
     schema:
       openAPIV3Schema:
-        description: RemediationWorkflowStatus keeps a record of recent remediation
-          workflow runs.
+        description: |-
+          RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
+          We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
         properties:
           apiVersion:
             description: |-
@@ -60,6 +61,9 @@ spec:
                   type: object
                 type: array
               type: object
+            description: |-
+              Status field holds remediation workflow run history for each node and node condition
+              Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
             type: object
         type: object
     served: true
diff --git a/helm-charts-k8s/templates/default-deviceconfig.yaml b/helm-charts-k8s/templates/default-deviceconfig.yaml
index 3c40cff8..32bdbc5a 100644
--- a/helm-charts-k8s/templates/default-deviceconfig.yaml
+++ b/helm-charts-k8s/templates/default-deviceconfig.yaml
@@ -437,6 +437,10 @@ spec:
     {{- with .testerImage }}
     testerImage: {{ . }}
     {{- end }}
+
+    {{- with .maxParallelWorkflows }}
+    maxParallelWorkflows: {{ . }}
+    {{- end }}
   {{- end }}
 
 {{- end }}
diff --git a/helm-charts-k8s/values.yaml b/helm-charts-k8s/values.yaml
index b924972a..b08b2f32 100644
--- a/helm-charts-k8s/values.yaml
+++ b/helm-charts-k8s/values.yaml
@@ -231,7 +231,10 @@ deviceConfig:
       # -- config manager tolerations
       configManagerTolerations: []
     remediationWorkflow:
+      # -- enable/disable remediation workflow controller
       enable: false
+      # -- Set maximum number of remediation workflows that can run in parallel. Default is 0 which means no limit
+      maxParallelWorkflows: 0
 # AMD GPU operator controller related configs
 controllerManager:
   manager:
diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock
index 2310380d..d9d69628 100644
--- a/helm-charts-openshift/Chart.lock
+++ b/helm-charts-openshift/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
-generated: "2025-11-03T10:08:49.883010865Z"
+generated: "2025-12-09T09:27:48.895675076Z"
diff --git a/helm-charts-openshift/crds/deviceconfig-crd.yaml b/helm-charts-openshift/crds/deviceconfig-crd.yaml
index 752cf141..6061e2c9 100644
--- a/helm-charts-openshift/crds/deviceconfig-crd.yaml
+++ b/helm-charts-openshift/crds/deviceconfig-crd.yaml
@@ -1396,6 +1396,11 @@ spec:
                       enable remediation workflows. disabled by default
                       enable if operator should automatically handle remediation of node incase of gpu issues
                     type: boolean
+                  maxParallelWorkflows:
+                    description: MaxParallelWorkflows specifies limit on how many remediation
+                      workflows can be executed in parallel. 0 is the default value
+                      and it means no limit.
+                    type: integer
                   testerImage:
                     description: Tester image used to run tests and verify if remediation
                       fixed the reported problem.
diff --git a/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml b/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml
index aa5c0ac0..011e3ad0 100644
--- a/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml
+++ b/helm-charts-openshift/crds/remediationworkflowstatus-crd.yaml
@@ -28,8 +28,9 @@ spec:
   - name: v1alpha1
     schema:
       openAPIV3Schema:
-        description: RemediationWorkflowStatus keeps a record of recent remediation
-          workflow runs.
+        description: |-
+          RemediationWorkflowStatus keeps a record of recent remediation workflow runs.
+          We maintain this information to avoid re-running remediation workflows on nodes where a pre-defined threshold is crossed.
         properties:
           apiVersion:
             description: |-
@@ -60,6 +61,9 @@ spec:
                   type: object
                 type: array
               type: object
+            description: |-
+              Status field holds remediation workflow run history for each node and node condition
+              Key is node name. Value is a map with key as node condition and value as list of workflow metadata(workflow name and it's start time)
             type: object
         type: object
     served: true
diff --git a/internal/controllers/mock_remediation_handler.go b/internal/controllers/mock_remediation_handler.go
index 0ab77e2c..cfff50ce 100644
--- a/internal/controllers/mock_remediation_handler.go
+++ b/internal/controllers/mock_remediation_handler.go
@@ -355,12 +355,11 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) getRecentRecoveryCount(nodeNa
 }
 
 // getRecoveryTrackerKey mocks base method.
-func (m *MockremediationMgrHelperAPI) getRecoveryTrackerKey(nodeName, nodeCondition string) (string, error) {
+func (m *MockremediationMgrHelperAPI) getRecoveryTrackerKey(nodeName, nodeCondition string) string {
 	m.ctrl.T.Helper()
 	ret := m.ctrl.Call(m, "getRecoveryTrackerKey", nodeName, nodeCondition)
 	ret0, _ := ret[0].(string)
-	ret1, _ := ret[1].(error)
-	return ret0, ret1
+	return ret0
 }
 
 // getRecoveryTrackerKey indicates an expected call of getRecoveryTrackerKey.
@@ -710,6 +709,20 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) syncInternalMapFromStatusCR(c
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "syncInternalMapFromStatusCR", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).syncInternalMapFromStatusCR), ctx, namespace)
 }
 
+// updateMaxParallelWorkflows mocks base method.
+func (m *MockremediationMgrHelperAPI) updateMaxParallelWorkflows(ctx context.Context, devConfig *v1alpha1.DeviceConfig) error {
+	m.ctrl.T.Helper()
+	ret := m.ctrl.Call(m, "updateMaxParallelWorkflows", ctx, devConfig)
+	ret0, _ := ret[0].(error)
+	return ret0
+}
+
+// updateMaxParallelWorkflows indicates an expected call of updateMaxParallelWorkflows.
+func (mr *MockremediationMgrHelperAPIMockRecorder) updateMaxParallelWorkflows(ctx, devConfig any) *gomock.Call {
+	mr.mock.ctrl.T.Helper()
+	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "updateMaxParallelWorkflows", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).updateMaxParallelWorkflows), ctx, devConfig)
+}
+
 // validateNodeConditions mocks base method.
 func (m *MockremediationMgrHelperAPI) validateNodeConditions(ctx context.Context, devConfig *v1alpha1.DeviceConfig, node *v1.Node, mappings map[string]ConditionWorkflowMapping) (ConditionWorkflowMapping, error) {
 	m.ctrl.T.Helper()
diff --git a/internal/controllers/remediation/scripts/notify.sh b/internal/controllers/remediation/scripts/notify.sh
index 11c32140..39f013e7 100644
--- a/internal/controllers/remediation/scripts/notify.sh
+++ b/internal/controllers/remediation/scripts/notify.sh
@@ -17,11 +17,12 @@ involvedObject:
   kind: Node
   name: ${NODE_NAME}
   namespace: {{workflow.namespace}}
-message: ${NOTIFY_MESSAGE}
+message: '${NOTIFY_MESSAGE}'
 reason: AMDGPUUnhealthy
 reportingComponent: amd-gpu-node-remediation-workflow
 reportingInstance: amd-gpu-node-remediation-workflow
 source:
   component: {{workflow.name}}
   host: ${NODE_NAME}
-type: Warning
\ No newline at end of file
+type: Warning
+EOF
\ No newline at end of file
diff --git a/internal/controllers/remediation/scripts/test.sh b/internal/controllers/remediation/scripts/test.sh
index 20695378..5a4c947e 100644
--- a/internal/controllers/remediation/scripts/test.sh
+++ b/internal/controllers/remediation/scripts/test.sh
@@ -1,7 +1,7 @@
 set -e
 NODE_NAME="{{inputs.parameters.node_name}}"
-JOB_NAME="test-runner-manual-trigger-${NODE_NAME}"
-CM_NAME="manual-config-map-${NODE_NAME}"
+JOB_NAME="{{workflow.name}}-test-run"
+CM_NAME="{{workflow.name}}-test-configmap"
 FRAMEWORK="{{inputs.parameters.framework}}"
 RECIPE="{{inputs.parameters.recipe}}"
 ITERATIONS="{{inputs.parameters.iterations}}"
@@ -10,6 +10,9 @@ TIMEOUTSECONDS="{{inputs.parameters.timeoutSeconds}}"
 TESTRUNNERIMAGE="{{inputs.parameters.testRunnerImage}}"
 TESTRUNNERSA="{{inputs.parameters.testRunnerServiceAccount}}"
 NAMESPACE="{{inputs.parameters.namespace}}"
+INITCONTAINERIMAGE="{{inputs.parameters.initContainerImage}}"
+WFNAME="{{workflow.name}}"
+WFUID="{{workflow.uid}}"
 
 if [ -z "$FRAMEWORK" ] || [ -z "$RECIPE" ] || [ -z "$ITERATIONS" ] || [ -z "$STOPONFAILURE" ] || [ -z "$TIMEOUTSECONDS" ]; then
   echo "Validation profile incomplete, skipping configmap and job creation. Please enter framework, recipe, iterations, stopOnFailure, timeoutSeconds as per testrunner requirements"
@@ -25,6 +28,13 @@ kind: ConfigMap
 metadata:
   name: ${CM_NAME}
   namespace: ${NAMESPACE}
+  ownerReferences:
+  - apiVersion: argoproj.io/v1alpha1
+    kind: Workflow
+    name: ${WFNAME}
+    uid: ${WFUID}
+    blockOwnerDeletion: true
+    controller: true
 data:
   config.json: |
     {
@@ -38,9 +48,9 @@ data:
                     {
                       "Framework": "${FRAMEWORK}",
                       "Recipe": "${RECIPE}",
-                      "Iterations": "${ITERATIONS}",
-                      "StopOnFailure": "${STOPONFAILURE}",
-                      "TimeoutSeconds": "${TIMEOUTSECONDS}"
+                      "Iterations": ${ITERATIONS},
+                      "StopOnFailure": ${STOPONFAILURE},
+                      "TimeoutSeconds": ${TIMEOUTSECONDS}
                     }
                   ]
                 }
@@ -56,8 +66,14 @@ kind: Job
 metadata:
   name: ${JOB_NAME}
   namespace: ${NAMESPACE}
+  ownerReferences:
+    - apiVersion: argoproj.io/v1alpha1
+      kind: Workflow
+      name: ${WFNAME}
+      uid: ${WFUID}
+      blockOwnerDeletion: true
+      controller: true
 spec:
-  ttlSecondsAfterFinished: 120
   backoffLimit: 0
   template:
     spec:
@@ -85,6 +101,20 @@ spec:
             path: /var/log/amd-test-runner
             type: DirectoryOrCreate
           name: test-runner-volume
+        - name: host-sys
+          hostPath:
+            path: /sys
+            type: Directory
+      initContainers:
+        - name: driver-init
+          image: "${INITCONTAINERIMAGE}"
+          imagePullPolicy: IfNotPresent
+          command: ['sh', '-c', 'while [ ! -d /host-sys/class/kfd ] || [ ! -d /host-sys/module/amdgpu/drivers/ ]; do echo \"amdgpu driver is not loaded \"; sleep 2 ;done; echo \"amdgpu driver is loaded\"']
+          securityContext:
+            privileged: true
+          volumeMounts:
+            - name: host-sys
+              mountPath: /host-sys      
       containers:
         - name: amd-test-runner
           image: "${TESTRUNNERIMAGE}"
@@ -133,13 +163,18 @@ echo "Overall timeout for the job is set to $timeout seconds."
 echo "Waiting for Job $JOB_NAME to complete..."
 
 while true; do
-  job_status=$(kubectl get job "$JOB_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions[0].type}' 2>/dev/null || true)
-  if [ "$job_status" = "Complete" ]; then
+  if ! kubectl get job "$JOB_NAME" -n "$NAMESPACE" &>/dev/null; then
+    echo "Error: Job $JOB_NAME is not found in namespace $NAMESPACE"
+    exit 1
+  fi
+  isComplete=$(kubectl get job "$JOB_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}')
+  isFailure=$(kubectl get job "$JOB_NAME" -n "$NAMESPACE" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}')
+  if [ "$isComplete" = "True" ]; then
     echo "Test runner job completed successfully."
 	kubectl logs -n $NAMESPACE job/$JOB_NAME
     echo "Detailed run report can be found at /var/log/amd-test-runner"
     exit 0
-  elif [ "$job_status" = "Failed" ]; then
+  elif [ "$isFailure" = "True" ]; then
     echo "Test runner job failed."
     kubectl logs -n $NAMESPACE job/$JOB_NAME
     echo "Detailed run report can be found at /var/log/amd-test-runner"
diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go
index 87c888e6..678bf052 100644
--- a/internal/controllers/remediation_handler.go
+++ b/internal/controllers/remediation_handler.go
@@ -38,6 +38,7 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"sync"
 	"time"
@@ -51,6 +52,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
+	"k8s.io/client-go/util/retry"
 	"k8s.io/utils/ptr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/client"
@@ -78,9 +80,11 @@ const (
 	ForceResumeWorkflowLabelKey   = "operator.amd.com/gpu-force-resume-workflow"
 	ForceResumeWorkflowLabelValue = "true"
 	// Below is the label and value needed to be added to node to abort an ongoing workflow
-	AbortWorkflowLabelKey   = "operator.amd.com/gpu-abort-workflow"
-	AbortWorkflowLabelValue = "true"
-	RemediationFilesPath    = "/remediation"
+	AbortWorkflowLabelKey           = "operator.amd.com/gpu-abort-workflow"
+	AbortWorkflowLabelValue         = "true"
+	RemediationFilesPath            = "/remediation"
+	DefaultInitContainerImage       = "busybox:1.36"
+	ArgoWorkflowControllerConfigMap = "workflow-controller-configmap"
 )
 
 type RecoveryPolicyConfig struct {
@@ -150,6 +154,11 @@ func (n *remediationMgr) HandleRemediation(ctx context.Context, devConfig *amdv1
 		return res, err
 	}
 
+	// Update max parallel workflows based on DeviceConfig
+	if err := n.helper.updateMaxParallelWorkflows(ctx, devConfig); err != nil {
+		logger.Error(err, "Failed to update max parallel workflows, continuing with remediation")
+	}
+
 	// Clear any older recovery attempts from the status CR
 	if err := n.helper.dropOlderRecoveryAttemptsFromStatusCR(ctx, devConfig.Namespace); err != nil {
 		logger.Error(err, "Failed to drop older recovery attempts from status CR")
@@ -289,7 +298,7 @@ type remediationMgrHelperAPI interface {
 	registerRecoveryAttempt(ctx context.Context, nodeName string, nodeCondition string, namespace string, wfName string) error
 	registerRecoveryAttemptInternal(nodeName string, nodeCondition string, namespace string, startTime time.Time) error
 	registerRecoveryAttemptToStatusCR(ctx context.Context, nodeName string, nodeCondition string, namespace string, wfName string, startTime time.Time) error
-	getRecoveryTrackerKey(nodeName string, nodeCondition string) (string, error)
+	getRecoveryTrackerKey(nodeName string, nodeCondition string) string
 	getMaxAllowedRunsPerWindow(recoveryPolicy *RecoveryPolicyConfig) int
 	getWindowSize(recoveryPolicy *RecoveryPolicyConfig) string
 	isRecoveryPolicyViolated(ctx context.Context, nodeName string, mapping *ConditionWorkflowMapping) bool
@@ -305,14 +314,16 @@ type remediationMgrHelperAPI interface {
 	attemptResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping ConditionWorkflowMapping, wf *workflowv1alpha1.Workflow)
 	handleSuspendedWorkflowsOnNode(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping, wf *workflowv1alpha1.Workflow) bool
 	getWorkflowTaskScriptSource(scriptFileName string) (string, error)
+	updateMaxParallelWorkflows(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) error
 }
 
 type remediationMgrHelper struct {
-	client             client.Client
-	k8sInterface       kubernetes.Interface
-	recoveryTracker    *sync.Map
-	statusSynced       bool
-	serviceAccountName string
+	client               client.Client
+	k8sInterface         kubernetes.Interface
+	recoveryTracker      *sync.Map
+	statusSynced         bool
+	serviceAccountName   string
+	maxParallelWorkflows int
 }
 
 // Initialize remediation manager helper interface
@@ -583,7 +594,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "drain", Template: "drain"}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{
 							{
-								Name:        "notifyBeforeSuspend",
+								Name:        "notifybeforesuspend",
 								TemplateRef: &workflowv1alpha1.TemplateRef{Name: "event-notify-template", Template: "notify"},
 								Arguments: workflowv1alpha1.Arguments{
 									Parameters: []workflowv1alpha1.Parameter{
@@ -600,7 +611,7 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "test", Template: "test", ContinueOn: &workflowv1alpha1.ContinueOn{Failed: true}}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{
 							{
-								Name:        "notifyGpuTestFailed",
+								Name:        "notifygputestfailed",
 								TemplateRef: &workflowv1alpha1.TemplateRef{Name: "event-notify-template", Template: "notify"},
 								Arguments: workflowv1alpha1.Arguments{
 									Parameters: []workflowv1alpha1.Parameter{
@@ -613,12 +624,12 @@ func (h *remediationMgrHelper) createDefaultWorkflowTemplate(ctx context.Context
 							},
 						},
 						},
-						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "failWorkflow", Template: "failWorkflow", When: "{{steps.test.exitCode}} != 0"}}},
+						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "failworkflow", Template: "failworkflow", When: "{{steps.test.exitCode}} != 0"}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "wait", Template: "wait", When: "{{steps.test.exitCode}} == 0"}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{{Name: "untaint", Template: "untaint", When: "{{steps.test.exitCode}} == 0"}}},
 						{Steps: []workflowv1alpha1.WorkflowStep{
 							{
-								Name:        "notifyWorkflowSucceeded",
+								Name:        "notifyworkflowsucceeded",
 								TemplateRef: &workflowv1alpha1.TemplateRef{Name: "event-notify-template", Template: "notify"},
 								Arguments: workflowv1alpha1.Arguments{
 									Parameters: []workflowv1alpha1.Parameter{
@@ -723,6 +734,10 @@ containers:
 								Name:  "namespace",
 								Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.namespace}}"),
 							},
+							{
+								Name:  "initContainerImage",
+								Value: workflowv1alpha1.AnyStringPtr("{{workflow.parameters.initContainerImage}}"),
+							},
 						},
 					},
 					Script: &workflowv1alpha1.ScriptTemplate{
@@ -765,7 +780,7 @@ containers:
 					},
 				},
 				{
-					Name: "failWorkflow",
+					Name: "failworkflow",
 					Script: &workflowv1alpha1.ScriptTemplate{
 						Source:    `echo "Failing workflow" && exit 1`,
 						Container: utilityContainer,
@@ -832,6 +847,34 @@ func (h *remediationMgrHelper) createDefaultObjects(ctx context.Context, devConf
 	return cm, nil
 }
 
+func (h *remediationMgrHelper) updateMaxParallelWorkflows(ctx context.Context, devConfig *amdv1alpha1.DeviceConfig) error {
+	logger := log.FromContext(ctx)
+	// Set maximum parallel workflows that can run simultaneously
+	if h.maxParallelWorkflows != devConfig.Spec.RemediationWorkflow.MaxParallelWorkflows {
+		err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
+			acm, err := h.getConfigMap(ctx, ArgoWorkflowControllerConfigMap, devConfig.Namespace)
+			if err != nil {
+				logger.Error(err, "Failed to fetch argo workflow controller configmap")
+				return err
+			}
+			if acm.Data == nil {
+				acm.Data = make(map[string]string)
+			}
+			// Update parallelism in Argo workflow controller configmap.
+			// https://github.com/argoproj/argo-workflows/blob/main/config/config.go#L69
+			acm.Data["parallelism"] = strconv.Itoa(devConfig.Spec.RemediationWorkflow.MaxParallelWorkflows)
+			return h.client.Update(ctx, acm)
+		})
+		if err != nil {
+			logger.Error(err, "Failed to update parallelism in argo workflow controller")
+			return err
+		}
+		h.maxParallelWorkflows = devConfig.Spec.RemediationWorkflow.MaxParallelWorkflows
+		logger.Info(fmt.Sprintf("Updated maximum parallel remediation workflows to %d", h.maxParallelWorkflows))
+	}
+	return nil
+}
+
 func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate *workflowv1alpha1.WorkflowTemplate, mapping *ConditionWorkflowMapping, nodeName string, devConfig *amdv1alpha1.DeviceConfig) *workflowv1alpha1.Workflow {
 	wf := &workflowv1alpha1.Workflow{
 		ObjectMeta: metav1.ObjectMeta{
@@ -873,6 +916,11 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate
 		testrunnerImage = devConfig.Spec.RemediationWorkflow.TesterImage
 	}
 
+	initContainerImage := DefaultInitContainerImage
+	if devConfig.Spec.CommonConfig.InitContainerImage != "" {
+		initContainerImage = devConfig.Spec.CommonConfig.InitContainerImage
+	}
+
 	// Pass the args required to be used in the template
 	wf.Spec.Arguments = workflowv1alpha1.Arguments{
 		Parameters: []workflowv1alpha1.Parameter{
@@ -928,6 +976,10 @@ func (h *remediationMgrHelper) populateWorkflow(ctx context.Context, wfTemplate
 				Name:  "notifySuccessMessage",
 				Value: workflowv1alpha1.AnyStringPtr(fmt.Sprintf("Remediation for node condition %s completed successfully on node %s", mapping.NodeCondition, nodeName)),
 			},
+			{
+				Name:  "initContainerImage",
+				Value: workflowv1alpha1.AnyStringPtr(initContainerImage),
+			},
 		},
 	}
 
@@ -1115,10 +1167,7 @@ func (h *remediationMgrHelper) getWorkflowUtilityImage(devConfig *amdv1alpha1.De
 
 func (h *remediationMgrHelper) getRecentRecoveryCount(nodeName string, nodeCondition string) int {
 	// get the length of the slice of attempts for the given node and condition
-	key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition)
-	if err != nil {
-		return 0
-	}
+	key := h.getRecoveryTrackerKey(nodeName, nodeCondition)
 
 	attempts, ok := h.recoveryTracker.Load(key)
 	if !ok {
@@ -1132,10 +1181,7 @@ func (h *remediationMgrHelper) getRecentRecoveryCount(nodeName string, nodeCondi
 }
 
 func (h *remediationMgrHelper) dropOlderRecoveryAttemptsInternal(nodeName string, nodeCondition string, windowSize string) error {
-	key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition)
-	if err != nil {
-		return fmt.Errorf("failed to get recovery tracker key: %w", err)
-	}
+	key := h.getRecoveryTrackerKey(nodeName, nodeCondition)
 
 	attempts, _ := h.recoveryTracker.LoadOrStore(key, []time.Time{})
 	if attemptsSlice, ok := attempts.([]time.Time); ok {
@@ -1265,10 +1311,7 @@ func (h *remediationMgrHelper) registerRecoveryAttemptToStatusCR(ctx context.Con
 }
 
 func (h *remediationMgrHelper) registerRecoveryAttemptInternal(nodeName string, nodeCondition string, namespace string, startTime time.Time) error {
-	key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition)
-	if err != nil {
-		return fmt.Errorf("failed to get recovery tracker key: %w", err)
-	}
+	key := h.getRecoveryTrackerKey(nodeName, nodeCondition)
 
 	attempts, _ := h.recoveryTracker.LoadOrStore(key, []time.Time{})
 	if attemptsSlice, ok := attempts.([]time.Time); ok {
@@ -1281,9 +1324,8 @@ func (h *remediationMgrHelper) registerRecoveryAttemptInternal(nodeName string,
 	return nil
 }
 
-func (h *remediationMgrHelper) getRecoveryTrackerKey(nodeName string, nodeCondition string) (string, error) {
-	key := fmt.Sprintf("%s-%s", nodeName, nodeCondition)
-	return key, nil
+func (h *remediationMgrHelper) getRecoveryTrackerKey(nodeName string, nodeCondition string) string {
+	return fmt.Sprintf("%s-%s", nodeName, nodeCondition)
 }
 
 func (h *remediationMgrHelper) getMaxAllowedRunsPerWindow(recoveryPolicy *RecoveryPolicyConfig) int {
@@ -1337,10 +1379,7 @@ func (h *remediationMgrHelper) syncInternalMapFromStatusCR(ctx context.Context,
 
 	for nodeName, conditions := range wfStatus.Status {
 		for nodeCondition, attempts := range conditions {
-			key, err := h.getRecoveryTrackerKey(nodeName, nodeCondition)
-			if err != nil {
-				return fmt.Errorf("failed to get recovery tracker key: %w", err)
-			}
+			key := h.getRecoveryTrackerKey(nodeName, nodeCondition)
 
 			attemptTimes := make([]time.Time, len(attempts))
 			for i, attempt := range attempts {
diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile
index 7ac47e7d..b87d7732 100644
--- a/tests/e2e/Makefile
+++ b/tests/e2e/Makefile
@@ -17,6 +17,7 @@ E2E_NODE_LABELLER_IMAGE_2 ?= rocm/k8s-device-plugin:labeller-1.31.0.6
 E2E_TEST_RUNNER_IMAGE ?= rocm/test-runner:v1.4.0
 E2E_KUBEVIRT_DEVICE_PLUGIN_IMAGE ?= rocm/k8s-device-plugin:latest
 E2E_KUBEVIRT_NODE_LABELLER_IMAGE ?= rocm/k8s-device-plugin:labeller-latest
+E2E_UTILS_CONTAINER_IMAGE ?= docker.io/rocm/gpu-operator-utils:v1.4.0
 
 export E2E_INIT_CONTAINER_IMAGE
 export E2E_KUBE_RBAC_PROXY_CURL_IMAGE
@@ -34,6 +35,7 @@ export E2E_TEST_RUNNER_IMAGE
 export E2E_AGFHC_TEST_RUNNER_IMAGE
 export E2E_KUBEVIRT_DEVICE_PLUGIN_IMAGE
 export E2E_KUBEVIRT_NODE_LABELLER_IMAGE
+export E2E_UTILS_CONTAINER_IMAGE
 
 export E2E_DCM_IMAGE
 export E2E_NODEAPP_IMG
diff --git a/tests/e2e/doc.go b/tests/e2e/doc.go
index 89226b2c..cce60e3c 100644
--- a/tests/e2e/doc.go
+++ b/tests/e2e/doc.go
@@ -28,7 +28,7 @@ import (
 type E2ESuite struct {
 	clientSet            *kubernetes.Clientset
 	dClient              *client.DeviceConfigClient
-	wfClient             workflowclient.Interface
+	wfClient             workflowclient.Clientset
 	cfgName              string
 	registry             string
 	helmChart            string
diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go
index 32cbcb13..c1e522b6 100644
--- a/tests/e2e/e2e_test.go
+++ b/tests/e2e/e2e_test.go
@@ -138,7 +138,7 @@ func (s *E2ESuite) SetUpSuite(c *C) {
 	if err != nil {
 		c.Fatalf("Failed to create workflow client: %v", err)
 	}
-	s.wfClient = wfClient
+	s.wfClient = *wfClient
 
 	s.clusterType = utils.GetClusterType(config)
 
diff --git a/tests/e2e/init.go b/tests/e2e/init.go
index 90e90f1c..9245b0c7 100644
--- a/tests/e2e/init.go
+++ b/tests/e2e/init.go
@@ -36,6 +36,7 @@ var (
 	driverImageRepo               string
 	kubeVirtHostDevicePluginImage string
 	kubeVirtHostNodeLabellerImage string
+	utilsContainerImage           string
 )
 
 func init() {
@@ -97,4 +98,8 @@ func init() {
 	if !ok {
 		log.Fatalf("E2E_KUBEVIRT_NODE_LABELLER_IMAGE is not defined.")
 	}
+	utilsContainerImage, ok = os.LookupEnv("E2E_UTILS_CONTAINER_IMAGE")
+	if !ok {
+		log.Fatalf("E2E_UTILS_CONTAINER_IMAGE is not defined.")
+	}
 }
diff --git a/tests/e2e/remediation_test.go b/tests/e2e/remediation_test.go
new file mode 100644
index 00000000..9dae48ce
--- /dev/null
+++ b/tests/e2e/remediation_test.go
@@ -0,0 +1,122 @@
+/*
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the \"License\");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an \"AS IS\" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package e2e
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"time"
+
+	wfv1alpha1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
+	"github.com/stretchr/testify/assert"
+	. "gopkg.in/check.v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+const (
+	remediationNodeCondition    = "AMDGPUHardwareAssertionHwa"
+	npdInbandRASConfigPath      = "./yamls/config/npd/node-problem-detector-config-inband.yaml"
+	npdInbandRASErrorConfigPath = "./yamls/config/npd/node-problem-detector-error-config-inband.yaml"
+)
+
+func (s *E2ESuite) verifyRemediationWorkflowStatus(c *C, nodeName, status string, waitTime int) {
+	assert.Eventually(c, func() bool {
+		wfs, err := s.wfClient.ArgoprojV1alpha1().Workflows(s.ns).List(context.Background(), metav1.ListOptions{})
+		if err != nil {
+			logger.Infof("Error listing workflows: %v", err)
+			return false
+		}
+		for _, wf := range wfs.Items {
+			if strings.Contains(wf.Name, nodeName) && status == string(wf.Status.Phase) {
+				return true
+			}
+		}
+		return false
+	}, time.Duration(waitTime)*time.Minute, 10*time.Second, "Remediation workflow did not reach expected status")
+}
+
+func (s *E2ESuite) TestAutoNodeRemediationWithoutPhysicalAction(c *C) {
+	logger.Infof("Starting Auto Node Remediation Test")
+	if s.simEnable {
+		c.Skip("Skipping for non amd gpu testbed")
+	}
+
+	nodes, err := s.clientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
+		LabelSelector: "feature.node.kubernetes.io/amd-gpu=true",
+	})
+	assert.NoError(c, err, "Failed to list nodes with AMD GPU label")
+	if len(nodes.Items) == 0 {
+		c.Fatalf("No nodes found with AMD GPU label")
+	}
+	nodeName := nodes.Items[0].Name
+
+	_, err = s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{})
+	assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName))
+
+	driverEnable := false
+	remediationEnable := true
+	devCfg := s.getDeviceConfig(c)
+	devCfg.Spec.Driver.Enable = &driverEnable
+	devCfg.Spec.RemediationWorkflow.Enable = &remediationEnable
+	devCfg.Spec.MetricsExporter.Enable = &remediationEnable
+	devCfg.Spec.MetricsExporter.Image = exporterImage
+	devCfg.Spec.MetricsExporter.ImagePullPolicy = "Always"
+	devCfg.Spec.MetricsExporter.Port = 5000
+	devCfg.Spec.CommonConfig.UtilsContainer.Image = utilsContainerImage
+	devCfg.Spec.CommonConfig.UtilsContainer.ImagePullPolicy = "Always"
+
+	logger.Infof("Creating DeviceConfig with remediation enabled and driver disabled")
+	s.createDeviceConfig(devCfg, c)
+	s.checkMetricsExporterStatus(devCfg, s.ns, corev1.ServiceTypeClusterIP, c)
+
+	// Wait for cluster to be up
+	logger.Infof("Waiting for device config to be applied")
+	time.Sleep(5 * time.Second)
+
+	// Setup NPD
+	logger.Infof("Setting up Node Problem Detector (NPD)")
+	setupNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath)
+	defer tearDownNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath)
+
+	logger.Infof("Verify if Node Problem Detector (NPD) is running on all GPU nodes")
+	s.verifyNPDRunning(c)
+
+	logger.Infof("Verifying that node condition %s is added for the node %s", remediationNodeCondition, nodeName)
+	s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionTrue)
+
+	// Trigger error condition by modifying NPD config
+	logger.Infof("Edit Node Problem Detector (NPD) thresholds to simulate error condition")
+	s.updateConfigForNPD(c, npdInbandRASConfigPath, npdInbandRASErrorConfigPath)
+
+	s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionTrue)
+
+	// Verify remediation workflow started and completed
+	logger.Infof("Verifying remediation workflow started on the node %s", nodeName)
+	s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowRunning), 5)
+
+	time.Sleep(4 * time.Minute) // wait for workflow to progress
+	logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration")
+	s.updateConfigForNPD(c, npdInbandRASErrorConfigPath, npdInbandRASConfigPath)
+
+	logger.Infof("Waiting for remediation workflow to complete on the node %s", nodeName)
+	s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowSucceeded), 70)
+
+	logger.Infof("Verifying that node condition %s is false on the node %s", remediationNodeCondition, nodeName)
+	s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionFalse)
+}
diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml
new file mode 100644
index 00000000..a2b6976d
--- /dev/null
+++ b/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml
@@ -0,0 +1,42 @@
+apiVersion: v1
+data:
+  custom-plugin-monitor.json: |
+    {
+      "plugin": "custom",
+      "pluginConfig": {
+        "invoke_interval": "30s",
+        "timeout": "15s",
+        "max_output_length": 80,
+        "concurrency": 3,
+        "enable_message_change_based_condition_update": false
+      },
+      "source": "amdgpu-custom-plugin-monitor",
+      "metricsReporting": true,
+      "conditions": [
+        {
+          "type": "AMDGPUHardwareAssertionHwa",
+          "reason": "AMDGPUIsUp",
+          "message": "AMDGPU is up"
+        }
+      ],
+      "rules": [
+        {
+          "type": "permanent",
+          "condition": "AMDGPUHardwareAssertionHwa",
+          "reason": "AMDGPU Hardware Assertion",
+          "path": "/var/lib/amd-metrics-exporter/amdgpuhealth",
+          "args": [
+            "query",
+            "inband-ras-errors",
+            "-s=CPER_SEVERITY_FATAL",
+            "--afid=30",
+            "-t=0"
+          ],
+          "timeout": "15s"
+        }
+      ]
+    }
+kind: ConfigMap
+metadata:
+  name: node-problem-detector-config
+  namespace: kube-system
diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml
new file mode 100644
index 00000000..4ffc7d68
--- /dev/null
+++ b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml
@@ -0,0 +1,42 @@
+apiVersion: v1
+data:
+  custom-plugin-monitor.json: |
+    {
+      "plugin": "custom",
+      "pluginConfig": {
+        "invoke_interval": "30s",
+        "timeout": "15s",
+        "max_output_length": 80,
+        "concurrency": 3,
+        "enable_message_change_based_condition_update": false
+      },
+      "source": "amdgpu-custom-plugin-monitor",
+      "metricsReporting": true,
+      "conditions": [
+        {
+          "type": "AMDGPUHardwareAssertionHwa",
+          "reason": "AMDGPUIsUp",
+          "message": "AMDGPU is up"
+        }
+      ],
+      "rules": [
+        {
+          "type": "permanent",
+          "condition": "AMDGPUHardwareAssertionHwa",
+          "reason": "AMDGPU Hardware Assertion",
+          "path": "/var/lib/amd-metrics-exporter/amdgpuhealth",
+          "args": [
+            "query",
+            "inband-ras-errors",
+            "-s=CPER_SEVERITY_FATAL",
+            "--afid=30",
+            "-t=-1"
+          ],
+          "timeout": "15s"
+        }
+      ]
+    }
+kind: ConfigMap
+metadata:
+  name: node-problem-detector-config
+  namespace: kube-system
diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-rbac.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-rbac.yaml
index 006eb4af..4ebda390 100644
--- a/tests/e2e/yamls/config/npd/node-problem-detector-rbac.yaml
+++ b/tests/e2e/yamls/config/npd/node-problem-detector-rbac.yaml
@@ -19,7 +19,7 @@ rules:
 - apiGroups: [""]
   resources: ["nodes/status"]
   verbs: ["patch"]
-- nonResourceURLs: ["/metrics", "/gpumetrics"]
+- nonResourceURLs: ["/metrics", "/gpumetrics", "/inbandraserrors"]
   verbs: ["get"]
 
 ---