diff --git a/helm-charts-k8s/Chart.lock b/helm-charts-k8s/Chart.lock
index be08b5b4..92deade4 100644
--- a/helm-charts-k8s/Chart.lock
+++ b/helm-charts-k8s/Chart.lock
@@ -9,4 +9,4 @@ dependencies:
   repository: file://./charts/remediation
   version: v1.0.0
 digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07
-generated: "2026-01-05T19:49:07.882445585Z"
+generated: "2026-01-07T10:51:28.442192317Z"
diff --git a/helm-charts-k8s/README.md b/helm-charts-k8s/README.md
index 203d9ef4..216745a6 100644
--- a/helm-charts-k8s/README.md
+++ b/helm-charts-k8s/README.md
@@ -13,6 +13,7 @@ AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU ac
 * AMD GPU Operator Controller
 * K8s Device Plugin
 * K8s Node Labeller
+* Device Config Manager
 * Device Metrics Exporter
 * Device Test Runner
 * Node Feature Discovery Operator
@@ -24,6 +25,7 @@ AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU ac
 * Comprehensive metrics collection and export
 * Easy deployment of AMD GPU device plugin for Kubernetes
 * Automated labeling of nodes with AMD GPU capabilities
+* GPU partitioning configuration and management via Device Config Manager
 * Compatibility with standard Kubernetes environments
 * Efficient GPU resource allocation for containerized workloads
 * GPU health monitoring and troubleshooting  
diff --git a/helm-charts-openshift/Chart.lock b/helm-charts-openshift/Chart.lock
index d9d69628..682e0e64 100644
--- a/helm-charts-openshift/Chart.lock
+++ b/helm-charts-openshift/Chart.lock
@@ -6,4 +6,4 @@ dependencies:
   repository: file://./charts/kmm
   version: v1.0.0
 digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
-generated: "2025-12-09T09:27:48.895675076Z"
+generated: "2026-01-07T10:51:41.178709798Z"
diff --git a/internal/controllers/mock_remediation_handler.go b/internal/controllers/mock_remediation_handler.go
index cfff50ce..7fb93822 100644
--- a/internal/controllers/mock_remediation_handler.go
+++ b/internal/controllers/mock_remediation_handler.go
@@ -569,20 +569,6 @@ func (mr *MockremediationMgrHelperAPIMockRecorder) isRemediationDisabled(ctx, de
 	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isRemediationDisabled", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).isRemediationDisabled), ctx, devConfig)
 }
 
-// isStatusSynced mocks base method.
-func (m *MockremediationMgrHelperAPI) isStatusSynced(ctx context.Context) bool {
-	m.ctrl.T.Helper()
-	ret := m.ctrl.Call(m, "isStatusSynced", ctx)
-	ret0, _ := ret[0].(bool)
-	return ret0
-}
-
-// isStatusSynced indicates an expected call of isStatusSynced.
-func (mr *MockremediationMgrHelperAPIMockRecorder) isStatusSynced(ctx any) *gomock.Call {
-	mr.mock.ctrl.T.Helper()
-	return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "isStatusSynced", reflect.TypeOf((*MockremediationMgrHelperAPI)(nil).isStatusSynced), ctx)
-}
-
 // isWorkflowSchedulableOnNode mocks base method.
 func (m *MockremediationMgrHelperAPI) isWorkflowSchedulableOnNode(ctx context.Context, devConfig *v1alpha1.DeviceConfig, node *v1.Node, mapping ConditionWorkflowMapping) bool {
 	m.ctrl.T.Helper()
diff --git a/internal/controllers/remediation_handler.go b/internal/controllers/remediation_handler.go
index 678bf052..f77c9d6b 100644
--- a/internal/controllers/remediation_handler.go
+++ b/internal/controllers/remediation_handler.go
@@ -165,14 +165,11 @@ func (n *remediationMgr) HandleRemediation(ctx context.Context, devConfig *amdv1
 		return res, err
 	}
 
-	// If statusSynced is false, we need to populate the internal map from the status CR
-	if !n.helper.isStatusSynced(ctx) {
-		if err := n.helper.syncInternalMapFromStatusCR(ctx, devConfig.Namespace); err != nil {
-			logger.Error(err, "Failed to sync internal map from status CR")
-			return res, err
-		}
-		logger.Info("Internal map synced from status CR successfully")
+	if err := n.helper.syncInternalMapFromStatusCR(ctx, devConfig.Namespace); err != nil {
+		logger.Error(err, "Failed to sync internal map from status CR")
+		return res, err
 	}
+	logger.Info("Internal map synced from status CR successfully")
 
 	var mappingsList []ConditionWorkflowMapping
 	if err = yaml.Unmarshal([]byte(configMap.Data["workflow"]), &mappingsList); err != nil {
@@ -304,7 +301,6 @@ type remediationMgrHelperAPI interface {
 	isRecoveryPolicyViolated(ctx context.Context, nodeName string, mapping *ConditionWorkflowMapping) bool
 	canResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping *ConditionWorkflowMapping) bool
 	syncInternalMapFromStatusCR(ctx context.Context, namespace string) error
-	isStatusSynced(ctx context.Context) bool
 	isNodeLabelledForForceResume(ctx context.Context, node *v1.Node) bool
 	removeForceResumeWorkflowLabelFromNode(ctx context.Context, node *v1.Node) error
 	isNodeLabelledForAbortWorkflow(node *v1.Node) bool
@@ -321,7 +317,6 @@ type remediationMgrHelper struct {
 	client               client.Client
 	k8sInterface         kubernetes.Interface
 	recoveryTracker      *sync.Map
-	statusSynced         bool
 	serviceAccountName   string
 	maxParallelWorkflows int
 }
@@ -332,7 +327,6 @@ func newRemediationMgrHelperHandler(client client.Client, k8sInterface kubernete
 		client:          client,
 		k8sInterface:    k8sInterface,
 		recoveryTracker: new(sync.Map),
-		statusSynced:    false,
 	}
 }
 
@@ -1057,6 +1051,12 @@ func (h *remediationMgrHelper) isWorkflowSchedulableOnNode(ctx context.Context,
 		logger.Info(fmt.Sprintf("Driver Install/Upgrade is in progress, skipping creation of workflow on node %s", node.Name))
 		return false
 	}
+
+	// if same node condition remediation workflow has crossed max threshold, skip the node
+	if h.isRecoveryPolicyViolated(ctx, node.Name, &mapping) {
+		logger.Info(fmt.Sprintf("Max remediation attempts reached for node %s on condition %s, skipping creation of workflow", node.Name, mapping.NodeCondition))
+		return false
+	}
 	return true
 }
 
@@ -1372,10 +1372,7 @@ func (h *remediationMgrHelper) syncInternalMapFromStatusCR(ctx context.Context,
 		return fmt.Errorf("failed to get remediation workflow status: %w", err)
 	}
 
-	if wfStatus.Status == nil {
-		h.statusSynced = true
-		return nil // Nothing to sync
-	}
+	h.recoveryTracker = new(sync.Map)
 
 	for nodeName, conditions := range wfStatus.Status {
 		for nodeCondition, attempts := range conditions {
@@ -1393,14 +1390,9 @@ func (h *remediationMgrHelper) syncInternalMapFromStatusCR(ctx context.Context,
 		}
 	}
 
-	h.statusSynced = true
 	return nil
 }
 
-func (h *remediationMgrHelper) isStatusSynced(ctx context.Context) bool {
-	return h.statusSynced
-}
-
 func (h *remediationMgrHelper) isRecoveryPolicyViolated(ctx context.Context, nodeName string, mapping *ConditionWorkflowMapping) bool {
 	logger := log.FromContext(ctx)
 
diff --git a/tests/e2e/client/client.go b/tests/e2e/client/client.go
index 2de113e3..e56d5ed3 100644
--- a/tests/e2e/client/client.go
+++ b/tests/e2e/client/client.go
@@ -19,6 +19,7 @@ package client
 import (
 	"context"
 	"encoding/json"
+	"fmt"
 
 	"github.com/ROCm/gpu-operator/api/v1alpha1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -382,3 +383,91 @@ func (c *deviceConfigsClient) Delete(name string) (*v1alpha1.DeviceConfig, error
 
 	return &result, err
 }
+
+type RemediationWorkflowStatusClient struct {
+	restClient rest.Interface
+}
+
+func NewWfStatusClient(c *rest.Config) (*RemediationWorkflowStatusClient, error) {
+	config := *c
+	config.ContentConfig.GroupVersion = &v1alpha1.GroupVersion
+	config.APIPath = "/apis"
+	config.NegotiatedSerializer = scheme.Codecs.WithoutConversion()
+	config.UserAgent = rest.DefaultKubernetesUserAgent()
+
+	client, err := rest.RESTClientFor(&config)
+	if err != nil {
+		return nil, err
+	}
+
+	return &RemediationWorkflowStatusClient{restClient: client}, nil
+}
+
+func (c *RemediationWorkflowStatusClient) Create(rwfstatus *v1alpha1.RemediationWorkflowStatus) (*v1alpha1.RemediationWorkflowStatus, error) {
+	result := v1alpha1.RemediationWorkflowStatus{}
+	rwfstatus.TypeMeta = metav1.TypeMeta{
+		Kind:       "RemediationWorkflowStatus",
+		APIVersion: "amd.com/v1alpha1",
+	}
+	err := c.restClient.
+		Post().
+		Namespace(rwfstatus.Namespace).
+		Resource("remediationworkflowstatuses").
+		Body(rwfstatus).
+		Do(context.TODO()).
+		Into(&result)
+	return &result, err
+}
+
+func (c *RemediationWorkflowStatusClient) Update(rwfstatus *v1alpha1.RemediationWorkflowStatus) (*v1alpha1.RemediationWorkflowStatus, error) {
+	result := v1alpha1.RemediationWorkflowStatus{}
+	rwfstatus.TypeMeta = metav1.TypeMeta{
+		Kind:       "RemediationWorkflowStatus",
+		APIVersion: "amd.com/v1alpha1",
+	}
+	err := c.restClient.
+		Put().
+		Namespace(rwfstatus.Namespace).
+		Resource("remediationworkflowstatuses").
+		Name(rwfstatus.Name).
+		SubResource("status").
+		Body(rwfstatus).
+		Do(context.TODO()).
+		Into(&result)
+
+	if err != nil {
+		return nil, fmt.Errorf("failed to update status: %w", err)
+	}
+
+	return &result, err
+}
+
+func (c *RemediationWorkflowStatusClient) Get(name, namespace string) (*v1alpha1.RemediationWorkflowStatus, error) {
+	result := v1alpha1.RemediationWorkflowStatus{}
+	err := c.restClient.
+		Get().
+		Namespace(namespace).
+		Resource("remediationworkflowstatuses").
+		Name(name).
+		Do(context.TODO()).
+		Into(&result)
+
+	return &result, err
+}
+
+func (c *RemediationWorkflowStatusClient) Delete(name string, namespace string) (*v1alpha1.RemediationWorkflowStatus, error) {
+	result := v1alpha1.RemediationWorkflowStatus{}
+	err := c.restClient.
+		Delete().
+		Namespace(namespace).
+		Resource("remediationworkflowstatuses").
+		Body(&v1alpha1.RemediationWorkflowStatus{
+			ObjectMeta: metav1.ObjectMeta{
+				Name: name,
+			},
+		}).
+		Do(context.TODO()).
+		Into(&result)
+
+	return &result, err
+}
diff --git a/tests/e2e/cluster_test.go b/tests/e2e/cluster_test.go
index c8801b8b..50b54c22 100644
--- a/tests/e2e/cluster_test.go
+++ b/tests/e2e/cluster_test.go
@@ -122,6 +122,9 @@ func (s *E2ESuite) getDeviceConfig(c *C) *v1alpha1.DeviceConfig {
 				Port:     5001,
 			},
 			Selector: map[string]string{"feature.node.kubernetes.io/amd-gpu": "true"},
+			CommonConfig: v1alpha1.CommonConfigSpec{
+				InitContainerImage: initContainerImage,
+			},
 		},
 	}
 	insecure := true
diff --git a/tests/e2e/doc.go b/tests/e2e/doc.go
index cce60e3c..2161d9ca 100644
--- a/tests/e2e/doc.go
+++ b/tests/e2e/doc.go
@@ -42,4 +42,5 @@ type E2ESuite struct {
 	monClient            monitoringClient.Interface
 	apiClientSet         *apiextClient.Clientset
 	framework            string
+	wfStatusClient       *client.RemediationWorkflowStatusClient
 }
diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go
index c1e522b6..89b8d397 100644
--- a/tests/e2e/e2e_test.go
+++ b/tests/e2e/e2e_test.go
@@ -108,6 +108,12 @@ func (s *E2ESuite) SetUpSuite(c *C) {
 	}
 	s.dClient = dcCli
 
+	wfstatusClient, err := client.NewWfStatusClient(config)
+	if err != nil {
+		c.Fatalf("Error: %v", err.Error())
+	}
+	s.wfStatusClient = wfstatusClient
+
 	err = apiextv1.AddToScheme(scheme.Scheme)
 	if err != nil {
 		c.Fatalf("Error: %v", err.Error())
diff --git a/tests/e2e/remediation_test.go b/tests/e2e/remediation_test.go
index 9dae48ce..9955685f 100644
--- a/tests/e2e/remediation_test.go
+++ b/tests/e2e/remediation_test.go
@@ -22,6 +22,8 @@ import (
 	"strings"
 	"time"
 
+	"github.com/ROCm/gpu-operator/api/v1alpha1"
+	"github.com/ROCm/gpu-operator/tests/e2e/utils"
 	wfv1alpha1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1"
 	"github.com/stretchr/testify/assert"
 	. "gopkg.in/check.v1"
@@ -30,9 +32,11 @@ import (
 )
 
 const (
-	remediationNodeCondition    = "AMDGPUHardwareAssertionHwa"
-	npdInbandRASConfigPath      = "./yamls/config/npd/node-problem-detector-config-inband.yaml"
-	npdInbandRASErrorConfigPath = "./yamls/config/npd/node-problem-detector-error-config-inband.yaml"
+	conditionHWAssertion         = "AMDGPUHardwareAssertionHwa"
+	conditionInternalError       = "AMDGPUDeviceInternalError"
+	npdInbandRASConfigPath       = "./yamls/config/npd/node-problem-detector-config-inband.yaml"
+	npdInbandRASErrorConfigPath  = "./yamls/config/npd/node-problem-detector-error-config-inband.yaml"
+	npdInband2RASErrorConfigPath = "./yamls/config/npd/node-problem-detector-error-config-inband2.yaml"
 )
 
 func (s *E2ESuite) verifyRemediationWorkflowStatus(c *C, nodeName, status string, waitTime int) {
@@ -51,8 +55,106 @@ func (s *E2ESuite) verifyRemediationWorkflowStatus(c *C, nodeName, status string
 	}, time.Duration(waitTime)*time.Minute, 10*time.Second, "Remediation workflow did not reach expected status")
 }
 
+func (s *E2ESuite) checkWorkflowExistence(c *C, nodeName string, shouldExist bool) bool {
+	wfs, err := s.wfClient.ArgoprojV1alpha1().Workflows(s.ns).List(context.Background(), metav1.ListOptions{})
+	if err != nil {
+		logger.Infof("Error listing workflows: %v", err)
+		return false
+	}
+	exists := false
+	for _, wf := range wfs.Items {
+		if strings.Contains(wf.Name, nodeName) {
+			exists = true
+			break
+		}
+	}
+	return exists == shouldExist
+}
+
+func (s *E2ESuite) isWorkflowSuspended(c *C, nodeName string) bool {
+	wfs, err := s.wfClient.ArgoprojV1alpha1().Workflows(s.ns).List(context.Background(), metav1.ListOptions{})
+	if err != nil || len(wfs.Items) == 0 {
+		logger.Infof("Error listing workflows: %v", err)
+		return false
+	}
+	wf := wfs.Items[0]
+	for _, wfItem := range wfs.Items {
+		if strings.Contains(wfItem.Name, nodeName) {
+			wf = wfItem
+			break
+		}
+	}
+	for _, nodeStatus := range wf.Status.Nodes {
+		if nodeStatus.Type == "Suspend" && nodeStatus.Phase == "Running" {
+			return true
+		}
+	}
+	return false
+}
+
+func (s *E2ESuite) populateDeviceConfig(c *C) *v1alpha1.DeviceConfig {
+	driverEnable := false
+	remediationEnable := true
+	devCfg := s.getDeviceConfig(c)
+	devCfg.Spec.Driver.Enable = &driverEnable
+	devCfg.Spec.RemediationWorkflow.Enable = &remediationEnable
+	devCfg.Spec.MetricsExporter.Enable = &remediationEnable
+	devCfg.Spec.MetricsExporter.Image = exporterImage
+	devCfg.Spec.MetricsExporter.ImagePullPolicy = "Always"
+	devCfg.Spec.MetricsExporter.Port = 5000
+	devCfg.Spec.CommonConfig.UtilsContainer.Image = utilsContainerImage
+	devCfg.Spec.CommonConfig.UtilsContainer.ImagePullPolicy = "Always"
+	return devCfg
+}
+
+func (s *E2ESuite) addRemediationWorkflowStatusMetaData(ns, nodeName, nodeCondition string, metadataCount int, c *C) {
+	// Create initial RemediationWorkflowStatus object if not present
+	wfstatus, err := s.wfStatusClient.Get("default", ns)
+	//if not found, create a new one
+	if err != nil {
+		logger.Infof("RemediationWorkflowStatus CR not found, creating a new one")
+		wfstatus.Name = "default"
+		wfstatus.Namespace = ns
+		wfstatus, err = s.wfStatusClient.Create(wfstatus)
+		assert.NoError(c, err, "Failed to create remediation workflow status")
+		if err != nil {
+			return
+		}
+	}
+	wfMetaData := make([]v1alpha1.WorkflowMetadata, 0)
+	for i := 0; i < metadataCount; i++ {
+		data := v1alpha1.WorkflowMetadata{
+			Name:      fmt.Sprintf("%s-%s", nodeName, nodeCondition),
+			StartTime: time.Now().UTC().Format("2006-01-02 15:04:05 UTC"),
+		}
+		wfMetaData = append(wfMetaData, data)
+	}
+	ncmap := make(map[string][]v1alpha1.WorkflowMetadata)
+	ncmap[nodeCondition] = wfMetaData
+	wfstatus.Status = make(map[string]map[string][]v1alpha1.WorkflowMetadata)
+	wfstatus.Status[nodeName] = ncmap
+	_, err = s.wfStatusClient.Update(wfstatus)
+	assert.NoError(c, err, "Failed to add metadata to remediation workflow status CR")
+}
+
+func (s *E2ESuite) untaintNode(nodeName string) {
+	cmd := fmt.Sprintf("kubectl taint node %s amd-gpu-unhealthy:NoSchedule-", nodeName)
+	utils.RunCommand(cmd)
+}
+
+func (s *E2ESuite) clearRemediationWorkflowStatusMetaData(ns string, c *C) {
+	wfstatus, err := s.wfStatusClient.Get("default", ns)
+	if err != nil {
+		logger.Infof("RemediationWorkflowStatus object is not found")
+		return
+	}
+	wfstatus.Status = make(map[string]map[string][]v1alpha1.WorkflowMetadata)
+	_, err = s.wfStatusClient.Update(wfstatus)
+	assert.NoError(c, err, "Failed to clear metadata from remediation workflow status CR")
+}
+
 func (s *E2ESuite) TestAutoNodeRemediationWithoutPhysicalAction(c *C) {
-	logger.Infof("Starting Auto Node Remediation Test")
+	logger.Infof("Starting Auto Node Remediation Test without physical action")
 	if s.simEnable {
 		c.Skip("Skipping for non amd gpu testbed")
 	}
@@ -69,17 +171,7 @@ func (s *E2ESuite) TestAutoNodeRemediationWithoutPhysicalAction(c *C) {
 	_, err = s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{})
 	assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName))
 
-	driverEnable := false
-	remediationEnable := true
-	devCfg := s.getDeviceConfig(c)
-	devCfg.Spec.Driver.Enable = &driverEnable
-	devCfg.Spec.RemediationWorkflow.Enable = &remediationEnable
-	devCfg.Spec.MetricsExporter.Enable = &remediationEnable
-	devCfg.Spec.MetricsExporter.Image = exporterImage
-	devCfg.Spec.MetricsExporter.ImagePullPolicy = "Always"
-	devCfg.Spec.MetricsExporter.Port = 5000
-	devCfg.Spec.CommonConfig.UtilsContainer.Image = utilsContainerImage
-	devCfg.Spec.CommonConfig.UtilsContainer.ImagePullPolicy = "Always"
+	devCfg := s.populateDeviceConfig(c)
 
 	logger.Infof("Creating DeviceConfig with remediation enabled and driver disabled")
 	s.createDeviceConfig(devCfg, c)
@@ -97,16 +189,16 @@ func (s *E2ESuite) TestAutoNodeRemediationWithoutPhysicalAction(c *C) {
 	logger.Infof("Verify if Node Problem Detector (NPD) is running on all GPU nodes")
 	s.verifyNPDRunning(c)
 
-	logger.Infof("Verifying that node condition %s is added for the node %s", remediationNodeCondition, nodeName)
-	s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionTrue)
+	logger.Infof("Verifying that node condition %s is added for the node %s", conditionHWAssertion, nodeName)
+	s.verifyNodeCondition(c, conditionHWAssertion, corev1.ConditionFalse)
 
 	// Trigger error condition by modifying NPD config
 	logger.Infof("Edit Node Problem Detector (NPD) thresholds to simulate error condition")
 	s.updateConfigForNPD(c, npdInbandRASConfigPath, npdInbandRASErrorConfigPath)
 
-	s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionTrue)
+	s.verifyNodeCondition(c, conditionHWAssertion, corev1.ConditionTrue)
 
-	// Verify remediation workflow started and completed
+	// Verify remediation workflow is started and running
 	logger.Infof("Verifying remediation workflow started on the node %s", nodeName)
 	s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowRunning), 5)
 
@@ -114,9 +206,240 @@ func (s *E2ESuite) TestAutoNodeRemediationWithoutPhysicalAction(c *C) {
 	logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration")
 	s.updateConfigForNPD(c, npdInbandRASErrorConfigPath, npdInbandRASConfigPath)
 
+	//verify workflow succeeded
+	logger.Infof("Waiting for remediation workflow to complete on the node %s", nodeName)
+	s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowSucceeded), 70)
+
+	logger.Infof("Verifying that node condition %s is false on the node %s", conditionHWAssertion, nodeName)
+	s.verifyNodeCondition(c, conditionHWAssertion, corev1.ConditionFalse)
+}
+
+func (s *E2ESuite) TestAutoNodeRemediationWithPhysicalAction(c *C) {
+	logger.Infof("Starting Auto Node Remediation Test with physical action")
+	if s.simEnable {
+		c.Skip("Skipping for non amd gpu testbed")
+	}
+
+	nodes, err := s.clientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
+		LabelSelector: "feature.node.kubernetes.io/amd-gpu=true",
+	})
+	assert.NoError(c, err, "Failed to list nodes with AMD GPU label")
+	if len(nodes.Items) == 0 {
+		c.Fatalf("No nodes found with AMD GPU label")
+	}
+	nodeName := nodes.Items[0].Name
+
+	_, err = s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{})
+	assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName))
+
+	devCfg := s.populateDeviceConfig(c)
+
+	logger.Infof("Creating DeviceConfig with remediation enabled and driver disabled")
+	s.createDeviceConfig(devCfg, c)
+	s.checkMetricsExporterStatus(devCfg, s.ns, corev1.ServiceTypeClusterIP, c)
+
+	// Wait for cluster to be up
+	logger.Infof("Waiting for device config to be applied")
+	time.Sleep(5 * time.Second)
+
+	// Setup NPD
+	logger.Infof("Setting up Node Problem Detector (NPD)")
+	setupNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath)
+	defer tearDownNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath)
+
+	logger.Infof("Verify if Node Problem Detector (NPD) is running on all GPU nodes")
+	s.verifyNPDRunning(c)
+
+	logger.Infof("Verifying that node condition %s is added for the node %s", conditionInternalError, nodeName)
+	s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionFalse)
+
+	// Trigger error condition by modifying NPD config
+	logger.Infof("Edit Node Problem Detector (NPD) thresholds to simulate error condition")
+	s.updateConfigForNPD(c, npdInbandRASConfigPath, npdInband2RASErrorConfigPath)
+
+	s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionTrue)
+
+	// Verify remediation workflow started
+	logger.Infof("Verifying remediation workflow started on the node %s", nodeName)
+	s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowRunning), 5)
+
+	//verify workflow is suspended waiting for physical action
+	logger.Infof("Verifying remediation workflow is suspended on the node %s", nodeName)
+	assert.Eventually(c, func() bool {
+		return s.isWorkflowSuspended(c, nodeName)
+	}, 5*time.Minute, 10*time.Second, "Remediation workflow did not reach suspended state")
+
+	// resume workflow by adding label to node
+	err = utils.AddNodeLabel(s.clientSet, nodeName, "operator.amd.com/gpu-force-resume-workflow", "true")
+	assert.NoError(c, err, "Failed to add label to resume workflow")
+
+	logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration")
+	s.updateConfigForNPD(c, npdInband2RASErrorConfigPath, npdInbandRASConfigPath)
+
 	logger.Infof("Waiting for remediation workflow to complete on the node %s", nodeName)
 	s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowSucceeded), 70)
 
-	logger.Infof("Verifying that node condition %s is false on the node %s", remediationNodeCondition, nodeName)
-	s.verifyNodeCondition(c, remediationNodeCondition, corev1.ConditionFalse)
+	logger.Infof("Verifying that node condition %s is false on the node %s", conditionInternalError, nodeName)
+	s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionFalse)
+}
+
+func (s *E2ESuite) TestAutoNodeRemediationAbortWorkflow(c *C) {
+	logger.Infof("Starting Auto Node Remediation abort workflow test")
+	if s.simEnable {
+		c.Skip("Skipping for non amd gpu testbed")
+	}
+
+	nodes, err := s.clientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
+		LabelSelector: "feature.node.kubernetes.io/amd-gpu=true",
+	})
+	assert.NoError(c, err, "Failed to list nodes with AMD GPU label")
+	if len(nodes.Items) == 0 {
+		c.Fatalf("No nodes found with AMD GPU label")
+	}
+	nodeName := nodes.Items[0].Name
+
+	_, err = s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{})
+	assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName))
+
+	devCfg := s.populateDeviceConfig(c)
+
+	logger.Infof("Creating DeviceConfig with remediation enabled and driver disabled")
+	s.createDeviceConfig(devCfg, c)
+	s.checkMetricsExporterStatus(devCfg, s.ns, corev1.ServiceTypeClusterIP, c)
+
+	// Wait for cluster to be up
+	logger.Infof("Waiting for device config to be applied")
+	time.Sleep(5 * time.Second)
+
+	// Setup NPD
+	logger.Infof("Setting up Node Problem Detector (NPD)")
+	setupNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath)
+	defer tearDownNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath)
+
+	logger.Infof("Verify if Node Problem Detector (NPD) is running on all GPU nodes")
+	s.verifyNPDRunning(c)
+
+	logger.Infof("Verifying that node condition %s is added for the node %s", conditionInternalError, nodeName)
+	s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionFalse)
+
+	// Trigger error condition by modifying NPD config
+	logger.Infof("Edit Node Problem Detector (NPD) thresholds to simulate error condition")
+	s.updateConfigForNPD(c, npdInbandRASConfigPath, npdInband2RASErrorConfigPath)
+
+	s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionTrue)
+
+	// Verify remediation workflow started
+	logger.Infof("Verifying remediation workflow started on the node %s", nodeName)
+	s.verifyRemediationWorkflowStatus(c, nodeName, string(wfv1alpha1.WorkflowRunning), 5)
+
+	//verify workflow is suspended waiting for physical action
+	logger.Infof("Verifying remediation workflow is suspended on the node %s", nodeName)
+	assert.Eventually(c, func() bool {
+		return s.isWorkflowSuspended(c, nodeName)
+	}, 5*time.Minute, 10*time.Second, "Remediation workflow did not reach suspended state")
+
+	// abort workflow by adding label to node
+	err = utils.AddNodeLabel(s.clientSet, nodeName, "operator.amd.com/gpu-abort-workflow", "true")
+	assert.NoError(c, err, "Failed to add label to abort workflow")
+
+	logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration")
+	s.updateConfigForNPD(c, npdInband2RASErrorConfigPath, npdInbandRASConfigPath)
+
+	//verify workflow is aborted and deleted
+	logger.Infof("Verifying remediation workflow is aborted and deleted on the node %s", nodeName)
+	assert.Eventually(c, func() bool {
+		return s.checkWorkflowExistence(c, nodeName, false)
+	}, 1*time.Minute, 10*time.Second, "Remediation workflow was not aborted and deleted")
+	s.untaintNode(nodeName)
+}
+
+func (s *E2ESuite) TestAutoNodeRemediationRecoveryPolicy(c *C) {
+	logger.Infof("Starting Auto Node Remediation recovery policy test")
+	if s.simEnable {
+		c.Skip("Skipping for non amd gpu testbed")
+	}
+
+	nodes, err := s.clientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
+		LabelSelector: "feature.node.kubernetes.io/amd-gpu=true",
+	})
+	assert.NoError(c, err, "Failed to list nodes with AMD GPU label")
+	if len(nodes.Items) == 0 {
+		c.Fatalf("No nodes found with AMD GPU label")
+	}
+	nodeName := nodes.Items[0].Name
+
+	_, err = s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{})
+	assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName))
+
+	devCfg := s.populateDeviceConfig(c)
+
+	//Clear previous state before starting the test
+	logger.Infof("Clean-up RemediationWorkflowStatus CR before the test")
+	s.clearRemediationWorkflowStatusMetaData(devCfg.Namespace, c)
+
+	// Pre-populate RemediationWorkflowStatus with max retries
+	logger.Infof("Pre-populate RemediationWorkflowStatus with max retries for node %s and condition %s", nodeName, conditionInternalError)
+	s.addRemediationWorkflowStatusMetaData(devCfg.Namespace, nodeName, conditionInternalError, 4, c)
+
+	logger.Infof("Creating DeviceConfig with remediation enabled and driver disabled")
+	s.createDeviceConfig(devCfg, c)
+	s.checkMetricsExporterStatus(devCfg, s.ns, corev1.ServiceTypeClusterIP, c)
+
+	// Wait for cluster to be up
+	logger.Infof("Waiting for device config to be applied")
+	time.Sleep(5 * time.Second)
+
+	// Setup NPD
+	logger.Infof("Setting up Node Problem Detector (NPD)")
+	setupNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath)
+	defer tearDownNPD(npdServiceAccountPath, npdInbandRASConfigPath, npdDaemonSetPath)
+
+	logger.Infof("Verify if Node Problem Detector (NPD) is running on all GPU nodes")
+	s.verifyNPDRunning(c)
+
+	logger.Infof("Verifying that node condition %s is added for the node %s", conditionInternalError, nodeName)
+	s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionFalse)
+
+	// Trigger error condition by modifying NPD config
+	logger.Infof("Edit Node Problem Detector (NPD) thresholds to simulate error condition")
+	s.updateConfigForNPD(c, npdInbandRASConfigPath, npdInband2RASErrorConfigPath)
+
+	s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionTrue)
+
+	// Verify remediation workflow is not started due to max retries reached
+	logger.Infof("Verifying remediation workflow is not started on the node %s due to max retries reached", nodeName)
+	assert.Eventually(c, func() bool {
+		return s.checkWorkflowExistence(c, nodeName, false)
+	}, 2*time.Minute, 10*time.Second, "Remediation workflow was started despite max retries reached")
+
+	// Clear RemediationWorkflowStatus metadata
+	logger.Infof("Clearing RemediationWorkflowStatus metadata for node %s and condition %s", nodeName, conditionInternalError)
+	s.clearRemediationWorkflowStatusMetaData(devCfg.Namespace, c)
+
+	// Verify remediation workflow is started and running now
+	logger.Infof("Verifying remediation workflow is started on the node %s after clearing metadata", nodeName)
+	assert.Eventually(c, func() bool {
+		return s.checkWorkflowExistence(c, nodeName, true)
+	}, 2*time.Minute, 10*time.Second, "Remediation workflow was started despite max retries reached")
+
+	//verify workflow is suspended waiting for physical action
+	logger.Infof("Verifying remediation workflow is suspended on the node %s", nodeName)
+	assert.Eventually(c, func() bool {
+		return s.isWorkflowSuspended(c, nodeName)
+	}, 3*time.Minute, 10*time.Second, "Remediation workflow did not reach suspended state")
+
+	// abort workflow by adding label to node
+	logger.Infof("Aborting the suspended workflow")
+	err = utils.AddNodeLabel(s.clientSet, nodeName, "operator.amd.com/gpu-abort-workflow", "true")
+	assert.NoError(c, err, "Failed to add label to abort workflow")
+
+	logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration")
+	s.updateConfigForNPD(c, npdInband2RASErrorConfigPath, npdInbandRASConfigPath)
+
+	//verify workflow is aborted and deleted
+	logger.Infof("Verifying remediation workflow is aborted and deleted on the node %s", nodeName)
+	assert.Eventually(c, func() bool {
+		return s.checkWorkflowExistence(c, nodeName, false)
+	}, 1*time.Minute, 10*time.Second, "Remediation workflow was not aborted and deleted")
+	s.untaintNode(nodeName)
 }
diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml
index a2b6976d..8468b1c9 100644
--- a/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml
+++ b/tests/e2e/yamls/config/npd/node-problem-detector-config-inband.yaml
@@ -17,6 +17,11 @@ data:
           "type": "AMDGPUHardwareAssertionHwa",
           "reason": "AMDGPUIsUp",
           "message": "AMDGPU is up"
+        },
+        {
+          "type": "AMDGPUDeviceInternalError",
+          "reason": "AMDGPUIsUp",
+          "message": "AMDGPU is up"
         }
       ],
       "rules": [
@@ -33,6 +38,20 @@ data:
             "-t=0"
           ],
           "timeout": "15s"
+        },
+        {
+          "type": "permanent",
+          "condition": "AMDGPUDeviceInternalError",
+          "reason": "AMDGPU Device Internal Error",
+          "path": "/var/lib/amd-metrics-exporter/amdgpuhealth",
+          "args": [
+            "query",
+            "inband-ras-errors",
+            "-s=CPER_SEVERITY_FATAL",
+            "--afid=25",
+            "-t=0"
+          ],
+          "timeout": "15s"
         }
       ]
     }
diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml
index 4ffc7d68..d07679b3 100644
--- a/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml
+++ b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband.yaml
@@ -17,6 +17,11 @@ data:
           "type": "AMDGPUHardwareAssertionHwa",
           "reason": "AMDGPUIsUp",
           "message": "AMDGPU is up"
+        },
+        {
+          "type": "AMDGPUDeviceInternalError",
+          "reason": "AMDGPUIsUp",
+          "message": "AMDGPU is up"
         }
       ],
       "rules": [
@@ -33,6 +38,20 @@ data:
             "-t=-1"
           ],
           "timeout": "15s"
+        },
+        {
+          "type": "permanent",
+          "condition": "AMDGPUDeviceInternalError",
+          "reason": "AMDGPU Device Internal Error",
+          "path": "/var/lib/amd-metrics-exporter/amdgpuhealth",
+          "args": [
+            "query",
+            "inband-ras-errors",
+            "-s=CPER_SEVERITY_FATAL",
+            "--afid=25",
+            "-t=0"
+          ],
+          "timeout": "15s"
         }
       ]
     }
diff --git a/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband2.yaml b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband2.yaml
new file mode 100644
index 00000000..9a6bb730
--- /dev/null
+++ b/tests/e2e/yamls/config/npd/node-problem-detector-error-config-inband2.yaml
@@ -0,0 +1,61 @@
+apiVersion: v1
+data:
+  custom-plugin-monitor.json: |
+    {
+      "plugin": "custom",
+      "pluginConfig": {
+        "invoke_interval": "30s",
+        "timeout": "15s",
+        "max_output_length": 80,
+        "concurrency": 3,
+        "enable_message_change_based_condition_update": false
+      },
+      "source": "amdgpu-custom-plugin-monitor",
+      "metricsReporting": true,
+      "conditions": [
+        {
+          "type": "AMDGPUHardwareAssertionHwa",
+          "reason": "AMDGPUIsUp",
+          "message": "AMDGPU is up"
+        },
+        {
+          "type": "AMDGPUDeviceInternalError",
+          "reason": "AMDGPUIsUp",
+          "message": "AMDGPU is up"
+        }
+      ],
+      "rules": [
+        {
+          "type": "permanent",
+          "condition": "AMDGPUHardwareAssertionHwa",
+          "reason": "AMDGPU Hardware Assertion",
+          "path": "/var/lib/amd-metrics-exporter/amdgpuhealth",
+          "args": [
+            "query",
+            "inband-ras-errors",
+            "-s=CPER_SEVERITY_FATAL",
+            "--afid=30",
+            "-t=0"
+          ],
+          "timeout": "15s"
+        },
+        {
+          "type": "permanent",
+          "condition": "AMDGPUDeviceInternalError",
+          "reason": "AMDGPU Device Internal Error",
+          "path": "/var/lib/amd-metrics-exporter/amdgpuhealth",
+          "args": [
+            "query",
+            "inband-ras-errors",
+            "-s=CPER_SEVERITY_FATAL",
+            "--afid=25",
+            "-t=-1"
+          ],
+          "timeout": "15s"
+        }
+      ]
+    }
+kind: ConfigMap
+metadata:
+  name: node-problem-detector-config
+  namespace: kube-system