Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion helm-charts-k8s/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ dependencies:
repository: file://./charts/remediation
version: v1.0.0
digest: sha256:41fa6a6232514acebf6abdcb1bccaf087e134b9f413b8fa33a7fec1f58a99e07
generated: "2026-01-05T19:49:07.882445585Z"
generated: "2026-01-07T10:51:28.442192317Z"
2 changes: 2 additions & 0 deletions helm-charts-k8s/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU ac
* AMD GPU Operator Controller
* K8s Device Plugin
* K8s Node Labeller
* Device Config Manager
* Device Metrics Exporter
* Device Test Runner
* Node Feature Discovery Operator
Expand All @@ -24,6 +25,7 @@ AMD GPU Operator simplifies the deployment and management of AMD Instinct GPU ac
* Comprehensive metrics collection and export
* Easy deployment of AMD GPU device plugin for Kubernetes
* Automated labeling of nodes with AMD GPU capabilities
* GPU partitioning configuration and management via Device Config Manager
* Compatibility with standard Kubernetes environments
* Efficient GPU resource allocation for containerized workloads
* GPU health monitoring and troubleshooting
Expand Down
2 changes: 1 addition & 1 deletion helm-charts-openshift/Chart.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@ dependencies:
repository: file://./charts/kmm
version: v1.0.0
digest: sha256:25200c34a5cc846a1275e5bf3fc637b19e909dc68de938189c5278d77d03f5ac
generated: "2025-12-09T09:27:48.895675076Z"
generated: "2026-01-07T10:51:41.178709798Z"
14 changes: 0 additions & 14 deletions internal/controllers/mock_remediation_handler.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

30 changes: 11 additions & 19 deletions internal/controllers/remediation_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,14 +165,11 @@ func (n *remediationMgr) HandleRemediation(ctx context.Context, devConfig *amdv1
return res, err
}

// If statusSynced is false, we need to populate the internal map from the status CR
if !n.helper.isStatusSynced(ctx) {
if err := n.helper.syncInternalMapFromStatusCR(ctx, devConfig.Namespace); err != nil {
logger.Error(err, "Failed to sync internal map from status CR")
return res, err
}
logger.Info("Internal map synced from status CR successfully")
if err := n.helper.syncInternalMapFromStatusCR(ctx, devConfig.Namespace); err != nil {
logger.Error(err, "Failed to sync internal map from status CR")
return res, err
}
logger.Info("Internal map synced from status CR successfully")

var mappingsList []ConditionWorkflowMapping
if err = yaml.Unmarshal([]byte(configMap.Data["workflow"]), &mappingsList); err != nil {
Expand Down Expand Up @@ -304,7 +301,6 @@ type remediationMgrHelperAPI interface {
isRecoveryPolicyViolated(ctx context.Context, nodeName string, mapping *ConditionWorkflowMapping) bool
canResumeWorkflowOnNode(ctx context.Context, node *v1.Node, mapping *ConditionWorkflowMapping) bool
syncInternalMapFromStatusCR(ctx context.Context, namespace string) error
isStatusSynced(ctx context.Context) bool
isNodeLabelledForForceResume(ctx context.Context, node *v1.Node) bool
removeForceResumeWorkflowLabelFromNode(ctx context.Context, node *v1.Node) error
isNodeLabelledForAbortWorkflow(node *v1.Node) bool
Expand All @@ -321,7 +317,6 @@ type remediationMgrHelper struct {
client client.Client
k8sInterface kubernetes.Interface
recoveryTracker *sync.Map
statusSynced bool
serviceAccountName string
maxParallelWorkflows int
}
Expand All @@ -332,7 +327,6 @@ func newRemediationMgrHelperHandler(client client.Client, k8sInterface kubernete
client: client,
k8sInterface: k8sInterface,
recoveryTracker: new(sync.Map),
statusSynced: false,
}
}

Expand Down Expand Up @@ -1057,6 +1051,12 @@ func (h *remediationMgrHelper) isWorkflowSchedulableOnNode(ctx context.Context,
logger.Info(fmt.Sprintf("Driver Install/Upgrade is in progress, skipping creation of workflow on node %s", node.Name))
return false
}

// if same node condition remediation workflow has crossed max threshold, skip the node
if h.isRecoveryPolicyViolated(ctx, node.Name, &mapping) {
logger.Info(fmt.Sprintf("Max remediation attempts reached for node %s on condition %s, skipping creation of workflow", node.Name, mapping.NodeCondition))
return false
}
return true
}

Expand Down Expand Up @@ -1372,10 +1372,7 @@ func (h *remediationMgrHelper) syncInternalMapFromStatusCR(ctx context.Context,
return fmt.Errorf("failed to get remediation workflow status: %w", err)
}

if wfStatus.Status == nil {
h.statusSynced = true
return nil // Nothing to sync
}
h.recoveryTracker = new(sync.Map)

for nodeName, conditions := range wfStatus.Status {
for nodeCondition, attempts := range conditions {
Expand All @@ -1393,14 +1390,9 @@ func (h *remediationMgrHelper) syncInternalMapFromStatusCR(ctx context.Context,
}
}

h.statusSynced = true
return nil
}

func (h *remediationMgrHelper) isStatusSynced(ctx context.Context) bool {
return h.statusSynced
}

func (h *remediationMgrHelper) isRecoveryPolicyViolated(ctx context.Context, nodeName string, mapping *ConditionWorkflowMapping) bool {
logger := log.FromContext(ctx)

Expand Down
89 changes: 89 additions & 0 deletions tests/e2e/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package client
import (
"context"
"encoding/json"
"fmt"

"github.com/ROCm/gpu-operator/api/v1alpha1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -382,3 +383,91 @@ func (c *deviceConfigsClient) Delete(name string) (*v1alpha1.DeviceConfig, error

return &result, err
}

type RemediationWorkflowStatusClient struct {
restClient rest.Interface
}

func NewWfStatusClient(c *rest.Config) (*RemediationWorkflowStatusClient, error) {
config := *c
config.ContentConfig.GroupVersion = &v1alpha1.GroupVersion
config.APIPath = "/apis"
config.NegotiatedSerializer = scheme.Codecs.WithoutConversion()
config.UserAgent = rest.DefaultKubernetesUserAgent()

client, err := rest.RESTClientFor(&config)
if err != nil {
return nil, err
}

return &RemediationWorkflowStatusClient{restClient: client}, nil
}

func (c *RemediationWorkflowStatusClient) Create(rwfstatus *v1alpha1.RemediationWorkflowStatus) (*v1alpha1.RemediationWorkflowStatus, error) {
result := v1alpha1.RemediationWorkflowStatus{}
rwfstatus.TypeMeta = metav1.TypeMeta{
Kind: "RemediationWorkflowStatus",
APIVersion: "amd.com/v1alpha1",
}
err := c.restClient.
Post().
Namespace(rwfstatus.Namespace).
Resource("remediationworkflowstatuses").
Body(rwfstatus).
Do(context.TODO()).
Into(&result)
return &result, err
}

func (c *RemediationWorkflowStatusClient) Update(rwfstatus *v1alpha1.RemediationWorkflowStatus) (*v1alpha1.RemediationWorkflowStatus, error) {
result := v1alpha1.RemediationWorkflowStatus{}
rwfstatus.TypeMeta = metav1.TypeMeta{
Kind: "RemediationWorkflowStatus",
APIVersion: "amd.com/v1alpha1",
}
err := c.restClient.
Put().
Namespace(rwfstatus.Namespace).
Resource("remediationworkflowstatuses").
Name(rwfstatus.Name).
SubResource("status").
Body(rwfstatus).
Do(context.TODO()).
Into(&result)

if err != nil {
return nil, fmt.Errorf("failed to update status: %w", err)
}

return &result, err
}

func (c *RemediationWorkflowStatusClient) Get(name, namespace string) (*v1alpha1.RemediationWorkflowStatus, error) {
result := v1alpha1.RemediationWorkflowStatus{}
err := c.restClient.
Get().
Namespace(namespace).
Resource("remediationworkflowstatuses").
Name(name).
Do(context.TODO()).
Into(&result)

return &result, err
}

func (c *RemediationWorkflowStatusClient) Delete(name string, namespace string) (*v1alpha1.RemediationWorkflowStatus, error) {
result := v1alpha1.RemediationWorkflowStatus{}
err := c.restClient.
Delete().
Namespace(namespace).
Resource("remediationworkflowstatuses").
Body(&v1alpha1.RemediationWorkflowStatus{
ObjectMeta: metav1.ObjectMeta{
Name: name,
},
}).
Do(context.TODO()).
Into(&result)

return &result, err
}
3 changes: 3 additions & 0 deletions tests/e2e/cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ func (s *E2ESuite) getDeviceConfig(c *C) *v1alpha1.DeviceConfig {
Port: 5001,
},
Selector: map[string]string{"feature.node.kubernetes.io/amd-gpu": "true"},
CommonConfig: v1alpha1.CommonConfigSpec{
InitContainerImage: initContainerImage,
},
},
}
insecure := true
Expand Down
1 change: 1 addition & 0 deletions tests/e2e/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,5 @@ type E2ESuite struct {
monClient monitoringClient.Interface
apiClientSet *apiextClient.Clientset
framework string
wfStatusClient *client.RemediationWorkflowStatusClient
}
6 changes: 6 additions & 0 deletions tests/e2e/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,12 @@ func (s *E2ESuite) SetUpSuite(c *C) {
}
s.dClient = dcCli

wfstatusClient, err := client.NewWfStatusClient(config)
if err != nil {
c.Fatalf("Error: %v", err.Error())
}
s.wfStatusClient = wfstatusClient

err = apiextv1.AddToScheme(scheme.Scheme)
if err != nil {
c.Fatalf("Error: %v", err.Error())
Expand Down
Loading
Loading