diff --git a/api/v1alpha1/seinodetask_types.go b/api/v1alpha1/seinodetask_types.go index 9c71413..e1b3b04 100644 --- a/api/v1alpha1/seinodetask_types.go +++ b/api/v1alpha1/seinodetask_types.go @@ -6,7 +6,7 @@ import ( // SeiNodeTaskKind discriminates the SeiNodeTask spec union. Exactly one of // the matching payload sub-structs in SeiNodeTaskSpec must be set. -// +kubebuilder:validation:Enum=GovSoftwareUpgrade;GovVote;AwaitCondition;UpdateNodeImage;AwaitNodesAtHeight;DiscoverPeers;RestartPod +// +kubebuilder:validation:Enum=GovSoftwareUpgrade;GovVote;AwaitCondition;UpdateNodeImage;AwaitNodesAtHeight;DiscoverPeers;RestartPod;MarkReady type SeiNodeTaskKind string const ( @@ -40,9 +40,9 @@ const ( // SeiNodeTaskKindDiscoverPeers backs the sidecar `discover-peers` task. // Re-resolves the target's spec.peers and writes persistent-peers into the - // on-disk config.toml (scalar merge). Disk-only: a running seid does not - // re-read config.toml, so compose with kind=RestartPod to apply. The two are - // not atomic — a DiscoverPeers success followed by a RestartPod failure + // on-disk config.toml (scalar merge). Disk-only: a running seid reads + // config.toml at startup, so compose with kind=RestartPod to apply. The two + // run independently — a DiscoverPeers success followed by a RestartPod failure // leaves config.toml ahead of the running peer set until the next restart. SeiNodeTaskKindDiscoverPeers SeiNodeTaskKind = "DiscoverPeers" @@ -54,6 +54,17 @@ const ( // The pod to delete is caller-supplied via spec.restartPod.podUID; a UID that // no longer matches the live pod completes as a no-op (see PodUID). SeiNodeTaskKindRestartPod SeiNodeTaskKind = "RestartPod" + + // SeiNodeTaskKindMarkReady backs the sidecar `mark-ready` task (fire-and-forget). + // Re-marks sidecar readiness so /v0/healthz returns 200, which unblocks the seid + // start-gate and the proxy readiness probe. Useful after a readiness-blind restart + // or image rollout to promote a parked node promptly, rather than waiting for the + // node controller's reapproval poll. Empty payload — nothing to parameterize. + // + // Completion is the submit ack: the task reports Complete once the sidecar + // accepts the request, a beat before /v0/healthz serves 200. Gate on the node + // actually serving with a following AwaitCondition/AwaitNodesAtHeight step. + SeiNodeTaskKindMarkReady SeiNodeTaskKind = "MarkReady" ) // SeiNodeTaskPhase is the high-level lifecycle state of a SeiNodeTask. @@ -99,7 +110,7 @@ const ( // Field names locked at v1alpha1 — see docs/design/seinode-task-lld.md // (PR sei-protocol/sei-k8s-controller#277). // -// +kubebuilder:validation:XValidation:rule="(has(self.govSoftwareUpgrade) ? 1 : 0) + (has(self.govVote) ? 1 : 0) + (has(self.awaitCondition) ? 1 : 0) + (has(self.updateNodeImage) ? 1 : 0) + (has(self.awaitNodesAtHeight) ? 1 : 0) + (has(self.discoverPeers) ? 1 : 0) + (has(self.restartPod) ? 1 : 0) == 1",message="exactly one of govSoftwareUpgrade, govVote, awaitCondition, updateNodeImage, awaitNodesAtHeight, discoverPeers, or restartPod must be set" +// +kubebuilder:validation:XValidation:rule="(has(self.govSoftwareUpgrade) ? 1 : 0) + (has(self.govVote) ? 1 : 0) + (has(self.awaitCondition) ? 1 : 0) + (has(self.updateNodeImage) ? 1 : 0) + (has(self.awaitNodesAtHeight) ? 1 : 0) + (has(self.discoverPeers) ? 1 : 0) + (has(self.restartPod) ? 1 : 0) + (has(self.markReady) ? 1 : 0) == 1",message="exactly one of govSoftwareUpgrade, govVote, awaitCondition, updateNodeImage, awaitNodesAtHeight, discoverPeers, restartPod, or markReady must be set" // +kubebuilder:validation:XValidation:rule="self.kind != 'GovSoftwareUpgrade' || has(self.govSoftwareUpgrade)",message="spec.govSoftwareUpgrade is required when kind=GovSoftwareUpgrade" // +kubebuilder:validation:XValidation:rule="self.kind != 'GovVote' || has(self.govVote)",message="spec.govVote is required when kind=GovVote" // +kubebuilder:validation:XValidation:rule="self.kind != 'AwaitCondition' || has(self.awaitCondition)",message="spec.awaitCondition is required when kind=AwaitCondition" @@ -108,6 +119,7 @@ const ( // +kubebuilder:validation:XValidation:rule="self.kind != 'DiscoverPeers' || has(self.discoverPeers)",message="spec.discoverPeers is required when kind=DiscoverPeers" // +kubebuilder:validation:XValidation:rule="self.kind != 'RestartPod' || has(self.restartPod)",message="spec.restartPod is required when kind=RestartPod" // +kubebuilder:validation:XValidation:rule="self.kind != 'RestartPod' || (has(self.restartPod) && size(self.restartPod.podUID) > 0)",message="spec.restartPod.podUID is required when kind=RestartPod" +// +kubebuilder:validation:XValidation:rule="self.kind != 'MarkReady' || has(self.markReady)",message="spec.markReady is required when kind=MarkReady" // +kubebuilder:validation:XValidation:rule="self.kind == oldSelf.kind",message="spec.kind is immutable" type SeiNodeTaskSpec struct { // Kind selects the task implementation. Immutable after creation. @@ -158,6 +170,10 @@ type SeiNodeTaskSpec struct { // RestartPod is the payload for kind=RestartPod. // +optional RestartPod *RestartPodPayload `json:"restartPod,omitempty"` + + // MarkReady is the payload for kind=MarkReady. + // +optional + MarkReady *MarkReadyPayload `json:"markReady,omitempty"` } // SeiNodeTaskTarget identifies the single SeiNode this task operates on. @@ -346,13 +362,11 @@ type AwaitNodesAtHeightPayload struct { // DiscoverPeersPayload is the payload for kind=DiscoverPeers. It is empty: the // task re-resolves the target SeiNode's current spec.peers (ec2Tags, static, // and label sources) and writes persistent-peers into the on-disk config.toml -// via the sidecar discover-peers task. There is nothing to parameterize — the -// peer sources are fully determined by the target's spec/status. Fields would -// only be added here if a future feature needs to override the target's -// declared peers (not in scope). +// via the sidecar discover-peers task. It is empty: the peer sources are fully +// determined by the target's spec/status, so there is nothing to parameterize. // -// Writes config.toml only; the running seid does not pick up the new peers -// until a restart. Compose with kind=RestartPod to apply. See the +// Writes config.toml only; the running seid picks up the new peers on its next +// restart. Compose with kind=RestartPod to apply. See the // SeiNodeTaskKindDiscoverPeers doc comment for the sequencing and atomicity // caveats. type DiscoverPeersPayload struct{} @@ -370,14 +384,20 @@ type RestartPodPayload struct { // a different UID appears. Content-addressed (UID, not creationTimestamp) so // the OnDelete replacement is unambiguously distinguished from the original. // - // The caller owns UID correctness: a non-empty UID that no longer matches the - // live pod (e.g. the pod was recreated out-of-band after it was read) deletes - // nothing and completes immediately as a no-op. Fetch the UID as late as - // possible — the controller does not re-validate it against the live pod. + // The caller owns UID correctness: the controller uses this UID verbatim, so a + // UID that no longer matches the live pod (e.g. the pod was recreated + // out-of-band after it was read) completes immediately as a no-op. Fetch the + // UID as late as possible. // +kubebuilder:validation:MinLength=1 PodUID string `json:"podUID"` } +// MarkReadyPayload is the payload for kind=MarkReady. It is empty: the sidecar +// mark-ready task (sidecar.MarkReadyTask) takes no inputs — it re-marks the +// target sidecar's in-process readiness flag so /v0/healthz serves 200. See the +// SeiNodeTaskKindMarkReady doc comment for the use case. +type MarkReadyPayload struct{} + // --------------------------------------------------------------------------- // Status // --------------------------------------------------------------------------- diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 61fb375..f7b50d2 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -448,6 +448,21 @@ func (in *LabelPeerSource) DeepCopy() *LabelPeerSource { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *MarkReadyPayload) DeepCopyInto(out *MarkReadyPayload) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MarkReadyPayload. +func (in *MarkReadyPayload) DeepCopy() *MarkReadyPayload { + if in == nil { + return nil + } + out := new(MarkReadyPayload) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NetworkingConfig) DeepCopyInto(out *NetworkingConfig) { *out = *in @@ -1311,6 +1326,11 @@ func (in *SeiNodeTaskSpec) DeepCopyInto(out *SeiNodeTaskSpec) { *out = new(RestartPodPayload) **out = **in } + if in.MarkReady != nil { + in, out := &in.MarkReady, &out.MarkReady + *out = new(MarkReadyPayload) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SeiNodeTaskSpec. diff --git a/config/crd/sei.io_seinodetasks.yaml b/config/crd/sei.io_seinodetasks.yaml index 20ebf8b..75b2822 100644 --- a/config/crd/sei.io_seinodetasks.yaml +++ b/config/crd/sei.io_seinodetasks.yaml @@ -250,7 +250,11 @@ spec: - AwaitNodesAtHeight - DiscoverPeers - RestartPod + - MarkReady type: string + markReady: + description: MarkReady is the payload for kind=MarkReady. + type: object restartPod: description: RestartPod is the payload for kind=RestartPod. properties: @@ -264,10 +268,10 @@ spec: a different UID appears. Content-addressed (UID, not creationTimestamp) so the OnDelete replacement is unambiguously distinguished from the original. - The caller owns UID correctness: a non-empty UID that no longer matches the - live pod (e.g. the pod was recreated out-of-band after it was read) deletes - nothing and completes immediately as a no-op. Fetch the UID as late as - possible — the controller does not re-validate it against the live pod. + The caller owns UID correctness: the controller uses this UID verbatim, so a + UID that no longer matches the live pod (e.g. the pod was recreated + out-of-band after it was read) completes immediately as a no-op. Fetch the + UID as late as possible. minLength: 1 type: string required: @@ -343,12 +347,13 @@ spec: type: object x-kubernetes-validations: - message: exactly one of govSoftwareUpgrade, govVote, awaitCondition, - updateNodeImage, awaitNodesAtHeight, discoverPeers, or restartPod - must be set + updateNodeImage, awaitNodesAtHeight, discoverPeers, restartPod, or + markReady must be set rule: '(has(self.govSoftwareUpgrade) ? 1 : 0) + (has(self.govVote) ? 1 : 0) + (has(self.awaitCondition) ? 1 : 0) + (has(self.updateNodeImage) ? 1 : 0) + (has(self.awaitNodesAtHeight) ? 1 : 0) + (has(self.discoverPeers) - ? 1 : 0) + (has(self.restartPod) ? 1 : 0) == 1' + ? 1 : 0) + (has(self.restartPod) ? 1 : 0) + (has(self.markReady) ? + 1 : 0) == 1' - message: spec.govSoftwareUpgrade is required when kind=GovSoftwareUpgrade rule: self.kind != 'GovSoftwareUpgrade' || has(self.govSoftwareUpgrade) - message: spec.govVote is required when kind=GovVote @@ -366,6 +371,8 @@ spec: - message: spec.restartPod.podUID is required when kind=RestartPod rule: self.kind != 'RestartPod' || (has(self.restartPod) && size(self.restartPod.podUID) > 0) + - message: spec.markReady is required when kind=MarkReady + rule: self.kind != 'MarkReady' || has(self.markReady) - message: spec.kind is immutable rule: self.kind == oldSelf.kind status: diff --git a/internal/controller/nodetask/controller.go b/internal/controller/nodetask/controller.go index e7dc690..6e3dcbd 100644 --- a/internal/controller/nodetask/controller.go +++ b/internal/controller/nodetask/controller.go @@ -61,6 +61,7 @@ const ( // operator sets spec.timeoutSeconds). defaultRestartPodTimeout = 10 * time.Minute defaultDiscoverPeersTimeout = 2 * time.Minute + defaultMarkReadyTimeout = 2 * time.Minute ) // resultRequeueImmediate mirrors planner.ResultRequeueImmediate without @@ -330,6 +331,8 @@ func effectiveTimeout(cr *seiv1alpha1.SeiNodeTask) time.Duration { return defaultRestartPodTimeout case seiv1alpha1.SeiNodeTaskKindDiscoverPeers: return defaultDiscoverPeersTimeout + case seiv1alpha1.SeiNodeTaskKindMarkReady: + return defaultMarkReadyTimeout default: return 0 } @@ -359,13 +362,11 @@ func taskParamsForKind(cr *seiv1alpha1.SeiNodeTask, target *seiv1alpha1.SeiNode) // populateOutputs stamps the typed per-kind outputs on Complete. // // Sidecar-backed kinds (GovVote, GovSoftwareUpgrade, AwaitCondition, -// AwaitNodesAtHeight) intentionally do NOT populate status.outputs in this -// PR. The sidecar's TaskResult shape carries the values, but extracting -// them would require structural changes on the sidecar side (typed -// per-task result payloads). We defer that work and leave the typed -// output fields on the CRD unset and forward-compatible. Downstream -// consumers coordinate via chain queries (chain-as-medium), not -// task-to-task currying. See conversation history / PR 3 scope notes. +// AwaitNodesAtHeight) leave their typed output fields unset for now: surfacing +// the values the sidecar's TaskResult carries needs typed per-task result +// payloads on the sidecar side, which is deferred. The CRD fields stay +// forward-compatible, and downstream consumers coordinate via chain queries +// (chain-as-medium) rather than task-to-task currying. func populateOutputs(cr *seiv1alpha1.SeiNodeTask, target *seiv1alpha1.SeiNode) { switch cr.Spec.Kind { case seiv1alpha1.SeiNodeTaskKindUpdateNodeImage: diff --git a/internal/controller/nodetask/controller_test.go b/internal/controller/nodetask/controller_test.go index ff12cc9..e17846c 100644 --- a/internal/controller/nodetask/controller_test.go +++ b/internal/controller/nodetask/controller_test.go @@ -235,6 +235,7 @@ type fakeSidecarClient struct { mu sync.Mutex submitted []sidecar.TaskRequest results map[uuid.UUID]*sidecar.TaskResult + getCalls int } func newFakeSidecarClient() *fakeSidecarClient { @@ -254,6 +255,7 @@ func (f *fakeSidecarClient) SubmitTask(_ context.Context, req sidecar.TaskReques func (f *fakeSidecarClient) GetTask(_ context.Context, id uuid.UUID) (*sidecar.TaskResult, error) { f.mu.Lock() defer f.mu.Unlock() + f.getCalls++ if r, ok := f.results[id]; ok { return r, nil } @@ -907,6 +909,60 @@ func TestReconcile_DiscoverPeers_LongTargetWait_DoesNotImmediatelyTimeOut(t *tes g.Expect(getTask(t, ctx, c).Status.Phase).To(Equal(seiv1alpha1.SeiNodeTaskPhaseComplete)) } +// --------------------------------------------------------------------------- +// MarkReady +// --------------------------------------------------------------------------- + +func newMarkReadyTask() *seiv1alpha1.SeiNodeTask { + return &seiv1alpha1.SeiNodeTask{ + ObjectMeta: metav1.ObjectMeta{Name: testTaskName, Namespace: testNS, UID: "task-uid-markready", Generation: 1}, + Spec: seiv1alpha1.SeiNodeTaskSpec{ + Kind: seiv1alpha1.SeiNodeTaskKindMarkReady, + Target: seiv1alpha1.SeiNodeTaskTarget{ + NodeRef: seiv1alpha1.SeiNodeTaskNodeRef{Name: testNodeName}, + RequirePhase: seiv1alpha1.PhaseRunning, + }, + MarkReady: &seiv1alpha1.MarkReadyPayload{}, + }, + } +} + +// MarkReady is fire-and-forget (registered sidecarTask[...](true)): Execute +// completes the task the moment SubmitTask is acked, so Complete means "the +// mark-ready request was accepted" and lands at the submit reconcile, with +// Status reading back the cached terminal state. The test pins that contract: +// Complete at R2, and getCalls==0 confirms the completion served from cache. +func TestReconcile_MarkReady_EndToEnd(t *testing.T) { + g := NewWithT(t) + ctx := context.Background() + t0 := time.Now() + cr := newMarkReadyTask() + node := newRunningNode() + fakeSC := newFakeSidecarClient() + + r, c := newReconcilerWithSidecar(t, t0, fakeSC, cr, node) + + // R1: synthesize task. + _, err := r.Reconcile(ctx, req()) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(getTask(t, ctx, c).Status.Phase).To(Equal(seiv1alpha1.SeiNodeTaskPhaseRunning)) + + // R2: Execute submits mark-ready and completes immediately on the ack — + // no staged result, no further reconcile required. + _, err = r.Reconcile(ctx, req()) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(getTask(t, ctx, c).Status.Phase).To(Equal(seiv1alpha1.SeiNodeTaskPhaseComplete)) + + fakeSC.mu.Lock() + defer fakeSC.mu.Unlock() + g.Expect(fakeSC.submitted).To(HaveLen(1)) + g.Expect(fakeSC.submitted[0].Type).To(Equal(sidecar.TaskTypeMarkReady)) + // Completion served from the cached terminal state set at submit; getCalls==0 + // pins the fire-and-forget contract (a sidecarTask[...](false) regression, + // which polls GetTask to terminal, would trip this). + g.Expect(fakeSC.getCalls).To(Equal(0)) +} + // Execution-start timeout still fires: a DiscoverPeers task whose sidecar never // completes Fails(Timeout) at executionStartedAt + default, confirming the // budget is enforced (just from the right reference point). diff --git a/internal/controller/nodetask/envtest/cel_validation_test.go b/internal/controller/nodetask/envtest/cel_validation_test.go index d834b77..8b60816 100644 --- a/internal/controller/nodetask/envtest/cel_validation_test.go +++ b/internal/controller/nodetask/envtest/cel_validation_test.go @@ -53,6 +53,41 @@ func TestCEL_RestartPod_Accepted(t *testing.T) { g.Expect(testCli.Create(testCtx, snt)).To(Succeed()) } +// MarkReady with its matching empty payload is accepted. +func TestCEL_MarkReady_Accepted(t *testing.T) { + g := NewWithT(t) + ns := makeNamespace(t) + snt := baseTask(ns, "markready-ok", seiv1alpha1.SeiNodeTaskKindMarkReady) + snt.Spec.MarkReady = &seiv1alpha1.MarkReadyPayload{} + g.Expect(testCli.Create(testCtx, snt)).To(Succeed()) +} + +// kind=MarkReady with NO payload is rejected (zero payloads / kind-required rule). +func TestCEL_MarkReady_NoPayload_Rejected(t *testing.T) { + g := NewWithT(t) + ns := makeNamespace(t) + snt := baseTask(ns, "markready-nopayload", seiv1alpha1.SeiNodeTaskKindMarkReady) + err := testCli.Create(testCtx, snt) + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(Or( + ContainSubstring("exactly one"), + ContainSubstring("markReady is required"), + )) +} + +// kind=MarkReady with a second payload (markReady + restartPod) is rejected by +// the exactly-one union rule. +func TestCEL_MarkReady_MultiplePayloads_Rejected(t *testing.T) { + g := NewWithT(t) + ns := makeNamespace(t) + snt := baseTask(ns, "markready-two-payloads", seiv1alpha1.SeiNodeTaskKindMarkReady) + snt.Spec.MarkReady = &seiv1alpha1.MarkReadyPayload{} + snt.Spec.RestartPod = &seiv1alpha1.RestartPodPayload{PodUID: "pod-uid-1"} + err := testCli.Create(testCtx, snt) + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(ContainSubstring("exactly one")) +} + // kind=DiscoverPeers with NO payload is rejected (zero payloads). func TestCEL_DiscoverPeers_NoPayload_Rejected(t *testing.T) { g := NewWithT(t) diff --git a/internal/task/restart_pod.go b/internal/task/restart_pod.go index 1015986..5fee6c1 100644 --- a/internal/task/restart_pod.go +++ b/internal/task/restart_pod.go @@ -21,13 +21,13 @@ const TaskTypeRestartPod = "restart-pod" // RestartedPodUID is the content-addressed restart signal, supplied by the // caller (spec.restartPod.podUID), copied to status.task at synthesis, and // threaded through params so it is stable across reconciles. The task deletes -// only this pod and completes once an owned Ready pod with a different UID -// exists. Keying on UID rather than a creation-time epoch avoids the +// this pod and completes once a replacement owned pod (a different UID) is +// Ready. Keying on UID rather than a creation-time epoch survives the // same-second-truncation race: an OnDelete replacement always has a fresh UID. -// CEL requires a non-empty podUID for kind=RestartPod; an empty UID reaching the -// task is treated as a wait, never success. The caller owns UID correctness: a -// non-empty UID that matches no live pod deletes nothing and completes as a -// no-op (the contract is documented on the PodUID API field). +// CEL requires a non-empty podUID for kind=RestartPod. The caller owns UID +// correctness: the task acts only on a UID that matches a live pod, so a stale +// UID leaves the pod in place and completes as a no-op (the contract is +// documented on the PodUID API field). type RestartPodParams struct { NodeName string `json:"nodeName"` Namespace string `json:"namespace"` @@ -59,10 +59,11 @@ func deserializeRestartPod(id string, params json.RawMessage, cfg ExecutionConfi // controller's responsibility, and OnDelete recreates the pod at the same // (unchanged) revision so seid re-reads config.toml on start. // -// Idempotent and stateless across reconciles: the captured pod is deleted; a -// replacement pod (different UID), an empty UID (none captured), or a missing -// pod is a no-op. A missing StatefulSet/pod is a transient wait (apply-statefulset -// or scheduling may lag), not an error. +// Idempotent and stateless across reconciles: Execute deletes the pod whose UID +// matches RestartedPodUID. Any other observed state — the replacement pod (a +// different UID), an empty UID, or a missing pod — leaves the cluster untouched. +// A missing StatefulSet/pod is a transient wait (apply-statefulset or scheduling +// may lag) and reports Running for the next reconcile. func (e *restartPodExecution) Execute(ctx context.Context) error { node, sts, err := e.fetchStatefulSet(ctx) if err != nil { @@ -75,10 +76,10 @@ func (e *restartPodExecution) Execute(ctx context.Context) error { return err } - // Defense-in-depth: the synthesis site never dispatches an empty UID for - // RestartPod, so this is unreachable in practice. If one ever slips through, - // delete nothing — Status reports Running so the controller's execution - // timeout fails the task rather than completing without a restart. + // Defense-in-depth: CEL and synthesis guarantee a non-empty UID, so this is a + // backstop. An empty UID short-circuits to a Running Status, leaving the + // execution timeout to fail the task — a restart that deleted nothing stays + // out of the Complete state. if e.params.RestartedPodUID == "" { return nil } @@ -87,21 +88,20 @@ func (e *restartPodExecution) Execute(ctx context.Context) error { if err != nil { return err } - // Only delete the supplied pod. A different or absent UID means either the - // OnDelete replacement already exists (idempotent across reconciles) or the - // caller-supplied UID was already stale — either way nothing to delete. + // Delete only the supplied pod. A matching live pod is the one to restart; a + // different UID is already the OnDelete replacement (idempotent across + // reconciles) or a stale caller-supplied UID — both leave the cluster as-is. if pod == nil || pod.UID != e.params.RestartedPodUID || pod.DeletionTimestamp != nil { return nil } return e.deletePod(ctx, pod) } -// Status completes when an owned pod that is NOT the restarted pod is Ready. -// An empty RestartedPodUID never completes (it reports Running until the -// controller's execution timeout fails the task): completing on the first owned -// Ready pod would let a restart that deleted nothing masquerade as success. The -// original pod (matching UID), a terminating pod, or a missing pod also reports -// Running so the executor re-polls. +// Status completes when the replacement pod — a Ready owned pod whose UID +// differs from RestartedPodUID — exists. An empty RestartedPodUID holds the task +// at Running until the execution timeout fails it, so a restart that deleted +// nothing stays out of the Complete state. The original pod (matching UID), a +// terminating pod, or a missing pod also hold at Running so the executor re-polls. func (e *restartPodExecution) Status(ctx context.Context) ExecutionStatus { if s, done := e.isTerminal(); done { return s diff --git a/internal/task/seinodetask_params.go b/internal/task/seinodetask_params.go index 36c4056..b7fcea8 100644 --- a/internal/task/seinodetask_params.go +++ b/internal/task/seinodetask_params.go @@ -29,8 +29,8 @@ const ( // ReasonedError is an error that carries the stable SeiNodeTask condition // reason it should surface as. Every error SeiNodeTaskParamsFor returns -// implements it, so the synthesis/driveTask call sites map err→reason via -// FailureReason without type-switching on individual error values. +// implements it, so the synthesis/driveTask call sites read the reason off the +// error via FailureReason. type ReasonedError interface { error Reason() string @@ -38,8 +38,8 @@ type ReasonedError interface { // FailureReason returns the stable condition reason for a param-build error. // It unwraps to a ReasonedError when present and otherwise defaults to -// ParamsBuildFailed — the catch-all for any failure that did not carry its own -// reason (e.g. a marshal error wrapped by the caller). +// ParamsBuildFailed for any other error (e.g. a marshal error wrapped by the +// caller). func FailureReason(err error) string { var re ReasonedError if errors.As(err, &re) { @@ -49,9 +49,8 @@ func FailureReason(err error) string { } // ErrUnsupportedKind reports an unwired SeiNodeTask kind: one the CRD enum -// admits but this build does not dispatch. It carries the offending kind and -// reports reason=UnsupportedKind (a public enum, CLAUDE.md); every other -// param-build failure reports ParamsBuildFailed. +// admits but this build leaves unwired. It carries the offending kind and +// reports reason=UnsupportedKind (a public enum, CLAUDE.md). type ErrUnsupportedKind struct { Kind seiv1alpha1.SeiNodeTaskKind } @@ -95,6 +94,8 @@ func SeiNodeTaskParamsFor(cr *seiv1alpha1.SeiNodeTask, target *seiv1alpha1.SeiNo return discoverPeersParams(cr, target) case seiv1alpha1.SeiNodeTaskKindRestartPod: return restartPodParams(cr) + case seiv1alpha1.SeiNodeTaskKindMarkReady: + return markReadyParams(cr) default: return SeiNodeTaskParams{}, &ErrUnsupportedKind{Kind: cr.Spec.Kind} } @@ -254,6 +255,16 @@ func restartPodParams(cr *seiv1alpha1.SeiNodeTask) (SeiNodeTaskParams, error) { }}, nil } +// markReadyParams builds the empty sidecar mark-ready payload. CEL requires +// spec.markReady for kind=MarkReady; this guard covers the early-validation path +// (taskParamsForKind runs before the spec is admission-checked in tests). +func markReadyParams(cr *seiv1alpha1.SeiNodeTask) (SeiNodeTaskParams, error) { + if cr.Spec.MarkReady == nil { + return SeiNodeTaskParams{}, paramsErrorf("spec.markReady is required for kind=MarkReady") + } + return SeiNodeTaskParams{sidecar.TaskTypeMarkReady, sidecar.MarkReadyTask{}}, nil +} + // resolveSigningUID returns explicit when set; otherwise derives from target // via ResolveOperatorKeyringUID. With nil target (early-validation path), the // final marshal happens later from driveTask, which has target. diff --git a/internal/task/seinodetask_params_test.go b/internal/task/seinodetask_params_test.go index 8b59336..a193a36 100644 --- a/internal/task/seinodetask_params_test.go +++ b/internal/task/seinodetask_params_test.go @@ -4,6 +4,8 @@ import ( "errors" "testing" + sidecar "github.com/sei-protocol/seictl/sidecar/client" + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" ) @@ -46,6 +48,48 @@ func TestSeiNodeTaskParamsFor_WiredKindMissingPayload_NotUnsupported(t *testing. } } +// kind=MarkReady maps to the sidecar mark-ready task with an empty payload — +// no target needed, no source building. +func TestSeiNodeTaskParamsFor_MarkReady(t *testing.T) { + cr := &seiv1alpha1.SeiNodeTask{ + Spec: seiv1alpha1.SeiNodeTaskSpec{ + Kind: seiv1alpha1.SeiNodeTaskKindMarkReady, + MarkReady: &seiv1alpha1.MarkReadyPayload{}, + }, + } + + p, err := SeiNodeTaskParamsFor(cr, nil) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if p.Type != sidecar.TaskTypeMarkReady { + t.Errorf("Type = %q, want %q", p.Type, sidecar.TaskTypeMarkReady) + } + if _, ok := p.Payload.(sidecar.MarkReadyTask); !ok { + t.Errorf("Payload = %T, want sidecar.MarkReadyTask", p.Payload) + } +} + +// kind=MarkReady with a nil payload is a param-build failure (ParamsBuildFailed), +// not an unsupported kind. CEL normally blocks this; the guard is the backstop. +func TestSeiNodeTaskParamsFor_MarkReady_NilPayload_ParamsBuildFailed(t *testing.T) { + cr := &seiv1alpha1.SeiNodeTask{ + Spec: seiv1alpha1.SeiNodeTaskSpec{Kind: seiv1alpha1.SeiNodeTaskKindMarkReady}, + } + + _, err := SeiNodeTaskParamsFor(cr, nil) + if err == nil { + t.Fatal("expected error for missing payload, got nil") + } + var unsupported *ErrUnsupportedKind + if errors.As(err, &unsupported) { + t.Error("missing-payload error must not be *ErrUnsupportedKind") + } + if got := FailureReason(err); got != ReasonParamsBuildFailed { + t.Errorf("FailureReason = %q, want %q", got, ReasonParamsBuildFailed) + } +} + // FailureReason carries the reason intrinsically: a wired-kind param failure // reports ParamsBuildFailed, an unwired kind reports UnsupportedKind. Both are // the public condition enum (CLAUDE.md); the controller maps err→reason with no diff --git a/manifests/sei.io_seinodetasks.yaml b/manifests/sei.io_seinodetasks.yaml index 20ebf8b..75b2822 100644 --- a/manifests/sei.io_seinodetasks.yaml +++ b/manifests/sei.io_seinodetasks.yaml @@ -250,7 +250,11 @@ spec: - AwaitNodesAtHeight - DiscoverPeers - RestartPod + - MarkReady type: string + markReady: + description: MarkReady is the payload for kind=MarkReady. + type: object restartPod: description: RestartPod is the payload for kind=RestartPod. properties: @@ -264,10 +268,10 @@ spec: a different UID appears. Content-addressed (UID, not creationTimestamp) so the OnDelete replacement is unambiguously distinguished from the original. - The caller owns UID correctness: a non-empty UID that no longer matches the - live pod (e.g. the pod was recreated out-of-band after it was read) deletes - nothing and completes immediately as a no-op. Fetch the UID as late as - possible — the controller does not re-validate it against the live pod. + The caller owns UID correctness: the controller uses this UID verbatim, so a + UID that no longer matches the live pod (e.g. the pod was recreated + out-of-band after it was read) completes immediately as a no-op. Fetch the + UID as late as possible. minLength: 1 type: string required: @@ -343,12 +347,13 @@ spec: type: object x-kubernetes-validations: - message: exactly one of govSoftwareUpgrade, govVote, awaitCondition, - updateNodeImage, awaitNodesAtHeight, discoverPeers, or restartPod - must be set + updateNodeImage, awaitNodesAtHeight, discoverPeers, restartPod, or + markReady must be set rule: '(has(self.govSoftwareUpgrade) ? 1 : 0) + (has(self.govVote) ? 1 : 0) + (has(self.awaitCondition) ? 1 : 0) + (has(self.updateNodeImage) ? 1 : 0) + (has(self.awaitNodesAtHeight) ? 1 : 0) + (has(self.discoverPeers) - ? 1 : 0) + (has(self.restartPod) ? 1 : 0) == 1' + ? 1 : 0) + (has(self.restartPod) ? 1 : 0) + (has(self.markReady) ? + 1 : 0) == 1' - message: spec.govSoftwareUpgrade is required when kind=GovSoftwareUpgrade rule: self.kind != 'GovSoftwareUpgrade' || has(self.govSoftwareUpgrade) - message: spec.govVote is required when kind=GovVote @@ -366,6 +371,8 @@ spec: - message: spec.restartPod.podUID is required when kind=RestartPod rule: self.kind != 'RestartPod' || (has(self.restartPod) && size(self.restartPod.podUID) > 0) + - message: spec.markReady is required when kind=MarkReady + rule: self.kind != 'MarkReady' || has(self.markReady) - message: spec.kind is immutable rule: self.kind == oldSelf.kind status: