diff --git a/api/v1alpha1/seinode_types.go b/api/v1alpha1/seinode_types.go index 85001d0..0679a9c 100644 --- a/api/v1alpha1/seinode_types.go +++ b/api/v1alpha1/seinode_types.go @@ -360,6 +360,15 @@ type SeiNodeStatus struct { // +optional ResolvedPeers []string `json:"resolvedPeers,omitempty"` + // ResolvedRPCWitnesses carries the in-cluster RPC endpoints + // (`-0...svc.cluster.local:26657`) of the label-resolved + // peers, used as CometBFT state-sync light-client witnesses. Unlike + // ResolvedPeers these never carry an external P2P address — RPC is + // internal-only. When empty the sidecar derives witnesses from + // persistent_peers instead. + // +optional + ResolvedRPCWitnesses []string `json:"resolvedRPCWitnesses,omitempty"` + // StatefulSet references the StatefulSet the controller created for // this SeiNode. UID is the identity check: an STS with the expected // name but a different UID is not the one this controller created diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index f9777d7..e680cb9 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -1083,6 +1083,11 @@ func (in *SeiNodeStatus) DeepCopyInto(out *SeiNodeStatus) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.ResolvedRPCWitnesses != nil { + in, out := &in.ResolvedRPCWitnesses, &out.ResolvedRPCWitnesses + *out = make([]string, len(*in)) + copy(*out, *in) + } if in.StatefulSet != nil { in, out := &in.StatefulSet, &out.StatefulSet *out = new(StatefulSetRef) diff --git a/config/crd/sei.io_seinodes.yaml b/config/crd/sei.io_seinodes.yaml index 159a0f3..a56e9c5 100644 --- a/config/crd/sei.io_seinodes.yaml +++ b/config/crd/sei.io_seinodes.yaml @@ -970,6 +970,17 @@ spec: items: type: string type: array + resolvedRPCWitnesses: + description: |- + ResolvedRPCWitnesses carries the in-cluster RPC endpoints + (`-0...svc.cluster.local:26657`) of the label-resolved + peers, used as CometBFT state-sync light-client witnesses. Unlike + ResolvedPeers these never carry an external P2P address — RPC is + internal-only. When empty the sidecar derives witnesses from + persistent_peers instead. + items: + type: string + type: array statefulSet: description: |- StatefulSet references the StatefulSet the controller created for diff --git a/go.mod b/go.mod index 137f60b..a3cfe8c 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/google/uuid v1.6.0 github.com/onsi/gomega v1.39.1 github.com/sei-protocol/sei-config v0.0.19 - github.com/sei-protocol/seictl v0.0.50 + github.com/sei-protocol/seictl v0.0.55 github.com/urfave/cli/v3 v3.6.1 go.opentelemetry.io/otel v1.43.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.43.0 @@ -36,6 +36,7 @@ require ( filippo.io/edwards25519 v1.1.0 // indirect github.com/99designs/go-keychain v0.0.0-20191008050251-8e49817e8af4 // indirect github.com/99designs/keyring v1.2.1 // indirect + github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect github.com/BurntSushi/toml v1.5.0 // indirect github.com/DataDog/zstd v1.5.7 // indirect github.com/ProjectZKM/Ziren/crates/go-runtime/zkvm_runtime v0.0.0-20251001021608-1fe7b43fc4d6 // indirect @@ -155,6 +156,7 @@ require ( github.com/mattn/go-isatty v0.0.20 // indirect github.com/minio/minlz v1.0.1-0.20250507153514-87eb42fe8882 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect + github.com/moby/term v0.5.0 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/mtibben/percent v0.2.1 // indirect @@ -240,6 +242,7 @@ require ( gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/apiserver v0.35.0 // indirect + k8s.io/cli-runtime v0.36.0 // indirect k8s.io/component-base v0.35.0 // indirect k8s.io/klog/v2 v2.140.0 // indirect k8s.io/kube-openapi v0.0.0-20260317180543-43fb72c5454a // indirect diff --git a/go.sum b/go.sum index 1753565..c83af5b 100644 --- a/go.sum +++ b/go.sum @@ -1446,6 +1446,7 @@ github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= github.com/lib/pq v1.10.9/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= github.com/libp2p/go-buffer-pool v0.1.0 h1:oK4mSFcQz7cTQIfqbe4MIj9gLW+mnanjyFtc6cdF0Y8= github.com/libp2p/go-buffer-pool v0.1.0/go.mod h1:N+vh8gMqimBzdKkSMVuydVDq+UV5QTWy5HSiZacSbPg= +github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de h1:9TO3cAIGXtEhnIaL+V+BEER86oLrvS+kWobKpbJuye0= github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de/go.mod h1:zAbeS9B/r2mtpb6U+EI2rYA5OAXxsYw6wTamcNW+zcE= github.com/lightstep/lightstep-tracer-common/golang/gogo v0.0.0-20190605223551-bc2310a04743/go.mod h1:qklhhLq1aX+mtWk9cPHPzaBjWImj5ULL6C7HFJtXQMM= github.com/lightstep/lightstep-tracer-go v0.18.1/go.mod h1:jlF1pusYV4pidLvZ+XD0UBX0ZE6WURAspgAczcDHrL4= @@ -1514,6 +1515,8 @@ github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyua github.com/mitchellh/mapstructure v1.5.0/go.mod h1:bFUtVrKA4DC2yAKiSyO/QUcy7e+RRV2QTWOzhPopBRo= github.com/mitchellh/pointerstructure v1.2.0 h1:O+i9nHnXS3l/9Wu7r4NrEdwA2VFTicjUEN1uBnDo34A= github.com/mitchellh/pointerstructure v1.2.0/go.mod h1:BRAsLI5zgXmw97Lf6s25bs8ohIXc3tViBH44KcwB2g4= +github.com/moby/term v0.5.0 h1:xt8Q1nalod/v7BqbG21f8mQPqH+xAaC9C3N3wfWbVP0= +github.com/moby/term v0.5.0/go.mod h1:8FzsFHVUBGZdbDsJw/ot+X+d5HLUbvklYLJ9uGfcI3Y= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= @@ -1781,6 +1784,8 @@ github.com/sei-protocol/sei-tm-db v0.0.5 h1:3WONKdSXEqdZZeLuWYfK5hP37TJpfaUa13vA github.com/sei-protocol/sei-tm-db v0.0.5/go.mod h1:Cpa6rGyczgthq7/0pI31jys2Fw0Nfrc+/jKdP1prVqY= github.com/sei-protocol/seictl v0.0.50 h1:zBOLIPI/G0oPsLV0DLlGnjCgckkyihOZ03llkFBytsk= github.com/sei-protocol/seictl v0.0.50/go.mod h1:yNPLcFKRTbKvsdKFuQseMHkkXTol7FXidnGKJa/bUXQ= +github.com/sei-protocol/seictl v0.0.55 h1:JZ15hoAS7ft3LL85SeYtkP3Gr/oMlEQnBjhefbDdiZ4= +github.com/sei-protocol/seictl v0.0.55/go.mod h1:sDWY/llzQPnblG/WS6uQ7vqDtshNQ0WJTJzRUgmfFpg= github.com/sei-protocol/seilog v0.0.3 h1:Zi7oWXdX5jv92dY8n482xH032LtNebC89Y+qYZlBn0Y= github.com/sei-protocol/seilog v0.0.3/go.mod h1:CKg58wraWnB3gRxWQ0v1rIVr0gmDHjkfP1bM2giKFFU= github.com/shirou/gopsutil v2.20.5+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= @@ -3467,6 +3472,8 @@ k8s.io/apimachinery v0.35.0 h1:Z2L3IHvPVv/MJ7xRxHEtk6GoJElaAqDCCU0S6ncYok8= k8s.io/apimachinery v0.35.0/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= k8s.io/apiserver v0.35.0 h1:CUGo5o+7hW9GcAEF3x3usT3fX4f9r8xmgQeCBDaOgX4= k8s.io/apiserver v0.35.0/go.mod h1:QUy1U4+PrzbJaM3XGu2tQ7U9A4udRRo5cyxkFX0GEds= +k8s.io/cli-runtime v0.36.0 h1:HNxciQpQMMOKS0/GiUXcKDyA6J2FDILJj9NmP2BZrTg= +k8s.io/cli-runtime v0.36.0/go.mod h1:KObkknK9Ro5LYX+1RdiKc7C8CvGg4aX+V/Zv+E8WPHA= k8s.io/client-go v0.35.0 h1:IAW0ifFbfQQwQmga0UdoH0yvdqrbwMdq9vIFEhRpxBE= k8s.io/client-go v0.35.0/go.mod h1:q2E5AAyqcbeLGPdoRB+Nxe3KYTfPce1Dnu1myQdqz9o= k8s.io/component-base v0.35.0 h1:+yBrOhzri2S1BVqyVSvcM3PtPyx5GUxCK2tinZz1G94= diff --git a/internal/controller/node/peers.go b/internal/controller/node/peers.go index 6839a6d..e44de4f 100644 --- a/internal/controller/node/peers.go +++ b/internal/controller/node/peers.go @@ -21,35 +21,45 @@ var errNoSidecarFactory = errors.New("sidecar client factory is nil") func (r *SeiNodeReconciler) reconcilePeers(ctx context.Context, node *seiv1alpha1.SeiNode) error { var resolved []string + var witnesses []string for _, src := range node.Spec.Peers { if src.Label == nil { continue } - endpoints, err := r.resolveLabelPeers(ctx, node, src.Label) + endpoints, rpcWitnesses, err := r.resolveLabelPeers(ctx, node, src.Label) if err != nil { return err } resolved = append(resolved, endpoints...) + witnesses = append(witnesses, rpcWitnesses...) } slices.Sort(resolved) resolved = slices.Compact(resolved) + slices.Sort(witnesses) + witnesses = slices.Compact(witnesses) if !slices.Equal(node.Status.ResolvedPeers, resolved) { node.Status.ResolvedPeers = resolved } + if !slices.Equal(node.Status.ResolvedRPCWitnesses, witnesses) { + node.Status.ResolvedRPCWitnesses = witnesses + } return nil } -// resolveLabelPeers returns fully-composed `@:` -// strings for SeiNodes matching the selector. Per-peer sidecar failures -// preserve the prior entry from Status.ResolvedPeers (so transients -// don't wedge fleet-wide reconciles) or skip with a log line. +// resolveLabelPeers returns fully-composed `@:` peer +// strings AND the in-cluster RPC witness endpoints for SeiNodes matching the +// selector. Per-peer sidecar failures preserve the prior peer entry from +// Status.ResolvedPeers (so transients don't wedge fleet-wide reconciles) or +// skip with a log line. Witnesses are deterministic from peer identity (no +// node_id needed), so every matched peer yields one regardless of sidecar +// reachability. func (r *SeiNodeReconciler) resolveLabelPeers( ctx context.Context, node *seiv1alpha1.SeiNode, src *seiv1alpha1.LabelPeerSource, -) ([]string, error) { +) ([]string, []string, error) { logger := log.FromContext(ctx) ns := node.Namespace if src.Namespace != "" { @@ -61,17 +71,20 @@ func (r *SeiNodeReconciler) resolveLabelPeers( client.InNamespace(ns), client.MatchingLabels(src.Selector), ); err != nil { - return nil, fmt.Errorf("listing peers by label: %w", err) + return nil, nil, fmt.Errorf("listing peers by label: %w", err) } prior := indexResolvedPeersByHost(node.Status.ResolvedPeers) var endpoints []string + var witnesses []string for i := range nodeList.Items { peer := &nodeList.Items[i] if peer.Name == node.Name && peer.Namespace == node.Namespace { continue } + witnesses = append(witnesses, peerRPCAddress(peer)) + address := peerAddress(peer) var sc task.SidecarClient err := errNoSidecarFactory @@ -93,7 +106,7 @@ func (r *SeiNodeReconciler) resolveLabelPeers( } logger.Info("skipping peer until node_id is resolvable", "peer", peer.Name, "err", err) } - return endpoints, nil + return endpoints, witnesses, nil } // indexResolvedPeersByHost maps `host:port` → `@host:port` for @@ -119,3 +132,12 @@ func peerAddress(peer *seiv1alpha1.SeiNode) string { return fmt.Sprintf("%s-0.%s.%s.svc.cluster.local:%d", peer.Name, peer.Name, peer.Namespace, seiconfig.PortP2P) } + +// peerRPCAddress returns the in-cluster headless Service DNS for a peer's RPC +// port. Unlike peerAddress it never consults Spec.ExternalAddress: the external +// NLB exposes P2P only, so a state-sync light-client witness must target the +// cluster-internal RPC endpoint or seid exits on "no witnesses connected". +func peerRPCAddress(peer *seiv1alpha1.SeiNode) string { + return fmt.Sprintf("%s-0.%s.%s.svc.cluster.local:%d", + peer.Name, peer.Name, peer.Namespace, seiconfig.PortRPC) +} diff --git a/internal/controller/node/peers_test.go b/internal/controller/node/peers_test.go index 4657c0f..3cbe6b8 100644 --- a/internal/controller/node/peers_test.go +++ b/internal/controller/node/peers_test.go @@ -14,6 +14,8 @@ const ( testRoleValue = "validator" testConsumerName = "consumer" testPeer1ResolvedID = "mock-node-id@peer-1-0.peer-1.default.svc.cluster.local:26656" + testWitnessNS = "arctic-1" + testWitnessRole = "syncer" ) type errStub string @@ -108,6 +110,56 @@ func TestReconcilePeers_PrefersExternalAddress(t *testing.T) { if node.Status.ResolvedPeers[0] != want { t.Errorf("resolvedPeers[0] = %q, want %q", node.Status.ResolvedPeers[0], want) } + + // The witness must be the internal RPC DNS, NOT the external P2P address: + // the NLB exposes P2P only. Writing the external address as a witness is + // the regression this fix prevents. + wantWitness := "pub-peer-0.pub-peer.default.svc.cluster.local:26657" + if len(node.Status.ResolvedRPCWitnesses) != 1 || node.Status.ResolvedRPCWitnesses[0] != wantWitness { + t.Errorf("resolvedRPCWitnesses = %v, want [%q]", node.Status.ResolvedRPCWitnesses, wantWitness) + } +} + +func TestReconcilePeers_WitnessesExcludeSelfAndUseRPCPort(t *testing.T) { + const peerName = "syncer-0-1" + node := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{ + Name: "syncer-0-0", Namespace: testWitnessNS, + Labels: map[string]string{testRoleLabel: testWitnessRole}, + }, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: testWitnessNS, + Image: "sei:latest", + Peers: []seiv1alpha1.PeerSource{ + {Label: &seiv1alpha1.LabelPeerSource{ + Selector: map[string]string{testRoleLabel: testWitnessRole}, + }}, + }, + FullNode: &seiv1alpha1.FullNodeSpec{}, + }, + } + peer := &seiv1alpha1.SeiNode{ + ObjectMeta: metav1.ObjectMeta{ + Name: peerName, Namespace: testWitnessNS, + Labels: map[string]string{testRoleLabel: testWitnessRole}, + }, + Spec: seiv1alpha1.SeiNodeSpec{ + ChainID: testWitnessNS, + Image: "sei:latest", + FullNode: &seiv1alpha1.FullNodeSpec{}, + }, + } + + r, _ := newNodeReconciler(t, node, peer) + if err := r.reconcilePeers(context.Background(), node); err != nil { + t.Fatalf("reconcilePeers: %v", err) + } + + want := peerName + "-0." + peerName + "." + testWitnessNS + ".svc.cluster.local:26657" + if len(node.Status.ResolvedRPCWitnesses) != 1 || node.Status.ResolvedRPCWitnesses[0] != want { + t.Errorf("resolvedRPCWitnesses = %v, want [%q] (self excluded, RPC port)", + node.Status.ResolvedRPCWitnesses, want) + } } func TestReconcilePeers_ExcludesSelf(t *testing.T) { @@ -350,6 +402,14 @@ func TestReconcilePeers_NilSidecarFactorySkipsNewPeer(t *testing.T) { if len(node.Status.ResolvedPeers) != 0 { t.Fatalf("expected unresolvable peer to be skipped, got %d: %v", len(node.Status.ResolvedPeers), node.Status.ResolvedPeers) } + // Intentional asymmetry: the witness needs no node_id, so it is emitted + // even though the peer was skipped from persistent_peers. seid can dial a + // state-sync RPC witness it has no P2P peering with; do not "symmetrize" + // this with ResolvedPeers. + wantWitness := "peer-1-0.peer-1.default.svc.cluster.local:26657" + if len(node.Status.ResolvedRPCWitnesses) != 1 || node.Status.ResolvedRPCWitnesses[0] != wantWitness { + t.Errorf("expected witness despite skipped peer, got %v", node.Status.ResolvedRPCWitnesses) + } } // Nil factory + prior entry: preserve-prior branch fires. diff --git a/internal/planner/planner.go b/internal/planner/planner.go index 73a9f0f..ad01c34 100644 --- a/internal/planner/planner.go +++ b/internal/planner/planner.go @@ -609,7 +609,7 @@ func paramsForTaskType( case TaskDiscoverPeers: return discoverPeersTask(node) case TaskConfigureStateSync: - return configureStateSyncTask(snap) + return configureStateSyncTask(node) case TaskConfigValidate: return sidecar.ConfigValidateTask{} case TaskMarkReady: @@ -686,9 +686,11 @@ func discoverPeersTask(node *seiv1alpha1.SeiNode) sidecar.DiscoverPeersTask { return sidecar.DiscoverPeersTask{Sources: sources} } -func configureStateSyncTask(snap *seiv1alpha1.SnapshotSource) sidecar.ConfigureStateSyncTask { +func configureStateSyncTask(node *seiv1alpha1.SeiNode) sidecar.ConfigureStateSyncTask { + snap := node.Spec.SnapshotSource() t := sidecar.ConfigureStateSyncTask{ UseLocalSnapshot: hasS3Snapshot(snap), + RpcServers: node.Status.ResolvedRPCWitnesses, } if snap != nil { if snap.TrustPeriod != "" { diff --git a/internal/planner/statesync_witness_test.go b/internal/planner/statesync_witness_test.go new file mode 100644 index 0000000..79234dd --- /dev/null +++ b/internal/planner/statesync_witness_test.go @@ -0,0 +1,45 @@ +package planner + +import ( + "slices" + "testing" + + seiv1alpha1 "github.com/sei-protocol/sei-k8s-controller/api/v1alpha1" +) + +func TestConfigureStateSyncTask_PassesResolvedWitnesses(t *testing.T) { + witnesses := []string{ + "syncer-0-0-0.syncer-0-0.arctic-1.svc.cluster.local:26657", + "syncer-0-1-0.syncer-0-1.arctic-1.svc.cluster.local:26657", + } + node := &seiv1alpha1.SeiNode{ + Spec: seiv1alpha1.SeiNodeSpec{ + FullNode: &seiv1alpha1.FullNodeSpec{ + Snapshot: &seiv1alpha1.SnapshotSource{TrustPeriod: "168h0m0s", BackfillBlocks: 6000}, + }, + }, + Status: seiv1alpha1.SeiNodeStatus{ResolvedRPCWitnesses: witnesses}, + } + + task := configureStateSyncTask(node) + + if !slices.Equal(task.RpcServers, witnesses) { + t.Errorf("RpcServers = %v, want %v", task.RpcServers, witnesses) + } + if task.TrustPeriod != "168h0m0s" { + t.Errorf("TrustPeriod = %q, want 168h0m0s", task.TrustPeriod) + } + if task.BackfillBlocks != 6000 { + t.Errorf("BackfillBlocks = %d, want 6000", task.BackfillBlocks) + } +} + +// No resolved witnesses (e.g. EC2/static peers) leaves RpcServers empty so the +// sidecar falls back to deriving witnesses from persistent_peers. +func TestConfigureStateSyncTask_NoWitnessesLeavesEmpty(t *testing.T) { + node := &seiv1alpha1.SeiNode{} + task := configureStateSyncTask(node) + if len(task.RpcServers) != 0 { + t.Errorf("RpcServers = %v, want empty", task.RpcServers) + } +} diff --git a/manifests/sei.io_seinodes.yaml b/manifests/sei.io_seinodes.yaml index 159a0f3..a56e9c5 100644 --- a/manifests/sei.io_seinodes.yaml +++ b/manifests/sei.io_seinodes.yaml @@ -970,6 +970,17 @@ spec: items: type: string type: array + resolvedRPCWitnesses: + description: |- + ResolvedRPCWitnesses carries the in-cluster RPC endpoints + (`-0...svc.cluster.local:26657`) of the label-resolved + peers, used as CometBFT state-sync light-client witnesses. Unlike + ResolvedPeers these never carry an external P2P address — RPC is + internal-only. When empty the sidecar derives witnesses from + persistent_peers instead. + items: + type: string + type: array statefulSet: description: |- StatefulSet references the StatefulSet the controller created for