From 0a5b6778e57dc698640b67b4c9c7c650f31b8446 Mon Sep 17 00:00:00 2001 From: Artur Khantimirov Date: Tue, 19 May 2026 14:05:00 +1200 Subject: [PATCH 1/2] fix(windows): register k8s-restart-job in NodePrep to avoid PIS bootstrap race MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Windows VHDs are baked or staged via Prepared Image Specification (PIS), CSE runs in two phases: BasePrep at bake time and NodePrep at node provisioning time. Register-NodeResetScriptTask was registered in BasePrep with an -AtStartup trigger, so on the first boot from a baked VHD it fired before NodePrep got a chance to remove the temporary kubeconfig at `c:\k\config`. That kubeconfig embeds the cluster-shared "nodeclient" client certificate. As a result, kubelet started up, read the embedded certificate, persisted it under `c:\k\pki`, and skipped the standard token-based TLS bootstrap. With `CN=nodeclient` (not `CN=system:node:`) the API server's NodeAuthorizer rejected every kubelet request — "User \"nodeclient\" cannot create certificatesigningrequests" — and the node never registered. The agentpool ARM operation succeeded, so the failure was silent. Repro / verification (NZ Sandbox sub, Windows2022, K8s 1.34.7): - Pool WITH PIS: cert subject CN=nodeclient,O=system:nodes → never joined - Pool WITHOUT PIS: cert subject CN=system:node: → Ready in ~30s Move Register-NodeResetScriptTask out of BasePrep and into NodePrep, after the temporary kubeconfig removal, so the scheduled task does not exist on a baked VHD and only fires on subsequent boots after the node has been properly provisioned. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- parts/windows/kuberneteswindowssetup.ps1 | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/parts/windows/kuberneteswindowssetup.ps1 b/parts/windows/kuberneteswindowssetup.ps1 index ba392f1f7a2..86590db76b7 100644 --- a/parts/windows/kuberneteswindowssetup.ps1 +++ b/parts/windows/kuberneteswindowssetup.ps1 @@ -485,7 +485,6 @@ function BasePrep { PREPROVISION_EXTENSION Adjust-DynamicPortRange Register-LogsCleanupScriptTask - Register-NodeResetScriptTask Update-DefenderPreferences @@ -579,6 +578,11 @@ function NodePrep { Remove-Item $kubeConfigFile } + # Register AFTER temp kubeconfig removal: the -AtStartup trigger would + # otherwise race PIS-baked VHD first boot and bring kubelet up with the + # embedded "nodeclient" cert instead of doing TLS bootstrap. + Register-NodeResetScriptTask + Start-InstallGPUDriver -EnableInstall $global:ConfigGPUDriverIfNeeded -GpuDriverURL $global:GpuDriverURL if (Test-Path $CacheDir) From 0dd409036bebf133bd07c8ffa80d162924c07b71 Mon Sep 17 00:00:00 2001 From: Artur Khantimirov Date: Tue, 19 May 2026 15:11:10 +1200 Subject: [PATCH 2/2] test(e2e): add Windows VHD caching scenario with legacy TLS bootstrap Existing Test_Windows2022_VHDCaching runs with secure TLS bootstrap enabled, which masks bugs in the legacy bootstrap-token path on Windows PIS / two-stage CSE provisioning. This scenario forces kubelet onto the legacy path so stage 2's WaitUntilNodeReady catches regressions there. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- e2e/scenario_win_test.go | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/e2e/scenario_win_test.go b/e2e/scenario_win_test.go index 2ece49b1c7b..0bc8e178bad 100644 --- a/e2e/scenario_win_test.go +++ b/e2e/scenario_win_test.go @@ -440,6 +440,31 @@ func Test_Windows2022_VHDCaching(t *testing.T) { }) } +// Test_Windows2022_VHDCaching_LegacyTLSBootstrap exercises Windows PIS / +// VHD-cached provisioning with secure TLS bootstrap disabled, forcing kubelet +// to use the legacy bootstrap-token path. Catches regressions in the two-stage +// CSE flow that only surface when no secure-tls-bootstrap client is around to +// overwrite the temporary kubeconfig. +func Test_Windows2022_VHDCaching_LegacyTLSBootstrap(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "VHD Caching with secure TLS bootstrap disabled", + Config: Config{ + Cluster: ClusterAzureNetwork, + VHD: config.VHDWindows2022Containerd, + VHDCaching: true, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Capacity = to.Ptr[int64](2) + }, + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { + if nbc.SecureTLSBootstrappingConfig == nil { + nbc.SecureTLSBootstrappingConfig = &datamodel.SecureTLSBootstrappingConfig{} + } + nbc.SecureTLSBootstrappingConfig.Enabled = false + }, + }, + }) +} + func Test_Windows2022Gen2_k8s_133(t *testing.T) { RunScenario(t, &Scenario{ Description: "Windows Server 2022 with Containerd 2- hyperv gen 2",