From 042c0f4c7acf1ff529cf19c77ab38bbaac5ba1d4 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Tue, 28 Apr 2026 10:54:13 -0700 Subject: [PATCH 01/24] enable aksnodeconfig in all e2es --- e2e/scenario_test.go | 78 ++++++++++++++++++++++++++++++++++++++++++++ e2e/test_helpers.go | 17 ++++++++++ 2 files changed, 95 insertions(+) diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index f8dad51d1e7..c5ff098f88e 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -26,6 +26,9 @@ func Test_AzureLinux3OSGuard(t *testing.T) { BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { nbc.AgentPoolProfile.LocalDNSProfile = nil }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.LocalDnsProfile = nil + }, Validator: func(ctx context.Context, s *Scenario) {}, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -47,6 +50,9 @@ func Test_Flatcar(t *testing.T) { }, } }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.CustomCaCerts = []string{encodedTestCert} + }, Validator: func(ctx context.Context, s *Scenario) { ValidateFileHasContent(ctx, s, "/etc/protocols", "protocols definition file") ValidateFileIsRegularFile(ctx, s, "/etc/ssl/certs/ca-certificates.crt") @@ -86,6 +92,9 @@ func Test_Flatcar_ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" nbc.IsARM64 = true }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.VmSize = "Standard_D2pds_V5" + }, Validator: func(ctx context.Context, s *Scenario) { }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { @@ -105,6 +114,9 @@ func Test_AzureLinuxV3_ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" nbc.IsARM64 = true }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.VmSize = "Standard_D2pds_V5" + }, Validator: func(ctx context.Context, s *Scenario) { }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { @@ -124,6 +136,9 @@ func Test_Flatcar_AzureCNI(t *testing.T) { nbc.ContainerService.Properties.OrchestratorProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) nbc.AgentPoolProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.NetworkConfig.NetworkPlugin = aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_AZURE + }, Validator: func(ctx context.Context, s *Scenario) { ServiceCanRestartValidator(ctx, s, "chronyd", 10) ValidateFileHasContent(ctx, s, "/etc/systemd/system/chronyd.service.d/10-chrony-restarts.conf", "Restart=always") @@ -143,6 +158,9 @@ func Test_Ubuntu2204_AzureCNI(t *testing.T) { nbc.ContainerService.Properties.OrchestratorProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) nbc.AgentPoolProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.NetworkConfig.NetworkPlugin = aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_AZURE + }, Validator: func(ctx context.Context, s *Scenario) { }, }, @@ -203,6 +221,9 @@ func Test_ACL(t *testing.T) { }, } }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.CustomCaCerts = []string{encodedTestCert} + }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) }, @@ -230,6 +251,9 @@ func Test_ACL_ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_v6" nbc.IsARM64 = true }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.VmSize = "Standard_D2pds_v6" + }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) vmss.SKU.Name = to.Ptr("Standard_D2pds_v6") @@ -277,6 +301,9 @@ func Test_ACL_AzureCNI(t *testing.T) { nbc.ContainerService.Properties.OrchestratorProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) nbc.AgentPoolProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.NetworkConfig.NetworkPlugin = aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_AZURE + }, Validator: func(ctx context.Context, s *Scenario) { ServiceCanRestartValidator(ctx, s, "chronyd", 10) ValidateFileHasContent(ctx, s, "/etc/systemd/system/chronyd.service.d/10-chrony-restarts.conf", "Restart=always") @@ -345,6 +372,9 @@ func Test_ACL_DisableSSH(t *testing.T) { BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { nbc.SSHStatus = datamodel.SSHOff }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.EnableSsh = to.Ptr(false) + }, SkipSSHConnectivityValidation: true, // Skip SSH connectivity validation since SSH is down SkipDefaultValidation: true, // Skip default validation since it requires SSH connectivity Validator: func(ctx context.Context, s *Scenario) { @@ -461,6 +491,9 @@ func Test_AzureLinuxV3_AzureCNI(t *testing.T) { nbc.ContainerService.Properties.OrchestratorProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) nbc.AgentPoolProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.NetworkConfig.NetworkPlugin = aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_AZURE + }, }, }) } @@ -479,6 +512,10 @@ func Test_AzureLinuxV3(t *testing.T) { }, } }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.MessageOfTheDay = "Zm9vYmFyDQo=" + config.CustomCaCerts = []string{encodedTestCert} + }, Validator: func(ctx context.Context, s *Scenario) { ValidateFileHasContent(ctx, s, "/etc/motd", "foobar") ValidateFileHasContent(ctx, s, "/etc/dnf/automatic.conf", "emit_via = stdio") @@ -664,6 +701,10 @@ func Test_Ubuntu2204(t *testing.T) { }, } }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.MessageOfTheDay = "Zm9vYmFyDQo=" + config.CustomCaCerts = []string{encodedTestCert} + }, Validator: func(ctx context.Context, s *Scenario) { ValidateInstalledPackageVersion(ctx, s, "moby-containerd", components.GetExpectedPackageVersions("containerd", "ubuntu", "r2204")[0]) ValidateInstalledPackageVersion(ctx, s, "moby-runc", components.GetExpectedPackageVersions("runc", "ubuntu", "r2204")[0]) @@ -685,6 +726,8 @@ func Test_Ubuntu2204FIPS(t *testing.T) { VHD: config.VHDUbuntu2204FIPSContainerd, BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties.AdditionalCapabilities = &armcompute.AdditionalCapabilities{ EnableFips1403Encryption: to.Ptr(true), @@ -710,6 +753,8 @@ func Test_Ubuntu2004FIPS(t *testing.T) { VHD: config.VHDUbuntu2004FIPSContainerd, BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -732,6 +777,8 @@ func Test_Ubuntu2204Gen2FIPS(t *testing.T) { VHD: config.VHDUbuntu2204Gen2FIPSContainerd, BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties.AdditionalCapabilities = &armcompute.AdditionalCapabilities{ EnableFips1403Encryption: to.Ptr(true), @@ -760,6 +807,8 @@ func Test_Ubuntu2204Gen2FIPSTL(t *testing.T) { VHD: config.VHDUbuntu2204Gen2FIPSTLContainerd, BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) vmss.Properties.AdditionalCapabilities = &armcompute.AdditionalCapabilities{ @@ -788,6 +837,9 @@ func Test_Ubuntu2204_EntraIDSSH(t *testing.T) { // Enable Entra ID SSH authentication nbc.SSHStatus = datamodel.EntraIDSSH }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.DisablePubkeyAuth = to.Ptr(true) + }, SkipSSHConnectivityValidation: true, // Skip SSH connectivity validation since Entra ID SSH disables private key authentication SkipDefaultValidation: true, // Skip default validation since it requires SSH connectivity Validator: func(ctx context.Context, s *Scenario) { @@ -839,6 +891,9 @@ func Test_AzureLinuxV3_DisableSSH(t *testing.T) { BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { nbc.SSHStatus = datamodel.SSHOff }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.EnableSsh = to.Ptr(false) + }, SkipSSHConnectivityValidation: true, // Skip SSH connectivity validation since SSH is down SkipDefaultValidation: true, // Skip default validation since it requires SSH connectivity Validator: func(ctx context.Context, s *Scenario) { @@ -858,6 +913,9 @@ func Test_Ubuntu2204_DisableSSH(t *testing.T) { BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { nbc.SSHStatus = datamodel.SSHOff }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.EnableSsh = to.Ptr(false) + }, SkipSSHConnectivityValidation: true, // Skip SSH connectivity validation since SSH is down SkipDefaultValidation: true, // Skip default validation since it requires SSH connectivity Validator: func(ctx context.Context, s *Scenario) { @@ -877,6 +935,9 @@ func Test_Flatcar_DisableSSH(t *testing.T) { BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { nbc.SSHStatus = datamodel.SSHOff }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.EnableSsh = to.Ptr(false) + }, SkipSSHConnectivityValidation: true, // Skip SSH connectivity validation since SSH is down SkipDefaultValidation: true, // Skip default validation since it requires SSH connectivity Validator: func(ctx context.Context, s *Scenario) { @@ -1153,6 +1214,9 @@ func Test_Ubuntu2204ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" nbc.IsARM64 = true }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.VmSize = "Standard_D2pds_V5" + }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.SKU.Name = to.Ptr("Standard_D2pds_V5") }, @@ -1169,6 +1233,9 @@ func Test_Ubuntu2204_ArtifactStreaming(t *testing.T) { BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { nbc.EnableArtifactStreaming = true }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.EnableArtifactStreaming = true + }, Validator: func(ctx context.Context, s *Scenario) { ValidateNonEmptyDirectory(ctx, s, "/etc/overlaybd") ValidateSystemdUnitIsRunning(ctx, s, "overlaybd-snapshotter.service") @@ -1191,6 +1258,10 @@ func Test_Ubuntu2204_ArtifactStreaming_ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" nbc.IsARM64 = true }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.EnableArtifactStreaming = true + config.VmSize = "Standard_D2pds_V5" + }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.SKU.Name = to.Ptr("Standard_D2pds_V5") }, @@ -1237,6 +1308,9 @@ func Test_AzureLinuxV3_ArtifactStreaming(t *testing.T) { BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { nbc.EnableArtifactStreaming = true }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.EnableArtifactStreaming = true + }, Validator: func(ctx context.Context, s *Scenario) { ValidateNonEmptyDirectory(ctx, s, "/etc/overlaybd") ValidateSystemdUnitIsRunning(ctx, s, "overlaybd-snapshotter.service") @@ -1309,6 +1383,10 @@ func Test_Ubuntu2404_ArtifactStreaming_ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" nbc.IsARM64 = true }, + AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + config.EnableArtifactStreaming = true + config.VmSize = "Standard_D2pds_V5" + }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.SKU.Name = to.Ptr("Standard_D2pds_V5") }, diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index 71312b8b108..7c7dfeec87e 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -101,12 +101,29 @@ func RunScenario(t *testing.T, s *Scenario) { require.NoError(t, err) }) } + + if supportsScriptlessAKSNodeConfig(s) { + t.Run("scriptless_anc", func(t *testing.T) { + t.Parallel() + sCopy := copyScenario(s) + if sCopy.Runtime == nil { + sCopy.Runtime = &ScenarioRuntime{} + } + sCopy.Runtime.EnableScriptlessNBCCSECmd = true + err := runScenario(t, sCopy) + require.NoError(t, err) + }) + } } func supportsScriptlessNBCCSECmd(s *Scenario) bool { return s.AKSNodeConfigMutator == nil && !s.IsWindows() && len(s.Config.CustomDataWriteFiles) <= 0 && !s.VHDCaching && !config.Config.TestPreProvision } +func supportsScriptlessAKSNodeConfig(s *Scenario) bool { + return s.AKSNodeConfigMutator != nil && !s.IsWindows() && len(s.Config.CustomDataWriteFiles) <= 0 && !s.VHDCaching && !config.Config.TestPreProvision +} + func runScenarioWithPreProvision(t *testing.T, original *Scenario) { // This is hard to understand. Some functional magic is used to run the original scenario in two stages. // 1. Stage 1: Run the original scenario with pre-provisioning enabled, but skip the main validation and validate only pre-provisioning. From 2c493c8b80640548c7955a2f47d71a23da9f88df Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Mon, 4 May 2026 16:23:47 -0700 Subject: [PATCH 02/24] refactor customdatahack and get e2e passing --- e2e/test_helpers.go | 9 +- e2e/types.go | 7 +- e2e/vmss.go | 306 ++++++++++++++++++++------------------------ 3 files changed, 145 insertions(+), 177 deletions(-) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index cfa9be410d7..e3d49219b6d 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -109,7 +109,7 @@ func RunScenario(t *testing.T, s *Scenario) { if sCopy.Runtime == nil { sCopy.Runtime = &ScenarioRuntime{} } - sCopy.Runtime.EnableScriptlessNBCCSECmd = true + sCopy.Runtime.EnableScriptlessANC = true err := runScenario(t, sCopy) require.NoError(t, err) }) @@ -297,9 +297,10 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) { s.AKSNodeConfigMutator(nodeconfig) s.Runtime.AKSNodeConfig = nodeconfig // AKSNodeConfig scenarios use aks-node-controller, not GetNodeBootstrapping. - // Clear NBC so validators that check NBC fields (e.g., ValidateScriptlessCSECmd) - // don't fire incorrectly — those validations only apply to NBC-based provisioning. - s.Runtime.NBC = nil + // NBC is kept for comparison mode (compareEnvs) where both configs are needed, + // but disable scriptless flags so validators don't fire incorrectly. + nbc.EnableScriptlessCSECmd = false + nbc.EnableScriptlessNBCCSECmd = false } publicKeyData := datamodel.PublicKey{KeyData: string(config.VMSSHPublicKey)} diff --git a/e2e/types.go b/e2e/types.go index 9643b167470..0d2953430dc 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -140,6 +140,7 @@ type ScenarioRuntime struct { VM *ScenarioVM VMSSName string EnableScriptlessNBCCSECmd bool + EnableScriptlessANC bool CSETimingReport *CSETimingReport // eagerly extracted before GA can sweep events } @@ -278,11 +279,11 @@ func (s *Scenario) KubeletConfigFileEnabled() bool { if s.Runtime == nil { return false } - if nbc := s.Runtime.NBC; nbc != nil && (nbc.EnableKubeletConfigFile || - (nbc.AgentPoolProfile != nil && (nbc.AgentPoolProfile.CustomKubeletConfig != nil || nbc.AgentPoolProfile.CustomLinuxOSConfig != nil))) { + if nodeConfig := s.Runtime.AKSNodeConfig; nodeConfig != nil && nodeConfig.KubeletConfig != nil && nodeConfig.KubeletConfig.EnableKubeletConfigFile { return true } - if nodeConfig := s.Runtime.AKSNodeConfig; nodeConfig != nil && nodeConfig.KubeletConfig != nil && nodeConfig.KubeletConfig.EnableKubeletConfigFile { + if nbc := s.Runtime.NBC; nbc != nil && (nbc.EnableKubeletConfigFile || + (nbc.AgentPoolProfile != nil && (nbc.AgentPoolProfile.CustomKubeletConfig != nil || nbc.AgentPoolProfile.CustomLinuxOSConfig != nil))) { return true } return false diff --git a/e2e/vmss.go b/e2e/vmss.go index e84fe77a6fc..aacbb5334cc 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -90,26 +90,65 @@ func ConfigureAndCreateVMSS(ctx context.Context, s *Scenario) (*ScenarioVM, erro // avoiding the race condition where runcmd or boothook scripts execute before networking is available. // Flatcar cannot use boothooks (coreos-cloudinit doesn't support MIME multipart), so it uses cloud-config // with a coreos.units block to define and start the service instead. -func CustomDataWithHack(s *Scenario, binaryURL string) (string, error) { - cloudConfigTemplate := `#cloud-boothook +func CustomDataWithHack(s *Scenario, binaryURL, nbcCmdScript string) (string, error) { + configPath := "/opt/azure/containers/aks-node-controller-config-hack.json" + nbcCmdPath := "/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh" + + // Build provision flags conditionally based on what's provided. + var flags []string + if s.Runtime.AKSNodeConfig != nil { + flags = append(flags, "--provision-config="+configPath) + } + if nbcCmdScript != "" { + flags = append(flags, "--nbc-cmd="+nbcCmdPath) + } + provisionFlags := strings.Join(flags, " ") + + // Encode AKSNodeConfig if provided. + var encodedAksNodeConfigJSON string + if s.Runtime.AKSNodeConfig != nil { + aksNodeConfigJSON, err := nodeconfigutils.MarshalConfigurationV1(s.Runtime.AKSNodeConfig) + if err != nil { + return "", fmt.Errorf("failed to marshal nbc, error: %w", err) + } + encodedAksNodeConfigJSON = base64.StdEncoding.EncodeToString(aksNodeConfigJSON) + } + + var customData string + if s.VHD.Flatcar { + customData = buildFlatcarCloudConfig(encodedAksNodeConfigJSON, configPath, nbcCmdScript, nbcCmdPath, binaryURL, provisionFlags) + } else { + customData = buildBoothookCloudConfig(encodedAksNodeConfigJSON, configPath, nbcCmdScript, nbcCmdPath, binaryURL, provisionFlags) + } + return base64.StdEncoding.EncodeToString([]byte(customData)), nil +} + +func buildBoothookCloudConfig(encodedConfig, configPath, nbcCmdScript, nbcCmdPath, binaryURL, provisionFlags string) string { + var sb strings.Builder + sb.WriteString(`#cloud-boothook #!/bin/bash set -euo pipefail mkdir -p /opt/azure/containers /opt/azure/bin -cat <<'EOF' | base64 -d > %[1]s -%[2]s -EOF -chmod 0600 %[1]s - +`) + if encodedConfig != "" { + fmt.Fprintf(&sb, "cat <<'EOF' | base64 -d > %s\n%s\nEOF\nchmod 0600 %s\n", + configPath, encodedConfig, configPath) + } + if nbcCmdScript != "" { + fmt.Fprintf(&sb, "\ncat <<'EOF' | base64 -d > %s\n%s\nEOF\nchmod 0755 %s\n", + nbcCmdPath, base64.StdEncoding.EncodeToString([]byte(nbcCmdScript)), nbcCmdPath) + } + fmt.Fprintf(&sb, ` cat <<'SCRIPT' > /opt/azure/bin/run-aks-node-controller-hack.sh #!/bin/bash set -euo pipefail mkdir -p /opt/azure/bin -curl -fSL --retry 10 --retry-delay 2 "%[3]s" -o /opt/azure/bin/aks-node-controller-hack +curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack chmod +x /opt/azure/bin/aks-node-controller-hack -/opt/azure/bin/aks-node-controller-hack provision --provision-config=%[1]s +/opt/azure/bin/aks-node-controller-hack provision %s SCRIPT chmod +x /opt/azure/bin/run-aks-node-controller-hack.sh @@ -130,28 +169,42 @@ UNIT systemctl daemon-reload systemctl start --no-block aks-node-controller-hack.service -` - if s.VHD.Flatcar { - // Flatcar uses coreos-cloudinit which only supports a subset of cloud-config features - // and does not handle MIME multipart or boothooks. Use coreos.units to define the service instead. - // https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters - cloudConfigTemplate = `#cloud-config -write_files: -- path: %[1]s +`, binaryURL, provisionFlags) + return sb.String() +} + +func buildFlatcarCloudConfig(encodedConfig, configPath, nbcCmdScript, nbcCmdPath, binaryURL, provisionFlags string) string { + // Flatcar uses coreos-cloudinit which only supports a subset of cloud-config features + // and does not handle MIME multipart or boothooks. Use coreos.units to define the service instead. + // https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters + var sb strings.Builder + sb.WriteString("#cloud-config\nwrite_files:\n") + if encodedConfig != "" { + fmt.Fprintf(&sb, `- path: %s permissions: "0600" owner: root content: !!binary | - %[2]s -- path: /opt/azure/bin/run-aks-node-controller-hack.sh + %s +`, configPath, encodedConfig) + } + if nbcCmdScript != "" { + fmt.Fprintf(&sb, `- path: %s + permissions: "0755" + owner: root + content: !!binary | + %s +`, nbcCmdPath, base64.StdEncoding.EncodeToString([]byte(nbcCmdScript))) + } + fmt.Fprintf(&sb, `- path: /opt/azure/bin/run-aks-node-controller-hack.sh permissions: "0755" owner: root content: | #!/bin/bash set -euo pipefail mkdir -p /opt/azure/bin - curl -fSL --retry 10 --retry-delay 2 "%[3]s" -o /opt/azure/bin/aks-node-controller-hack + curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack chmod +x /opt/azure/bin/aks-node-controller-hack - /opt/azure/bin/aks-node-controller-hack provision --provision-config=%[1]s + /opt/azure/bin/aks-node-controller-hack provision %s # Flatcar specific configuration. It supports only a subset of cloud-init features https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters coreos: units: @@ -167,159 +220,62 @@ coreos: ExecStart=/opt/azure/bin/run-aks-node-controller-hack.sh [Install] WantedBy=multi-user.target -` - } - - aksNodeConfigJSON, err := nodeconfigutils.MarshalConfigurationV1(s.Runtime.AKSNodeConfig) - if err != nil { - return "", fmt.Errorf("failed to marshal nbc, error: %w", err) - } - encodedAksNodeConfigJSON := base64.StdEncoding.EncodeToString(aksNodeConfigJSON) - configPath := "/opt/azure/containers/aks-node-controller-config-hack.json" - - customDataYAML := fmt.Sprintf(cloudConfigTemplate, configPath, encodedAksNodeConfigJSON, binaryURL) - return base64.StdEncoding.EncodeToString([]byte(customDataYAML)), nil +`, binaryURL, provisionFlags) + return sb.String() } // CustomDataWithNBCCmdHack is similar to baker.boothooktemplate, but it uses a hack to run new aks-node-controller binary. -// Original aks-node-controller isn't run because it fails systemd check validating aks-node-controller-config.json exists -// (check aks-node-controller.service for details). -// with a coreos.units block to define and start the service instead. -func CustomDataWithNBCCmdHack(s *Scenario, customData, binaryURL string) (string, error) { - decoded, err := base64.StdEncoding.DecodeString(customData) - require.NoError(s.T, err) - - customData = strings.Replace(string(decoded), "aks-node-controller-nbc-cmd.sh", "aks-node-controller-nbc-cmd-hack.sh", -1) - - if s.VHD.Flatcar { - // For Flatcar, customData is an ignition JSON config from baker.go's flatcarTemplate. - // Ignition's "enabled: true" only creates enable symlinks but does NOT start services, - // so we can't use ignition JSON to start the hack service reliably. - // Instead, convert to #cloud-config format with coreos.units "command: start", - // which coreos-cloudinit processes and explicitly starts the service. - var ignitionConfig map[string]interface{} - if err := json.Unmarshal([]byte(customData), &ignitionConfig); err != nil { - return "", fmt.Errorf("failed to parse ignition config: %w", err) - } - - // Extract the nbc-cmd-hack.sh content from the ignition storage.files - var nbcCmdContent string - if storage, ok := ignitionConfig["storage"].(map[string]interface{}); ok { - if files, ok := storage["files"].([]interface{}); ok { - for _, f := range files { - file, _ := f.(map[string]interface{}) - if file["path"] == "/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh" { - if contents, ok := file["contents"].(map[string]interface{}); ok { - source, _ := contents["source"].(string) - // source is "data:;base64," - nbcCmdContent, _ = strings.CutPrefix(source, "data:;base64,") - // As of PR #8357, baker.go's flatcarTemplate marks the file with - // `compression: gzip`, so the base64 payload decodes to gzip bytes - // rather than plaintext shell. Ignition would normally gunzip it, - // but here we re-emit via cloud-config `!!binary`, which only - // base64-decodes. We must gunzip ourselves and re-base64 the - // plaintext, otherwise the resulting nbc-cmd-hack.sh contains raw - // gzip bytes and CSE exec fails with "cannot execute binary file" - // (exit 126). - if compression, _ := contents["compression"].(string); compression == "gzip" { - gzBytes, err := base64.StdEncoding.DecodeString(nbcCmdContent) - if err != nil { - return "", fmt.Errorf("failed to base64-decode gzipped nbc-cmd source: %w", err) - } - gzReader, err := gzip.NewReader(bytes.NewReader(gzBytes)) - if err != nil { - return "", fmt.Errorf("failed to create gzip reader for nbc-cmd source: %w", err) - } - plain, err := io.ReadAll(gzReader) - _ = gzReader.Close() - if err != nil { - return "", fmt.Errorf("failed to gunzip nbc-cmd source: %w", err) - } - nbcCmdContent = base64.StdEncoding.EncodeToString(plain) +func GunzipCustomData(s *Scenario, customData string) (string, error) { + var ignitionConfig map[string]interface{} + if err := json.Unmarshal([]byte(customData), &ignitionConfig); err != nil { + return "", fmt.Errorf("failed to parse ignition config: %w", err) + } + + // Extract the nbc-cmd-hack.sh content from the ignition storage.files + var nbcCmdContent string + if storage, ok := ignitionConfig["storage"].(map[string]interface{}); ok { + if files, ok := storage["files"].([]interface{}); ok { + for _, f := range files { + file, _ := f.(map[string]interface{}) + if file["path"] == "/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh" { + if contents, ok := file["contents"].(map[string]interface{}); ok { + source, _ := contents["source"].(string) + // source is "data:;base64," + nbcCmdContent, _ = strings.CutPrefix(source, "data:;base64,") + // As of PR #8357, baker.go's flatcarTemplate marks the file with + // `compression: gzip`, so the base64 payload decodes to gzip bytes + // rather than plaintext shell. Ignition would normally gunzip it, + // but here we re-emit via cloud-config `!!binary`, which only + // base64-decodes. We must gunzip ourselves and re-base64 the + // plaintext, otherwise the resulting nbc-cmd-hack.sh contains raw + // gzip bytes and CSE exec fails with "cannot execute binary file" + // (exit 126). + if compression, _ := contents["compression"].(string); compression == "gzip" { + gzBytes, err := base64.StdEncoding.DecodeString(nbcCmdContent) + if err != nil { + return "", fmt.Errorf("failed to base64-decode gzipped nbc-cmd source: %w", err) + } + gzReader, err := gzip.NewReader(bytes.NewReader(gzBytes)) + if err != nil { + return "", fmt.Errorf("failed to create gzip reader for nbc-cmd source: %w", err) } + plain, err := io.ReadAll(gzReader) + _ = gzReader.Close() + if err != nil { + return "", fmt.Errorf("failed to gunzip nbc-cmd source: %w", err) + } + nbcCmdContent = base64.StdEncoding.EncodeToString(plain) } } } } } - if nbcCmdContent == "" { - return "", fmt.Errorf("failed to extract nbc-cmd-hack.sh content from ignition config") - } - - // Build a #cloud-config that writes both the nbc-cmd script and hack runner, - // then starts the hack service via coreos.units command: start - cloudConfig := fmt.Sprintf(`#cloud-config -write_files: -- path: /opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh - permissions: "0600" - owner: root - content: !!binary | - %[1]s -- path: /opt/azure/bin/run-aks-node-controller-hack.sh - permissions: "0755" - owner: root - content: | - #!/bin/bash - set -euo pipefail - mkdir -p /opt/azure/bin - curl -fSL --retry 10 --retry-delay 2 "%[2]s" -o /opt/azure/bin/aks-node-controller-hack - chmod +x /opt/azure/bin/aks-node-controller-hack - /opt/azure/bin/aks-node-controller-hack provision --nbc-cmd=/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh -coreos: - units: - - name: aks-node-controller-hack.service - command: start - content: | - [Unit] - Description=Downloads and runs the AKS node controller hack - After=network-online.target - Wants=network-online.target - [Service] - Type=oneshot - ExecStart=/opt/azure/bin/run-aks-node-controller-hack.sh - [Install] - WantedBy=multi-user.target -`, nbcCmdContent, binaryURL) - - return base64.StdEncoding.EncodeToString([]byte(cloudConfig)), nil + } + if nbcCmdContent == "" { + return "", fmt.Errorf("failed to extract nbc-cmd-hack.sh content from ignition config") } - cloudConfigTemplate := `%s - -mkdir -p /opt/azure/bin - -cat <<'SCRIPT' > /opt/azure/bin/run-aks-node-controller-hack.sh -#!/bin/bash -set -euo pipefail -mkdir -p /opt/azure/bin -curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack -chmod +x /opt/azure/bin/aks-node-controller-hack - -/opt/azure/bin/aks-node-controller-hack provision --nbc-cmd=/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh - -SCRIPT -chmod +x /opt/azure/bin/run-aks-node-controller-hack.sh - -cat <<'UNIT' > /etc/systemd/system/aks-node-controller-hack.service -[Unit] -Description=Downloads and runs the AKS node controller hack -After=network-online.target -Wants=network-online.target - -[Service] -Type=oneshot -ExecStart=/opt/azure/bin/run-aks-node-controller-hack.sh - -[Install] -WantedBy=basic.target -UNIT - -systemctl daemon-reload -systemctl start --no-block aks-node-controller-hack.service -` - - customDataYAML := fmt.Sprintf(cloudConfigTemplate, customData, binaryURL) - return base64.StdEncoding.EncodeToString([]byte(customDataYAML)), nil + return nbcCmdContent, nil } func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachineScaleSet { @@ -329,9 +285,20 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine require.NoError(s.T, err) var cse, customData string - if s.Runtime.AKSNodeConfig != nil { + if s.Runtime.NBC != nil { + if s.Runtime.EnableScriptlessANC { + s.Runtime.NBC.EnableKubeletConfigFile = true + } + nodeBootstrapping, err = ab.GetNodeBootstrapping(ctx, s.Runtime.NBC) + require.NoError(s.T, err) + } + + if s.Runtime.AKSNodeConfig != nil && s.Runtime.EnableScriptlessANC { cse = nodeconfigutils.CSE - customData = func() string { + + var nbcCmdScript string + nbcCmdScript = nodeBootstrapping.CSE + customData = func(nbcCmdScript string) string { if config.Config.DisableScriptLessCompilation { var data string var err error @@ -345,20 +312,19 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine } binaryURL, err := CachedCompileAndUploadAKSNodeController(ctx, s.VHD.Arch) require.NoError(s.T, err, "failed to compile and upload aks-node-controller binary") - data, err := CustomDataWithHack(s, binaryURL) + data, err := CustomDataWithHack(s, binaryURL, nbcCmdScript) require.NoError(s.T, err, "failed to generate custom data from AKSNodeConfig with hack") return data - }() + }(nbcCmdScript) } else { - nodeBootstrapping, err = ab.GetNodeBootstrapping(ctx, s.Runtime.NBC) - require.NoError(s.T, err) cse = nodeBootstrapping.CSE customData = nodeBootstrapping.CustomData if s.Runtime.NBC.EnableScriptlessNBCCSECmd && !config.Config.DisableScriptLessCompilation && !s.Tags.NetworkIsolated { binaryURL, err := CachedCompileAndUploadAKSNodeController(ctx, s.VHD.Arch) require.NoError(s.T, err, "failed to compile and upload aks-node-controller binary") - customData, err = CustomDataWithNBCCmdHack(s, customData, binaryURL) + customData, err = CustomDataWithHack(s, binaryURL, customData) + customData, err = GunzipCustomData(s, customData) require.NoError(s.T, err, "failed to generate custom data with NBC cmd hack") } if len(s.Config.CustomDataWriteFiles) > 0 { From 1bb406da323e137c32837fb1c38af7b6cf0a9189 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Mon, 4 May 2026 17:05:21 -0700 Subject: [PATCH 03/24] fix scriptless vs nonscriptless e2e logic --- e2e/test_helpers.go | 5 +---- e2e/types.go | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index e3d49219b6d..128b6409a94 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -117,7 +117,7 @@ func RunScenario(t *testing.T, s *Scenario) { } func supportsScriptlessNBCCSECmd(s *Scenario) bool { - return s.AKSNodeConfigMutator == nil && !s.IsWindows() && len(s.Config.CustomDataWriteFiles) <= 0 && !s.VHDCaching && !config.Config.TestPreProvision && !s.SkipScriptlessNBC + return !s.Tags.Scriptless && !s.IsWindows() && len(s.Config.CustomDataWriteFiles) <= 0 && !s.VHDCaching && !config.Config.TestPreProvision && !s.SkipScriptlessNBC } func supportsScriptlessAKSNodeConfig(s *Scenario) bool { @@ -370,9 +370,6 @@ func maybeSkipScenario(ctx context.Context, t testing.TB, s *Scenario) { s.Tags.Arch = s.VHD.Arch s.Tags.ImageName = s.VHD.Name s.Tags.VHDCaching = s.VHDCaching - if s.AKSNodeConfigMutator != nil { - s.Tags.Scriptless = true - } if config.Config.TagsToRun != "" { matches, err := s.Tags.MatchesFilters(config.Config.TagsToRun) diff --git a/e2e/types.go b/e2e/types.go index 0d2953430dc..8618a9c92d0 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -279,7 +279,7 @@ func (s *Scenario) KubeletConfigFileEnabled() bool { if s.Runtime == nil { return false } - if nodeConfig := s.Runtime.AKSNodeConfig; nodeConfig != nil && nodeConfig.KubeletConfig != nil && nodeConfig.KubeletConfig.EnableKubeletConfigFile { + if nodeConfig := s.Runtime.AKSNodeConfig; nodeConfig != nil && nodeConfig.KubeletConfig != nil && nodeConfig.KubeletConfig.EnableKubeletConfigFile && (s.Runtime.EnableScriptlessANC || s.Tags.Scriptless) { return true } if nbc := s.Runtime.NBC; nbc != nil && (nbc.EnableKubeletConfigFile || From 02b46dc22b67a319ff43513029da720bbc6dcbfb Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Tue, 5 May 2026 17:16:40 -0700 Subject: [PATCH 04/24] restore --- .../pkg/nodeconfigutils/utils.go | 52 +++ e2e/vmss.go | 300 ++++++++++-------- 2 files changed, 226 insertions(+), 126 deletions(-) diff --git a/aks-node-controller/pkg/nodeconfigutils/utils.go b/aks-node-controller/pkg/nodeconfigutils/utils.go index 6fe4bed34f0..6ee893890f8 100644 --- a/aks-node-controller/pkg/nodeconfigutils/utils.go +++ b/aks-node-controller/pkg/nodeconfigutils/utils.go @@ -16,6 +16,8 @@ const ( AKSNodeConfigFilePath = "/opt/azure/containers/aks-node-controller-config.json" + NBCCmdFilePath = "/opt/azure/containers/aks-node-controller-nbc-cmd.sh" + boothookTemplate = `#cloud-boothook #!/bin/bash set -euo pipefail @@ -29,6 +31,28 @@ cat <<'EOF' | base64 -d >%[1]s EOF chmod 0600 %[1]s +logger -t aks-boothook "launching aks-node-controller service $(date -Ins)" +systemctl start --no-block aks-node-controller.service +` + + boothookPhase3Template = `#cloud-boothook +#!/bin/bash +set -euo pipefail + +logger -t aks-boothook "boothook start $(date -Ins)" + +mkdir -p /opt/azure/containers + +cat <<'EOF' | base64 -d >%[1]s +%[2]s +EOF +chmod 0600 %[1]s + +cat <<'EOF' | base64 -d >%[3]s +%[4]s +EOF +chmod 0600 %[3]s + logger -t aks-boothook "launching aks-node-controller service $(date -Ins)" systemctl start --no-block aks-node-controller.service ` @@ -82,6 +106,34 @@ func CustomData(cfg *aksnodeconfigv1.Configuration) (string, error) { return base64.StdEncoding.EncodeToString(customData.Bytes()), nil } +func CustomDataPhase3(cfg *aksnodeconfigv1.Configuration, nbcCSECMD string) (string, error) { + aksNodeConfigJSON, err := MarshalConfigurationV1(cfg) + if err != nil { + return "", fmt.Errorf("failed to marshal nbc, error: %w", err) + } + + encodedAksNodeConfigJSON := base64.StdEncoding.EncodeToString(aksNodeConfigJSON) + boothook := fmt.Sprintf(boothookPhase3Template, AKSNodeConfigFilePath, encodedAksNodeConfigJSON, NBCCmdFilePath, nbcCSECMD) + + var customData bytes.Buffer + writer := multipart.NewWriter(&customData) + + fmt.Fprintf(&customData, "MIME-Version: 1.0\r\n") + fmt.Fprintf(&customData, "Content-Type: multipart/mixed; boundary=%q\r\n\r\n", writer.Boundary()) + + if err := writeMIMEPart(writer, "text/cloud-boothook", boothook); err != nil { + return "", fmt.Errorf("failed to write boothook part: %w", err) + } + if err := writeMIMEPart(writer, "text/cloud-config", cloudConfigTemplate); err != nil { + return "", fmt.Errorf("failed to write cloud-config part: %w", err) + } + if err := writer.Close(); err != nil { + return "", fmt.Errorf("failed to finalize multipart custom data: %w", err) + } + + return base64.StdEncoding.EncodeToString(customData.Bytes()), nil +} + // CustomDataFlatcar builds base64-encoded custom data for Flatcar Container Linux nodes. // Unlike Ubuntu/Azure Linux which use cloud-init and expect MIME multipart custom data, // Flatcar uses Ignition (configured via Butane) to process machine configuration. Ignition diff --git a/e2e/vmss.go b/e2e/vmss.go index aacbb5334cc..a8621b91a74 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -90,65 +90,26 @@ func ConfigureAndCreateVMSS(ctx context.Context, s *Scenario) (*ScenarioVM, erro // avoiding the race condition where runcmd or boothook scripts execute before networking is available. // Flatcar cannot use boothooks (coreos-cloudinit doesn't support MIME multipart), so it uses cloud-config // with a coreos.units block to define and start the service instead. -func CustomDataWithHack(s *Scenario, binaryURL, nbcCmdScript string) (string, error) { - configPath := "/opt/azure/containers/aks-node-controller-config-hack.json" - nbcCmdPath := "/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh" - - // Build provision flags conditionally based on what's provided. - var flags []string - if s.Runtime.AKSNodeConfig != nil { - flags = append(flags, "--provision-config="+configPath) - } - if nbcCmdScript != "" { - flags = append(flags, "--nbc-cmd="+nbcCmdPath) - } - provisionFlags := strings.Join(flags, " ") - - // Encode AKSNodeConfig if provided. - var encodedAksNodeConfigJSON string - if s.Runtime.AKSNodeConfig != nil { - aksNodeConfigJSON, err := nodeconfigutils.MarshalConfigurationV1(s.Runtime.AKSNodeConfig) - if err != nil { - return "", fmt.Errorf("failed to marshal nbc, error: %w", err) - } - encodedAksNodeConfigJSON = base64.StdEncoding.EncodeToString(aksNodeConfigJSON) - } - - var customData string - if s.VHD.Flatcar { - customData = buildFlatcarCloudConfig(encodedAksNodeConfigJSON, configPath, nbcCmdScript, nbcCmdPath, binaryURL, provisionFlags) - } else { - customData = buildBoothookCloudConfig(encodedAksNodeConfigJSON, configPath, nbcCmdScript, nbcCmdPath, binaryURL, provisionFlags) - } - return base64.StdEncoding.EncodeToString([]byte(customData)), nil -} - -func buildBoothookCloudConfig(encodedConfig, configPath, nbcCmdScript, nbcCmdPath, binaryURL, provisionFlags string) string { - var sb strings.Builder - sb.WriteString(`#cloud-boothook +func CustomDataWithHack(s *Scenario, binaryURL string) (string, error) { + cloudConfigTemplate := `#cloud-boothook #!/bin/bash set -euo pipefail mkdir -p /opt/azure/containers /opt/azure/bin -`) - if encodedConfig != "" { - fmt.Fprintf(&sb, "cat <<'EOF' | base64 -d > %s\n%s\nEOF\nchmod 0600 %s\n", - configPath, encodedConfig, configPath) - } - if nbcCmdScript != "" { - fmt.Fprintf(&sb, "\ncat <<'EOF' | base64 -d > %s\n%s\nEOF\nchmod 0755 %s\n", - nbcCmdPath, base64.StdEncoding.EncodeToString([]byte(nbcCmdScript)), nbcCmdPath) - } - fmt.Fprintf(&sb, ` +cat <<'EOF' | base64 -d > %[1]s +%[2]s +EOF +chmod 0600 %[1]s + cat <<'SCRIPT' > /opt/azure/bin/run-aks-node-controller-hack.sh #!/bin/bash set -euo pipefail mkdir -p /opt/azure/bin -curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack +curl -fSL --retry 10 --retry-delay 2 "%[3]s" -o /opt/azure/bin/aks-node-controller-hack chmod +x /opt/azure/bin/aks-node-controller-hack -/opt/azure/bin/aks-node-controller-hack provision %s +/opt/azure/bin/aks-node-controller-hack provision --provision-config=%[1]s SCRIPT chmod +x /opt/azure/bin/run-aks-node-controller-hack.sh @@ -169,42 +130,28 @@ UNIT systemctl daemon-reload systemctl start --no-block aks-node-controller-hack.service -`, binaryURL, provisionFlags) - return sb.String() -} - -func buildFlatcarCloudConfig(encodedConfig, configPath, nbcCmdScript, nbcCmdPath, binaryURL, provisionFlags string) string { - // Flatcar uses coreos-cloudinit which only supports a subset of cloud-config features - // and does not handle MIME multipart or boothooks. Use coreos.units to define the service instead. - // https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters - var sb strings.Builder - sb.WriteString("#cloud-config\nwrite_files:\n") - if encodedConfig != "" { - fmt.Fprintf(&sb, `- path: %s +` + if s.VHD.Flatcar { + // Flatcar uses coreos-cloudinit which only supports a subset of cloud-config features + // and does not handle MIME multipart or boothooks. Use coreos.units to define the service instead. + // https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters + cloudConfigTemplate = `#cloud-config +write_files: +- path: %[1]s permissions: "0600" owner: root content: !!binary | - %s -`, configPath, encodedConfig) - } - if nbcCmdScript != "" { - fmt.Fprintf(&sb, `- path: %s - permissions: "0755" - owner: root - content: !!binary | - %s -`, nbcCmdPath, base64.StdEncoding.EncodeToString([]byte(nbcCmdScript))) - } - fmt.Fprintf(&sb, `- path: /opt/azure/bin/run-aks-node-controller-hack.sh + %[2]s +- path: /opt/azure/bin/run-aks-node-controller-hack.sh permissions: "0755" owner: root content: | #!/bin/bash set -euo pipefail mkdir -p /opt/azure/bin - curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack + curl -fSL --retry 10 --retry-delay 2 "%[3]s" -o /opt/azure/bin/aks-node-controller-hack chmod +x /opt/azure/bin/aks-node-controller-hack - /opt/azure/bin/aks-node-controller-hack provision %s + /opt/azure/bin/aks-node-controller-hack provision --provision-config=%[1]s # Flatcar specific configuration. It supports only a subset of cloud-init features https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters coreos: units: @@ -220,62 +167,159 @@ coreos: ExecStart=/opt/azure/bin/run-aks-node-controller-hack.sh [Install] WantedBy=multi-user.target -`, binaryURL, provisionFlags) - return sb.String() +` + } + + aksNodeConfigJSON, err := nodeconfigutils.MarshalConfigurationV1(s.Runtime.AKSNodeConfig) + if err != nil { + return "", fmt.Errorf("failed to marshal nbc, error: %w", err) + } + encodedAksNodeConfigJSON := base64.StdEncoding.EncodeToString(aksNodeConfigJSON) + configPath := "/opt/azure/containers/aks-node-controller-config-hack.json" + + customDataYAML := fmt.Sprintf(cloudConfigTemplate, configPath, encodedAksNodeConfigJSON, binaryURL) + return base64.StdEncoding.EncodeToString([]byte(customDataYAML)), nil } // CustomDataWithNBCCmdHack is similar to baker.boothooktemplate, but it uses a hack to run new aks-node-controller binary. -func GunzipCustomData(s *Scenario, customData string) (string, error) { - var ignitionConfig map[string]interface{} - if err := json.Unmarshal([]byte(customData), &ignitionConfig); err != nil { - return "", fmt.Errorf("failed to parse ignition config: %w", err) - } - - // Extract the nbc-cmd-hack.sh content from the ignition storage.files - var nbcCmdContent string - if storage, ok := ignitionConfig["storage"].(map[string]interface{}); ok { - if files, ok := storage["files"].([]interface{}); ok { - for _, f := range files { - file, _ := f.(map[string]interface{}) - if file["path"] == "/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh" { - if contents, ok := file["contents"].(map[string]interface{}); ok { - source, _ := contents["source"].(string) - // source is "data:;base64," - nbcCmdContent, _ = strings.CutPrefix(source, "data:;base64,") - // As of PR #8357, baker.go's flatcarTemplate marks the file with - // `compression: gzip`, so the base64 payload decodes to gzip bytes - // rather than plaintext shell. Ignition would normally gunzip it, - // but here we re-emit via cloud-config `!!binary`, which only - // base64-decodes. We must gunzip ourselves and re-base64 the - // plaintext, otherwise the resulting nbc-cmd-hack.sh contains raw - // gzip bytes and CSE exec fails with "cannot execute binary file" - // (exit 126). - if compression, _ := contents["compression"].(string); compression == "gzip" { - gzBytes, err := base64.StdEncoding.DecodeString(nbcCmdContent) - if err != nil { - return "", fmt.Errorf("failed to base64-decode gzipped nbc-cmd source: %w", err) - } - gzReader, err := gzip.NewReader(bytes.NewReader(gzBytes)) - if err != nil { - return "", fmt.Errorf("failed to create gzip reader for nbc-cmd source: %w", err) - } - plain, err := io.ReadAll(gzReader) - _ = gzReader.Close() - if err != nil { - return "", fmt.Errorf("failed to gunzip nbc-cmd source: %w", err) +// Original aks-node-controller isn't run because it fails systemd check validating aks-node-controller-config.json exists +// (check aks-node-controller.service for details). +// with a coreos.units block to define and start the service instead. +func CustomDataWithNBCCmdHack(s *Scenario, customData, binaryURL string) (string, error) { + decoded, err := base64.StdEncoding.DecodeString(customData) + require.NoError(s.T, err) + + customData = strings.Replace(string(decoded), "aks-node-controller-nbc-cmd.sh", "aks-node-controller-nbc-cmd-hack.sh", -1) + + if s.VHD.Flatcar { + // For Flatcar, customData is an ignition JSON config from baker.go's flatcarTemplate. + // Ignition's "enabled: true" only creates enable symlinks but does NOT start services, + // so we can't use ignition JSON to start the hack service reliably. + // Instead, convert to #cloud-config format with coreos.units "command: start", + // which coreos-cloudinit processes and explicitly starts the service. + var ignitionConfig map[string]interface{} + if err := json.Unmarshal([]byte(customData), &ignitionConfig); err != nil { + return "", fmt.Errorf("failed to parse ignition config: %w", err) + } + + // Extract the nbc-cmd-hack.sh content from the ignition storage.files + var nbcCmdContent string + if storage, ok := ignitionConfig["storage"].(map[string]interface{}); ok { + if files, ok := storage["files"].([]interface{}); ok { + for _, f := range files { + file, _ := f.(map[string]interface{}) + if file["path"] == "/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh" { + if contents, ok := file["contents"].(map[string]interface{}); ok { + source, _ := contents["source"].(string) + // source is "data:;base64," + nbcCmdContent, _ = strings.CutPrefix(source, "data:;base64,") + // As of PR #8357, baker.go's flatcarTemplate marks the file with + // `compression: gzip`, so the base64 payload decodes to gzip bytes + // rather than plaintext shell. Ignition would normally gunzip it, + // but here we re-emit via cloud-config `!!binary`, which only + // base64-decodes. We must gunzip ourselves and re-base64 the + // plaintext, otherwise the resulting nbc-cmd-hack.sh contains raw + // gzip bytes and CSE exec fails with "cannot execute binary file" + // (exit 126). + if compression, _ := contents["compression"].(string); compression == "gzip" { + gzBytes, err := base64.StdEncoding.DecodeString(nbcCmdContent) + if err != nil { + return "", fmt.Errorf("failed to base64-decode gzipped nbc-cmd source: %w", err) + } + gzReader, err := gzip.NewReader(bytes.NewReader(gzBytes)) + if err != nil { + return "", fmt.Errorf("failed to create gzip reader for nbc-cmd source: %w", err) + } + plain, err := io.ReadAll(gzReader) + _ = gzReader.Close() + if err != nil { + return "", fmt.Errorf("failed to gunzip nbc-cmd source: %w", err) + } + nbcCmdContent = base64.StdEncoding.EncodeToString(plain) } - nbcCmdContent = base64.StdEncoding.EncodeToString(plain) } } } } } + if nbcCmdContent == "" { + return "", fmt.Errorf("failed to extract nbc-cmd-hack.sh content from ignition config") + } + + // Build a #cloud-config that writes both the nbc-cmd script and hack runner, + // then starts the hack service via coreos.units command: start + cloudConfig := fmt.Sprintf(`#cloud-config +write_files: +- path: /opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh + permissions: "0600" + owner: root + content: !!binary | + %[1]s +- path: /opt/azure/bin/run-aks-node-controller-hack.sh + permissions: "0755" + owner: root + content: | + #!/bin/bash + set -euo pipefail + mkdir -p /opt/azure/bin + curl -fSL --retry 10 --retry-delay 2 "%[2]s" -o /opt/azure/bin/aks-node-controller-hack + chmod +x /opt/azure/bin/aks-node-controller-hack + /opt/azure/bin/aks-node-controller-hack provision --nbc-cmd=/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh +coreos: + units: + - name: aks-node-controller-hack.service + command: start + content: | + [Unit] + Description=Downloads and runs the AKS node controller hack + After=network-online.target + Wants=network-online.target + [Service] + Type=oneshot + ExecStart=/opt/azure/bin/run-aks-node-controller-hack.sh + [Install] + WantedBy=multi-user.target +`, nbcCmdContent, binaryURL) + + return base64.StdEncoding.EncodeToString([]byte(cloudConfig)), nil } - if nbcCmdContent == "" { - return "", fmt.Errorf("failed to extract nbc-cmd-hack.sh content from ignition config") - } - return nbcCmdContent, nil + cloudConfigTemplate := `%s + +mkdir -p /opt/azure/bin + +cat <<'SCRIPT' > /opt/azure/bin/run-aks-node-controller-hack.sh +#!/bin/bash +set -euo pipefail +mkdir -p /opt/azure/bin +curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack +chmod +x /opt/azure/bin/aks-node-controller-hack + +/opt/azure/bin/aks-node-controller-hack provision --nbc-cmd=/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh + +SCRIPT +chmod +x /opt/azure/bin/run-aks-node-controller-hack.sh + +cat <<'UNIT' > /etc/systemd/system/aks-node-controller-hack.service +[Unit] +Description=Downloads and runs the AKS node controller hack +After=network-online.target +Wants=network-online.target + +[Service] +Type=oneshot +ExecStart=/opt/azure/bin/run-aks-node-controller-hack.sh + +[Install] +WantedBy=basic.target +UNIT + +systemctl daemon-reload +systemctl start --no-block aks-node-controller-hack.service +` + + customDataYAML := fmt.Sprintf(cloudConfigTemplate, customData, binaryURL) + return base64.StdEncoding.EncodeToString([]byte(customDataYAML)), nil } func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachineScaleSet { @@ -293,12 +337,12 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine require.NoError(s.T, err) } - if s.Runtime.AKSNodeConfig != nil && s.Runtime.EnableScriptlessANC { + if s.Runtime.AKSNodeConfig != nil { cse = nodeconfigutils.CSE var nbcCmdScript string nbcCmdScript = nodeBootstrapping.CSE - customData = func(nbcCmdScript string) string { + customData = func() string { if config.Config.DisableScriptLessCompilation { var data string var err error @@ -312,19 +356,23 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine } binaryURL, err := CachedCompileAndUploadAKSNodeController(ctx, s.VHD.Arch) require.NoError(s.T, err, "failed to compile and upload aks-node-controller binary") - data, err := CustomDataWithHack(s, binaryURL, nbcCmdScript) + data, err := CustomDataWithHack(s, binaryURL) require.NoError(s.T, err, "failed to generate custom data from AKSNodeConfig with hack") return data - }(nbcCmdScript) + }() + + if s.Runtime.EnableScriptlessANC { + customData, err = nodeconfigutils.CustomDataPhase3(s.Runtime.AKSNodeConfig, nbcCmdScript) + require.NoError(s.T, err, "failed to generate custom data for phase 3") + } } else { cse = nodeBootstrapping.CSE customData = nodeBootstrapping.CustomData - if s.Runtime.NBC.EnableScriptlessNBCCSECmd && !config.Config.DisableScriptLessCompilation && !s.Tags.NetworkIsolated { + if s.Runtime.NBC.EnableScriptlessNBCCSECmd && !config.Config.DisableScriptLessCompilation && !s.Tags.NetworkIsolated && !s.Runtime.NBC.PreProvisionOnly { binaryURL, err := CachedCompileAndUploadAKSNodeController(ctx, s.VHD.Arch) require.NoError(s.T, err, "failed to compile and upload aks-node-controller binary") - customData, err = CustomDataWithHack(s, binaryURL, customData) - customData, err = GunzipCustomData(s, customData) + customData, err = CustomDataWithNBCCmdHack(s, customData, binaryURL) require.NoError(s.T, err, "failed to generate custom data with NBC cmd hack") } if len(s.Config.CustomDataWriteFiles) > 0 { From dba6573933ca46a5bb27853c3d3a8c44c5b08e2e Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Sun, 10 May 2026 21:49:00 -0700 Subject: [PATCH 05/24] add phase 3 custom data and fix e2e logic --- aks-node-controller/app.go | 1 + .../pkg/nodeconfigutils/utils.go | 5 +- e2e/scenario_test.go | 52 +++---- e2e/test_helpers.go | 2 +- e2e/vmss.go | 128 ++++++++++++------ 5 files changed, 116 insertions(+), 72 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 68aa907b0d3..221f030c750 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -450,6 +450,7 @@ func (a *App) Provision(ctx context.Context, flags ProvisionFlags) (*ProvisionRe // If both flags are provided, compare environments before proceeding. // This is best-effort and should not block provisioning. if flags.ProvisionConfig != "" && flags.NBCCmd != "" { + slog.Info("ProvisionConfig and NBCCmd both provided, comparing envs") compareEnvs(ctx, flags, a.eventLogger) } diff --git a/aks-node-controller/pkg/nodeconfigutils/utils.go b/aks-node-controller/pkg/nodeconfigutils/utils.go index 6ee893890f8..365b7b0dbf0 100644 --- a/aks-node-controller/pkg/nodeconfigutils/utils.go +++ b/aks-node-controller/pkg/nodeconfigutils/utils.go @@ -51,7 +51,7 @@ chmod 0600 %[1]s cat <<'EOF' | base64 -d >%[3]s %[4]s EOF -chmod 0600 %[3]s +chmod 0755 %[3]s logger -t aks-boothook "launching aks-node-controller service $(date -Ins)" systemctl start --no-block aks-node-controller.service @@ -113,7 +113,8 @@ func CustomDataPhase3(cfg *aksnodeconfigv1.Configuration, nbcCSECMD string) (str } encodedAksNodeConfigJSON := base64.StdEncoding.EncodeToString(aksNodeConfigJSON) - boothook := fmt.Sprintf(boothookPhase3Template, AKSNodeConfigFilePath, encodedAksNodeConfigJSON, NBCCmdFilePath, nbcCSECMD) + encodedNBCCSECmd := base64.StdEncoding.EncodeToString([]byte(nbcCSECMD)) + boothook := fmt.Sprintf(boothookPhase3Template, AKSNodeConfigFilePath, encodedAksNodeConfigJSON, NBCCmdFilePath, encodedNBCCSECmd) var customData bytes.Buffer writer := multipart.NewWriter(&customData) diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index d18c483d933..41d8b912044 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -26,7 +26,7 @@ func Test_AzureLinux3OSGuard(t *testing.T) { BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { nbc.AgentPoolProfile.LocalDNSProfile = nil }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.LocalDnsProfile = nil }, Validator: func(ctx context.Context, s *Scenario) {}, @@ -50,7 +50,7 @@ func Test_Flatcar(t *testing.T) { }, } }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.CustomCaCerts = []string{encodedTestCert} }, Validator: func(ctx context.Context, s *Scenario) { @@ -92,7 +92,7 @@ func Test_Flatcar_ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" nbc.IsARM64 = true }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.VmSize = "Standard_D2pds_V5" }, Validator: func(ctx context.Context, s *Scenario) { @@ -114,7 +114,7 @@ func Test_AzureLinuxV3_ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" nbc.IsARM64 = true }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.VmSize = "Standard_D2pds_V5" }, Validator: func(ctx context.Context, s *Scenario) { @@ -136,7 +136,7 @@ func Test_Flatcar_AzureCNI(t *testing.T) { nbc.ContainerService.Properties.OrchestratorProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) nbc.AgentPoolProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.NetworkConfig.NetworkPlugin = aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_AZURE }, Validator: func(ctx context.Context, s *Scenario) { @@ -158,7 +158,7 @@ func Test_Ubuntu2204_AzureCNI(t *testing.T) { nbc.ContainerService.Properties.OrchestratorProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) nbc.AgentPoolProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.NetworkConfig.NetworkPlugin = aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_AZURE }, Validator: func(ctx context.Context, s *Scenario) { @@ -221,7 +221,7 @@ func Test_ACL(t *testing.T) { }, } }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.CustomCaCerts = []string{encodedTestCert} }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { @@ -251,7 +251,7 @@ func Test_ACL_ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_v6" nbc.IsARM64 = true }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.VmSize = "Standard_D2pds_v6" }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { @@ -301,7 +301,7 @@ func Test_ACL_AzureCNI(t *testing.T) { nbc.ContainerService.Properties.OrchestratorProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) nbc.AgentPoolProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.NetworkConfig.NetworkPlugin = aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_AZURE }, Validator: func(ctx context.Context, s *Scenario) { @@ -372,7 +372,7 @@ func Test_ACL_DisableSSH(t *testing.T) { BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { nbc.SSHStatus = datamodel.SSHOff }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.EnableSsh = to.Ptr(false) }, SkipSSHConnectivityValidation: true, // Skip SSH connectivity validation since SSH is down @@ -492,7 +492,7 @@ func Test_AzureLinuxV3_AzureCNI(t *testing.T) { nbc.ContainerService.Properties.OrchestratorProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) nbc.AgentPoolProfile.KubernetesConfig.NetworkPlugin = string(armcontainerservice.NetworkPluginAzure) }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.NetworkConfig.NetworkPlugin = aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_AZURE }, }, @@ -513,7 +513,7 @@ func Test_AzureLinuxV3(t *testing.T) { }, } }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.MessageOfTheDay = "Zm9vYmFyDQo=" config.CustomCaCerts = []string{encodedTestCert} }, @@ -750,7 +750,7 @@ func Test_Ubuntu2204(t *testing.T) { }, } }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.MessageOfTheDay = "Zm9vYmFyDQo=" config.CustomCaCerts = []string{encodedTestCert} }, @@ -775,7 +775,7 @@ func Test_Ubuntu2204FIPS(t *testing.T) { VHD: config.VHDUbuntu2204FIPSContainerd, BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties.AdditionalCapabilities = &armcompute.AdditionalCapabilities{ @@ -802,7 +802,7 @@ func Test_Ubuntu2004FIPS(t *testing.T) { VHD: config.VHDUbuntu2004FIPSContainerd, BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { }, @@ -826,7 +826,7 @@ func Test_Ubuntu2204Gen2FIPS(t *testing.T) { VHD: config.VHDUbuntu2204Gen2FIPSContainerd, BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties.AdditionalCapabilities = &armcompute.AdditionalCapabilities{ @@ -856,7 +856,7 @@ func Test_Ubuntu2204Gen2FIPSTL(t *testing.T) { VHD: config.VHDUbuntu2204Gen2FIPSTLContainerd, BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -886,7 +886,7 @@ func Test_Ubuntu2204_EntraIDSSH(t *testing.T) { // Enable Entra ID SSH authentication nbc.SSHStatus = datamodel.EntraIDSSH }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.DisablePubkeyAuth = to.Ptr(true) }, SkipSSHConnectivityValidation: true, // Skip SSH connectivity validation since Entra ID SSH disables private key authentication @@ -940,7 +940,7 @@ func Test_AzureLinuxV3_DisableSSH(t *testing.T) { BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { nbc.SSHStatus = datamodel.SSHOff }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.EnableSsh = to.Ptr(false) }, SkipSSHConnectivityValidation: true, // Skip SSH connectivity validation since SSH is down @@ -962,7 +962,7 @@ func Test_Ubuntu2204_DisableSSH(t *testing.T) { BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { nbc.SSHStatus = datamodel.SSHOff }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.EnableSsh = to.Ptr(false) }, SkipSSHConnectivityValidation: true, // Skip SSH connectivity validation since SSH is down @@ -984,7 +984,7 @@ func Test_Flatcar_DisableSSH(t *testing.T) { BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { nbc.SSHStatus = datamodel.SSHOff }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.EnableSsh = to.Ptr(false) }, SkipSSHConnectivityValidation: true, // Skip SSH connectivity validation since SSH is down @@ -1290,7 +1290,7 @@ func Test_Ubuntu2204ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" nbc.IsARM64 = true }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.VmSize = "Standard_D2pds_V5" }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { @@ -1309,7 +1309,7 @@ func Test_Ubuntu2204_ArtifactStreaming(t *testing.T) { BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { nbc.EnableArtifactStreaming = true }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.EnableArtifactStreaming = true }, Validator: func(ctx context.Context, s *Scenario) { @@ -1334,7 +1334,7 @@ func Test_Ubuntu2204_ArtifactStreaming_ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" nbc.IsARM64 = true }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.EnableArtifactStreaming = true config.VmSize = "Standard_D2pds_V5" }, @@ -1384,7 +1384,7 @@ func Test_AzureLinuxV3_ArtifactStreaming(t *testing.T) { BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { nbc.EnableArtifactStreaming = true }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.EnableArtifactStreaming = true }, Validator: func(ctx context.Context, s *Scenario) { @@ -1459,7 +1459,7 @@ func Test_Ubuntu2404_ArtifactStreaming_ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" nbc.IsARM64 = true }, - AKSNodeConfigMutator: func(config *aksnodeconfigv1.Configuration) { + AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { config.EnableArtifactStreaming = true config.VmSize = "Standard_D2pds_V5" }, diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index 012d92fc6f1..f7ecc055770 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -292,7 +292,7 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) { if s.BootstrapConfigMutator != nil { s.BootstrapConfigMutator(s.Runtime.Cluster, nbc) } - if s.AKSNodeConfigMutator != nil { + if s.AKSNodeConfigMutator != nil && (s.Runtime.EnableScriptlessANC || s.Tags.Scriptless) { nodeconfig := nbcToAKSNodeConfigV1(nbc) s.AKSNodeConfigMutator(s.Runtime.Cluster, nodeconfig) s.Runtime.AKSNodeConfig = nodeconfig diff --git a/e2e/vmss.go b/e2e/vmss.go index a5bb03ede54..c7599e171b0 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -90,26 +90,67 @@ func ConfigureAndCreateVMSS(ctx context.Context, s *Scenario) (*ScenarioVM, erro // avoiding the race condition where runcmd or boothook scripts execute before networking is available. // Flatcar cannot use boothooks (coreos-cloudinit doesn't support MIME multipart), so it uses cloud-config // with a coreos.units block to define and start the service instead. -func CustomDataWithHack(s *Scenario, binaryURL string) (string, error) { - cloudConfigTemplate := `#cloud-boothook +func CustomDataWithHack(s *Scenario, nbcCmdScript, binaryURL string) (string, error) { + configPath := "/opt/azure/containers/aks-node-controller-config-hack.json" + nbcCmdPath := "/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh" + + // Build provision flags conditionally based on what's provided. + var flags []string + var encodedNBCCSECmd string + if s.Runtime.AKSNodeConfig != nil { + flags = append(flags, "--provision-config="+configPath) + } + if nbcCmdScript != "" { + flags = append(flags, "--nbc-cmd="+nbcCmdPath) + encodedNBCCSECmd = base64.StdEncoding.EncodeToString([]byte(nbcCmdScript)) + } + provisionFlags := strings.Join(flags, " ") + + // Encode AKSNodeConfig if provided. + var encodedAksNodeConfigJSON string + if s.Runtime.AKSNodeConfig != nil { + aksNodeConfigJSON, err := nodeconfigutils.MarshalConfigurationV1(s.Runtime.AKSNodeConfig) + if err != nil { + return "", fmt.Errorf("failed to marshal nbc, error: %w", err) + } + encodedAksNodeConfigJSON = base64.StdEncoding.EncodeToString(aksNodeConfigJSON) + } + + var customData string + if s.VHD.Flatcar { + customData = buildFlatcarCloudConfig(encodedAksNodeConfigJSON, configPath, encodedNBCCSECmd, nbcCmdPath, binaryURL, provisionFlags) + } else { + customData = buildBoothookCloudConfig(encodedAksNodeConfigJSON, configPath, encodedNBCCSECmd, nbcCmdPath, binaryURL, provisionFlags) + } + return base64.StdEncoding.EncodeToString([]byte(customData)), nil +} + +func buildBoothookCloudConfig(encodedConfig, configPath, encodedNBCCmd, nbcCmdPath, binaryURL, provisionFlags string) string { + var sb strings.Builder + sb.WriteString(`#cloud-boothook #!/bin/bash set -euo pipefail mkdir -p /opt/azure/containers /opt/azure/bin -cat <<'EOF' | base64 -d > %[1]s -%[2]s -EOF -chmod 0600 %[1]s - +`) + if encodedConfig != "" { + fmt.Fprintf(&sb, "cat <<'EOF' | base64 -d > %s\n%s\nEOF\nchmod 0600 %s\n", + configPath, encodedConfig, configPath) + } + if encodedNBCCmd != "" { + fmt.Fprintf(&sb, "\ncat <<'EOF' | base64 -d > %s\n%s\nEOF\nchmod 0755 %s\n", + nbcCmdPath, encodedNBCCmd, nbcCmdPath) + } + fmt.Fprintf(&sb, ` cat <<'SCRIPT' > /opt/azure/bin/run-aks-node-controller-hack.sh #!/bin/bash set -euo pipefail mkdir -p /opt/azure/bin -curl -fSL --retry 10 --retry-delay 2 "%[3]s" -o /opt/azure/bin/aks-node-controller-hack +curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack chmod +x /opt/azure/bin/aks-node-controller-hack -/opt/azure/bin/aks-node-controller-hack provision --provision-config=%[1]s +/opt/azure/bin/aks-node-controller-hack provision %s SCRIPT chmod +x /opt/azure/bin/run-aks-node-controller-hack.sh @@ -130,28 +171,42 @@ UNIT systemctl daemon-reload systemctl start --no-block aks-node-controller-hack.service -` - if s.VHD.Flatcar { - // Flatcar uses coreos-cloudinit which only supports a subset of cloud-config features - // and does not handle MIME multipart or boothooks. Use coreos.units to define the service instead. - // https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters - cloudConfigTemplate = `#cloud-config -write_files: -- path: %[1]s +`, binaryURL, provisionFlags) + return sb.String() +} + +func buildFlatcarCloudConfig(encodedConfig, configPath, encodedNBCCmd, nbcCmdPath, binaryURL, provisionFlags string) string { + // Flatcar uses coreos-cloudinit which only supports a subset of cloud-config features + // and does not handle MIME multipart or boothooks. Use coreos.units to define the service instead. + // https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters + var sb strings.Builder + sb.WriteString("#cloud-config\nwrite_files:\n") + if encodedConfig != "" { + fmt.Fprintf(&sb, `- path: %s permissions: "0600" owner: root content: !!binary | - %[2]s -- path: /opt/azure/bin/run-aks-node-controller-hack.sh + %s +`, configPath, encodedConfig) + } + if encodedNBCCmd != "" { + fmt.Fprintf(&sb, `- path: %s + permissions: "0755" + owner: root + content: !!binary | + %s +`, nbcCmdPath, encodedNBCCmd) + } + fmt.Fprintf(&sb, `- path: /opt/azure/bin/run-aks-node-controller-hack.sh permissions: "0755" owner: root content: | #!/bin/bash set -euo pipefail mkdir -p /opt/azure/bin - curl -fSL --retry 10 --retry-delay 2 "%[3]s" -o /opt/azure/bin/aks-node-controller-hack + curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack chmod +x /opt/azure/bin/aks-node-controller-hack - /opt/azure/bin/aks-node-controller-hack provision --provision-config=%[1]s + /opt/azure/bin/aks-node-controller-hack provision %s # Flatcar specific configuration. It supports only a subset of cloud-init features https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters coreos: units: @@ -167,18 +222,8 @@ coreos: ExecStart=/opt/azure/bin/run-aks-node-controller-hack.sh [Install] WantedBy=multi-user.target -` - } - - aksNodeConfigJSON, err := nodeconfigutils.MarshalConfigurationV1(s.Runtime.AKSNodeConfig) - if err != nil { - return "", fmt.Errorf("failed to marshal nbc, error: %w", err) - } - encodedAksNodeConfigJSON := base64.StdEncoding.EncodeToString(aksNodeConfigJSON) - configPath := "/opt/azure/containers/aks-node-controller-config-hack.json" - - customDataYAML := fmt.Sprintf(cloudConfigTemplate, configPath, encodedAksNodeConfigJSON, binaryURL) - return base64.StdEncoding.EncodeToString([]byte(customDataYAML)), nil +`, binaryURL, provisionFlags) + return sb.String() } // CustomDataWithNBCCmdHack is similar to baker.boothooktemplate, but it uses a hack to run new aks-node-controller binary. @@ -340,9 +385,11 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine if s.Runtime.AKSNodeConfig != nil { cse = nodeconfigutils.CSE - var nbcCmdScript string - nbcCmdScript = nodeBootstrapping.CSE - customData = func() string { + var nbcCSECmd string + if s.Runtime.EnableScriptlessANC { + nbcCSECmd = nodeBootstrapping.CSE + } + customData = func(nbcCSECmd string) string { if config.Config.DisableScriptLessCompilation { var data string var err error @@ -356,15 +403,10 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine } binaryURL, err := CachedCompileAndUploadAKSNodeController(ctx, s.VHD.Arch) require.NoError(s.T, err, "failed to compile and upload aks-node-controller binary") - data, err := CustomDataWithHack(s, binaryURL) + data, err := CustomDataWithHack(s, nbcCSECmd, binaryURL) require.NoError(s.T, err, "failed to generate custom data from AKSNodeConfig with hack") return data - }() - - if s.Runtime.EnableScriptlessANC { - customData, err = nodeconfigutils.CustomDataPhase3(s.Runtime.AKSNodeConfig, nbcCmdScript) - require.NoError(s.T, err, "failed to generate custom data for phase 3") - } + }(nbcCSECmd) } else { cse = nodeBootstrapping.CSE From b8e9aa5413291911ff00c51808cdb470b01e89f8 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Mon, 18 May 2026 14:25:16 -0700 Subject: [PATCH 06/24] fix diffs --- aks-node-controller/app.go | 16 ++- aks-node-controller/helpers/const.go | 4 +- aks-node-controller/parser/helper.go | 22 ++- aks-node-controller/parser/parser.go | 8 +- .../parser/templates/localdns.toml.gtpl | 4 +- e2e/node_config.go | 133 ++++++++++++------ e2e/types.go | 2 +- e2e/vmss.go | 48 ++++--- .../linux/cloud-init/artifacts/cse_config.sh | 2 +- pkg/agent/baker.go | 52 ++++--- pkg/agent/utils.go | 12 +- 11 files changed, 186 insertions(+), 117 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 221f030c750..b68a4a81787 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -23,6 +23,14 @@ import ( "github.com/urfave/cli/v3" ) +var deprecatedCSEVars = map[string]bool{ + "CLOUD_INIT_STATUS_SCRIPT": true, + "HYPERKUBE_URL": true, + "MCR_REPOSITORY_BASE": true, + "BLOCK_OUTBOUND_NETWORK": true, + "DISABLE_PUBKEY_AUTH": true, +} + type App struct { // cmdRun is a function that runs the given command. // the goal of this field is to make it easier to test the app by mocking the command runner. @@ -295,11 +303,13 @@ func compareEnvs(ctx context.Context, flags ProvisionFlags, eventLogger *helpers nbcVal, inNBC := nbcEnv[key] switch { case inPC && !inNBC: - diffs = append(diffs, fmt.Sprintf("only-in-pc: %s", key)) + diffs = append(diffs, fmt.Sprintf("only-in-pc: %s = %q", key, pcVal)) case !inPC && inNBC: - diffs = append(diffs, fmt.Sprintf("only-in-nbc: %s", key)) + if !deprecatedCSEVars[key] { + diffs = append(diffs, fmt.Sprintf("only-in-nbc: %s = %q", key, nbcVal)) + } case pcVal != nbcVal: - diffs = append(diffs, fmt.Sprintf("differs: %s", key)) + diffs = append(diffs, fmt.Sprintf("differs: %s pc=%q nbc=%q", key, pcVal, nbcVal)) } } diff --git a/aks-node-controller/helpers/const.go b/aks-node-controller/helpers/const.go index e51261f25d3..73485491d01 100644 --- a/aks-node-controller/helpers/const.go +++ b/aks-node-controller/helpers/const.go @@ -7,8 +7,8 @@ const ( NetworkPluginKubenet = "kubenet" NetworkPolicyAzure = "azure" NetworkPolicyCalico = "calico" - LoadBalancerBasic = "basic" - LoadBalancerStandard = "standard" + LoadBalancerBasic = "Basic" + LoadBalancerStandard = "Standard" VMSizeStandardDc2s = "Standard_DC2s" VMSizeStandardDc4s = "Standard_DC4s" DefaultLinuxUser = "azureuser" diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index 99c69c7aa4f..9411a173703 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -471,24 +471,18 @@ func getPortRangeEndValue(portRange string) int { // createSortedKeyValuePairs creates a string with key=value pairs, sorted by key, with custom delimiter. func createSortedKeyValuePairs[T any](m map[string]T, delimiter string) string { - keys := []string{} + keys := make([]string, 0, len(m)) for key := range m { keys = append(keys, key) } // we are sorting the keys for deterministic output for readability and testing. sort.Strings(keys) - var buf bytes.Buffer - i := 0 + pairs := make([]string, 0, len(keys)) for _, key := range keys { - i++ - // set the last delimiter to empty string - if i == len(keys) { - delimiter = "" - } - buf.WriteString(fmt.Sprintf("%s=%v%s", key, m[key], delimiter)) + pairs = append(pairs, fmt.Sprintf("%s=%v", key, m[key])) } - return buf.String() + return strings.Join(pairs, delimiter) } func getExcludeMasterFromStandardLB(lb *aksnodeconfigv1.LoadBalancerConfig) bool { @@ -652,7 +646,7 @@ func marshalToJSON(v any) ([]byte, error) { } var rawMessage json.RawMessage = data - jsonByte, err := json.MarshalIndent(rawMessage, "", " ") + jsonByte, err := json.MarshalIndent(rawMessage, "", " ") if err != nil { log.Printf("error marshalling kubelet config file content: %v", err) return nil, err @@ -694,13 +688,13 @@ func getProxyVariables(proxyConfig *aksnodeconfigv1.HttpProxyConfig) string { proxyVars := "" if proxyConfig.GetHttpProxy() != "" { // from https://curl.se/docs/manual.html, curl uses http_proxy but uppercase for others? - proxyVars = fmt.Sprintf("export http_proxy=\"%s\";", proxyConfig.GetHttpProxy()) + proxyVars = fmt.Sprintf("export http_proxy=%s;", proxyConfig.GetHttpProxy()) } if proxyConfig.GetHttpsProxy() != "" { - proxyVars = fmt.Sprintf("export HTTPS_PROXY=\"%s\"; %s", proxyConfig.GetHttpsProxy(), proxyVars) + proxyVars = fmt.Sprintf("export HTTPS_PROXY=%s; %s", proxyConfig.GetHttpsProxy(), proxyVars) } if proxyConfig.GetNoProxyEntries() != nil { - proxyVars = fmt.Sprintf("export NO_PROXY=\"%s\"; %s", strings.Join(proxyConfig.GetNoProxyEntries(), ","), proxyVars) + proxyVars = fmt.Sprintf("export NO_PROXY=%s; %s", strings.Join(proxyConfig.GetNoProxyEntries(), ","), proxyVars) } return proxyVars } diff --git a/aks-node-controller/parser/parser.go b/aks-node-controller/parser/parser.go index 73c9618bb7f..0561254c8bb 100644 --- a/aks-node-controller/parser/parser.go +++ b/aks-node-controller/parser/parser.go @@ -33,7 +33,7 @@ func executeBootstrapTemplate(inputContract *aksnodeconfigv1.Configuration) (str func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string { cloudProviderSettings := getCloudProviderSettings(config) env := map[string]string{ - "PROVISION_OUTPUT": "/var/log/azure/cluster-provision.log", + "PROVISION_OUTPUT": "/var/log/azure/cluster-provision-cse-output.log", "MOBY_VERSION": "", "CLOUDPROVIDER_BACKOFF": fmt.Sprintf("%v", cloudProviderSettings.backoff), "CLOUDPROVIDER_BACKOFF_MODE": cloudProviderSettings.backoffMode, @@ -47,7 +47,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string { "CLOUDPROVIDER_RATELIMIT_BUCKET": fmt.Sprintf("%v", cloudProviderSettings.rateLimitBucket), "CLOUDPROVIDER_RATELIMIT_BUCKET_WRITE": fmt.Sprintf("%v", cloudProviderSettings.rateLimitBucketWrite), "CLI_TOOL": "ctr", - "NETWORK_MODE": "transparent", + "NETWORK_MODE": "", "ADMINUSER": getLinuxAdminUsername(config.GetLinuxAdminUsername()), "TENANT_ID": config.GetAuthConfig().GetTenantId(), "KUBERNETES_VERSION": config.GetKubernetesVersion(), @@ -188,13 +188,13 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string { "LOCALDNS_GENERATED_COREFILE": getLocalDnsCorefileBase64WithHostsPlugin(config, false), "LOCALDNS_COREFILE_BASE": getLocalDnsCorefileBase64WithHostsPlugin(config, false), "LOCALDNS_COREFILE_WITH_HOSTS": getLocalDnsCorefileBase64WithHostsPlugin(config, true), - "DISABLE_PUBKEY_AUTH": fmt.Sprintf("%v", config.GetDisablePubkeyAuth()), "SERVICE_ACCOUNT_IMAGE_PULL_ENABLED": fmt.Sprintf("%v", config.GetServiceAccountImagePullProfile().GetEnabled()), "SERVICE_ACCOUNT_IMAGE_PULL_DEFAULT_CLIENT_ID": config.GetServiceAccountImagePullProfile().GetDefaultClientId(), "SERVICE_ACCOUNT_IMAGE_PULL_DEFAULT_TENANT_ID": config.GetServiceAccountImagePullProfile().GetDefaultTenantId(), "IDENTITY_BINDINGS_LOCAL_AUTHORITY_SNI": config.GetServiceAccountImagePullProfile().GetLocalAuthoritySni(), "CSE_TIMEOUT": getCSETimeout(config), - "SKIP_WAAGENT_HOLD": "true", + "SKIP_WAAGENT_HOLD": "false", + "NETWORK_ISOLATED_CLUSTER_TEST_MODE": "false", // temp: needs to be added to config } for i, cert := range config.CustomCaCerts { diff --git a/aks-node-controller/parser/templates/localdns.toml.gtpl b/aks-node-controller/parser/templates/localdns.toml.gtpl index 99584dde2a1..818b23aa421 100644 --- a/aks-node-controller/parser/templates/localdns.toml.gtpl +++ b/aks-node-controller/parser/templates/localdns.toml.gtpl @@ -131,13 +131,11 @@ health-check.localdns.local:53 { template ANY ANY internal.cloudapp.net { match "^(?:[^.]+\.){4,}internal\.cloudapp\.net\.$" rcode NXDOMAIN - fallthrough - } template ANY ANY reddog.microsoft.com { rcode NXDOMAIN } {{- end}} } -{{- end}} \ No newline at end of file +{{- end}} diff --git a/e2e/node_config.go b/e2e/node_config.go index e76537b03b7..63197dc4801 100644 --- a/e2e/node_config.go +++ b/e2e/node_config.go @@ -20,7 +20,7 @@ import ( // this is a base kubelet config for Scriptless e2e test func baseKubeletConfig() *aksnodeconfigv1.KubeletConfig { return &aksnodeconfigv1.KubeletConfig{ - EnableKubeletConfigFile: true, + EnableKubeletConfigFile: false, KubeletFlags: map[string]string{ "--cloud-provider": "external", "--kubeconfig": "/var/lib/kubelet/kubeconfig", @@ -58,9 +58,11 @@ func baseKubeletConfig() *aksnodeconfigv1.KubeletConfig { Webhook: &aksnodeconfigv1.KubeletWebhookAuthentication{ Enabled: true, }, + Anonymous: &aksnodeconfigv1.KubeletAnonymousAuthentication{}, }, Authorization: &aksnodeconfigv1.KubeletAuthorization{ - Mode: "Webhook", + Mode: "Webhook", + Webhook: &aksnodeconfigv1.KubeletWebhookAuthorization{}, }, EventRecordQps: to.Ptr(int32(0)), ClusterDomain: "cluster.local", @@ -81,8 +83,9 @@ func baseKubeletConfig() *aksnodeconfigv1.KubeletConfig { "nodefs.inodesFree": "5%", }, ProtectKernelDefaults: true, - FeatureGates: map[string]bool{}, - FailSwapOn: to.Ptr(false), + FeatureGates: map[string]bool{ + "RotateKubeletServerCertificate": true, + }, KubeReserved: map[string]string{ "cpu": "100m", "memory": "1638Mi", @@ -90,10 +93,6 @@ func baseKubeletConfig() *aksnodeconfigv1.KubeletConfig { EnforceNodeAllocatable: []string{ "pods", }, - AllowedUnsafeSysctls: []string{ - "kernel.msg*", - "net.ipv4.route.min_pmtu", - }, }, } } @@ -159,16 +158,55 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod bootstrappingConfig.BootstrappingAuthMethod = aksnodeconfigv1.BootstrappingAuthMethod_BOOTSTRAPPING_AUTH_METHOD_SECURE_TLS_BOOTSTRAPPING } - return &aksnodeconfigv1.Configuration{ + k8sConfig := cs.Properties.OrchestratorProfile.KubernetesConfig + + // Derive UseInstanceMetadata from NBC config. + useInstanceMeta := false + if k8sConfig.UseInstanceMetadata != nil { + useInstanceMeta = *k8sConfig.UseInstanceMetadata + } + + // Base64-encode SP secret and kubelet client key to match what baker.go renders. + spSecret := "" + if cs.Properties.ServicePrincipalProfile != nil && cs.Properties.ServicePrincipalProfile.Secret != "" { + spSecret = base64.StdEncoding.EncodeToString([]byte(cs.Properties.ServicePrincipalProfile.Secret)) + } + kubeletClientKey := "" + if cs.Properties.CertificateProfile != nil && cs.Properties.CertificateProfile.ClientPrivateKey != "" { + kubeletClientKey = base64.StdEncoding.EncodeToString([]byte(cs.Properties.CertificateProfile.ClientPrivateKey)) + } + + // Build HttpProxyConfig with all fields (not just NoProxyEntries) to match NBC. + httpProxyConfig := &aksnodeconfigv1.HttpProxyConfig{} + if nbc.HTTPProxyConfig != nil { + if nbc.HTTPProxyConfig.NoProxy != nil { + httpProxyConfig.NoProxyEntries = *nbc.HTTPProxyConfig.NoProxy + } + if nbc.HTTPProxyConfig.HTTPProxy != nil { + httpProxyConfig.HttpProxy = *nbc.HTTPProxyConfig.HTTPProxy + } + if nbc.HTTPProxyConfig.HTTPSProxy != nil { + httpProxyConfig.HttpsProxy = *nbc.HTTPProxyConfig.HTTPSProxy + } + if nbc.HTTPProxyConfig.TrustedCA != nil { + httpProxyConfig.ProxyTrustedCa = *nbc.HTTPProxyConfig.TrustedCA + } + } + + // Derive EnableUnattendedUpgrade from NBC (baker uses !DisableUnattendedUpgrades). + enableUnattendedUpgrade := !nbc.DisableUnattendedUpgrades + //config.GetClusterConfig().GetLoadBalancerConfig().GetLoadBalancerSku() + cfg := &aksnodeconfigv1.Configuration{ Version: "v1", BootstrappingConfig: bootstrappingConfig, DisableCustomData: true, LinuxAdminUsername: "azureuser", VmSize: config.Config.DefaultVMSKU, ClusterConfig: &aksnodeconfigv1.ClusterConfig{ - Location: nbc.ContainerService.Location, - ResourceGroup: nbc.ResourceGroupName, - VmType: aksnodeconfigv1.VmType_VM_TYPE_VMSS, + Location: nbc.ContainerService.Location, + ResourceGroup: nbc.ResourceGroupName, + VmType: aksnodeconfigv1.VmType_VM_TYPE_VMSS, + UseInstanceMetadata: useInstanceMeta, ClusterNetworkConfig: &aksnodeconfigv1.ClusterNetworkConfig{ SecurityGroupName: cs.Properties.GetNSGName(), VnetName: cs.Properties.GetVirtualNetworkName(), @@ -177,26 +215,29 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod RouteTable: cs.Properties.GetRouteTableName(), }, CloudProviderConfig: &aksnodeconfigv1.CloudProviderConfig{ - Backoff: cs.Properties.OrchestratorProfile.KubernetesConfig.CloudProviderBackoff, - BackoffMode: cs.Properties.OrchestratorProfile.KubernetesConfig.CloudProviderBackoffMode, - BackoffRetries: to.Ptr(int32(cs.Properties.OrchestratorProfile.KubernetesConfig.CloudProviderBackoffRetries)), - BackoffExponent: to.Ptr(cs.Properties.OrchestratorProfile.KubernetesConfig.CloudProviderBackoffExponent), - BackoffDuration: to.Ptr(int32(cs.Properties.OrchestratorProfile.KubernetesConfig.CloudProviderBackoffDuration)), - BackoffJitter: to.Ptr(cs.Properties.OrchestratorProfile.KubernetesConfig.CloudProviderBackoffJitter), - RateLimit: cs.Properties.OrchestratorProfile.KubernetesConfig.CloudProviderRateLimit, - RateLimitQps: to.Ptr(cs.Properties.OrchestratorProfile.KubernetesConfig.CloudProviderRateLimitQPS), - RateLimitQpsWrite: to.Ptr(cs.Properties.OrchestratorProfile.KubernetesConfig.CloudProviderRateLimitQPSWrite), - RateLimitBucket: to.Ptr(int32(cs.Properties.OrchestratorProfile.KubernetesConfig.CloudProviderRateLimitBucket)), - RateLimitBucketWrite: to.Ptr(int32(cs.Properties.OrchestratorProfile.KubernetesConfig.CloudProviderRateLimitBucketWrite)), + Backoff: k8sConfig.CloudProviderBackoff, + BackoffMode: k8sConfig.CloudProviderBackoffMode, + BackoffRetries: to.Ptr(int32(k8sConfig.CloudProviderBackoffRetries)), + BackoffExponent: to.Ptr(k8sConfig.CloudProviderBackoffExponent), + BackoffDuration: to.Ptr(int32(k8sConfig.CloudProviderBackoffDuration)), + BackoffJitter: to.Ptr(k8sConfig.CloudProviderBackoffJitter), + RateLimit: k8sConfig.CloudProviderRateLimit, + RateLimitQps: to.Ptr(k8sConfig.CloudProviderRateLimitQPS), + RateLimitQpsWrite: to.Ptr(k8sConfig.CloudProviderRateLimitQPSWrite), + RateLimitBucket: to.Ptr(int32(k8sConfig.CloudProviderRateLimitBucket)), + RateLimitBucketWrite: to.Ptr(int32(k8sConfig.CloudProviderRateLimitBucketWrite)), }, PrimaryScaleSet: nbc.PrimaryScaleSetName, + LoadBalancerConfig: &aksnodeconfigv1.LoadBalancerConfig{ + LoadBalancerSku: aksnodeconfigv1.LoadBalancerSku_LOAD_BALANCER_SKU_STANDARD, + }, }, ApiServerConfig: &aksnodeconfigv1.ApiServerConfig{ ApiServerName: cs.Properties.HostedMasterProfile.FQDN, }, AuthConfig: &aksnodeconfigv1.AuthConfig{ ServicePrincipalId: cs.Properties.ServicePrincipalProfile.ClientID, - ServicePrincipalSecret: cs.Properties.ServicePrincipalProfile.Secret, + ServicePrincipalSecret: spSecret, TenantId: nbc.TenantID, SubscriptionId: nbc.SubscriptionID, AssignedIdentityId: nbc.UserAssignedIdentityClientID, @@ -204,13 +245,13 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod NetworkConfig: &aksnodeconfigv1.NetworkConfig{ NetworkPlugin: aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_KUBENET, CniPluginsUrl: nbc.CloudSpecConfig.KubernetesSpecConfig.CNIPluginsDownloadURL, - VnetCniPluginsUrl: cs.Properties.OrchestratorProfile.KubernetesConfig.AzureCNIURLLinux, + VnetCniPluginsUrl: k8sConfig.AzureCNIURLLinux, }, GpuConfig: &aksnodeconfigv1.GpuConfig{ ConfigGpuDriver: true, GpuDevicePlugin: false, }, - EnableUnattendedUpgrade: true, + EnableUnattendedUpgrade: enableUnattendedUpgrade, KubernetesVersion: cs.Properties.OrchestratorProfile.OrchestratorVersion, ContainerdConfig: &aksnodeconfigv1.ContainerdConfig{ ContainerdDownloadUrlBase: nbc.CloudSpecConfig.KubernetesSpecConfig.ContainerdDownloadURLBase, @@ -218,17 +259,15 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod OutboundCommand: helpers.GetDefaultOutboundCommand(), KubernetesCaCert: base64.StdEncoding.EncodeToString([]byte(cs.Properties.CertificateProfile.CaCertificate)), KubeBinaryConfig: &aksnodeconfigv1.KubeBinaryConfig{ - KubeBinaryUrl: cs.Properties.OrchestratorProfile.KubernetesConfig.CustomKubeBinaryURL, + KubeBinaryUrl: k8sConfig.CustomKubeBinaryURL, PodInfraContainerImageUrl: nbc.K8sComponents.PodInfraContainerImageURL, }, - KubeProxyUrl: cs.Properties.OrchestratorProfile.KubernetesConfig.CustomKubeProxyImage, - HttpProxyConfig: &aksnodeconfigv1.HttpProxyConfig{ - NoProxyEntries: *nbc.HTTPProxyConfig.NoProxy, - }, + KubeProxyUrl: k8sConfig.CustomKubeProxyImage, + HttpProxyConfig: httpProxyConfig, LocalDnsProfile: &aksnodeconfigv1.LocalDnsProfile{ EnableLocalDns: true, CpuLimitInMilliCores: to.Ptr(int32(2008)), - MemoryLimitInMb: to.Ptr(int32(256)), + MemoryLimitInMb: to.Ptr(int32(128)), VnetDnsOverrides: map[string]*aksnodeconfigv1.LocalDnsOverrides{ ".": { QueryLogging: "Log", @@ -238,23 +277,23 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod MaxConcurrent: to.Ptr(int32(1000)), CacheDurationInSeconds: to.Ptr(int32(3600)), ServeStaleDurationInSeconds: to.Ptr(int32(3600)), - ServeStale: "Immediate", + ServeStale: "Verify", }, "cluster.local": { QueryLogging: "Error", Protocol: "ForceTCP", ForwardDestination: "ClusterCoreDNS", - ForwardPolicy: "RoundRobin", - MaxConcurrent: to.Ptr(int32(3000)), - CacheDurationInSeconds: to.Ptr(int32(7200)), - ServeStaleDurationInSeconds: to.Ptr(int32(4500)), + ForwardPolicy: "Sequential", + MaxConcurrent: to.Ptr(int32(1000)), + CacheDurationInSeconds: to.Ptr(int32(3600)), + ServeStaleDurationInSeconds: to.Ptr(int32(3600)), ServeStale: "Disable", }, "testdomain456.com": { QueryLogging: "Log", Protocol: "PreferUDP", - ForwardDestination: "VnetDNS", - ForwardPolicy: "Random", + ForwardDestination: "ClusterCoreDNS", + ForwardPolicy: "Sequential", MaxConcurrent: to.Ptr(int32(1000)), CacheDurationInSeconds: to.Ptr(int32(3600)), ServeStaleDurationInSeconds: to.Ptr(int32(3600)), @@ -305,6 +344,20 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod // Therefore, we require client (e.g. AKS-RP) to provide the final kubelet config that is ready to be written to the final kubelet config file on a node. KubeletConfig: baseKubeletConfig(), } + + // Populate KubeletConfig fields from NBC that aren't in the static baseKubeletConfig. + cfg.KubeletConfig.KubeletClientKey = kubeletClientKey + + // Build kubelet flags from the NBC's KubeletConfig map, filtering the same way baker.go does. + if nbc.KubeletConfig != nil { + kubeletFlags := make(map[string]string) + for key, val := range nbc.KubeletConfig { + kubeletFlags[key] = val + } + cfg.KubeletConfig.KubeletFlags = kubeletFlags + } + + return cfg } // this is huge, but accurate, so leave it here. @@ -386,7 +439,7 @@ func baseTemplateLinux(t testing.TB, location string, k8sVersion string, arch st AzureCNIURLLinux: "https://packages.aks.azure.com/azure-cni/v1.6.21/binaries/azure-vnet-cni-linux-amd64-v1.6.21.tgz", AzureCNIURLARM64Linux: "", AzureCNIURLWindows: "", - MaximumLoadBalancerRuleCount: 250, + MaximumLoadBalancerRuleCount: 148, PrivateAzureRegistryServer: "", NetworkPluginMode: "", }, diff --git a/e2e/types.go b/e2e/types.go index 6b4a80ff10a..e3bcd31a03d 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -279,7 +279,7 @@ func (s *Scenario) KubeletConfigFileEnabled() bool { if s.Runtime == nil { return false } - if nodeConfig := s.Runtime.AKSNodeConfig; nodeConfig != nil && nodeConfig.KubeletConfig != nil && nodeConfig.KubeletConfig.EnableKubeletConfigFile && (s.Runtime.EnableScriptlessANC || s.Tags.Scriptless) { + if nodeConfig := s.Runtime.AKSNodeConfig; nodeConfig != nil && nodeConfig.KubeletConfig != nil && nodeConfig.KubeletConfig.EnableKubeletConfigFile { return true } if nbc := s.Runtime.NBC; nbc != nil && (nbc.EnableKubeletConfigFile || diff --git a/e2e/vmss.go b/e2e/vmss.go index c7599e171b0..317ef8ef0d7 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -102,7 +102,7 @@ func CustomDataWithHack(s *Scenario, nbcCmdScript, binaryURL string) (string, er } if nbcCmdScript != "" { flags = append(flags, "--nbc-cmd="+nbcCmdPath) - encodedNBCCSECmd = base64.StdEncoding.EncodeToString([]byte(nbcCmdScript)) + encodedNBCCSECmd = gzipAndBase64Encode([]byte(nbcCmdScript)) } provisionFlags := strings.Join(flags, " ") @@ -113,7 +113,7 @@ func CustomDataWithHack(s *Scenario, nbcCmdScript, binaryURL string) (string, er if err != nil { return "", fmt.Errorf("failed to marshal nbc, error: %w", err) } - encodedAksNodeConfigJSON = base64.StdEncoding.EncodeToString(aksNodeConfigJSON) + encodedAksNodeConfigJSON = gzipAndBase64Encode(aksNodeConfigJSON) } var customData string @@ -135,11 +135,11 @@ mkdir -p /opt/azure/containers /opt/azure/bin `) if encodedConfig != "" { - fmt.Fprintf(&sb, "cat <<'EOF' | base64 -d > %s\n%s\nEOF\nchmod 0600 %s\n", + fmt.Fprintf(&sb, "cat <<'EOF' | base64 -d | gzip -d > %s\n%s\nEOF\nchmod 0600 %s\n", configPath, encodedConfig, configPath) } if encodedNBCCmd != "" { - fmt.Fprintf(&sb, "\ncat <<'EOF' | base64 -d > %s\n%s\nEOF\nchmod 0755 %s\n", + fmt.Fprintf(&sb, "\ncat <<'EOF' | base64 -d | gzip -d > %s\n%s\nEOF\nchmod 0755 %s\n", nbcCmdPath, encodedNBCCmd, nbcCmdPath) } fmt.Fprintf(&sb, ` @@ -182,21 +182,29 @@ func buildFlatcarCloudConfig(encodedConfig, configPath, encodedNBCCmd, nbcCmdPat var sb strings.Builder sb.WriteString("#cloud-config\nwrite_files:\n") if encodedConfig != "" { - fmt.Fprintf(&sb, `- path: %s + fmt.Fprintf(&sb, `- path: %s.gz.b64 permissions: "0600" owner: root - content: !!binary | - %s + content: | + %s `, configPath, encodedConfig) } if encodedNBCCmd != "" { - fmt.Fprintf(&sb, `- path: %s - permissions: "0755" + fmt.Fprintf(&sb, `- path: %s.gz.b64 + permissions: "0644" owner: root - content: !!binary | - %s + content: | + %s `, nbcCmdPath, encodedNBCCmd) } + // Build a decode script that decompresses the gzipped+base64 files before running ANC. + var decodeSteps strings.Builder + if encodedConfig != "" { + fmt.Fprintf(&decodeSteps, " base64 -d %s.gz.b64 | gzip -d > %s && chmod 0600 %s\n", configPath, configPath, configPath) + } + if encodedNBCCmd != "" { + fmt.Fprintf(&decodeSteps, " base64 -d %s.gz.b64 | gzip -d > %s && chmod 0755 %s\n", nbcCmdPath, nbcCmdPath, nbcCmdPath) + } fmt.Fprintf(&sb, `- path: /opt/azure/bin/run-aks-node-controller-hack.sh permissions: "0755" owner: root @@ -204,7 +212,7 @@ func buildFlatcarCloudConfig(encodedConfig, configPath, encodedNBCCmd, nbcCmdPat #!/bin/bash set -euo pipefail mkdir -p /opt/azure/bin - curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack +%s curl -fSL --retry 10 --retry-delay 2 "%s" -o /opt/azure/bin/aks-node-controller-hack chmod +x /opt/azure/bin/aks-node-controller-hack /opt/azure/bin/aks-node-controller-hack provision %s # Flatcar specific configuration. It supports only a subset of cloud-init features https://github.com/flatcar/coreos-cloudinit/blob/main/Documentation/cloud-config.md#coreos-parameters @@ -222,10 +230,21 @@ coreos: ExecStart=/opt/azure/bin/run-aks-node-controller-hack.sh [Install] WantedBy=multi-user.target -`, binaryURL, provisionFlags) +`, decodeSteps.String(), binaryURL, provisionFlags) return sb.String() } +// gzipAndBase64Encode compresses data with gzip then base64-encodes it. +// This matches the production baker.go approach (getBase64EncodedGzippedCustomScriptFromStr) +// to keep custom data within the Azure 65535-byte limit. +func gzipAndBase64Encode(data []byte) string { + var buf bytes.Buffer + w := gzip.NewWriter(&buf) + w.Write(data) + w.Close() + return base64.StdEncoding.EncodeToString(buf.Bytes()) +} + // CustomDataWithNBCCmdHack is similar to baker.boothooktemplate, but it uses a hack to run new aks-node-controller binary. // Original aks-node-controller isn't run because it fails systemd check validating aks-node-controller-config.json exists // (check aks-node-controller.service for details). @@ -375,9 +394,6 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine var cse, customData string if s.Runtime.NBC != nil { - if s.Runtime.EnableScriptlessANC { - s.Runtime.NBC.EnableKubeletConfigFile = true - } nodeBootstrapping, err = ab.GetNodeBootstrapping(ctx, s.Runtime.NBC) require.NoError(s.T, err) } diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 1bccaf924c7..6a7c4651867 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -1120,7 +1120,7 @@ configureSSHPubkeyAuth() { local ssh_use_pubkey_auth # Determine the desired pubkey auth setting - if [ "${disable_pubkey_auth}" = "true" ]; then +if [ "${disable_pubkey_auth}" = "true" ]; then ssh_use_pubkey_auth="no" else ssh_use_pubkey_auth="yes" diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index 485d1739a6a..fde16daa30e 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1483,7 +1483,6 @@ func isMariner(osSku string) bool { const sysctlTemplateString = `# This is a partial workaround to this upstream Kubernetes issue: # https://github.com/kubernetes/kubernetes/issues/41916#issuecomment-312428731 -net.ipv4.tcp_retries2=8 net.core.message_burst=80 net.core.message_cost=40 {{- if .CustomLinuxOSConfig}} @@ -1574,6 +1573,7 @@ net.ipv4.ip_local_port_range={{$s.NetIpv4IpLocalPortRange}} net.ipv4.ip_local_reserved_ports=65330 {{- end}} {{- end}} +net.ipv4.tcp_retries2=8 {{- if $s.NetNetfilterNfConntrackMax}} net.netfilter.nf_conntrack_max={{$s.NetNetfilterNfConntrackMax}} {{- end}} @@ -1608,30 +1608,29 @@ vm.vfs_cache_pressure={{$s.VMVfsCachePressure}} {{- end}} ` -const kubenetCniTemplate = ` -{ - "cniVersion": "0.3.1", - "name": "kubenet", - "plugins": [{ - "type": "bridge", - "bridge": "cbr0", - "mtu": 1500, - "addIf": "eth0", - "isGateway": true, - "ipMasq": false, - "promiscMode": true, - "hairpinMode": false, - "ipam": { - "type": "host-local", - "ranges": [{{range $i, $range := .PodCIDRRanges}}{{if $i}}, {{end}}[{"subnet": "{{$range}}"}]{{end}}], - "routes": [{{range $i, $route := .Routes}}{{if $i}}, {{end}}{"dst": "{{$route}}"}{{end}}] - } - }, - { - "type": "portmap", - "capabilities": {"portMappings": true}, - "externalSetMarkChain": "KUBE-MARK-MASQ" - }] +const kubenetCniTemplate = `{ + "cniVersion": "0.3.1", + "name": "kubenet", + "plugins": [{ + "type": "bridge", + "bridge": "cbr0", + "mtu": 1500, + "addIf": "eth0", + "isGateway": true, + "ipMasq": false, + "promiscMode": true, + "hairpinMode": false, + "ipam": { + "type": "host-local", + "ranges": [{{range $i, $range := .PodCIDRRanges}}{{if $i}}, {{end}}[{"subnet": "{{$range}}"}]{{end}}], + "routes": [{{range $i, $route := .Routes}}{{if $i}}, {{end}}{"dst": "{{$route}}"}{{end}}] + } + }, + { + "type": "portmap", + "capabilities": {"portMappings": true}, + "externalSetMarkChain": "KUBE-MARK-MASQ" + }] } ` @@ -1985,8 +1984,7 @@ func GenerateLocalDNSCoreFile( // (mcr.microsoft.com, packages.aks.azure.com, etc.) are included in root domain server blocks. // When false, hosts blocks are omitted — used as a fallback when enableAKSLocalDNSHostsSetup fails at // provisioning time, following the same dual-config pattern used for containerd GPU/no-GPU configs. -const localDNSCoreFileTemplateString = ` -# *********************************************************************************** +const localDNSCoreFileTemplateString = `# *********************************************************************************** # WARNING: Changes to this file will be overwritten and not persisted. # *********************************************************************************** # whoami (used for health check of DNS) diff --git a/pkg/agent/utils.go b/pkg/agent/utils.go index a1bae0c8e5e..d4e7227fac3 100644 --- a/pkg/agent/utils.go +++ b/pkg/agent/utils.go @@ -393,11 +393,11 @@ func GetOrderedKubeletConfigFlagString(config *datamodel.NodeBootstrappingConfig } } sort.Strings(keys) - var buf bytes.Buffer + pairs := make([]string, 0, len(keys)) for _, key := range keys { - buf.WriteString(fmt.Sprintf("%s=%s ", key, k[key])) + pairs = append(pairs, fmt.Sprintf("%s=%s", key, k[key])) } - return buf.String() + return strings.Join(pairs, " ") } func getOrderedKubeletConfigFlagWithCustomConfigurationString(customConfig, defaultConfig map[string]string) string { @@ -418,11 +418,11 @@ func getOrderedKubeletConfigFlagWithCustomConfigurationString(customConfig, defa } } sort.Strings(keys) - var buf bytes.Buffer + pairs := make([]string, 0, len(keys)) for _, key := range keys { - buf.WriteString(fmt.Sprintf("%s=%s ", key, config[key])) + pairs = append(pairs, fmt.Sprintf("%s=%s", key, config[key])) } - return buf.String() + return strings.Join(pairs, " ") } func getKubeletCustomConfiguration(properties *datamodel.Properties) map[string]string { From 9e45ddeddb0dddfa36f537ca0b2c0563c6c5b0a3 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Tue, 19 May 2026 15:04:41 -0700 Subject: [PATCH 07/24] add validator --- aks-node-controller/parser/helper.go | 2 +- e2e/validation.go | 1 + e2e/validators.go | 13 +++++++++++++ pkg/agent/baker.go | 22 +++++++++++----------- 4 files changed, 26 insertions(+), 12 deletions(-) diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index 9411a173703..d9e3b6134f1 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -402,7 +402,7 @@ func getSysctlContent(s *aksnodeconfigv1.SysctlConfig) string { m["vm.vfs_cache_pressure"] = s.GetVmVfsCachePressure() } - return base64.StdEncoding.EncodeToString([]byte(createSortedKeyValuePairs(m, "\n"))) + return base64.StdEncoding.EncodeToString([]byte(createSortedKeyValuePairs(m, "\n") + "\n")) } func getShouldConfigContainerdUlimits(u *aksnodeconfigv1.UlimitConfig) bool { diff --git a/e2e/validation.go b/e2e/validation.go index 3cf65423dd0..3c02e8c6191 100644 --- a/e2e/validation.go +++ b/e2e/validation.go @@ -49,6 +49,7 @@ func ValidateCommonLinux(ctx context.Context, s *Scenario) { ValidateWaagentLog(ctx, s) ValidateScriptlessCSECmd(ctx, s) ValidateScriptlessNBCCSECmd(ctx, s) + ValidateScriptlessPhase3(ctx, s) ValidateNodeExporter(ctx, s) ValidateSysctlConfig(ctx, s, map[string]string{ diff --git a/e2e/validators.go b/e2e/validators.go index 6fdddb5ab9b..de98c3f1269 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2565,6 +2565,19 @@ func ValidateScriptlessNBCCSECmd(ctx context.Context, s *Scenario) { } } +func ValidateScriptlessPhase3(ctx context.Context, s *Scenario) { + s.T.Helper() + if s.Runtime.NBC != nil && s.Runtime.AKSNodeConfig != nil { + logFile := "/var/log/azure/aks-node-controller.log" + if !fileHasContent(ctx, s, logFile, "env compare: no differences found between provision-config and nbc-cmd env vars") { + // Grep for "differs" lines to show what's different + diffCmd := "sudo grep 'differs' " + logFile + " || true" + result := execScriptOnVMForScenarioValidateExitCode(ctx, s, diffCmd, 0, "could not grep for differences in aks-node-controller.log") + s.T.Fatalf("expected no env var differences between provision-config and nbc-cmd, but found differences:\n%s", result.stdout) + } + } +} + // ValidateRxBufferDefault validates rx buffer config using default values based on VM's CPU count func ValidateRxBufferDefault(ctx context.Context, s *Scenario) { s.T.Helper() diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index fde16daa30e..baa293edb0e 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1481,9 +1481,7 @@ func isMariner(osSku string) bool { return osSku == datamodel.OSSKUCBLMariner || osSku == datamodel.OSSKUMariner || osSku == datamodel.OSSKUAzureLinux } -const sysctlTemplateString = `# This is a partial workaround to this upstream Kubernetes issue: -# https://github.com/kubernetes/kubernetes/issues/41916#issuecomment-312428731 -net.core.message_burst=80 +const sysctlTemplateString = `net.core.message_burst=80 net.core.message_cost=40 {{- if .CustomLinuxOSConfig}} {{- if .CustomLinuxOSConfig.Sysctls}} @@ -1492,11 +1490,6 @@ net.core.somaxconn={{.CustomLinuxOSConfig.Sysctls.NetCoreSomaxconn}} {{- else}} net.core.somaxconn=16384 {{- end}} -{{- if .CustomLinuxOSConfig.Sysctls.NetIpv4TcpMaxSynBacklog}} -net.ipv4.tcp_max_syn_backlog={{.CustomLinuxOSConfig.Sysctls.NetIpv4TcpMaxSynBacklog}} -{{- else}} -net.ipv4.tcp_max_syn_backlog=16384 -{{- end}} {{- if .CustomLinuxOSConfig.Sysctls.NetIpv4NeighDefaultGcThresh1}} net.ipv4.neigh.default.gc_thresh1={{.CustomLinuxOSConfig.Sysctls.NetIpv4NeighDefaultGcThresh1}} {{- else}} @@ -1512,19 +1505,27 @@ net.ipv4.neigh.default.gc_thresh3={{.CustomLinuxOSConfig.Sysctls.NetIpv4NeighDef {{- else}} net.ipv4.neigh.default.gc_thresh3=16384 {{- end}} +{{- if .CustomLinuxOSConfig.Sysctls.NetIpv4TcpMaxSynBacklog}} +net.ipv4.tcp_max_syn_backlog={{.CustomLinuxOSConfig.Sysctls.NetIpv4TcpMaxSynBacklog}} {{- else}} -net.core.somaxconn=16384 net.ipv4.tcp_max_syn_backlog=16384 +{{- end}} +net.ipv4.tcp_retries2=8 +{{- else}} +net.core.somaxconn=16384 net.ipv4.neigh.default.gc_thresh1=4096 net.ipv4.neigh.default.gc_thresh2=8192 net.ipv4.neigh.default.gc_thresh3=16384 +net.ipv4.tcp_max_syn_backlog=16384 +net.ipv4.tcp_retries2=8 {{- end}} {{- else}} net.core.somaxconn=16384 -net.ipv4.tcp_max_syn_backlog=16384 net.ipv4.neigh.default.gc_thresh1=4096 net.ipv4.neigh.default.gc_thresh2=8192 net.ipv4.neigh.default.gc_thresh3=16384 +net.ipv4.tcp_max_syn_backlog=16384 +net.ipv4.tcp_retries2=8 {{- end}} {{- if .CustomLinuxOSConfig}} {{- if .CustomLinuxOSConfig.Sysctls}} @@ -1573,7 +1574,6 @@ net.ipv4.ip_local_port_range={{$s.NetIpv4IpLocalPortRange}} net.ipv4.ip_local_reserved_ports=65330 {{- end}} {{- end}} -net.ipv4.tcp_retries2=8 {{- if $s.NetNetfilterNfConntrackMax}} net.netfilter.nf_conntrack_max={{$s.NetNetfilterNfConntrackMax}} {{- end}} From eda17a8446d4a22b557142b0655f85675b1cf104 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Wed, 20 May 2026 14:52:44 -0700 Subject: [PATCH 08/24] do not run scriptless phase 3 e2es for dedicated scriptless scenarios --- e2e/test_helpers.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index f7ecc055770..71d0a8e35cb 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -121,7 +121,7 @@ func supportsScriptlessNBCCSECmd(s *Scenario) bool { } func supportsScriptlessAKSNodeConfig(s *Scenario) bool { - return s.AKSNodeConfigMutator != nil && !s.IsWindows() && len(s.Config.CustomDataWriteFiles) <= 0 && !s.VHDCaching && !config.Config.TestPreProvision + return s.AKSNodeConfigMutator != nil && s.BootstrapConfigMutator != nil && !s.IsWindows() && len(s.Config.CustomDataWriteFiles) <= 0 && !s.VHDCaching && !config.Config.TestPreProvision } func runScenarioWithPreProvision(t *testing.T, original *Scenario) { From eed133e935098d968745a9fddfdef1b58e5f5e95 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Thu, 21 May 2026 11:38:38 -0700 Subject: [PATCH 09/24] fix vnetcniplugins conversion and phase 3 validation check --- e2e/node_config.go | 9 +++++++-- e2e/validators.go | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/e2e/node_config.go b/e2e/node_config.go index 63197dc4801..74485ed0f88 100644 --- a/e2e/node_config.go +++ b/e2e/node_config.go @@ -195,7 +195,11 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod // Derive EnableUnattendedUpgrade from NBC (baker uses !DisableUnattendedUpgrades). enableUnattendedUpgrade := !nbc.DisableUnattendedUpgrades - //config.GetClusterConfig().GetLoadBalancerConfig().GetLoadBalancerSku() + vnetCNIPluginURL := nbc.CloudSpecConfig.KubernetesSpecConfig.VnetCNILinuxPluginsDownloadURL + if nbc.IsARM64 { + vnetCNIPluginURL = nbc.CloudSpecConfig.KubernetesSpecConfig.VnetCNIARM64LinuxPluginsDownloadURL + } + cfg := &aksnodeconfigv1.Configuration{ Version: "v1", BootstrappingConfig: bootstrappingConfig, @@ -245,13 +249,14 @@ func nbcToAKSNodeConfigV1(nbc *datamodel.NodeBootstrappingConfiguration) *aksnod NetworkConfig: &aksnodeconfigv1.NetworkConfig{ NetworkPlugin: aksnodeconfigv1.NetworkPlugin_NETWORK_PLUGIN_KUBENET, CniPluginsUrl: nbc.CloudSpecConfig.KubernetesSpecConfig.CNIPluginsDownloadURL, - VnetCniPluginsUrl: k8sConfig.AzureCNIURLLinux, + VnetCniPluginsUrl: vnetCNIPluginURL, }, GpuConfig: &aksnodeconfigv1.GpuConfig{ ConfigGpuDriver: true, GpuDevicePlugin: false, }, EnableUnattendedUpgrade: enableUnattendedUpgrade, + EnableArtifactStreaming: nbc.EnableArtifactStreaming, KubernetesVersion: cs.Properties.OrchestratorProfile.OrchestratorVersion, ContainerdConfig: &aksnodeconfigv1.ContainerdConfig{ ContainerdDownloadUrlBase: nbc.CloudSpecConfig.KubernetesSpecConfig.ContainerdDownloadURLBase, diff --git a/e2e/validators.go b/e2e/validators.go index de98c3f1269..3f44985bace 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2567,7 +2567,7 @@ func ValidateScriptlessNBCCSECmd(ctx context.Context, s *Scenario) { func ValidateScriptlessPhase3(ctx context.Context, s *Scenario) { s.T.Helper() - if s.Runtime.NBC != nil && s.Runtime.AKSNodeConfig != nil { + if s.AKSNodeConfigMutator != nil && s.BootstrapConfigMutator != nil { logFile := "/var/log/azure/aks-node-controller.log" if !fileHasContent(ctx, s, logFile, "env compare: no differences found between provision-config and nbc-cmd env vars") { // Grep for "differs" lines to show what's different From 0b51e0c01da6d1b7a249867b14f297fa590b55be Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Fri, 22 May 2026 15:31:59 -0700 Subject: [PATCH 10/24] fix scriptless phase 3 validator condition --- e2e/validators.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/validators.go b/e2e/validators.go index 205df14fcc9..d381ccd21c5 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2683,7 +2683,7 @@ func ValidateScriptlessNBCCSECmd(ctx context.Context, s *Scenario) { // ValidateScriptlessPhase3 validates that there are not diffs between ANC generated cse cmd NBC cse cmd vars func ValidateScriptlessPhase3(ctx context.Context, s *Scenario) { s.T.Helper() - if s.AKSNodeConfigMutator != nil && s.BootstrapConfigMutator != nil { + if s.Runtime.EnableScriptlessANC { logFile := "/var/log/azure/aks-node-controller.log" if !fileHasContent(ctx, s, logFile, "env compare: no differences found between provision-config and nbc-cmd env vars") { // Grep for "differs" lines to show what's different From b05096952b27db8da39ba82d937f6a1f35c9ca60 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Tue, 26 May 2026 17:15:45 -0700 Subject: [PATCH 11/24] fix more diffs --- aks-node-controller/app.go | 71 ++++++++++++------- .../parser/templates/containerd.toml.gtpl | 4 +- .../templates/containerd_no_GPU.toml.gtpl | 4 +- e2e/node_config.go | 2 +- pkg/agent/baker.go | 22 +++--- 5 files changed, 60 insertions(+), 43 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index b68a4a81787..943aeab890a 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -23,12 +23,17 @@ import ( "github.com/urfave/cli/v3" ) -var deprecatedCSEVars = map[string]bool{ - "CLOUD_INIT_STATUS_SCRIPT": true, - "HYPERKUBE_URL": true, - "MCR_REPOSITORY_BASE": true, - "BLOCK_OUTBOUND_NETWORK": true, - "DISABLE_PUBKEY_AUTH": true, +func isDeprecatedCSEVar(key string) bool { + switch key { + case "CLOUD_INIT_STATUS_SCRIPT", + "HYPERKUBE_URL", + "MCR_REPOSITORY_BASE", + "BLOCK_OUTBOUND_NETWORK", + // DISABLE_PUBKEY_AUTH is now computed in scripts, and CSE var is not used + "DISABLE_PUBKEY_AUTH": + return true + } + return false } type App struct { @@ -265,14 +270,7 @@ func compareEnvs(ctx context.Context, flags ProvisionFlags, eventLogger *helpers } // Extract CSE-specific env vars from provision config by filtering out unmodified OS env vars. - osEnv := envSliceToMap(os.Environ()) - pcAllEnv := envSliceToMap(provisionConfigCmd.Env) - pcEnv := make(map[string]string, len(pcAllEnv)) - for k, v := range pcAllEnv { - if osVal, inOS := osEnv[k]; !inOS || osVal != v { - pcEnv[k] = v - } - } + pcEnv := extractCSEEnvVars(provisionConfigCmd.Env) // Parse env vars directly from the NBC command file content. nbcCmdContent, err := os.ReadFile(flags.NBCCmd) @@ -282,8 +280,36 @@ func compareEnvs(ctx context.Context, flags ProvisionFlags, eventLogger *helpers } nbcEnv := parseEnvVarsFromNBCCmdContent(string(nbcCmdContent)) - // Collect all keys from both environments. - allKeys := make(map[string]struct{}) + diffs := diffEnvMaps(pcEnv, nbcEnv) + + now := time.Now() + if len(diffs) == 0 { + slog.Info("env compare: no differences found between provision-config and nbc-cmd env vars") + eventLogger.LogEvent("CompareEnvs", "env vars match between provision-config and nbc-cmd", helpers.EventLevelInformational, now, now) + } else { + message := fmt.Sprintf("env var differences (%d): %s", len(diffs), strings.Join(diffs, "; ")) + slog.Info(message) + eventLogger.LogEvent("CompareEnvs", message, helpers.EventLevelInformational, now, now) + } +} + +// extractCSEEnvVars filters a command's env slice to only CSE-specific variables +// by removing entries that match the current OS environment. +func extractCSEEnvVars(cmdEnv []string) map[string]string { + osEnv := envSliceToMap(os.Environ()) + allEnv := envSliceToMap(cmdEnv) + cseEnv := make(map[string]string, len(allEnv)) + for k, v := range allEnv { + if osVal, inOS := osEnv[k]; !inOS || osVal != v { + cseEnv[k] = v + } + } + return cseEnv +} + +// diffEnvMaps compares two environment variable maps and returns a sorted list of human-readable differences. +func diffEnvMaps(pcEnv, nbcEnv map[string]string) []string { + allKeys := make(map[string]struct{}, len(pcEnv)+len(nbcEnv)) for k := range pcEnv { allKeys[k] = struct{}{} } @@ -305,23 +331,14 @@ func compareEnvs(ctx context.Context, flags ProvisionFlags, eventLogger *helpers case inPC && !inNBC: diffs = append(diffs, fmt.Sprintf("only-in-pc: %s = %q", key, pcVal)) case !inPC && inNBC: - if !deprecatedCSEVars[key] { + if !isDeprecatedCSEVar(key) { diffs = append(diffs, fmt.Sprintf("only-in-nbc: %s = %q", key, nbcVal)) } case pcVal != nbcVal: diffs = append(diffs, fmt.Sprintf("differs: %s pc=%q nbc=%q", key, pcVal, nbcVal)) } } - - now := time.Now() - if len(diffs) == 0 { - slog.Info("env compare: no differences found between provision-config and nbc-cmd env vars") - eventLogger.LogEvent("CompareEnvs", "env vars match between provision-config and nbc-cmd", helpers.EventLevelInformational, now, now) - } else { - message := fmt.Sprintf("env var differences (%d): %s", len(diffs), strings.Join(diffs, "; ")) - slog.Info(message) - eventLogger.LogEvent("CompareEnvs", message, helpers.EventLevelInformational, now, now) - } + return diffs } // parseEnvVarsFromNBCCmdContent extracts environment variable assignments from an NBC command string. diff --git a/aks-node-controller/parser/templates/containerd.toml.gtpl b/aks-node-controller/parser/templates/containerd.toml.gtpl index db8d87d130b..6b9d7442fdb 100644 --- a/aks-node-controller/parser/templates/containerd.toml.gtpl +++ b/aks-node-controller/parser/templates/containerd.toml.gtpl @@ -56,8 +56,8 @@ root = "{{.KubeletConfig.GetContainerDataDir}}"{{- end}} {{- if .GetEnableArtifactStreaming }} [proxy_plugins] [proxy_plugins.overlaybd] - type = "snapshot" - address = "/run/overlaybd-snapshotter/overlaybd.sock" + type = "snapshot" + address = "/run/overlaybd-snapshotter/overlaybd.sock" {{- end}} {{- if .GetIsKata }} [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata] diff --git a/aks-node-controller/parser/templates/containerd_no_GPU.toml.gtpl b/aks-node-controller/parser/templates/containerd_no_GPU.toml.gtpl index eda56f66eca..2eb27cab54c 100644 --- a/aks-node-controller/parser/templates/containerd_no_GPU.toml.gtpl +++ b/aks-node-controller/parser/templates/containerd_no_GPU.toml.gtpl @@ -40,8 +40,8 @@ root = "{{.KubeletConfig.GetContainerDataDir}}"{{- end}} {{- if .GetEnableArtifactStreaming }} [proxy_plugins] [proxy_plugins.overlaybd] - type = "snapshot" - address = "/run/overlaybd-snapshotter/overlaybd.sock" + type = "snapshot" + address = "/run/overlaybd-snapshotter/overlaybd.sock" {{- end}} {{- if .GetIsKata }} [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata] diff --git a/e2e/node_config.go b/e2e/node_config.go index 74485ed0f88..c037248f580 100644 --- a/e2e/node_config.go +++ b/e2e/node_config.go @@ -642,7 +642,7 @@ func baseTemplateLinux(t testing.TB, location string, k8sVersion string, arch st KubeBinariesSASURLBase: "https://packages.aks.azure.com/kubernetes/", WindowsTelemetryGUID: "fb801154-36b9-41bc-89c2-f4d4f05472b0", CNIPluginsDownloadURL: "https://packages.aks.azure.com/cni/cni-plugins-amd64-v0.7.6.tgz", - VnetCNILinuxPluginsDownloadURL: "https://packages.aks.azure.com/azure-cni/v1.1.3/binaries/azure-vnet-cni-linux-amd64-v1.1.3.tgz", + VnetCNILinuxPluginsDownloadURL: "https://packages.aks.azure.com/azure-cni/v1.6.21/binaries/azure-vnet-cni-linux-amd64-v1.6.21.tgz", VnetCNIWindowsPluginsDownloadURL: "https://packages.aks.azure.com/azure-cni/v1.1.3/binaries/azure-vnet-cni-singletenancy-windows-amd64-v1.1.3.zip", ContainerdDownloadURLBase: "https://storage.googleapis.com/cri-containerd-release/", CSIProxyDownloadURL: "https://packages.aks.azure.com/csi-proxy/v0.1.0/binaries/csi-proxy.tar.gz", diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index baa293edb0e..a5aeadd136b 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1716,10 +1716,10 @@ root = "{{GetDataDir}}"{{- end}} type = "snapshot" address = "/run/containerd/tardev-snapshotter.sock" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata-cc] - pod_annotations = ["io.katacontainers.*"] snapshotter = "tardev" runtime_type = "io.containerd.kata-cc.v2" privileged_without_host_devices = true + pod_annotations = ["io.katacontainers.*"] [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata-cc.options] ConfigPath = "/opt/confidential-containers/share/defaults/kata-containers/configuration-clh-snp.toml" {{- end}} @@ -1747,9 +1747,9 @@ root = "{{GetDataDir}}"{{- end}} default_runtime_name = "nvidia-container-runtime" [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.nvidia-container-runtime] runtime_type = "io.containerd.runc.v2" - [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.nvidia-container-runtime.options] - BinaryName = "/usr/bin/nvidia-container-runtime" - SystemdCgroup = true + [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.nvidia-container-runtime.options] + BinaryName = "/usr/bin/nvidia-container-runtime" + SystemdCgroup = true [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.untrusted] runtime_type = "io.containerd.runc.v2" [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.untrusted.options] @@ -1758,13 +1758,13 @@ root = "{{GetDataDir}}"{{- end}} default_runtime_name = "runc" [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc] runtime_type = "io.containerd.runc.v2" - [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc.options] - BinaryName = "/usr/bin/runc" - SystemdCgroup = true + [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc.options] + BinaryName = "/usr/bin/runc" + SystemdCgroup = true [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.untrusted] runtime_type = "io.containerd.runc.v2" - [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.untrusted.options] - BinaryName = "/usr/bin/runc" + [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.untrusted.options] + BinaryName = "/usr/bin/runc" {{- end}} {{- if and (IsKubenet) (not HasCalicoNetworkPolicy) }} [plugins."io.containerd.cri.v1.runtime".cni] @@ -1793,10 +1793,10 @@ root = "{{GetDataDir}}"{{- end}} type = "snapshot" address = "/run/containerd/tardev-snapshotter.sock" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata-cc] - pod_annotations = ["io.katacontainers.*"] snapshotter = "tardev" runtime_type = "io.containerd.kata-cc.v2" privileged_without_host_devices = true + pod_annotations = ["io.katacontainers.*"] [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata-cc.options] ConfigPath = "/opt/confidential-containers/share/defaults/kata-containers/configuration-clh-snp.toml" {{- end}} @@ -1915,10 +1915,10 @@ root = "{{GetDataDir}}"{{- end}} type = "snapshot" address = "/run/containerd/tardev-snapshotter.sock" [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata-cc] - pod_annotations = ["io.katacontainers.*"] snapshotter = "tardev" runtime_type = "io.containerd.kata-cc.v2" privileged_without_host_devices = true + pod_annotations = ["io.katacontainers.*"] [plugins."io.containerd.grpc.v1.cri".containerd.runtimes.kata-cc.options] ConfigPath = "/opt/confidential-containers/share/defaults/kata-containers/configuration-clh-snp.toml" {{- end}} From c181e2510551ddf9c919820508cb0801520c04f8 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Wed, 27 May 2026 09:55:20 -0700 Subject: [PATCH 12/24] fix diffs --- aks-node-controller/app.go | 4 +--- aks-node-controller/parser/parser.go | 1 + pkg/agent/baker.go | 9 --------- 3 files changed, 2 insertions(+), 12 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 943aeab890a..9134805fee0 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -28,9 +28,7 @@ func isDeprecatedCSEVar(key string) bool { case "CLOUD_INIT_STATUS_SCRIPT", "HYPERKUBE_URL", "MCR_REPOSITORY_BASE", - "BLOCK_OUTBOUND_NETWORK", - // DISABLE_PUBKEY_AUTH is now computed in scripts, and CSE var is not used - "DISABLE_PUBKEY_AUTH": + "BLOCK_OUTBOUND_NETWORK": return true } return false diff --git a/aks-node-controller/parser/parser.go b/aks-node-controller/parser/parser.go index 0561254c8bb..ec2cc09cec2 100644 --- a/aks-node-controller/parser/parser.go +++ b/aks-node-controller/parser/parser.go @@ -188,6 +188,7 @@ func getCSEEnv(config *aksnodeconfigv1.Configuration) map[string]string { "LOCALDNS_GENERATED_COREFILE": getLocalDnsCorefileBase64WithHostsPlugin(config, false), "LOCALDNS_COREFILE_BASE": getLocalDnsCorefileBase64WithHostsPlugin(config, false), "LOCALDNS_COREFILE_WITH_HOSTS": getLocalDnsCorefileBase64WithHostsPlugin(config, true), + "DISABLE_PUBKEY_AUTH": fmt.Sprintf("%v", config.GetDisablePubkeyAuth()), "SERVICE_ACCOUNT_IMAGE_PULL_ENABLED": fmt.Sprintf("%v", config.GetServiceAccountImagePullProfile().GetEnabled()), "SERVICE_ACCOUNT_IMAGE_PULL_DEFAULT_CLIENT_ID": config.GetServiceAccountImagePullProfile().GetDefaultClientId(), "SERVICE_ACCOUNT_IMAGE_PULL_DEFAULT_TENANT_ID": config.GetServiceAccountImagePullProfile().GetDefaultTenantId(), diff --git a/pkg/agent/baker.go b/pkg/agent/baker.go index a5aeadd136b..a7d2604cfd6 100644 --- a/pkg/agent/baker.go +++ b/pkg/agent/baker.go @@ -1732,7 +1732,6 @@ root = "{{GetDataDir}}"{{- end}} snapshotter = "overlaybd" disable_snapshot_annotations = false {{- end}} - [plugins."io.containerd.cri.v1.images".pinned_images] sandbox = "{{GetPodInfraContainerSpec}}" {{- if IsKubernetesVersionGe "1.22.0"}} @@ -1741,7 +1740,6 @@ root = "{{GetDataDir}}"{{- end}} {{- end}} [plugins."io.containerd.cri.v1.images".registry.headers] X-Meta-Source-Client = ["azure/aks"] - [plugins."io.containerd.cri.v1.runtime".containerd] {{- if IsNSeriesSKU }} default_runtime_name = "nvidia-container-runtime" @@ -1772,10 +1770,8 @@ root = "{{GetDataDir}}"{{- end}} conf_dir = "/etc/cni/net.d" conf_template = "/etc/containerd/kubenet_template.conf" {{- end}} - [metrics] address = "0.0.0.0:10257" - {{- if IsArtifactStreamingEnabled }} [proxy_plugins] [proxy_plugins.overlaybd] @@ -1804,13 +1800,11 @@ root = "{{GetDataDir}}"{{- end}} containerdV2NoGPUConfigTemplate ContainerdConfigTemplate = `version = 2 oom_score = -999{{if HasDataDir }} root = "{{GetDataDir}}"{{- end}} - [plugins."io.containerd.cri.v1.images"] {{- if IsArtifactStreamingEnabled }} snapshotter = "overlaybd" disable_snapshot_annotations = false {{- end}} - [plugins."io.containerd.cri.v1.images".pinned_images] sandbox = "{{GetPodInfraContainerSpec}}" {{- if IsKubernetesVersionGe "1.22.0"}} @@ -1819,7 +1813,6 @@ root = "{{GetDataDir}}"{{- end}} {{- end}} [plugins."io.containerd.cri.v1.images".registry.headers] X-Meta-Source-Client = ["azure/aks"] - [plugins."io.containerd.cri.v1.runtime".containerd] default_runtime_name = "runc" [plugins."io.containerd.cri.v1.runtime".containerd.runtimes.runc] @@ -1837,10 +1830,8 @@ root = "{{GetDataDir}}"{{- end}} conf_dir = "/etc/cni/net.d" conf_template = "/etc/containerd/kubenet_template.conf" {{- end}} - [metrics] address = "0.0.0.0:10257" - {{- if IsArtifactStreamingEnabled }} [proxy_plugins] [proxy_plugins.overlaybd] From 1eb9ade993112e22d7b081c6cda889542452c474 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Wed, 27 May 2026 11:15:49 -0700 Subject: [PATCH 13/24] do not enable phase 3 in ubuntu2404 e2e for now --- aks-node-controller/parser/helper.go | 1 + aks-node-controller/parser/helper_test.go | 6 ++++-- e2e/scenario_test.go | 6 ------ 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index d9e3b6134f1..d28a4a76d4e 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -181,6 +181,7 @@ func containerdConfigFromAKSNodeConfig(aksnodeconfig *aksnodeconfigv1.Configurat return "", fmt.Errorf("AKSNodeConfig is nil") } + // TODO: add containerdv2 support // the containerd config template is different based on whether the node is with GPU or not. _template := containerdConfigTemplate if noGPU { diff --git a/aks-node-controller/parser/helper_test.go b/aks-node-controller/parser/helper_test.go index 2592d12c01a..0f9bdbb0b06 100644 --- a/aks-node-controller/parser/helper_test.go +++ b/aks-node-controller/parser/helper_test.go @@ -166,7 +166,8 @@ net.ipv4.neigh.default.gc_thresh1=4096 net.ipv4.neigh.default.gc_thresh2=8192 net.ipv4.neigh.default.gc_thresh3=16384 net.ipv4.tcp_max_syn_backlog=16384 -net.ipv4.tcp_retries2=8`)), +net.ipv4.tcp_retries2=8 +`)), }, { name: "SysctlConfig with custom values", @@ -187,7 +188,8 @@ net.ipv4.neigh.default.gc_thresh1=4096 net.ipv4.neigh.default.gc_thresh2=8192 net.ipv4.neigh.default.gc_thresh3=16384 net.ipv4.tcp_max_syn_backlog=9999 -net.ipv4.tcp_retries2=8`)), +net.ipv4.tcp_retries2=8 +`)), }, } for _, tt := range tests { diff --git a/e2e/scenario_test.go b/e2e/scenario_test.go index 785d81082da..00a46060de7 100644 --- a/e2e/scenario_test.go +++ b/e2e/scenario_test.go @@ -850,8 +850,6 @@ func Test_Ubuntu2004FIPS(t *testing.T) { VHD: config.VHDUbuntu2004FIPSContainerd, BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, - AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { - }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -1510,10 +1508,6 @@ func Test_Ubuntu2404_ArtifactStreaming_ARM64(t *testing.T) { nbc.AgentPoolProfile.VMSize = "Standard_D2pds_V5" nbc.IsARM64 = true }, - AKSNodeConfigMutator: func(_ *Cluster, config *aksnodeconfigv1.Configuration) { - config.EnableArtifactStreaming = true - config.VmSize = "Standard_D2pds_V5" - }, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.SKU.Name = to.Ptr("Standard_D2pds_V5") }, From 093c4f7ebaffea0ca6c13c6ac0088228160a584d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 27 May 2026 18:30:25 +0000 Subject: [PATCH 14/24] fix: update unit tests to remove expected trailing space from GetOrderedKubeletConfigFlagString Co-authored-by: lilypan26 <106703606+lilypan26@users.noreply.github.com> --- pkg/agent/baker_test.go | 2 +- pkg/agent/utils_test.go | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/agent/baker_test.go b/pkg/agent/baker_test.go index 20748e7acae..9ab5cb74da9 100644 --- a/pkg/agent/baker_test.go +++ b/pkg/agent/baker_test.go @@ -1060,7 +1060,7 @@ var _ = Describe("getLinuxNodeCSECommand", func() { vars, err := getDecodedVarsFromCseCmd([]byte(cseCmd)) Expect(err).NotTo(HaveOccurred()) Expect(vars).To(HaveKey("KUBELET_FLAGS")) - Expect(vars["KUBELET_FLAGS"]).To(Equal("--image-gc-high-threshold=85 --max-pods=110 --pod-max-pids=-1 ")) + Expect(vars["KUBELET_FLAGS"]).To(Equal("--image-gc-high-threshold=85 --max-pods=110 --pod-max-pids=-1")) }) It("should handle different distros", func() { diff --git a/pkg/agent/utils_test.go b/pkg/agent/utils_test.go index b8b42429ff2..94e84cb07cb 100644 --- a/pkg/agent/utils_test.go +++ b/pkg/agent/utils_test.go @@ -581,7 +581,7 @@ var _ = Describe("Test GetOrderedKubeletConfigFlagString", func() { AgentPoolProfile: &datamodel.AgentPoolProfile{}, } actucalStr := GetOrderedKubeletConfigFlagString(config) - expectStr := "--event-qps=0 --image-gc-high-threshold=85 --node-status-update-frequency=10s " + expectStr := "--event-qps=0 --image-gc-high-threshold=85 --node-status-update-frequency=10s" Expect(expectStr).To(Equal(actucalStr)) }) @@ -614,7 +614,7 @@ var _ = Describe("Test GetOrderedKubeletConfigFlagString", func() { AgentPoolProfile: &datamodel.AgentPoolProfile{}, } - expectStr := "--event-qps=0 --image-gc-high-threshold=85 --node-status-update-frequency=20s --seccomp-default=true --streaming-connection-idle-timeout=4h0m0s " + expectStr := "--event-qps=0 --image-gc-high-threshold=85 --node-status-update-frequency=20s --seccomp-default=true --streaming-connection-idle-timeout=4h0m0s" actucalStr := GetOrderedKubeletConfigFlagString(config) Expect(expectStr).To(Equal(actucalStr)) }) @@ -642,7 +642,7 @@ var _ = Describe("Test GetOrderedKubeletConfigFlagString", func() { AgentPoolProfile: &datamodel.AgentPoolProfile{}, } - expectedStr := "--node-labels=topology.kubernetes.io/region=southcentralus " + expectedStr := "--node-labels=topology.kubernetes.io/region=southcentralus" actualStr := GetOrderedKubeletConfigFlagString(config) Expect(expectedStr).To(Equal(actualStr)) }) @@ -677,7 +677,7 @@ var _ = Describe("Test GetOrderedKubeletConfigFlagString", func() { }, } - expectedStr := "--node-labels=topology.kubernetes.io/region=southcentralus --seccomp-default=true " + expectedStr := "--node-labels=topology.kubernetes.io/region=southcentralus --seccomp-default=true" actualStr := GetOrderedKubeletConfigFlagString(config) Expect(expectedStr).To(Equal(actualStr)) }) From 1e7befc13e018abccb8b5e2144d150ba1db6b45a Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Wed, 27 May 2026 11:33:18 -0700 Subject: [PATCH 15/24] update UTs --- pkg/agent/baker_test.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pkg/agent/baker_test.go b/pkg/agent/baker_test.go index 20748e7acae..dae4aea09b7 100644 --- a/pkg/agent/baker_test.go +++ b/pkg/agent/baker_test.go @@ -27,8 +27,7 @@ import ( - KEY="VALUE WITH WHITSPACE". */ const cseRegexString = `([^=\s]+)=(\"[^\"]*\"|[^\s]*)` -const expectedlocalDNSCorefileWithoutOverrides = ` -# *********************************************************************************** +const expectedlocalDNSCorefileWithoutOverrides = `# *********************************************************************************** # WARNING: Changes to this file will be overwritten and not persisted. # *********************************************************************************** # whoami (used for health check of DNS) @@ -403,8 +402,7 @@ var _ = Describe("Assert generated customData and cseCmd", func() { Expect(err).To(BeNil()) Expect(localDNSCoreFile).ToNot(BeEmpty()) - expectedlocalDNSCorefile := ` -# *********************************************************************************** + expectedlocalDNSCorefile := `# *********************************************************************************** # WARNING: Changes to this file will be overwritten and not persisted. # *********************************************************************************** # whoami (used for health check of DNS) @@ -593,8 +591,7 @@ testdomain456.com:53 { Expect(err).To(BeNil()) Expect(localDNSCoreFile).ToNot(BeEmpty()) - expectedlocalDNSCorefile := ` -# *********************************************************************************** + expectedlocalDNSCorefile := `# *********************************************************************************** # WARNING: Changes to this file will be overwritten and not persisted. # *********************************************************************************** # whoami (used for health check of DNS) From 60d6e92e28464c0a0db0fb699d5fb8adf59942e9 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 27 May 2026 20:35:41 +0000 Subject: [PATCH 16/24] fix: quote proxy values in getProxyVariables to prevent command injection Co-authored-by: lilypan26 <106703606+lilypan26@users.noreply.github.com> --- aks-node-controller/parser/helper.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index d28a4a76d4e..179ca0f1736 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -689,13 +689,13 @@ func getProxyVariables(proxyConfig *aksnodeconfigv1.HttpProxyConfig) string { proxyVars := "" if proxyConfig.GetHttpProxy() != "" { // from https://curl.se/docs/manual.html, curl uses http_proxy but uppercase for others? - proxyVars = fmt.Sprintf("export http_proxy=%s;", proxyConfig.GetHttpProxy()) + proxyVars = fmt.Sprintf("export http_proxy=\"%s\";", proxyConfig.GetHttpProxy()) } if proxyConfig.GetHttpsProxy() != "" { - proxyVars = fmt.Sprintf("export HTTPS_PROXY=%s; %s", proxyConfig.GetHttpsProxy(), proxyVars) + proxyVars = fmt.Sprintf("export HTTPS_PROXY=\"%s\"; %s", proxyConfig.GetHttpsProxy(), proxyVars) } if proxyConfig.GetNoProxyEntries() != nil { - proxyVars = fmt.Sprintf("export NO_PROXY=%s; %s", strings.Join(proxyConfig.GetNoProxyEntries(), ","), proxyVars) + proxyVars = fmt.Sprintf("export NO_PROXY=\"%s\"; %s", strings.Join(proxyConfig.GetNoProxyEntries(), ","), proxyVars) } return proxyVars } From e7c279c64920af0e9f32f7521d8d17f7b2903bb4 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 27 May 2026 21:04:51 +0000 Subject: [PATCH 17/24] fix: propagate gzip helper errors in e2e custom data Co-authored-by: lilypan26 <106703606+lilypan26@users.noreply.github.com> --- e2e/vmss.go | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 317ef8ef0d7..89bb7651a1a 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -93,6 +93,7 @@ func ConfigureAndCreateVMSS(ctx context.Context, s *Scenario) (*ScenarioVM, erro func CustomDataWithHack(s *Scenario, nbcCmdScript, binaryURL string) (string, error) { configPath := "/opt/azure/containers/aks-node-controller-config-hack.json" nbcCmdPath := "/opt/azure/containers/aks-node-controller-nbc-cmd-hack.sh" + var err error // Build provision flags conditionally based on what's provided. var flags []string @@ -102,7 +103,10 @@ func CustomDataWithHack(s *Scenario, nbcCmdScript, binaryURL string) (string, er } if nbcCmdScript != "" { flags = append(flags, "--nbc-cmd="+nbcCmdPath) - encodedNBCCSECmd = gzipAndBase64Encode([]byte(nbcCmdScript)) + encodedNBCCSECmd, err = gzipAndBase64Encode([]byte(nbcCmdScript)) + if err != nil { + return "", fmt.Errorf("failed to gzip nbc cmd script: %w", err) + } } provisionFlags := strings.Join(flags, " ") @@ -113,7 +117,10 @@ func CustomDataWithHack(s *Scenario, nbcCmdScript, binaryURL string) (string, er if err != nil { return "", fmt.Errorf("failed to marshal nbc, error: %w", err) } - encodedAksNodeConfigJSON = gzipAndBase64Encode(aksNodeConfigJSON) + encodedAksNodeConfigJSON, err = gzipAndBase64Encode(aksNodeConfigJSON) + if err != nil { + return "", fmt.Errorf("failed to gzip aks node config: %w", err) + } } var customData string @@ -237,12 +244,16 @@ coreos: // gzipAndBase64Encode compresses data with gzip then base64-encodes it. // This matches the production baker.go approach (getBase64EncodedGzippedCustomScriptFromStr) // to keep custom data within the Azure 65535-byte limit. -func gzipAndBase64Encode(data []byte) string { +func gzipAndBase64Encode(data []byte) (string, error) { var buf bytes.Buffer w := gzip.NewWriter(&buf) - w.Write(data) - w.Close() - return base64.StdEncoding.EncodeToString(buf.Bytes()) + if _, err := w.Write(data); err != nil { + return "", fmt.Errorf("failed to gzip data: %w", err) + } + if err := w.Close(); err != nil { + return "", fmt.Errorf("failed to finalize gzip data: %w", err) + } + return base64.StdEncoding.EncodeToString(buf.Bytes()), nil } // CustomDataWithNBCCmdHack is similar to baker.boothooktemplate, but it uses a hack to run new aks-node-controller binary. From f4adeb9bee80ade83101dcd4e977e42914160a8b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 27 May 2026 21:04:58 +0000 Subject: [PATCH 18/24] fix: gzip phase3 custom data payload Co-authored-by: lilypan26 <106703606+lilypan26@users.noreply.github.com> --- .../pkg/nodeconfigutils/utils.go | 16 ++++- .../pkg/nodeconfigutils/utils_test.go | 67 +++++++++++++++++++ 2 files changed, 82 insertions(+), 1 deletion(-) diff --git a/aks-node-controller/pkg/nodeconfigutils/utils.go b/aks-node-controller/pkg/nodeconfigutils/utils.go index 365b7b0dbf0..a6ff8eea811 100644 --- a/aks-node-controller/pkg/nodeconfigutils/utils.go +++ b/aks-node-controller/pkg/nodeconfigutils/utils.go @@ -2,6 +2,7 @@ package nodeconfigutils import ( "bytes" + "compress/gzip" "encoding/base64" "fmt" "mime/multipart" @@ -132,7 +133,7 @@ func CustomDataPhase3(cfg *aksnodeconfigv1.Configuration, nbcCSECMD string) (str return "", fmt.Errorf("failed to finalize multipart custom data: %w", err) } - return base64.StdEncoding.EncodeToString(customData.Bytes()), nil + return gzipAndBase64Encode(customData.Bytes()) } // CustomDataFlatcar builds base64-encoded custom data for Flatcar Container Linux nodes. @@ -171,6 +172,19 @@ func writeMIMEPart(writer *multipart.Writer, contentType, content string) error return err } +func gzipAndBase64Encode(data []byte) (string, error) { + var gzipped bytes.Buffer + gzipWriter := gzip.NewWriter(&gzipped) + if _, err := gzipWriter.Write(data); err != nil { + return "", fmt.Errorf("failed to gzip custom data: %w", err) + } + if err := gzipWriter.Close(); err != nil { + return "", fmt.Errorf("failed to finalize gzip custom data: %w", err) + } + + return base64.StdEncoding.EncodeToString(gzipped.Bytes()), nil +} + func MarshalConfigurationV1(cfg *aksnodeconfigv1.Configuration) ([]byte, error) { options := protojson.MarshalOptions{ UseEnumNumbers: false, diff --git a/aks-node-controller/pkg/nodeconfigutils/utils_test.go b/aks-node-controller/pkg/nodeconfigutils/utils_test.go index 598ccc3c439..c5751cac5e4 100644 --- a/aks-node-controller/pkg/nodeconfigutils/utils_test.go +++ b/aks-node-controller/pkg/nodeconfigutils/utils_test.go @@ -1,6 +1,8 @@ package nodeconfigutils import ( + "bytes" + "compress/gzip" "encoding/base64" "io" "mime" @@ -271,6 +273,71 @@ func TestCustomDataUsesMultipartBoothookAndCloudConfig(t *testing.T) { require.ErrorIs(t, err, io.EOF) } +func TestCustomDataPhase3UsesGzippedMultipartBoothookAndCloudConfig(t *testing.T) { + cfg := &aksnodeconfigv1.Configuration{ + Version: "v1", + AuthConfig: &aksnodeconfigv1.AuthConfig{ + SubscriptionId: "test-subscription", + }, + ClusterConfig: &aksnodeconfigv1.ClusterConfig{ + ResourceGroup: "test-rg", + Location: "eastus", + }, + ApiServerConfig: &aksnodeconfigv1.ApiServerConfig{ + ApiServerName: "test-api-server", + }, + } + + customData, err := CustomDataPhase3(cfg, "echo test") + require.NoError(t, err) + + decoded, err := base64.StdEncoding.DecodeString(customData) + require.NoError(t, err) + + gzipReader, err := gzip.NewReader(bytes.NewReader(decoded)) + require.NoError(t, err) + defer gzipReader.Close() + + uncompressed, err := io.ReadAll(gzipReader) + require.NoError(t, err) + + sections := strings.SplitN(string(uncompressed), "\r\n\r\n", 2) + require.Len(t, sections, 2) + + message := textproto.MIMEHeader{} + for _, line := range strings.Split(sections[0], "\r\n") { + if line == "" { + continue + } + parts := strings.SplitN(line, ": ", 2) + require.Len(t, parts, 2) + message.Add(parts[0], parts[1]) + } + + mediaType, params, err := mime.ParseMediaType(message.Get("Content-Type")) + require.NoError(t, err) + require.Equal(t, "multipart/mixed", mediaType) + + reader := multipart.NewReader(strings.NewReader(sections[1]), params["boundary"]) + + part, err := reader.NextPart() + require.NoError(t, err) + require.Equal(t, "text/cloud-boothook", part.Header.Get("Content-Type")) + boothook, err := io.ReadAll(part) + require.NoError(t, err) + require.Contains(t, string(boothook), AKSNodeConfigFilePath) + require.Contains(t, string(boothook), NBCCmdFilePath) + + part, err = reader.NextPart() + require.NoError(t, err) + require.Equal(t, "text/cloud-config", part.Header.Get("Content-Type")) + _, err = io.ReadAll(part) + require.NoError(t, err) + + _, err = reader.NextPart() + require.ErrorIs(t, err, io.EOF) +} + func TestMarshalUnmarshalWithPopulatedConfig(t *testing.T) { t.Run("fully populated config marshals to >100 bytes", func(t *testing.T) { cfg := &aksnodeconfigv1.Configuration{} From 50768f5a6cd0c46e5d83fcfeb306fd323469dcb0 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Wed, 27 May 2026 14:22:17 -0700 Subject: [PATCH 19/24] fix http proxy vars quotations --- aks-node-controller/app.go | 24 ++++++++++++++++++--- aks-node-controller/app_test.go | 6 ++++++ parts/linux/cloud-init/artifacts/cse_cmd.sh | 2 +- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 9134805fee0..5a2c0fd69a4 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -384,14 +384,14 @@ func parseEnvVarsFromNBCCmdContent(content string) map[string]string { } // parseEnvValue parses the value portion of a KEY=VALUE assignment starting at position i. -// It handles concatenated quoted and unquoted segments. Returns the parsed value and the new position. +// It handles concatenated quoted (single or double) and unquoted segments. Returns the parsed value and the new position. func parseEnvValue(content string, i int) (string, int) { n := len(content) var value strings.Builder for i < n { switch { case content[i] == '"': - // Quoted section: read until closing quote. + // Double-quoted section: read until closing double quote. i++ // skip opening quote for i < n && content[i] != '"' { value.WriteByte(content[i]) @@ -400,6 +400,16 @@ func parseEnvValue(content string, i int) (string, int) { if i < n { i++ // skip closing quote } + case content[i] == '\'': + // Single-quoted section: read until closing single quote. + i++ // skip opening quote + for i < n && content[i] != '\'' { + value.WriteByte(content[i]) + i++ + } + if i < n { + i++ // skip closing quote + } case isDelimiter(content[i]): return value.String(), i default: @@ -426,7 +436,7 @@ func isEnvKeyChar(c byte) bool { return (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' } -// skipToken advances past the current non-whitespace token, respecting double-quoted sections. +// skipToken advances past the current non-whitespace token, respecting quoted sections. func skipToken(content string, i int) int { n := len(content) for i < n && content[i] != ' ' && content[i] != '\t' && content[i] != '\n' && content[i] != ';' { @@ -438,6 +448,14 @@ func skipToken(content string, i int) int { if i < n { i++ } + } else if content[i] == '\'' { + i++ + for i < n && content[i] != '\'' { + i++ + } + if i < n { + i++ + } } else { i++ } diff --git a/aks-node-controller/app_test.go b/aks-node-controller/app_test.go index 337109da386..16e77af6a13 100644 --- a/aks-node-controller/app_test.go +++ b/aks-node-controller/app_test.go @@ -526,6 +526,12 @@ func TestParseEnvVarsFromNBCCmdContent(t *testing.T) { assert.Equal(t, "false", got["GPU_NEEDS_FABRIC_MANAGER"]) assert.Equal(t, "900", got["CSE_TIMEOUT"]) }) + + t.Run("single-quoted values", func(t *testing.T) { + content := `PROXY_VARS='export HTTPS_PROXY="https://proxy:8443"; export http_proxy="http://proxy:8080";'` + got := parseEnvVarsFromNBCCmdContent(content) + assert.Equal(t, `export HTTPS_PROXY="https://proxy:8443"; export http_proxy="http://proxy:8080";`, got["PROXY_VARS"]) + }) } // compareEnvsConfigEnv builds a CSE env map from the test provision config, diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index d184b6e5356..6d774c31544 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -129,7 +129,7 @@ CUSTOM_SEARCH_DOMAIN_FILEPATH="{{GetCustomSearchDomainsCSEScriptFilepath}}" HTTP_PROXY_URLS="{{GetHTTPProxy}}" HTTPS_PROXY_URLS="{{GetHTTPSProxy}}" NO_PROXY_URLS="{{GetNoProxy}}" -PROXY_VARS="{{GetProxyVariables}}" +PROXY_VARS='{{GetProxyVariables}}' ENABLE_SECURE_TLS_BOOTSTRAPPING="{{EnableSecureTLSBootstrapping}}" SECURE_TLS_BOOTSTRAPPING_AAD_RESOURCE="{{GetSecureTLSBootstrappingAADResource}}" SECURE_TLS_BOOTSTRAPPING_USER_ASSIGNED_IDENTITY_ID="{{GetSecureTLSBootstrappingUserAssignedIdentityID}}" From d4ed38a03693cdd31d854d05e544cc3f3125f97b Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Wed, 27 May 2026 15:15:18 -0700 Subject: [PATCH 20/24] address comments --- aks-node-controller/pkg/nodeconfigutils/utils.go | 14 ++++++++++---- e2e/vmss.go | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/aks-node-controller/pkg/nodeconfigutils/utils.go b/aks-node-controller/pkg/nodeconfigutils/utils.go index a6ff8eea811..c506f611616 100644 --- a/aks-node-controller/pkg/nodeconfigutils/utils.go +++ b/aks-node-controller/pkg/nodeconfigutils/utils.go @@ -44,12 +44,12 @@ logger -t aks-boothook "boothook start $(date -Ins)" mkdir -p /opt/azure/containers -cat <<'EOF' | base64 -d >%[1]s +cat <<'EOF' | base64 -d | gzip -d >%[1]s %[2]s EOF chmod 0600 %[1]s -cat <<'EOF' | base64 -d >%[3]s +cat <<'EOF' | base64 -d | gzip -d >%[3]s %[4]s EOF chmod 0755 %[3]s @@ -113,8 +113,14 @@ func CustomDataPhase3(cfg *aksnodeconfigv1.Configuration, nbcCSECMD string) (str return "", fmt.Errorf("failed to marshal nbc, error: %w", err) } - encodedAksNodeConfigJSON := base64.StdEncoding.EncodeToString(aksNodeConfigJSON) - encodedNBCCSECmd := base64.StdEncoding.EncodeToString([]byte(nbcCSECMD)) + encodedAksNodeConfigJSON, err := gzipAndBase64Encode(aksNodeConfigJSON) + if err != nil { + return "", fmt.Errorf("failed to gzip and base64 encode nbc config: %w", err) + } + encodedNBCCSECmd, err := gzipAndBase64Encode([]byte(nbcCSECMD)) + if err != nil { + return "", fmt.Errorf("failed to gzip and base64 encode nbc cse cmd: %w", err) + } boothook := fmt.Sprintf(boothookPhase3Template, AKSNodeConfigFilePath, encodedAksNodeConfigJSON, NBCCmdFilePath, encodedNBCCSECmd) var customData bytes.Buffer diff --git a/e2e/vmss.go b/e2e/vmss.go index 89bb7651a1a..a399b999dbe 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -198,7 +198,7 @@ func buildFlatcarCloudConfig(encodedConfig, configPath, encodedNBCCmd, nbcCmdPat } if encodedNBCCmd != "" { fmt.Fprintf(&sb, `- path: %s.gz.b64 - permissions: "0644" + permissions: "0600" owner: root content: | %s From 2ef3c72b13e9e812de384cd9c756988c781eb72b Mon Sep 17 00:00:00 2001 From: lilypan26 Date: Wed, 27 May 2026 15:16:08 -0700 Subject: [PATCH 21/24] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- parts/linux/cloud-init/artifacts/cse_config.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/cse_config.sh b/parts/linux/cloud-init/artifacts/cse_config.sh index 6a7c4651867..1bccaf924c7 100755 --- a/parts/linux/cloud-init/artifacts/cse_config.sh +++ b/parts/linux/cloud-init/artifacts/cse_config.sh @@ -1120,7 +1120,7 @@ configureSSHPubkeyAuth() { local ssh_use_pubkey_auth # Determine the desired pubkey auth setting -if [ "${disable_pubkey_auth}" = "true" ]; then + if [ "${disable_pubkey_auth}" = "true" ]; then ssh_use_pubkey_auth="no" else ssh_use_pubkey_auth="yes" From 425c7ff7bea7218185e7ae018dc81423ccafd999 Mon Sep 17 00:00:00 2001 From: lilypan26 Date: Wed, 27 May 2026 15:41:48 -0700 Subject: [PATCH 22/24] improve env compare validation Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- e2e/validators.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index e2f2a1b59fb..5213f1895c3 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2686,8 +2686,8 @@ func ValidateScriptlessPhase3(ctx context.Context, s *Scenario) { if s.Runtime.EnableScriptlessANC { logFile := "/var/log/azure/aks-node-controller.log" if !fileHasContent(ctx, s, logFile, "env compare: no differences found between provision-config and nbc-cmd env vars") { - // Grep for "differs" lines to show what's different - diffCmd := "sudo grep 'differs' " + logFile + " || true" + // Grep for all env-compare diff markers to show what's different. + diffCmd := "sudo grep -E 'differs|only-in-pc|only-in-nbc|env var differences' " + logFile + " || true" result := execScriptOnVMForScenarioValidateExitCode(ctx, s, diffCmd, 0, "could not grep for differences in aks-node-controller.log") s.T.Fatalf("expected no env var differences between provision-config and nbc-cmd, but found differences:\n%s", result.stdout) } From 442c525e915d467a59abadc8b4e3ab25356bdfb3 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Wed, 27 May 2026 15:54:27 -0700 Subject: [PATCH 23/24] fix proxy var comapre --- aks-node-controller/app.go | 17 ++++++++++++++++- parts/linux/cloud-init/artifacts/cse_cmd.sh | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 5a2c0fd69a4..919c8687f7f 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -332,13 +332,28 @@ func diffEnvMaps(pcEnv, nbcEnv map[string]string) []string { if !isDeprecatedCSEVar(key) { diffs = append(diffs, fmt.Sprintf("only-in-nbc: %s = %q", key, nbcVal)) } - case pcVal != nbcVal: + case !envValsEqual(pcVal, nbcVal): diffs = append(diffs, fmt.Sprintf("differs: %s pc=%q nbc=%q", key, pcVal, nbcVal)) } } return diffs } +// envValsEqual compares two environment variable values, treating them as equal +// if they differ only in the presence of double quotes around substrings. +// This handles cases like PROXY_VARS where the legacy path strips inner quotes +// due to shell quoting collision while the scriptless path preserves them. +func envValsEqual(a, b string) bool { + if a == b { + return true + } + return stripDoubleQuotes(a) == stripDoubleQuotes(b) +} + +func stripDoubleQuotes(s string) string { + return strings.ReplaceAll(s, "\"", "") +} + // parseEnvVarsFromNBCCmdContent extracts environment variable assignments from an NBC command string. // The command is a bash one-liner with KEY=VALUE pairs (quoted or unquoted) interspersed with shell commands. // Only variables with uppercase/underscore names are extracted. diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index 6d774c31544..d184b6e5356 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -129,7 +129,7 @@ CUSTOM_SEARCH_DOMAIN_FILEPATH="{{GetCustomSearchDomainsCSEScriptFilepath}}" HTTP_PROXY_URLS="{{GetHTTPProxy}}" HTTPS_PROXY_URLS="{{GetHTTPSProxy}}" NO_PROXY_URLS="{{GetNoProxy}}" -PROXY_VARS='{{GetProxyVariables}}' +PROXY_VARS="{{GetProxyVariables}}" ENABLE_SECURE_TLS_BOOTSTRAPPING="{{EnableSecureTLSBootstrapping}}" SECURE_TLS_BOOTSTRAPPING_AAD_RESOURCE="{{GetSecureTLSBootstrappingAADResource}}" SECURE_TLS_BOOTSTRAPPING_USER_ASSIGNED_IDENTITY_ID="{{GetSecureTLSBootstrappingUserAssignedIdentityID}}" From 2126d61102eb829d68be427d755391185b92ee84 Mon Sep 17 00:00:00 2001 From: Lily Pan Date: Wed, 27 May 2026 16:19:30 -0700 Subject: [PATCH 24/24] do not log values --- aks-node-controller/app.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/aks-node-controller/app.go b/aks-node-controller/app.go index 919c8687f7f..1f3024a1dac 100644 --- a/aks-node-controller/app.go +++ b/aks-node-controller/app.go @@ -327,13 +327,13 @@ func diffEnvMaps(pcEnv, nbcEnv map[string]string) []string { nbcVal, inNBC := nbcEnv[key] switch { case inPC && !inNBC: - diffs = append(diffs, fmt.Sprintf("only-in-pc: %s = %q", key, pcVal)) + diffs = append(diffs, fmt.Sprintf("only-in-pc: %s", key)) case !inPC && inNBC: if !isDeprecatedCSEVar(key) { - diffs = append(diffs, fmt.Sprintf("only-in-nbc: %s = %q", key, nbcVal)) + diffs = append(diffs, fmt.Sprintf("only-in-nbc: %s", key)) } case !envValsEqual(pcVal, nbcVal): - diffs = append(diffs, fmt.Sprintf("differs: %s pc=%q nbc=%q", key, pcVal, nbcVal)) + diffs = append(diffs, fmt.Sprintf("differs: %s", key)) } } return diffs @@ -455,7 +455,8 @@ func isEnvKeyChar(c byte) bool { func skipToken(content string, i int) int { n := len(content) for i < n && content[i] != ' ' && content[i] != '\t' && content[i] != '\n' && content[i] != ';' { - if content[i] == '"' { + switch { + case content[i] == '"': i++ for i < n && content[i] != '"' { i++ @@ -463,7 +464,7 @@ func skipToken(content string, i int) int { if i < n { i++ } - } else if content[i] == '\'' { + case content[i] == '\'': i++ for i < n && content[i] != '\'' { i++ @@ -471,7 +472,7 @@ func skipToken(content string, i int) int { if i < n { i++ } - } else { + default: i++ } }