From 4ad14f0c38de3641fe938b43700bd12a0dcd4706 Mon Sep 17 00:00:00 2001 From: chmill Date: Thu, 30 Apr 2026 23:28:15 +0000 Subject: [PATCH 1/6] feat: node-exporter windows --- e2e/config/config.go | 2 +- e2e/scenario_win_test.go | 18 ++ e2e/validators.go | 49 +++++ parts/common/components.json | 17 ++ parts/windows/kuberneteswindowssetup.ps1 | 16 ++ parts/windows/windowscsehelper.ps1 | 6 +- .../windows-exporter-config.yml | 11 + .../windows-exporter-health.ps1 | 32 +++ staging/cse/windows/README | 2 +- staging/cse/windows/windowsexporterfunc.ps1 | 192 ++++++++++++++++++ .../cse/windows/windowsexporterfunc.tests.ps1 | 74 +++++++ .../packer/test/windows-vhd-content-test.ps1 | 46 +++++ .../packer/windows/configure-windows-vhd.ps1 | 80 ++++++++ .../windows/windows-vhd-builder-sig.json | 4 +- 14 files changed, 544 insertions(+), 5 deletions(-) create mode 100644 parts/windows/windowsexporter/windows-exporter-config.yml create mode 100644 parts/windows/windowsexporter/windows-exporter-health.ps1 create mode 100644 staging/cse/windows/windowsexporterfunc.ps1 create mode 100644 staging/cse/windows/windowsexporterfunc.tests.ps1 diff --git a/e2e/config/config.go b/e2e/config/config.go index d61db484c6e..7a0c5d050ae 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -179,7 +179,7 @@ func mustGetNewRSAKeyPair() ([]byte, []byte, string) { privateKeyFileName, err := writePrivateKeyToTempFile(privatePEMBytes) if err != nil { - panic(fmt.Sprintf("failed to write private key to temp file: %w", err)) + panic(fmt.Sprintf("failed to write private key to temp file: %v", err)) } return privatePEMBytes, publicKeyBytes, privateKeyFileName diff --git a/e2e/scenario_win_test.go b/e2e/scenario_win_test.go index 2ece49b1c7b..fb41d3ce342 100644 --- a/e2e/scenario_win_test.go +++ b/e2e/scenario_win_test.go @@ -68,6 +68,7 @@ func Test_Windows2022_AzureNetwork(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -91,6 +92,7 @@ func Test_Windows2022AzureOverlayNetworkDualStack(t *testing.T) { ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"}) ValidateCiliumIsNotRunningWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -115,6 +117,7 @@ func Test_Windows2022Gen2AzureNetwork(t *testing.T) { ValidateDotnetNotInstalledWindows(ctx, s) ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip") ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -139,6 +142,7 @@ func Test_Windows2022Gen2AzureOverlayNetworkDualStack(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip") ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -161,6 +165,7 @@ func Test_Windows23H2AzureNetwork(t *testing.T) { ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"}) ValidateCiliumIsNotRunningWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -184,6 +189,7 @@ func Test_Windows23H2AzureOverlayNetworkDualStack(t *testing.T) { ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"}) ValidateCiliumIsNotRunningWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -207,6 +213,7 @@ func Test_Windows23H2Gen2AzureNetwork(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip") ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -231,6 +238,7 @@ func Test_Windows23H2Gen2AzureOverlayDualStack(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip") ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -294,6 +302,7 @@ func Test_Windows2025(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -319,6 +328,7 @@ func Test_Windows2025Gen2(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -378,6 +388,7 @@ func Test_Windows2022_SecureTLSBootstrapping_BootstrapToken_Fallback(t *testing. ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -434,6 +445,7 @@ func Test_Windows2022_VHDCaching(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -460,6 +472,7 @@ func Test_Windows2022Gen2_k8s_133(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -483,6 +496,7 @@ func Test_Windows23H2_Cilium2(t *testing.T) { ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"}) ValidateCiliumIsRunningWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -506,6 +520,7 @@ func Test_Windows23H2Gen2_WindowsCiliumNetworking(t *testing.T) { Validator: func(ctx context.Context, s *Scenario) { ValidateWindowsCiliumIsRunning(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -534,6 +549,7 @@ func Test_Windows2022_McrChinaCloud_Windows(t *testing.T) { `https://mcr.azk8s.cn`) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -570,6 +586,7 @@ func Test_Windows2025Gen2_McrChinaCloud_Windows(t *testing.T) { `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`, `https://mcr.azk8s.cn`) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) }, }, }) @@ -616,6 +633,7 @@ func Test_NetworkIsolatedCluster_Windows_WithEgress(t *testing.T) { ValidateFileDoesNotExist(ctx, s, `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) }, }, }) diff --git a/e2e/validators.go b/e2e/validators.go index 96597ada092..bc2a8fb863d 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -805,6 +805,55 @@ func ValidateWindowsSystemServicesRestartConfiguration(ctx context.Context, s *S ValidateWindowsSystemServiceRestartConfiguration(ctx, s, "kubeproxy") } +// ValidateWindowsExporter asserts that the aks-windows-exporter service registered by +// staging/cse/windows/windowsexporterfunc.ps1 is running and serving Prometheus metrics. +// +// When the VHD does not carry the windows-exporter assets (older VHDs where the +// aks-vm-extension still installs the service), the sentinel file is absent and we +// skip the validation - the extension owns the service in that mode and AgentBaker +// has no guarantee about the service state at this point in provisioning. +func ValidateWindowsExporter(ctx context.Context, s *Scenario) { + s.T.Helper() + + const ( + sentinel = `C:\k\skip_vhd_windows_exporter` + binary = `C:\k\windows-exporter\windows-exporter.exe` + configFile = `C:\k\windows-exporter\windows-exporter-config.yml` + serviceName = "aks-windows-exporter" + metricsURL = "http://localhost:19182/metrics" + ) + + sentinelCheck := []string{ + "$ErrorActionPreference = \"Stop\"", + fmt.Sprintf("if (-not (Test-Path '%s')) { Write-Output 'SKIP'; exit 0 }", sentinel), + "Write-Output 'PRESENT'", + } + res := execScriptOnVMForScenario(ctx, s, strings.Join(sentinelCheck, "\n")) + if strings.Contains(res.stdout, "SKIP") { + s.T.Logf("Skipping aks-windows-exporter validation: sentinel %s not found (aks-vm-extension manages the service on this VHD)", sentinel) + return + } + + s.T.Logf("skip_vhd_windows_exporter sentinel present, validating aks-windows-exporter installation") + + command := []string{ + "$ErrorActionPreference = \"Stop\"", + fmt.Sprintf("if (-not (Test-Path '%s')) { throw 'missing binary: %s' }", binary, binary), + fmt.Sprintf("if (-not (Test-Path '%s')) { throw 'missing config: %s' }", configFile, configFile), + fmt.Sprintf("$svc = Get-Service -Name %s", serviceName), + "Write-Output $svc", + fmt.Sprintf("if ($svc.Status -ne 'Running') { throw \"service %s is not running: $($svc.Status)\" }", serviceName), + fmt.Sprintf("if ($svc.StartType -ne 'Automatic') { throw \"service %s StartType is $($svc.StartType), expected Automatic\" }", serviceName), + // Hit the metrics endpoint and require a windows-exporter-specific metric. + fmt.Sprintf("$resp = Invoke-WebRequest -UseBasicParsing -Uri '%s' -TimeoutSec 10", metricsURL), + "if ($resp.StatusCode -ne 200) { throw \"metrics endpoint returned $($resp.StatusCode)\" }", + "if ($resp.Content -notmatch 'windows_os_info') { throw 'windows_os_info metric missing from /metrics response' }", + "if ($resp.Content -notmatch 'windows_cpu_time_total') { throw 'windows_cpu_time_total metric missing from /metrics response' }", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + fmt.Sprintf("aks-windows-exporter validation failed on %s", s.Runtime.VM.PrivateIP)) +} + func ValidateSystemdUnitIsNotFailed(ctx context.Context, s *Scenario, serviceName string) { s.T.Helper() command := []string{ diff --git a/parts/common/components.json b/parts/common/components.json index 55cc839d581..140170da80e 100644 --- a/parts/common/components.json +++ b/parts/common/components.json @@ -914,6 +914,23 @@ } } }, + { + "name": "windows-exporter", + "windowsDownloadLocation": "c:\\akse-cache\\windows-exporter\\", + "downloadURIs": { + "windows": { + "default": { + "versionsV2": [ + { + "renovateTag": "", + "latestVersion": "0.31.2" + } + ], + "downloadURL": "https://packages.aks.azure.com/dalec-packages/windows-exporter/${version}/windows/amd64/windows-exporter_${version}-1_amd64.zip" + } + } + } + }, { "name": "windows credential provider", "windowsDownloadLocation": "c:\\akse-cache\\credential-provider\\", diff --git a/parts/windows/kuberneteswindowssetup.ps1 b/parts/windows/kuberneteswindowssetup.ps1 index ba392f1f7a2..64f85097d46 100644 --- a/parts/windows/kuberneteswindowssetup.ps1 +++ b/parts/windows/kuberneteswindowssetup.ps1 @@ -325,6 +325,12 @@ if (Test-Path -Path 'c:\AzureData\windows\securetlsbootstrapfunc.ps1') { Write-Log "Windows Secure TLS Bootstrap function script not found, skipping dot-source" } +if (Test-Path -Path 'c:\AzureData\windows\windowsexporterfunc.ps1') { + . c:\AzureData\windows\windowsexporterfunc.ps1 +} else { + Write-Log "Windows Exporter function script not found, skipping dot-source" +} + if (Test-Path -Path 'c:\AzureData\windows\windowsciliumnetworkingfunc.ps1') { . c:\AzureData\windows\windowsciliumnetworkingfunc.ps1 } else { @@ -508,6 +514,16 @@ function BasePrep { Install-GmsaPlugin -GmsaPackageUrl $global:WindowsGmsaPackageUrl } + # Register aks-windows-exporter when its assets are baked into the VHD. + # Wrapped in Get-Command guard for bidirectional compat with older VHDs that don't + # carry windowsexporterfunc.ps1 in the CSE script package. + if (Get-Command -Name Install-WindowsExporter -ErrorAction SilentlyContinue) { + Logs-To-Event -TaskName "AKS.WindowsCSE.InstallWindowsExporter" -TaskMessage "Install aks-windows-exporter if VHD-baked" + Install-WindowsExporter + } else { + Write-Log "Install-WindowsExporter not available; aks-vm-extension will manage windows-exporter" + } + Write-Log "BasePrep completed successfully" Logs-To-Event -TaskName "AKS.WindowsCSE.BasePrep" -TaskMessage "BasePrep completed successfully" } diff --git a/parts/windows/windowscsehelper.ps1 b/parts/windows/windowscsehelper.ps1 index 2fbac973d0e..79cca76dce3 100644 --- a/parts/windows/windowscsehelper.ps1 +++ b/parts/windows/windowscsehelper.ps1 @@ -88,9 +88,10 @@ $global:WINDOWS_CSE_ERROR_ORAS_PULL_CREDENTIAL_PROVIDER=81 # exit code for error $global:WINDOWS_CSE_ERROR_ORAS_PULL_POD_INFRA_CONTAINER=82 # exit code for error pulling pause image with oras from registry $global:WINDOWS_CSE_ERROR_NETWORK_ISOLATED_CLUSTER_CSE_NOT_CACHED=83 # exit code for cse of network isolated cluster not cached $global:WINDOWS_CSE_ERROR_ORAS_PULL_CONTAINERD=84 # exit code for error pulling containerd artifact with oras from registry +$global:WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL=85 # exit code for failure starting aks-windows-exporter during CSE # WINDOWS_CSE_ERROR_MAX_CODE is only used in unit tests to verify whether new error code name is added in $global:ErrorCodeNames # Please use the current value of WINDOWS_CSE_ERROR_MAX_CODE as the value of the new error code and increment it by 1 -$global:WINDOWS_CSE_ERROR_MAX_CODE=85 +$global:WINDOWS_CSE_ERROR_MAX_CODE=86 # Please add new error code for downloading new packages in RP code too $global:ErrorCodeNames = @( @@ -178,7 +179,8 @@ $global:ErrorCodeNames = @( "WINDOWS_CSE_ERROR_ORAS_PULL_CREDENTIAL_PROVIDER", "WINDOWS_CSE_ERROR_ORAS_PULL_POD_INFRA_CONTAINER", "WINDOWS_CSE_ERROR_NETWORK_ISOLATED_CLUSTER_CSE_NOT_CACHED", - "WINDOWS_CSE_ERROR_ORAS_PULL_CONTAINERD" + "WINDOWS_CSE_ERROR_ORAS_PULL_CONTAINERD", + "WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL" ) # The package domain to be used diff --git a/parts/windows/windowsexporter/windows-exporter-config.yml b/parts/windows/windowsexporter/windows-exporter-config.yml new file mode 100644 index 00000000000..48f837af622 --- /dev/null +++ b/parts/windows/windowsexporter/windows-exporter-config.yml @@ -0,0 +1,11 @@ +web: + listen-address: ":19182" +collectors: + # explicitly enabled for version: https://github.com/prometheus-community/windows_exporter/tree/v0.31.2 + # NOTE: `cpu_info`, `container` and `process` collectors are added to default collectors + enabled: "cpu,logical_disk,memory,net,os,physical_disk,service,system,cpu_info,container,process" +collector: + service: + include: "aks-windows-exporter|kubelet|kubeproxy|containerd|hns|csi-proxy" +log: + level: debug diff --git a/parts/windows/windowsexporter/windows-exporter-health.ps1 b/parts/windows/windowsexporter/windows-exporter-health.ps1 new file mode 100644 index 00000000000..206181777bb --- /dev/null +++ b/parts/windows/windowsexporter/windows-exporter-health.ps1 @@ -0,0 +1,32 @@ + +function Get-Health { + $result = (& "curl.exe" --silent "http://localhost:19182/health" 2>$null) -join "`n" + if ($null -ne $result -and $result.Contains("ok")) { + return $result + } else { + return "" + } +} + +function Get-Version { + $result = (& "curl.exe" --silent "http://localhost:19182/version" 2>$null) -join "`n" + if ($null -ne $result -and $result.Contains("version")) { + # {"version":"v0.25.1","revision":"f70fa009de541dc99ed210aa7e67c9550133ef02","branch":"HEAD","buildUser":"cloudtest@781d70d7c000002","buildDate":"20240223-08:06:57","goVersion":"go1.21.3"} + $version = $result -replace ".*""version"":""([^""]+)"".*", '$1' + return $version + } else { + return "" + } +} + +function Get-MetricsExample { + # The result may be too large in production node. I suggest to call it only when testing. + $result = (& "curl.exe" "http://localhost:19182/metrics") + $example = "windows_process_cpu_time_total" + if ($result -match $example) { + $example = $result -split "`n" | Where-Object {$_ -match $example} | Select-Object -Last 1 + return $example + } else { + return "" + } +} diff --git a/staging/cse/windows/README b/staging/cse/windows/README index 65e63d96c82..8512bfafb0b 100644 --- a/staging/cse/windows/README +++ b/staging/cse/windows/README @@ -29,7 +29,7 @@ pushd aks-windows-cse unzip ../aks-windows-cse-scripts-*.zip rm ../*.zip - files=("azurecnifunc.ps1" "calicofunc.ps1" "configfunc.ps1" "containerdfunc.ps1" "containerdtemplate.toml" "kubeletfunc.ps1" "kubernetesfunc.ps1" "nvidiagpudriverfunc.ps1" "securetlsbootstrapfunc.ps1" "windowsciliumnetworkingfunc.ps1") + files=("azurecnifunc.ps1" "calicofunc.ps1" "configfunc.ps1" "containerdfunc.ps1" "containerdtemplate.toml" "kubeletfunc.ps1" "kubernetesfunc.ps1" "nvidiagpudriverfunc.ps1" "securetlsbootstrapfunc.ps1" "windowsciliumnetworkingfunc.ps1" "windowsexporterfunc.ps1") for file in ${files[@]}; do echo "Downloading $file from $url/$file" curl -O "$url/$file" diff --git a/staging/cse/windows/windowsexporterfunc.ps1 b/staging/cse/windows/windowsexporterfunc.ps1 new file mode 100644 index 00000000000..75a5f32aa2e --- /dev/null +++ b/staging/cse/windows/windowsexporterfunc.ps1 @@ -0,0 +1,192 @@ +<# +.SYNOPSIS + Installs and starts the aks-windows-exporter service using assets baked into the VHD. + +.DESCRIPTION + Migrated from aks-vm-extension (see aks-windows-node-vm-extension/entrypoint.ps1). + Registers windows-exporter.exe as the Windows service "aks-windows-exporter" via NSSM, + matching the service name, port (19182), log paths, and NSSM settings the extension + used so existing customer dashboards/alerts continue to work. + + The function is guarded so it is a no-op when running on a VHD that does not carry + the exporter assets. In that case the aks-vm-extension install path continues to + handle the service (dual-mode coexistence). + + Coordination with aks-vm-extension: + - When C:\k\skip_vhd_windows_exporter exists, the extension's entrypoint.ps1 + short-circuits. The sentinel is dropped by the Windows VHD build once the binary + and config are staged. +#> + +$global:WindowsExporterInstallDir = "C:\k\windows-exporter" +$global:WindowsExporterBinary = Join-Path $global:WindowsExporterInstallDir "windows-exporter.exe" +$global:WindowsExporterConfig = Join-Path $global:WindowsExporterInstallDir "windows-exporter-config.yml" +$global:WindowsExporterHealthScript = Join-Path $global:WindowsExporterInstallDir "windows-exporter-health.ps1" +$global:WindowsExporterSkipFile = "C:\k\skip_vhd_windows_exporter" +$global:WindowsExporterServiceName = "aks-windows-exporter" +$global:WindowsExporterLegacyService = "windows-exporter" +$global:WindowsExporterPort = 19182 +$global:WindowsExporterStdoutLog = "C:\k\windows-exporter.log" +$global:WindowsExporterStderrLog = "C:\k\windows-exporter.err.log" +$global:WindowsExporterNssm = "C:\k\nssm.exe" + +function Test-WindowsExporterPortInUse { + param([Parameter(Mandatory=$true)][int]$Port) + try { + $listener = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue + return ($null -ne $listener) + } + catch { + $netstat = netstat -an | Select-String ":$Port\s+.*LISTENING" + return ($null -ne $netstat) + } +} + +function Remove-WindowsExporterService { + param([Parameter(Mandatory=$true)][string]$ServiceName) + + $svc = Get-Service $ServiceName -ErrorAction SilentlyContinue + if (-not $svc) { + return + } + Write-Log "Removing existing service $ServiceName" + try { + & $global:WindowsExporterNssm stop $ServiceName 2>&1 | Out-Null + Stop-Service $ServiceName -Force -ErrorAction SilentlyContinue + & $global:WindowsExporterNssm remove $ServiceName confirm 2>&1 | Out-Null + } + catch { + Write-Log "Warning: failed to fully remove $ServiceName (best-effort): $_" + } +} + +function Remove-LegacyWindowsExporterService { + # Old extension versions registered a service literally named "windows-exporter" on port 19182. + # Only remove it if it is using the exporter port, so we don't interfere with unrelated tooling. + $legacy = Get-Service $global:WindowsExporterLegacyService -ErrorAction SilentlyContinue + if (-not $legacy) { + return + } + if (Test-WindowsExporterPortInUse -Port $global:WindowsExporterPort) { + Write-Log "Legacy service $($global:WindowsExporterLegacyService) using port $($global:WindowsExporterPort) - removing" + Remove-WindowsExporterService -ServiceName $global:WindowsExporterLegacyService + } else { + Write-Log "Legacy service $($global:WindowsExporterLegacyService) present but not on port $($global:WindowsExporterPort) - leaving it alone" + } +} + +function Test-WindowsExporterHealth { + param( + [int]$RetryCount = 5, + [int]$RetryInterval = 5 + ) + + if (Test-Path $global:WindowsExporterHealthScript) { + . $global:WindowsExporterHealthScript + for ($i = 0; $i -le $RetryCount; $i++) { + $healthResult = Get-Health + if ($healthResult -ne "") { + Write-Log "aks-windows-exporter health check passed: $healthResult" + $versionResult = Get-Version + if ($versionResult -ne "") { + Write-Log "aks-windows-exporter version $versionResult" + } + return $true + } + Start-Sleep -Seconds $RetryInterval + } + + Write-Log "aks-windows-exporter health script check failed after $($RetryCount + 1) attempts" + return $false + } + + Write-Log "windows-exporter health script not found at $($global:WindowsExporterHealthScript); falling back to direct health endpoint probe" + for ($i = 0; $i -le $RetryCount; $i++) { + $result = (& "curl.exe" --silent "http://localhost:$($global:WindowsExporterPort)/health" 2>$null) -join "`n" + if ($null -ne $result -and $result.Contains("ok")) { + Write-Log "aks-windows-exporter health check passed: $result" + return $true + } + Start-Sleep -Seconds $RetryInterval + } + Write-Log "aks-windows-exporter health check failed after $($RetryCount + 1) attempts" + return $false +} + +function Install-WindowsExporter { + <# + .SYNOPSIS + Registers and starts the aks-windows-exporter NSSM service. + + .NOTES + No-ops when: + - The VHD-build sentinel C:\k\skip_vhd_windows_exporter is absent + (older VHD without baked assets - aks-vm-extension still covers it). + - The windows-exporter binary is missing on disk (defensive). + #> + + if (-not (Test-Path $global:WindowsExporterSkipFile)) { + Write-Log "skip_vhd_windows_exporter not present; aks-vm-extension will manage windows-exporter on this node" + return + } + + if (-not (Test-Path $global:WindowsExporterBinary)) { + Write-Log "windows-exporter binary not found at $($global:WindowsExporterBinary); skipping install (older VHD?)" + return + } + + if (-not (Test-Path $global:WindowsExporterConfig)) { + Write-Log "windows-exporter config not found at $($global:WindowsExporterConfig); skipping install" + return + } + + if (-not (Test-Path $global:WindowsExporterHealthScript)) { + Write-Log "windows-exporter health script not found at $($global:WindowsExporterHealthScript); health validation will use direct endpoint probe" + } + + if (-not (Test-Path $global:WindowsExporterNssm)) { + Write-Log "nssm.exe not found at $($global:WindowsExporterNssm); cannot install $($global:WindowsExporterServiceName)" + Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL -ErrorMessage "nssm.exe missing; cannot register aks-windows-exporter" + return + } + + Write-Log "Installing $($global:WindowsExporterServiceName)" + + # Always drop any prior registration before re-installing (handles VHD re-provisioning and upgrades). + Remove-WindowsExporterService -ServiceName $global:WindowsExporterServiceName + + # Clean up stale service from older extension versions, if any. + Remove-LegacyWindowsExporterService + + $appParameters = "--config.file=`"$($global:WindowsExporterConfig)`"" + + # NSSM settings mirror aks-vm-extension/aks-windows-node-vm-extension/entrypoint.ps1 Install-SystemService + # to preserve service behavior (logs, rotation, restart policy) that customers rely on. + & $global:WindowsExporterNssm install $global:WindowsExporterServiceName $global:WindowsExporterBinary | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppDirectory $global:WindowsExporterInstallDir | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppParameters $appParameters | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName DisplayName $global:WindowsExporterServiceName | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName Description $global:WindowsExporterServiceName | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName Start SERVICE_AUTO_START | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName ObjectName LocalSystem | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName Type SERVICE_WIN32_OWN_PROCESS | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppRestartDelay 5000 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppThrottle 1500 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppStdout $global:WindowsExporterStdoutLog | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppStderr $global:WindowsExporterStderrLog | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppStdoutCreationDisposition 4 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppStderrCreationDisposition 4 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppRotateFiles 1 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppRotateOnline 1 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppRotateSeconds 86400 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppRotateBytes 10485760 | Out-Null + + & $global:WindowsExporterNssm start $global:WindowsExporterServiceName | Out-Null + + if (-not (Test-WindowsExporterHealth)) { + Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL -ErrorMessage "aks-windows-exporter failed to become healthy" + return + } + + Write-Log "Installed and started $($global:WindowsExporterServiceName)" +} diff --git a/staging/cse/windows/windowsexporterfunc.tests.ps1 b/staging/cse/windows/windowsexporterfunc.tests.ps1 new file mode 100644 index 00000000000..72699ad7b7d --- /dev/null +++ b/staging/cse/windows/windowsexporterfunc.tests.ps1 @@ -0,0 +1,74 @@ +Describe 'Windows exporter CSE functions' { + BeforeAll { + . $PSScriptRoot\..\..\..\parts\windows\windowscsehelper.ps1 + . $PSCommandPath.Replace('.tests.ps1','.ps1') + + function Write-Log { + param($Message) + Write-Host "LOG: $Message" + } + } + + Context 'Install-WindowsExporter' { + BeforeEach { + $script:LastExitCode = $null + $script:LastErrorMessage = $null + + Mock Set-ExitCode -MockWith { + param($ExitCode, $ErrorMessage) + $script:LastExitCode = $ExitCode + $script:LastErrorMessage = $ErrorMessage + } + } + + It 'no-ops when the VHD sentinel is absent' { + Mock Test-Path -MockWith { return $false } + + { Install-WindowsExporter } | Should -Not -Throw + + Assert-MockCalled Set-ExitCode -Exactly -Times 0 + } + + It 'no-ops when the sentinel is present but the binary is absent' { + Mock Test-Path -MockWith { + param($Path) + return $Path -eq $global:WindowsExporterSkipFile + } + + { Install-WindowsExporter } | Should -Not -Throw + + Assert-MockCalled Set-ExitCode -Exactly -Times 0 + } + + It 'sets the windows-exporter error code when nssm is absent after assets are present' { + Mock Test-Path -MockWith { + param($Path) + return $Path -ne $global:WindowsExporterNssm + } + + { Install-WindowsExporter } | Should -Not -Throw + + Assert-MockCalled Set-ExitCode -Exactly -Times 1 -ParameterFilter { + $ExitCode -eq $global:WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL + } + $script:LastExitCode | Should -Be $global:WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL + } + } + + Context 'Test-WindowsExporterHealth' { + It 'uses the baked health script when it is present' { + $global:WindowsExporterHealthScript = Join-Path $TestDrive 'windows-exporter-health.ps1' + @' +function Get-Health { + return "ok" +} + +function Get-Version { + return "v0.31.2" +} +'@ | Set-Content -Path $global:WindowsExporterHealthScript -Force + + Test-WindowsExporterHealth -RetryCount 0 -RetryInterval 0 | Should -Be $true + } + } +} diff --git a/vhdbuilder/packer/test/windows-vhd-content-test.ps1 b/vhdbuilder/packer/test/windows-vhd-content-test.ps1 index 157fa4f598a..ffa45222571 100644 --- a/vhdbuilder/packer/test/windows-vhd-content-test.ps1 +++ b/vhdbuilder/packer/test/windows-vhd-content-test.ps1 @@ -604,6 +604,49 @@ function Test-ToolsToCacheOnVHD { } } +function Test-WindowsExporterOnVHD { + # The Install-WindowsExporterOnVHD step in configure-windows-vhd.ps1 must have: + # 1. Extracted windows-exporter.exe into C:\k\windows-exporter + # 2. Placed windows-exporter-config.yml alongside it + # 3. Placed windows-exporter-health.ps1 alongside it + # 4. Created the sentinel file the aks-vm-extension honors to no-op + # The service is registered at CSE time (staging/cse/windows/windowsexporterfunc.ps1), + # so we intentionally do NOT expect the service to exist on the VHD itself. + $exporterDir = "C:\k\windows-exporter" + $expected = @( + (Join-Path $exporterDir "windows-exporter.exe"), + (Join-Path $exporterDir "windows-exporter-config.yml"), + (Join-Path $exporterDir "windows-exporter-health.ps1"), + "C:\k\skip_vhd_windows_exporter" + ) + + $missing = @() + foreach ($path in $expected) + { + if (-not (Test-Path -Path $path)) + { + $missing += $path + } + else + { + Write-OutputWithTimestamp "windows-exporter asset present: $path" + } + } + + if ($missing.Count -gt 0) + { + Write-ErrorWithTimestamp "Missing windows-exporter VHD assets: $($missing -join ', ')" + exit 1 + } + + $svc = Get-Service "aks-windows-exporter" -ErrorAction SilentlyContinue + if ($svc) + { + Write-ErrorWithTimestamp "Service aks-windows-exporter should not be registered on the VHD (CSE registers it at provisioning); found state: $($svc.Status)" + exit 1 + } +} + function Test-ExpandVolumeTask { $osDrive = ((Get-WmiObject Win32_OperatingSystem -ErrorAction Stop).SystemDrive).TrimEnd(":") $osDisk = Get-Partition -DriveLetter $osDrive | Get-Disk @@ -760,6 +803,9 @@ Test-WindowsDefenderPlatformUpdate Write-OutputWithTimestamp "Test: ToolsToCacheOnVHD" Test-ToolsToCacheOnVHD +Write-OutputWithTimestamp "Test: WindowsExporterOnVHD" +Test-WindowsExporterOnVHD + Write-OutputWithTimestamp "Test: ExpandVolumeTask" Test-ExpandVolumeTask diff --git a/vhdbuilder/packer/windows/configure-windows-vhd.ps1 b/vhdbuilder/packer/windows/configure-windows-vhd.ps1 index 3bef0c3ad59..e162ee118b7 100644 --- a/vhdbuilder/packer/windows/configure-windows-vhd.ps1 +++ b/vhdbuilder/packer/windows/configure-windows-vhd.ps1 @@ -760,6 +760,85 @@ function Set-WinRmServiceAutoStart sc.exe config winrm start=auto } +function Install-WindowsExporterOnVHD +{ + # Stage windows-exporter assets into C:\k\windows-exporter so the CSE-time + # Install-WindowsExporter function (staging/cse/windows/windowsexporterfunc.ps1) + # can register the aks-windows-exporter service at node provisioning. + # + # Migrated from aks-vm-extension; the sentinel file is the coordination hook + # that tells the extension to no-op on nodes built from this VHD. + $exporterCacheDir = "c:\akse-cache\windows-exporter" + $exporterInstallDir = "C:\k\windows-exporter" + $exporterConfigSrc = "c:\k\windows-exporter-config.yml" + $exporterHealthSrc = "c:\k\windows-exporter-health.ps1" + $exporterSentinel = "C:\k\skip_vhd_windows_exporter" + + if (-not (Test-Path $exporterCacheDir)) + { + Write-Log "windows-exporter cache directory not found at $exporterCacheDir; skipping VHD install" + return + } + + $exporterZip = Get-ChildItem -Path $exporterCacheDir -Filter "windows-exporter_*_amd64.zip" -File | + Sort-Object -Property Name -Descending | + Select-Object -First 1 + if (-not $exporterZip) + { + Write-Log "No windows-exporter zip found under $exporterCacheDir; skipping VHD install" + return + } + + New-Item -ItemType Directory -Path $exporterInstallDir -Force | Out-Null + + Write-Log "Extracting $($exporterZip.FullName) to $exporterInstallDir" + Add-Type -AssemblyName System.IO.Compression.FileSystem + # Clean any prior extraction to avoid zip-extract "already exists" failures on re-runs + Get-ChildItem -Path $exporterInstallDir -Force -ErrorAction SilentlyContinue | + Remove-Item -Recurse -Force -ErrorAction SilentlyContinue + [System.IO.Compression.ZipFile]::ExtractToDirectory($exporterZip.FullName, $exporterInstallDir) + + $exporterBinary = Join-Path $exporterInstallDir "windows-exporter.exe" + if (-not (Test-Path $exporterBinary)) + { + # Some zip payloads nest the binary under a subfolder; flatten into install dir. + $nestedBinary = Get-ChildItem -Path $exporterInstallDir -Filter "windows-exporter.exe" -Recurse -File | + Select-Object -First 1 + if ($nestedBinary) + { + Move-Item -Path $nestedBinary.FullName -Destination $exporterBinary -Force + } + } + if (-not (Test-Path $exporterBinary)) + { + throw "windows-exporter.exe not found after extracting $($exporterZip.Name)" + } + + if (Test-Path $exporterConfigSrc) + { + Copy-Item -Path $exporterConfigSrc -Destination (Join-Path $exporterInstallDir "windows-exporter-config.yml") -Force + } + else + { + throw "windows-exporter config not staged at $exporterConfigSrc" + } + + if (Test-Path $exporterHealthSrc) + { + Copy-Item -Path $exporterHealthSrc -Destination (Join-Path $exporterInstallDir "windows-exporter-health.ps1") -Force + } + else + { + throw "windows-exporter health script not staged at $exporterHealthSrc" + } + + # Drop the sentinel last so partial installs don't appear complete to CSE. + New-Item -ItemType File -Path $exporterSentinel -Force | Out-Null + + LogFilesInDirectory $exporterInstallDir + Write-Log "windows-exporter staged on VHD; sentinel $exporterSentinel dropped" +} + function Set-WinRmServiceDelayedStart { # Hyper-V messes with networking components on startup after the feature is enabled @@ -1044,6 +1123,7 @@ try Get-ToolsToVHD Get-PrivatePackagesToCacheOnVHD Install-WindowsCiliumNetworking + Install-WindowsExporterOnVHD # Update all the registry keys again in case the steps in between reset them. Ok, some of the steps in between do reset them. But there's a risk that the steps also need # the keys set. So we kinda have to do both now :cry: Update-Registry diff --git a/vhdbuilder/packer/windows/windows-vhd-builder-sig.json b/vhdbuilder/packer/windows/windows-vhd-builder-sig.json index 1437c8b65d9..bec6cc8ba49 100644 --- a/vhdbuilder/packer/windows/windows-vhd-builder-sig.json +++ b/vhdbuilder/packer/windows/windows-vhd-builder-sig.json @@ -88,7 +88,9 @@ "parts/common/components.json", "vhdbuilder/packer/windows/windows-vhd-configuration.ps1", "vhdbuilder/packer/windows/windows_settings.json", - "vhdbuilder/packer/windows/components_json_helpers.ps1" + "vhdbuilder/packer/windows/components_json_helpers.ps1", + "parts/windows/windowsexporter/windows-exporter-config.yml", + "parts/windows/windowsexporter/windows-exporter-health.ps1" ], "destination": "c:/k/" }, From 85a6a6b8148ccceeaab81e722c35442e994bffb5 Mon Sep 17 00:00:00 2001 From: chmill Date: Wed, 13 May 2026 21:00:25 +0000 Subject: [PATCH 2/6] exporter windows --- staging/cse/windows/kubernetesfunc.ps1 | 2 + staging/cse/windows/windowsexporterfunc.ps1 | 65 +++---------------- .../packer/windows/configure-windows-vhd.ps1 | 4 +- 3 files changed, 13 insertions(+), 58 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index a71e04daf6d..57a48a0fa77 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -166,6 +166,8 @@ function Update-DefenderPreferences { Add-MpPreference -ExclusionPath "C:\k\containerd.err.log" Add-MpPreference -ExclusionPath "C:\k\aks-windows-exporter.err.log" Add-MpPreference -ExclusionPath "C:\k\aks-windows-exporter.log" + Add-MpPreference -ExclusionPath "C:\k\windows-exporter.err.log" + Add-MpPreference -ExclusionPath "C:\k\windows-exporter.log" # Azure CNI Add-MpPreference -ExclusionProcess "C:\k\azurecni\bin\azure-cns.exe" diff --git a/staging/cse/windows/windowsexporterfunc.ps1 b/staging/cse/windows/windowsexporterfunc.ps1 index 75a5f32aa2e..3f9e0e3f06d 100644 --- a/staging/cse/windows/windowsexporterfunc.ps1 +++ b/staging/cse/windows/windowsexporterfunc.ps1 @@ -14,7 +14,7 @@ Coordination with aks-vm-extension: - When C:\k\skip_vhd_windows_exporter exists, the extension's entrypoint.ps1 - short-circuits. The sentinel is dropped by the Windows VHD build once the binary + short-circuits. The sentinel is created by the Windows VHD build once the binary and config are staged. #> @@ -24,57 +24,11 @@ $global:WindowsExporterConfig = Join-Path $global:WindowsExporterInstall $global:WindowsExporterHealthScript = Join-Path $global:WindowsExporterInstallDir "windows-exporter-health.ps1" $global:WindowsExporterSkipFile = "C:\k\skip_vhd_windows_exporter" $global:WindowsExporterServiceName = "aks-windows-exporter" -$global:WindowsExporterLegacyService = "windows-exporter" $global:WindowsExporterPort = 19182 $global:WindowsExporterStdoutLog = "C:\k\windows-exporter.log" $global:WindowsExporterStderrLog = "C:\k\windows-exporter.err.log" $global:WindowsExporterNssm = "C:\k\nssm.exe" -function Test-WindowsExporterPortInUse { - param([Parameter(Mandatory=$true)][int]$Port) - try { - $listener = Get-NetTCPConnection -LocalPort $Port -State Listen -ErrorAction SilentlyContinue - return ($null -ne $listener) - } - catch { - $netstat = netstat -an | Select-String ":$Port\s+.*LISTENING" - return ($null -ne $netstat) - } -} - -function Remove-WindowsExporterService { - param([Parameter(Mandatory=$true)][string]$ServiceName) - - $svc = Get-Service $ServiceName -ErrorAction SilentlyContinue - if (-not $svc) { - return - } - Write-Log "Removing existing service $ServiceName" - try { - & $global:WindowsExporterNssm stop $ServiceName 2>&1 | Out-Null - Stop-Service $ServiceName -Force -ErrorAction SilentlyContinue - & $global:WindowsExporterNssm remove $ServiceName confirm 2>&1 | Out-Null - } - catch { - Write-Log "Warning: failed to fully remove $ServiceName (best-effort): $_" - } -} - -function Remove-LegacyWindowsExporterService { - # Old extension versions registered a service literally named "windows-exporter" on port 19182. - # Only remove it if it is using the exporter port, so we don't interfere with unrelated tooling. - $legacy = Get-Service $global:WindowsExporterLegacyService -ErrorAction SilentlyContinue - if (-not $legacy) { - return - } - if (Test-WindowsExporterPortInUse -Port $global:WindowsExporterPort) { - Write-Log "Legacy service $($global:WindowsExporterLegacyService) using port $($global:WindowsExporterPort) - removing" - Remove-WindowsExporterService -ServiceName $global:WindowsExporterLegacyService - } else { - Write-Log "Legacy service $($global:WindowsExporterLegacyService) present but not on port $($global:WindowsExporterPort) - leaving it alone" - } -} - function Test-WindowsExporterHealth { param( [int]$RetryCount = 5, @@ -150,19 +104,18 @@ function Install-WindowsExporter { return } - Write-Log "Installing $($global:WindowsExporterServiceName)" - - # Always drop any prior registration before re-installing (handles VHD re-provisioning and upgrades). - Remove-WindowsExporterService -ServiceName $global:WindowsExporterServiceName - - # Clean up stale service from older extension versions, if any. - Remove-LegacyWindowsExporterService + Write-Log "Ensuring $($global:WindowsExporterServiceName) is installed and running" $appParameters = "--config.file=`"$($global:WindowsExporterConfig)`"" # NSSM settings mirror aks-vm-extension/aks-windows-node-vm-extension/entrypoint.ps1 Install-SystemService # to preserve service behavior (logs, rotation, restart policy) that customers rely on. - & $global:WindowsExporterNssm install $global:WindowsExporterServiceName $global:WindowsExporterBinary | Out-Null + $existingService = Get-Service $global:WindowsExporterServiceName -ErrorAction SilentlyContinue + if (-not $existingService) { + & $global:WindowsExporterNssm install $global:WindowsExporterServiceName $global:WindowsExporterBinary | Out-Null + } else { + Write-Log "$($global:WindowsExporterServiceName) is already registered; ensuring settings and running state" + } & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppDirectory $global:WindowsExporterInstallDir | Out-Null & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppParameters $appParameters | Out-Null & $global:WindowsExporterNssm set $global:WindowsExporterServiceName DisplayName $global:WindowsExporterServiceName | Out-Null @@ -188,5 +141,5 @@ function Install-WindowsExporter { return } - Write-Log "Installed and started $($global:WindowsExporterServiceName)" + Write-Log "Ensured $($global:WindowsExporterServiceName) is installed and running" } diff --git a/vhdbuilder/packer/windows/configure-windows-vhd.ps1 b/vhdbuilder/packer/windows/configure-windows-vhd.ps1 index e162ee118b7..3cb47973fcf 100644 --- a/vhdbuilder/packer/windows/configure-windows-vhd.ps1 +++ b/vhdbuilder/packer/windows/configure-windows-vhd.ps1 @@ -832,11 +832,11 @@ function Install-WindowsExporterOnVHD throw "windows-exporter health script not staged at $exporterHealthSrc" } - # Drop the sentinel last so partial installs don't appear complete to CSE. + # Create the sentinel last so partial installs don't appear complete to CSE. New-Item -ItemType File -Path $exporterSentinel -Force | Out-Null LogFilesInDirectory $exporterInstallDir - Write-Log "windows-exporter staged on VHD; sentinel $exporterSentinel dropped" + Write-Log "windows-exporter staged on VHD; sentinel $exporterSentinel created" } function Set-WinRmServiceDelayedStart From 1d1f93881838d9919e02f3b7cc66bcb223f48dfc Mon Sep 17 00:00:00 2001 From: chmill Date: Thu, 14 May 2026 01:38:34 +0000 Subject: [PATCH 3/6] fix: address windows exporter review feedback --- e2e/validators.go | 2 +- .../windows-exporter-health.ps1 | 18 +++++++++++++++--- staging/cse/windows/windowsexporterfunc.ps1 | 9 ++++++++- .../cse/windows/windowsexporterfunc.tests.ps1 | 12 ++++++++++++ 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index bc2a8fb863d..a522b55ced1 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -840,7 +840,7 @@ func ValidateWindowsExporter(ctx context.Context, s *Scenario) { "$ErrorActionPreference = \"Stop\"", fmt.Sprintf("if (-not (Test-Path '%s')) { throw 'missing binary: %s' }", binary, binary), fmt.Sprintf("if (-not (Test-Path '%s')) { throw 'missing config: %s' }", configFile, configFile), - fmt.Sprintf("$svc = Get-Service -Name %s", serviceName), + fmt.Sprintf("$svc = Get-Service -Name '%s'", serviceName), "Write-Output $svc", fmt.Sprintf("if ($svc.Status -ne 'Running') { throw \"service %s is not running: $($svc.Status)\" }", serviceName), fmt.Sprintf("if ($svc.StartType -ne 'Automatic') { throw \"service %s StartType is $($svc.StartType), expected Automatic\" }", serviceName), diff --git a/parts/windows/windowsexporter/windows-exporter-health.ps1 b/parts/windows/windowsexporter/windows-exporter-health.ps1 index 206181777bb..bd6f55ba401 100644 --- a/parts/windows/windowsexporter/windows-exporter-health.ps1 +++ b/parts/windows/windowsexporter/windows-exporter-health.ps1 @@ -1,6 +1,18 @@ +function Invoke-WindowsExporterRequest { + param([Parameter(Mandatory=$true)][string]$Path) + + try { + $response = Invoke-WebRequest -UseBasicParsing -Uri "http://localhost:19182/$Path" -TimeoutSec 10 -ErrorAction Stop + return [string]$response.Content + } + catch { + return "" + } +} + function Get-Health { - $result = (& "curl.exe" --silent "http://localhost:19182/health" 2>$null) -join "`n" + $result = Invoke-WindowsExporterRequest -Path "health" if ($null -ne $result -and $result.Contains("ok")) { return $result } else { @@ -9,7 +21,7 @@ function Get-Health { } function Get-Version { - $result = (& "curl.exe" --silent "http://localhost:19182/version" 2>$null) -join "`n" + $result = Invoke-WindowsExporterRequest -Path "version" if ($null -ne $result -and $result.Contains("version")) { # {"version":"v0.25.1","revision":"f70fa009de541dc99ed210aa7e67c9550133ef02","branch":"HEAD","buildUser":"cloudtest@781d70d7c000002","buildDate":"20240223-08:06:57","goVersion":"go1.21.3"} $version = $result -replace ".*""version"":""([^""]+)"".*", '$1' @@ -21,7 +33,7 @@ function Get-Version { function Get-MetricsExample { # The result may be too large in production node. I suggest to call it only when testing. - $result = (& "curl.exe" "http://localhost:19182/metrics") + $result = Invoke-WindowsExporterRequest -Path "metrics" $example = "windows_process_cpu_time_total" if ($result -match $example) { $example = $result -split "`n" | Where-Object {$_ -match $example} | Select-Object -Last 1 diff --git a/staging/cse/windows/windowsexporterfunc.ps1 b/staging/cse/windows/windowsexporterfunc.ps1 index 3f9e0e3f06d..160bb29769b 100644 --- a/staging/cse/windows/windowsexporterfunc.ps1 +++ b/staging/cse/windows/windowsexporterfunc.ps1 @@ -56,7 +56,14 @@ function Test-WindowsExporterHealth { Write-Log "windows-exporter health script not found at $($global:WindowsExporterHealthScript); falling back to direct health endpoint probe" for ($i = 0; $i -le $RetryCount; $i++) { - $result = (& "curl.exe" --silent "http://localhost:$($global:WindowsExporterPort)/health" 2>$null) -join "`n" + $result = "" + try { + $response = Invoke-WebRequest -UseBasicParsing -Uri "http://localhost:$($global:WindowsExporterPort)/health" -TimeoutSec 10 -ErrorAction Stop + $result = [string]$response.Content + } + catch { + $result = "" + } if ($null -ne $result -and $result.Contains("ok")) { Write-Log "aks-windows-exporter health check passed: $result" return $true diff --git a/staging/cse/windows/windowsexporterfunc.tests.ps1 b/staging/cse/windows/windowsexporterfunc.tests.ps1 index 72699ad7b7d..71f96d73272 100644 --- a/staging/cse/windows/windowsexporterfunc.tests.ps1 +++ b/staging/cse/windows/windowsexporterfunc.tests.ps1 @@ -70,5 +70,17 @@ function Get-Version { Test-WindowsExporterHealth -RetryCount 0 -RetryInterval 0 | Should -Be $true } + + It 'uses a native PowerShell endpoint probe when the baked health script is absent' { + $global:WindowsExporterHealthScript = Join-Path $TestDrive 'missing-health.ps1' + + Mock Invoke-WebRequest -MockWith { + return @{ Content = 'ok' } + } + + Test-WindowsExporterHealth -RetryCount 0 -RetryInterval 0 | Should -Be $true + + Assert-MockCalled Invoke-WebRequest -Exactly -Times 1 + } } } From 94a6d6a9a7276a5e87171c71e26d993abbb264de Mon Sep 17 00:00:00 2001 From: chmill Date: Thu, 14 May 2026 19:01:29 +0000 Subject: [PATCH 4/6] more verbose validation --- e2e/validators.go | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index a522b55ced1..52f3d0f0f5c 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -846,12 +846,23 @@ func ValidateWindowsExporter(ctx context.Context, s *Scenario) { fmt.Sprintf("if ($svc.StartType -ne 'Automatic') { throw \"service %s StartType is $($svc.StartType), expected Automatic\" }", serviceName), // Hit the metrics endpoint and require a windows-exporter-specific metric. fmt.Sprintf("$resp = Invoke-WebRequest -UseBasicParsing -Uri '%s' -TimeoutSec 10", metricsURL), - "if ($resp.StatusCode -ne 200) { throw \"metrics endpoint returned $($resp.StatusCode)\" }", - "if ($resp.Content -notmatch 'windows_os_info') { throw 'windows_os_info metric missing from /metrics response' }", - "if ($resp.Content -notmatch 'windows_cpu_time_total') { throw 'windows_cpu_time_total metric missing from /metrics response' }", + "$failureReasons = @()", + "if ($resp.StatusCode -ne 200) { $failureReasons += \"metrics endpoint returned $($resp.StatusCode)\" }", + "if ($resp.Content -notmatch 'windows_os_info') { $failureReasons += 'windows_os_info metric missing from /metrics response' }", + "if ($resp.Content -notmatch 'windows_cpu_time_total') { $failureReasons += 'windows_cpu_time_total metric missing from /metrics response' }", + "if ($failureReasons.Count -gt 0) {", + " Write-Output \"metrics endpoint returned status $($resp.StatusCode) with $($resp.Content.Length) bytes\"", + " Write-Output ('metrics validation failures: ' + ($failureReasons -join '; '))", + " Write-Output '--- begin /metrics response ---'", + " Write-Output $resp.Content", + " Write-Output '--- end /metrics response ---'", + " throw ($failureReasons -join '; ')", + "}", + "Write-Output \"metrics endpoint returned status $($resp.StatusCode) with $($resp.Content.Length) bytes\"", } - execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + validationResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, fmt.Sprintf("aks-windows-exporter validation failed on %s", s.Runtime.VM.PrivateIP)) + s.T.Logf("aks-windows-exporter validation succeeded on %s: service is Running/Automatic and %s contains windows_os_info and windows_cpu_time_total\n%s", s.Runtime.VM.PrivateIP, metricsURL, strings.TrimSpace(validationResult.stdout)) } func ValidateSystemdUnitIsNotFailed(ctx context.Context, s *Scenario, serviceName string) { From 33a62df8c433dccd38ced5b8cd435cf0ed6c7af0 Mon Sep 17 00:00:00 2001 From: chmill Date: Tue, 19 May 2026 00:16:33 +0000 Subject: [PATCH 5/6] comments --- e2e/validators.go | 24 ++++++++++++------- .../windows-exporter-health.ps1 | 18 +++++++++----- .../packer/windows/configure-windows-vhd.ps1 | 12 ++++++---- .../windows/windows-vhd-configuration.ps1 | 18 ++++++++++++++ 4 files changed, 53 insertions(+), 19 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index 52f3d0f0f5c..e033c5d953e 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -828,7 +828,8 @@ func ValidateWindowsExporter(ctx context.Context, s *Scenario) { fmt.Sprintf("if (-not (Test-Path '%s')) { Write-Output 'SKIP'; exit 0 }", sentinel), "Write-Output 'PRESENT'", } - res := execScriptOnVMForScenario(ctx, s, strings.Join(sentinelCheck, "\n")) + res := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(sentinelCheck, "\n"), 0, + fmt.Sprintf("could not check aks-windows-exporter sentinel %s on %s", sentinel, s.Runtime.VM.PrivateIP)) if strings.Contains(res.stdout, "SKIP") { s.T.Logf("Skipping aks-windows-exporter validation: sentinel %s not found (aks-vm-extension manages the service on this VHD)", sentinel) return @@ -846,19 +847,26 @@ func ValidateWindowsExporter(ctx context.Context, s *Scenario) { fmt.Sprintf("if ($svc.StartType -ne 'Automatic') { throw \"service %s StartType is $($svc.StartType), expected Automatic\" }", serviceName), // Hit the metrics endpoint and require a windows-exporter-specific metric. fmt.Sprintf("$resp = Invoke-WebRequest -UseBasicParsing -Uri '%s' -TimeoutSec 10", metricsURL), + "$metricsContent = [string]$resp.Content", "$failureReasons = @()", "if ($resp.StatusCode -ne 200) { $failureReasons += \"metrics endpoint returned $($resp.StatusCode)\" }", - "if ($resp.Content -notmatch 'windows_os_info') { $failureReasons += 'windows_os_info metric missing from /metrics response' }", - "if ($resp.Content -notmatch 'windows_cpu_time_total') { $failureReasons += 'windows_cpu_time_total metric missing from /metrics response' }", + "if ($metricsContent -notmatch 'windows_os_info') { $failureReasons += 'windows_os_info metric missing from /metrics response' }", + "if ($metricsContent -notmatch 'windows_cpu_time_total') { $failureReasons += 'windows_cpu_time_total metric missing from /metrics response' }", "if ($failureReasons.Count -gt 0) {", - " Write-Output \"metrics endpoint returned status $($resp.StatusCode) with $($resp.Content.Length) bytes\"", + " $metricsPreviewMaxChars = 65536", + " $metricsPreview = $metricsContent", + " Write-Output \"metrics endpoint returned status $($resp.StatusCode) with $($metricsContent.Length) characters\"", " Write-Output ('metrics validation failures: ' + ($failureReasons -join '; '))", - " Write-Output '--- begin /metrics response ---'", - " Write-Output $resp.Content", - " Write-Output '--- end /metrics response ---'", + " if ($metricsPreview.Length -gt $metricsPreviewMaxChars) {", + " $metricsPreview = $metricsPreview.Substring(0, $metricsPreviewMaxChars)", + " Write-Output \"metrics response truncated: showing first $metricsPreviewMaxChars of $($metricsContent.Length) characters\"", + " }", + " Write-Output '--- begin /metrics response preview ---'", + " Write-Output $metricsPreview", + " Write-Output '--- end /metrics response preview ---'", " throw ($failureReasons -join '; ')", "}", - "Write-Output \"metrics endpoint returned status $($resp.StatusCode) with $($resp.Content.Length) bytes\"", + "Write-Output \"metrics endpoint returned status $($resp.StatusCode) with $($metricsContent.Length) characters\"", } validationResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, fmt.Sprintf("aks-windows-exporter validation failed on %s", s.Runtime.VM.PrivateIP)) diff --git a/parts/windows/windowsexporter/windows-exporter-health.ps1 b/parts/windows/windowsexporter/windows-exporter-health.ps1 index bd6f55ba401..2650d49daa3 100644 --- a/parts/windows/windowsexporter/windows-exporter-health.ps1 +++ b/parts/windows/windowsexporter/windows-exporter-health.ps1 @@ -22,13 +22,19 @@ function Get-Health { function Get-Version { $result = Invoke-WindowsExporterRequest -Path "version" - if ($null -ne $result -and $result.Contains("version")) { - # {"version":"v0.25.1","revision":"f70fa009de541dc99ed210aa7e67c9550133ef02","branch":"HEAD","buildUser":"cloudtest@781d70d7c000002","buildDate":"20240223-08:06:57","goVersion":"go1.21.3"} - $version = $result -replace ".*""version"":""([^""]+)"".*", '$1' - return $version - } else { - return "" + if ($null -ne $result -and $result -ne "") { + try { + # {"version":"v0.25.1","revision":"f70fa009de541dc99ed210aa7e67c9550133ef02","branch":"HEAD","buildUser":"cloudtest@781d70d7c000002","buildDate":"20240223-08:06:57","goVersion":"go1.21.3"} + $versionInfo = $result | ConvertFrom-Json -ErrorAction Stop + if ($null -ne $versionInfo.version -and $versionInfo.version -ne "") { + return [string]$versionInfo.version + } + } + catch { + } } + + return "" } function Get-MetricsExample { diff --git a/vhdbuilder/packer/windows/configure-windows-vhd.ps1 b/vhdbuilder/packer/windows/configure-windows-vhd.ps1 index 3cb47973fcf..1b3932f539b 100644 --- a/vhdbuilder/packer/windows/configure-windows-vhd.ps1 +++ b/vhdbuilder/packer/windows/configure-windows-vhd.ps1 @@ -780,14 +780,16 @@ function Install-WindowsExporterOnVHD return } - $exporterZip = Get-ChildItem -Path $exporterCacheDir -Filter "windows-exporter_*_amd64.zip" -File | - Sort-Object -Property Name -Descending | - Select-Object -First 1 + $expectedExporterZipName = [IO.Path]::GetFileName($global:windowsExporterPackageUrl.Split('?')[0]) + + $exporterZips = @(Get-ChildItem -Path $exporterCacheDir -Filter "windows-exporter_*_amd64.zip" -File) + $exporterZip = @($exporterZips | Where-Object { $_.Name -eq $expectedExporterZipName }) | Select-Object -First 1 if (-not $exporterZip) { - Write-Log "No windows-exporter zip found under $exporterCacheDir; skipping VHD install" - return + $foundExporterZipNames = @($exporterZips | ForEach-Object { $_.Name }) + throw "No expected windows-exporter zip found under $exporterCacheDir. Expected: $expectedExporterZipName. Found: $($foundExporterZipNames -join ', ')" } + Write-Log "Using windows-exporter package $($exporterZip.Name) resolved from components.json" New-Item -ItemType Directory -Path $exporterInstallDir -Force | Out-Null diff --git a/vhdbuilder/packer/windows/windows-vhd-configuration.ps1 b/vhdbuilder/packer/windows/windows-vhd-configuration.ps1 index e86ef641928..fccf453f817 100644 --- a/vhdbuilder/packer/windows/windows-vhd-configuration.ps1 +++ b/vhdbuilder/packer/windows/windows-vhd-configuration.ps1 @@ -73,6 +73,23 @@ $global:keysToSet = GetRegKeysToApply $windowsSettingsJson $global:map = GetPackagesFromComponentsJson $componentsJson $global:releaseNotesToSet = GetKeyMapForReleaseNotes $windowsSettingsJson +function Get-WindowsExporterPackageUrl +{ + $windowsExporterPackages = $global:map["c:\akse-cache\windows-exporter\"] + $windowsExporterPackageCount = 0 + if ($windowsExporterPackages -ne $null) + { + $windowsExporterPackageCount = $windowsExporterPackages.Count + } + + if ($windowsExporterPackageCount -ne 1) + { + throw "Expected exactly one windows-exporter package from components.json, found $windowsExporterPackageCount" + } + + return $windowsExporterPackages[0] +} + $validSKU = GetWindowsBaseVersions $windowsSettingsJson if (-not ($validSKU -contains $windowsSKU)) { @@ -85,6 +102,7 @@ if (-not ($validSKU -contains $windowsSKU)) # specified by AKS PR for most of the cases. BUT as long as there's a new unpacked image version, we should keep the # versions synced. $global:defaultContainerdPackageUrl = GetDefaultContainerDFromComponentsJson $componentsJson +$global:windowsExporterPackageUrl = Get-WindowsExporterPackageUrl # defenderUpdateUrl refers to the latest windows defender platform update $global:defenderUpdateUrl = GetDefenderUpdateUrl $windowsSettingsJson From b533a0223d3ae3e1a1b4781371a99703d808735c Mon Sep 17 00:00:00 2001 From: chmill Date: Tue, 19 May 2026 21:40:45 +0000 Subject: [PATCH 6/6] out of scope --- e2e/config/config.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/config/config.go b/e2e/config/config.go index 7a0c5d050ae..d61db484c6e 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -179,7 +179,7 @@ func mustGetNewRSAKeyPair() ([]byte, []byte, string) { privateKeyFileName, err := writePrivateKeyToTempFile(privatePEMBytes) if err != nil { - panic(fmt.Sprintf("failed to write private key to temp file: %v", err)) + panic(fmt.Sprintf("failed to write private key to temp file: %w", err)) } return privatePEMBytes, publicKeyBytes, privateKeyFileName