diff --git a/e2e/config/config.go b/e2e/config/config.go index d61db484c6e..7a0c5d050ae 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -179,7 +179,7 @@ func mustGetNewRSAKeyPair() ([]byte, []byte, string) { privateKeyFileName, err := writePrivateKeyToTempFile(privatePEMBytes) if err != nil { - panic(fmt.Sprintf("failed to write private key to temp file: %w", err)) + panic(fmt.Sprintf("failed to write private key to temp file: %v", err)) } return privatePEMBytes, publicKeyBytes, privateKeyFileName diff --git a/e2e/scenario_win_test.go b/e2e/scenario_win_test.go index 2ece49b1c7b..fb41d3ce342 100644 --- a/e2e/scenario_win_test.go +++ b/e2e/scenario_win_test.go @@ -68,6 +68,7 @@ func Test_Windows2022_AzureNetwork(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -91,6 +92,7 @@ func Test_Windows2022AzureOverlayNetworkDualStack(t *testing.T) { ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"}) ValidateCiliumIsNotRunningWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -115,6 +117,7 @@ func Test_Windows2022Gen2AzureNetwork(t *testing.T) { ValidateDotnetNotInstalledWindows(ctx, s) ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip") ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -139,6 +142,7 @@ func Test_Windows2022Gen2AzureOverlayNetworkDualStack(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip") ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -161,6 +165,7 @@ func Test_Windows23H2AzureNetwork(t *testing.T) { ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"}) ValidateCiliumIsNotRunningWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -184,6 +189,7 @@ func Test_Windows23H2AzureOverlayNetworkDualStack(t *testing.T) { ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"}) ValidateCiliumIsNotRunningWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -207,6 +213,7 @@ func Test_Windows23H2Gen2AzureNetwork(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip") ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -231,6 +238,7 @@ func Test_Windows23H2Gen2AzureOverlayDualStack(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip") ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -294,6 +302,7 @@ func Test_Windows2025(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -319,6 +328,7 @@ func Test_Windows2025Gen2(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -378,6 +388,7 @@ func Test_Windows2022_SecureTLSBootstrapping_BootstrapToken_Fallback(t *testing. ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -434,6 +445,7 @@ func Test_Windows2022_VHDCaching(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -460,6 +472,7 @@ func Test_Windows2022Gen2_k8s_133(t *testing.T) { ValidateCiliumIsNotRunningWindows(ctx, s) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -483,6 +496,7 @@ func Test_Windows23H2_Cilium2(t *testing.T) { ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"}) ValidateCiliumIsRunningWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -506,6 +520,7 @@ func Test_Windows23H2Gen2_WindowsCiliumNetworking(t *testing.T) { Validator: func(ctx context.Context, s *Scenario) { ValidateWindowsCiliumIsRunning(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -534,6 +549,7 @@ func Test_Windows2022_McrChinaCloud_Windows(t *testing.T) { `https://mcr.azk8s.cn`) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) ValidateCollectWindowsLogsScript(ctx, s) }, }, @@ -570,6 +586,7 @@ func Test_Windows2025Gen2_McrChinaCloud_Windows(t *testing.T) { `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`, `https://mcr.azk8s.cn`) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) }, }, }) @@ -616,6 +633,7 @@ func Test_NetworkIsolatedCluster_Windows_WithEgress(t *testing.T) { ValidateFileDoesNotExist(ctx, s, `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`) ValidateDotnetNotInstalledWindows(ctx, s) ValidateWindowsSystemServicesRestartConfiguration(ctx, s) + ValidateWindowsExporter(ctx, s) }, }, }) diff --git a/e2e/validators.go b/e2e/validators.go index 96597ada092..52f3d0f0f5c 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -805,6 +805,66 @@ func ValidateWindowsSystemServicesRestartConfiguration(ctx context.Context, s *S ValidateWindowsSystemServiceRestartConfiguration(ctx, s, "kubeproxy") } +// ValidateWindowsExporter asserts that the aks-windows-exporter service registered by +// staging/cse/windows/windowsexporterfunc.ps1 is running and serving Prometheus metrics. +// +// When the VHD does not carry the windows-exporter assets (older VHDs where the +// aks-vm-extension still installs the service), the sentinel file is absent and we +// skip the validation - the extension owns the service in that mode and AgentBaker +// has no guarantee about the service state at this point in provisioning. +func ValidateWindowsExporter(ctx context.Context, s *Scenario) { + s.T.Helper() + + const ( + sentinel = `C:\k\skip_vhd_windows_exporter` + binary = `C:\k\windows-exporter\windows-exporter.exe` + configFile = `C:\k\windows-exporter\windows-exporter-config.yml` + serviceName = "aks-windows-exporter" + metricsURL = "http://localhost:19182/metrics" + ) + + sentinelCheck := []string{ + "$ErrorActionPreference = \"Stop\"", + fmt.Sprintf("if (-not (Test-Path '%s')) { Write-Output 'SKIP'; exit 0 }", sentinel), + "Write-Output 'PRESENT'", + } + res := execScriptOnVMForScenario(ctx, s, strings.Join(sentinelCheck, "\n")) + if strings.Contains(res.stdout, "SKIP") { + s.T.Logf("Skipping aks-windows-exporter validation: sentinel %s not found (aks-vm-extension manages the service on this VHD)", sentinel) + return + } + + s.T.Logf("skip_vhd_windows_exporter sentinel present, validating aks-windows-exporter installation") + + command := []string{ + "$ErrorActionPreference = \"Stop\"", + fmt.Sprintf("if (-not (Test-Path '%s')) { throw 'missing binary: %s' }", binary, binary), + fmt.Sprintf("if (-not (Test-Path '%s')) { throw 'missing config: %s' }", configFile, configFile), + fmt.Sprintf("$svc = Get-Service -Name '%s'", serviceName), + "Write-Output $svc", + fmt.Sprintf("if ($svc.Status -ne 'Running') { throw \"service %s is not running: $($svc.Status)\" }", serviceName), + fmt.Sprintf("if ($svc.StartType -ne 'Automatic') { throw \"service %s StartType is $($svc.StartType), expected Automatic\" }", serviceName), + // Hit the metrics endpoint and require a windows-exporter-specific metric. + fmt.Sprintf("$resp = Invoke-WebRequest -UseBasicParsing -Uri '%s' -TimeoutSec 10", metricsURL), + "$failureReasons = @()", + "if ($resp.StatusCode -ne 200) { $failureReasons += \"metrics endpoint returned $($resp.StatusCode)\" }", + "if ($resp.Content -notmatch 'windows_os_info') { $failureReasons += 'windows_os_info metric missing from /metrics response' }", + "if ($resp.Content -notmatch 'windows_cpu_time_total') { $failureReasons += 'windows_cpu_time_total metric missing from /metrics response' }", + "if ($failureReasons.Count -gt 0) {", + " Write-Output \"metrics endpoint returned status $($resp.StatusCode) with $($resp.Content.Length) bytes\"", + " Write-Output ('metrics validation failures: ' + ($failureReasons -join '; '))", + " Write-Output '--- begin /metrics response ---'", + " Write-Output $resp.Content", + " Write-Output '--- end /metrics response ---'", + " throw ($failureReasons -join '; ')", + "}", + "Write-Output \"metrics endpoint returned status $($resp.StatusCode) with $($resp.Content.Length) bytes\"", + } + validationResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + fmt.Sprintf("aks-windows-exporter validation failed on %s", s.Runtime.VM.PrivateIP)) + s.T.Logf("aks-windows-exporter validation succeeded on %s: service is Running/Automatic and %s contains windows_os_info and windows_cpu_time_total\n%s", s.Runtime.VM.PrivateIP, metricsURL, strings.TrimSpace(validationResult.stdout)) +} + func ValidateSystemdUnitIsNotFailed(ctx context.Context, s *Scenario, serviceName string) { s.T.Helper() command := []string{ diff --git a/parts/common/components.json b/parts/common/components.json index 55cc839d581..140170da80e 100644 --- a/parts/common/components.json +++ b/parts/common/components.json @@ -914,6 +914,23 @@ } } }, + { + "name": "windows-exporter", + "windowsDownloadLocation": "c:\\akse-cache\\windows-exporter\\", + "downloadURIs": { + "windows": { + "default": { + "versionsV2": [ + { + "renovateTag": "", + "latestVersion": "0.31.2" + } + ], + "downloadURL": "https://packages.aks.azure.com/dalec-packages/windows-exporter/${version}/windows/amd64/windows-exporter_${version}-1_amd64.zip" + } + } + } + }, { "name": "windows credential provider", "windowsDownloadLocation": "c:\\akse-cache\\credential-provider\\", diff --git a/parts/windows/kuberneteswindowssetup.ps1 b/parts/windows/kuberneteswindowssetup.ps1 index ba392f1f7a2..64f85097d46 100644 --- a/parts/windows/kuberneteswindowssetup.ps1 +++ b/parts/windows/kuberneteswindowssetup.ps1 @@ -325,6 +325,12 @@ if (Test-Path -Path 'c:\AzureData\windows\securetlsbootstrapfunc.ps1') { Write-Log "Windows Secure TLS Bootstrap function script not found, skipping dot-source" } +if (Test-Path -Path 'c:\AzureData\windows\windowsexporterfunc.ps1') { + . c:\AzureData\windows\windowsexporterfunc.ps1 +} else { + Write-Log "Windows Exporter function script not found, skipping dot-source" +} + if (Test-Path -Path 'c:\AzureData\windows\windowsciliumnetworkingfunc.ps1') { . c:\AzureData\windows\windowsciliumnetworkingfunc.ps1 } else { @@ -508,6 +514,16 @@ function BasePrep { Install-GmsaPlugin -GmsaPackageUrl $global:WindowsGmsaPackageUrl } + # Register aks-windows-exporter when its assets are baked into the VHD. + # Wrapped in Get-Command guard for bidirectional compat with older VHDs that don't + # carry windowsexporterfunc.ps1 in the CSE script package. + if (Get-Command -Name Install-WindowsExporter -ErrorAction SilentlyContinue) { + Logs-To-Event -TaskName "AKS.WindowsCSE.InstallWindowsExporter" -TaskMessage "Install aks-windows-exporter if VHD-baked" + Install-WindowsExporter + } else { + Write-Log "Install-WindowsExporter not available; aks-vm-extension will manage windows-exporter" + } + Write-Log "BasePrep completed successfully" Logs-To-Event -TaskName "AKS.WindowsCSE.BasePrep" -TaskMessage "BasePrep completed successfully" } diff --git a/parts/windows/windowscsehelper.ps1 b/parts/windows/windowscsehelper.ps1 index 2fbac973d0e..79cca76dce3 100644 --- a/parts/windows/windowscsehelper.ps1 +++ b/parts/windows/windowscsehelper.ps1 @@ -88,9 +88,10 @@ $global:WINDOWS_CSE_ERROR_ORAS_PULL_CREDENTIAL_PROVIDER=81 # exit code for error $global:WINDOWS_CSE_ERROR_ORAS_PULL_POD_INFRA_CONTAINER=82 # exit code for error pulling pause image with oras from registry $global:WINDOWS_CSE_ERROR_NETWORK_ISOLATED_CLUSTER_CSE_NOT_CACHED=83 # exit code for cse of network isolated cluster not cached $global:WINDOWS_CSE_ERROR_ORAS_PULL_CONTAINERD=84 # exit code for error pulling containerd artifact with oras from registry +$global:WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL=85 # exit code for failure starting aks-windows-exporter during CSE # WINDOWS_CSE_ERROR_MAX_CODE is only used in unit tests to verify whether new error code name is added in $global:ErrorCodeNames # Please use the current value of WINDOWS_CSE_ERROR_MAX_CODE as the value of the new error code and increment it by 1 -$global:WINDOWS_CSE_ERROR_MAX_CODE=85 +$global:WINDOWS_CSE_ERROR_MAX_CODE=86 # Please add new error code for downloading new packages in RP code too $global:ErrorCodeNames = @( @@ -178,7 +179,8 @@ $global:ErrorCodeNames = @( "WINDOWS_CSE_ERROR_ORAS_PULL_CREDENTIAL_PROVIDER", "WINDOWS_CSE_ERROR_ORAS_PULL_POD_INFRA_CONTAINER", "WINDOWS_CSE_ERROR_NETWORK_ISOLATED_CLUSTER_CSE_NOT_CACHED", - "WINDOWS_CSE_ERROR_ORAS_PULL_CONTAINERD" + "WINDOWS_CSE_ERROR_ORAS_PULL_CONTAINERD", + "WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL" ) # The package domain to be used diff --git a/parts/windows/windowsexporter/windows-exporter-config.yml b/parts/windows/windowsexporter/windows-exporter-config.yml new file mode 100644 index 00000000000..48f837af622 --- /dev/null +++ b/parts/windows/windowsexporter/windows-exporter-config.yml @@ -0,0 +1,11 @@ +web: + listen-address: ":19182" +collectors: + # explicitly enabled for version: https://github.com/prometheus-community/windows_exporter/tree/v0.31.2 + # NOTE: `cpu_info`, `container` and `process` collectors are added to default collectors + enabled: "cpu,logical_disk,memory,net,os,physical_disk,service,system,cpu_info,container,process" +collector: + service: + include: "aks-windows-exporter|kubelet|kubeproxy|containerd|hns|csi-proxy" +log: + level: debug diff --git a/parts/windows/windowsexporter/windows-exporter-health.ps1 b/parts/windows/windowsexporter/windows-exporter-health.ps1 new file mode 100644 index 00000000000..bd6f55ba401 --- /dev/null +++ b/parts/windows/windowsexporter/windows-exporter-health.ps1 @@ -0,0 +1,44 @@ + +function Invoke-WindowsExporterRequest { + param([Parameter(Mandatory=$true)][string]$Path) + + try { + $response = Invoke-WebRequest -UseBasicParsing -Uri "http://localhost:19182/$Path" -TimeoutSec 10 -ErrorAction Stop + return [string]$response.Content + } + catch { + return "" + } +} + +function Get-Health { + $result = Invoke-WindowsExporterRequest -Path "health" + if ($null -ne $result -and $result.Contains("ok")) { + return $result + } else { + return "" + } +} + +function Get-Version { + $result = Invoke-WindowsExporterRequest -Path "version" + if ($null -ne $result -and $result.Contains("version")) { + # {"version":"v0.25.1","revision":"f70fa009de541dc99ed210aa7e67c9550133ef02","branch":"HEAD","buildUser":"cloudtest@781d70d7c000002","buildDate":"20240223-08:06:57","goVersion":"go1.21.3"} + $version = $result -replace ".*""version"":""([^""]+)"".*", '$1' + return $version + } else { + return "" + } +} + +function Get-MetricsExample { + # The result may be too large in production node. I suggest to call it only when testing. + $result = Invoke-WindowsExporterRequest -Path "metrics" + $example = "windows_process_cpu_time_total" + if ($result -match $example) { + $example = $result -split "`n" | Where-Object {$_ -match $example} | Select-Object -Last 1 + return $example + } else { + return "" + } +} diff --git a/staging/cse/windows/README b/staging/cse/windows/README index 65e63d96c82..8512bfafb0b 100644 --- a/staging/cse/windows/README +++ b/staging/cse/windows/README @@ -29,7 +29,7 @@ pushd aks-windows-cse unzip ../aks-windows-cse-scripts-*.zip rm ../*.zip - files=("azurecnifunc.ps1" "calicofunc.ps1" "configfunc.ps1" "containerdfunc.ps1" "containerdtemplate.toml" "kubeletfunc.ps1" "kubernetesfunc.ps1" "nvidiagpudriverfunc.ps1" "securetlsbootstrapfunc.ps1" "windowsciliumnetworkingfunc.ps1") + files=("azurecnifunc.ps1" "calicofunc.ps1" "configfunc.ps1" "containerdfunc.ps1" "containerdtemplate.toml" "kubeletfunc.ps1" "kubernetesfunc.ps1" "nvidiagpudriverfunc.ps1" "securetlsbootstrapfunc.ps1" "windowsciliumnetworkingfunc.ps1" "windowsexporterfunc.ps1") for file in ${files[@]}; do echo "Downloading $file from $url/$file" curl -O "$url/$file" diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index a71e04daf6d..57a48a0fa77 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -166,6 +166,8 @@ function Update-DefenderPreferences { Add-MpPreference -ExclusionPath "C:\k\containerd.err.log" Add-MpPreference -ExclusionPath "C:\k\aks-windows-exporter.err.log" Add-MpPreference -ExclusionPath "C:\k\aks-windows-exporter.log" + Add-MpPreference -ExclusionPath "C:\k\windows-exporter.err.log" + Add-MpPreference -ExclusionPath "C:\k\windows-exporter.log" # Azure CNI Add-MpPreference -ExclusionProcess "C:\k\azurecni\bin\azure-cns.exe" diff --git a/staging/cse/windows/windowsexporterfunc.ps1 b/staging/cse/windows/windowsexporterfunc.ps1 new file mode 100644 index 00000000000..160bb29769b --- /dev/null +++ b/staging/cse/windows/windowsexporterfunc.ps1 @@ -0,0 +1,152 @@ +<# +.SYNOPSIS + Installs and starts the aks-windows-exporter service using assets baked into the VHD. + +.DESCRIPTION + Migrated from aks-vm-extension (see aks-windows-node-vm-extension/entrypoint.ps1). + Registers windows-exporter.exe as the Windows service "aks-windows-exporter" via NSSM, + matching the service name, port (19182), log paths, and NSSM settings the extension + used so existing customer dashboards/alerts continue to work. + + The function is guarded so it is a no-op when running on a VHD that does not carry + the exporter assets. In that case the aks-vm-extension install path continues to + handle the service (dual-mode coexistence). + + Coordination with aks-vm-extension: + - When C:\k\skip_vhd_windows_exporter exists, the extension's entrypoint.ps1 + short-circuits. The sentinel is created by the Windows VHD build once the binary + and config are staged. +#> + +$global:WindowsExporterInstallDir = "C:\k\windows-exporter" +$global:WindowsExporterBinary = Join-Path $global:WindowsExporterInstallDir "windows-exporter.exe" +$global:WindowsExporterConfig = Join-Path $global:WindowsExporterInstallDir "windows-exporter-config.yml" +$global:WindowsExporterHealthScript = Join-Path $global:WindowsExporterInstallDir "windows-exporter-health.ps1" +$global:WindowsExporterSkipFile = "C:\k\skip_vhd_windows_exporter" +$global:WindowsExporterServiceName = "aks-windows-exporter" +$global:WindowsExporterPort = 19182 +$global:WindowsExporterStdoutLog = "C:\k\windows-exporter.log" +$global:WindowsExporterStderrLog = "C:\k\windows-exporter.err.log" +$global:WindowsExporterNssm = "C:\k\nssm.exe" + +function Test-WindowsExporterHealth { + param( + [int]$RetryCount = 5, + [int]$RetryInterval = 5 + ) + + if (Test-Path $global:WindowsExporterHealthScript) { + . $global:WindowsExporterHealthScript + for ($i = 0; $i -le $RetryCount; $i++) { + $healthResult = Get-Health + if ($healthResult -ne "") { + Write-Log "aks-windows-exporter health check passed: $healthResult" + $versionResult = Get-Version + if ($versionResult -ne "") { + Write-Log "aks-windows-exporter version $versionResult" + } + return $true + } + Start-Sleep -Seconds $RetryInterval + } + + Write-Log "aks-windows-exporter health script check failed after $($RetryCount + 1) attempts" + return $false + } + + Write-Log "windows-exporter health script not found at $($global:WindowsExporterHealthScript); falling back to direct health endpoint probe" + for ($i = 0; $i -le $RetryCount; $i++) { + $result = "" + try { + $response = Invoke-WebRequest -UseBasicParsing -Uri "http://localhost:$($global:WindowsExporterPort)/health" -TimeoutSec 10 -ErrorAction Stop + $result = [string]$response.Content + } + catch { + $result = "" + } + if ($null -ne $result -and $result.Contains("ok")) { + Write-Log "aks-windows-exporter health check passed: $result" + return $true + } + Start-Sleep -Seconds $RetryInterval + } + Write-Log "aks-windows-exporter health check failed after $($RetryCount + 1) attempts" + return $false +} + +function Install-WindowsExporter { + <# + .SYNOPSIS + Registers and starts the aks-windows-exporter NSSM service. + + .NOTES + No-ops when: + - The VHD-build sentinel C:\k\skip_vhd_windows_exporter is absent + (older VHD without baked assets - aks-vm-extension still covers it). + - The windows-exporter binary is missing on disk (defensive). + #> + + if (-not (Test-Path $global:WindowsExporterSkipFile)) { + Write-Log "skip_vhd_windows_exporter not present; aks-vm-extension will manage windows-exporter on this node" + return + } + + if (-not (Test-Path $global:WindowsExporterBinary)) { + Write-Log "windows-exporter binary not found at $($global:WindowsExporterBinary); skipping install (older VHD?)" + return + } + + if (-not (Test-Path $global:WindowsExporterConfig)) { + Write-Log "windows-exporter config not found at $($global:WindowsExporterConfig); skipping install" + return + } + + if (-not (Test-Path $global:WindowsExporterHealthScript)) { + Write-Log "windows-exporter health script not found at $($global:WindowsExporterHealthScript); health validation will use direct endpoint probe" + } + + if (-not (Test-Path $global:WindowsExporterNssm)) { + Write-Log "nssm.exe not found at $($global:WindowsExporterNssm); cannot install $($global:WindowsExporterServiceName)" + Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL -ErrorMessage "nssm.exe missing; cannot register aks-windows-exporter" + return + } + + Write-Log "Ensuring $($global:WindowsExporterServiceName) is installed and running" + + $appParameters = "--config.file=`"$($global:WindowsExporterConfig)`"" + + # NSSM settings mirror aks-vm-extension/aks-windows-node-vm-extension/entrypoint.ps1 Install-SystemService + # to preserve service behavior (logs, rotation, restart policy) that customers rely on. + $existingService = Get-Service $global:WindowsExporterServiceName -ErrorAction SilentlyContinue + if (-not $existingService) { + & $global:WindowsExporterNssm install $global:WindowsExporterServiceName $global:WindowsExporterBinary | Out-Null + } else { + Write-Log "$($global:WindowsExporterServiceName) is already registered; ensuring settings and running state" + } + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppDirectory $global:WindowsExporterInstallDir | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppParameters $appParameters | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName DisplayName $global:WindowsExporterServiceName | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName Description $global:WindowsExporterServiceName | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName Start SERVICE_AUTO_START | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName ObjectName LocalSystem | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName Type SERVICE_WIN32_OWN_PROCESS | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppRestartDelay 5000 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppThrottle 1500 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppStdout $global:WindowsExporterStdoutLog | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppStderr $global:WindowsExporterStderrLog | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppStdoutCreationDisposition 4 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppStderrCreationDisposition 4 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppRotateFiles 1 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppRotateOnline 1 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppRotateSeconds 86400 | Out-Null + & $global:WindowsExporterNssm set $global:WindowsExporterServiceName AppRotateBytes 10485760 | Out-Null + + & $global:WindowsExporterNssm start $global:WindowsExporterServiceName | Out-Null + + if (-not (Test-WindowsExporterHealth)) { + Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL -ErrorMessage "aks-windows-exporter failed to become healthy" + return + } + + Write-Log "Ensured $($global:WindowsExporterServiceName) is installed and running" +} diff --git a/staging/cse/windows/windowsexporterfunc.tests.ps1 b/staging/cse/windows/windowsexporterfunc.tests.ps1 new file mode 100644 index 00000000000..71f96d73272 --- /dev/null +++ b/staging/cse/windows/windowsexporterfunc.tests.ps1 @@ -0,0 +1,86 @@ +Describe 'Windows exporter CSE functions' { + BeforeAll { + . $PSScriptRoot\..\..\..\parts\windows\windowscsehelper.ps1 + . $PSCommandPath.Replace('.tests.ps1','.ps1') + + function Write-Log { + param($Message) + Write-Host "LOG: $Message" + } + } + + Context 'Install-WindowsExporter' { + BeforeEach { + $script:LastExitCode = $null + $script:LastErrorMessage = $null + + Mock Set-ExitCode -MockWith { + param($ExitCode, $ErrorMessage) + $script:LastExitCode = $ExitCode + $script:LastErrorMessage = $ErrorMessage + } + } + + It 'no-ops when the VHD sentinel is absent' { + Mock Test-Path -MockWith { return $false } + + { Install-WindowsExporter } | Should -Not -Throw + + Assert-MockCalled Set-ExitCode -Exactly -Times 0 + } + + It 'no-ops when the sentinel is present but the binary is absent' { + Mock Test-Path -MockWith { + param($Path) + return $Path -eq $global:WindowsExporterSkipFile + } + + { Install-WindowsExporter } | Should -Not -Throw + + Assert-MockCalled Set-ExitCode -Exactly -Times 0 + } + + It 'sets the windows-exporter error code when nssm is absent after assets are present' { + Mock Test-Path -MockWith { + param($Path) + return $Path -ne $global:WindowsExporterNssm + } + + { Install-WindowsExporter } | Should -Not -Throw + + Assert-MockCalled Set-ExitCode -Exactly -Times 1 -ParameterFilter { + $ExitCode -eq $global:WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL + } + $script:LastExitCode | Should -Be $global:WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL + } + } + + Context 'Test-WindowsExporterHealth' { + It 'uses the baked health script when it is present' { + $global:WindowsExporterHealthScript = Join-Path $TestDrive 'windows-exporter-health.ps1' + @' +function Get-Health { + return "ok" +} + +function Get-Version { + return "v0.31.2" +} +'@ | Set-Content -Path $global:WindowsExporterHealthScript -Force + + Test-WindowsExporterHealth -RetryCount 0 -RetryInterval 0 | Should -Be $true + } + + It 'uses a native PowerShell endpoint probe when the baked health script is absent' { + $global:WindowsExporterHealthScript = Join-Path $TestDrive 'missing-health.ps1' + + Mock Invoke-WebRequest -MockWith { + return @{ Content = 'ok' } + } + + Test-WindowsExporterHealth -RetryCount 0 -RetryInterval 0 | Should -Be $true + + Assert-MockCalled Invoke-WebRequest -Exactly -Times 1 + } + } +} diff --git a/vhdbuilder/packer/test/windows-vhd-content-test.ps1 b/vhdbuilder/packer/test/windows-vhd-content-test.ps1 index 157fa4f598a..ffa45222571 100644 --- a/vhdbuilder/packer/test/windows-vhd-content-test.ps1 +++ b/vhdbuilder/packer/test/windows-vhd-content-test.ps1 @@ -604,6 +604,49 @@ function Test-ToolsToCacheOnVHD { } } +function Test-WindowsExporterOnVHD { + # The Install-WindowsExporterOnVHD step in configure-windows-vhd.ps1 must have: + # 1. Extracted windows-exporter.exe into C:\k\windows-exporter + # 2. Placed windows-exporter-config.yml alongside it + # 3. Placed windows-exporter-health.ps1 alongside it + # 4. Created the sentinel file the aks-vm-extension honors to no-op + # The service is registered at CSE time (staging/cse/windows/windowsexporterfunc.ps1), + # so we intentionally do NOT expect the service to exist on the VHD itself. + $exporterDir = "C:\k\windows-exporter" + $expected = @( + (Join-Path $exporterDir "windows-exporter.exe"), + (Join-Path $exporterDir "windows-exporter-config.yml"), + (Join-Path $exporterDir "windows-exporter-health.ps1"), + "C:\k\skip_vhd_windows_exporter" + ) + + $missing = @() + foreach ($path in $expected) + { + if (-not (Test-Path -Path $path)) + { + $missing += $path + } + else + { + Write-OutputWithTimestamp "windows-exporter asset present: $path" + } + } + + if ($missing.Count -gt 0) + { + Write-ErrorWithTimestamp "Missing windows-exporter VHD assets: $($missing -join ', ')" + exit 1 + } + + $svc = Get-Service "aks-windows-exporter" -ErrorAction SilentlyContinue + if ($svc) + { + Write-ErrorWithTimestamp "Service aks-windows-exporter should not be registered on the VHD (CSE registers it at provisioning); found state: $($svc.Status)" + exit 1 + } +} + function Test-ExpandVolumeTask { $osDrive = ((Get-WmiObject Win32_OperatingSystem -ErrorAction Stop).SystemDrive).TrimEnd(":") $osDisk = Get-Partition -DriveLetter $osDrive | Get-Disk @@ -760,6 +803,9 @@ Test-WindowsDefenderPlatformUpdate Write-OutputWithTimestamp "Test: ToolsToCacheOnVHD" Test-ToolsToCacheOnVHD +Write-OutputWithTimestamp "Test: WindowsExporterOnVHD" +Test-WindowsExporterOnVHD + Write-OutputWithTimestamp "Test: ExpandVolumeTask" Test-ExpandVolumeTask diff --git a/vhdbuilder/packer/windows/configure-windows-vhd.ps1 b/vhdbuilder/packer/windows/configure-windows-vhd.ps1 index 3bef0c3ad59..3cb47973fcf 100644 --- a/vhdbuilder/packer/windows/configure-windows-vhd.ps1 +++ b/vhdbuilder/packer/windows/configure-windows-vhd.ps1 @@ -760,6 +760,85 @@ function Set-WinRmServiceAutoStart sc.exe config winrm start=auto } +function Install-WindowsExporterOnVHD +{ + # Stage windows-exporter assets into C:\k\windows-exporter so the CSE-time + # Install-WindowsExporter function (staging/cse/windows/windowsexporterfunc.ps1) + # can register the aks-windows-exporter service at node provisioning. + # + # Migrated from aks-vm-extension; the sentinel file is the coordination hook + # that tells the extension to no-op on nodes built from this VHD. + $exporterCacheDir = "c:\akse-cache\windows-exporter" + $exporterInstallDir = "C:\k\windows-exporter" + $exporterConfigSrc = "c:\k\windows-exporter-config.yml" + $exporterHealthSrc = "c:\k\windows-exporter-health.ps1" + $exporterSentinel = "C:\k\skip_vhd_windows_exporter" + + if (-not (Test-Path $exporterCacheDir)) + { + Write-Log "windows-exporter cache directory not found at $exporterCacheDir; skipping VHD install" + return + } + + $exporterZip = Get-ChildItem -Path $exporterCacheDir -Filter "windows-exporter_*_amd64.zip" -File | + Sort-Object -Property Name -Descending | + Select-Object -First 1 + if (-not $exporterZip) + { + Write-Log "No windows-exporter zip found under $exporterCacheDir; skipping VHD install" + return + } + + New-Item -ItemType Directory -Path $exporterInstallDir -Force | Out-Null + + Write-Log "Extracting $($exporterZip.FullName) to $exporterInstallDir" + Add-Type -AssemblyName System.IO.Compression.FileSystem + # Clean any prior extraction to avoid zip-extract "already exists" failures on re-runs + Get-ChildItem -Path $exporterInstallDir -Force -ErrorAction SilentlyContinue | + Remove-Item -Recurse -Force -ErrorAction SilentlyContinue + [System.IO.Compression.ZipFile]::ExtractToDirectory($exporterZip.FullName, $exporterInstallDir) + + $exporterBinary = Join-Path $exporterInstallDir "windows-exporter.exe" + if (-not (Test-Path $exporterBinary)) + { + # Some zip payloads nest the binary under a subfolder; flatten into install dir. + $nestedBinary = Get-ChildItem -Path $exporterInstallDir -Filter "windows-exporter.exe" -Recurse -File | + Select-Object -First 1 + if ($nestedBinary) + { + Move-Item -Path $nestedBinary.FullName -Destination $exporterBinary -Force + } + } + if (-not (Test-Path $exporterBinary)) + { + throw "windows-exporter.exe not found after extracting $($exporterZip.Name)" + } + + if (Test-Path $exporterConfigSrc) + { + Copy-Item -Path $exporterConfigSrc -Destination (Join-Path $exporterInstallDir "windows-exporter-config.yml") -Force + } + else + { + throw "windows-exporter config not staged at $exporterConfigSrc" + } + + if (Test-Path $exporterHealthSrc) + { + Copy-Item -Path $exporterHealthSrc -Destination (Join-Path $exporterInstallDir "windows-exporter-health.ps1") -Force + } + else + { + throw "windows-exporter health script not staged at $exporterHealthSrc" + } + + # Create the sentinel last so partial installs don't appear complete to CSE. + New-Item -ItemType File -Path $exporterSentinel -Force | Out-Null + + LogFilesInDirectory $exporterInstallDir + Write-Log "windows-exporter staged on VHD; sentinel $exporterSentinel created" +} + function Set-WinRmServiceDelayedStart { # Hyper-V messes with networking components on startup after the feature is enabled @@ -1044,6 +1123,7 @@ try Get-ToolsToVHD Get-PrivatePackagesToCacheOnVHD Install-WindowsCiliumNetworking + Install-WindowsExporterOnVHD # Update all the registry keys again in case the steps in between reset them. Ok, some of the steps in between do reset them. But there's a risk that the steps also need # the keys set. So we kinda have to do both now :cry: Update-Registry diff --git a/vhdbuilder/packer/windows/windows-vhd-builder-sig.json b/vhdbuilder/packer/windows/windows-vhd-builder-sig.json index 1437c8b65d9..bec6cc8ba49 100644 --- a/vhdbuilder/packer/windows/windows-vhd-builder-sig.json +++ b/vhdbuilder/packer/windows/windows-vhd-builder-sig.json @@ -88,7 +88,9 @@ "parts/common/components.json", "vhdbuilder/packer/windows/windows-vhd-configuration.ps1", "vhdbuilder/packer/windows/windows_settings.json", - "vhdbuilder/packer/windows/components_json_helpers.ps1" + "vhdbuilder/packer/windows/components_json_helpers.ps1", + "parts/windows/windowsexporter/windows-exporter-config.yml", + "parts/windows/windowsexporter/windows-exporter-health.ps1" ], "destination": "c:/k/" },