Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion e2e/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ func mustGetNewRSAKeyPair() ([]byte, []byte, string) {

privateKeyFileName, err := writePrivateKeyToTempFile(privatePEMBytes)
if err != nil {
panic(fmt.Sprintf("failed to write private key to temp file: %w", err))
panic(fmt.Sprintf("failed to write private key to temp file: %v", err))
}

return privatePEMBytes, publicKeyBytes, privateKeyFileName
Expand Down
18 changes: 18 additions & 0 deletions e2e/scenario_win_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ func Test_Windows2022_AzureNetwork(t *testing.T) {
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateDotnetNotInstalledWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand All @@ -91,6 +92,7 @@ func Test_Windows2022AzureOverlayNetworkDualStack(t *testing.T) {
ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"})
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand All @@ -115,6 +117,7 @@ func Test_Windows2022Gen2AzureNetwork(t *testing.T) {
ValidateDotnetNotInstalledWindows(ctx, s)
ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip")
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand All @@ -139,6 +142,7 @@ func Test_Windows2022Gen2AzureOverlayNetworkDualStack(t *testing.T) {
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip")
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand All @@ -161,6 +165,7 @@ func Test_Windows23H2AzureNetwork(t *testing.T) {
ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"})
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand All @@ -184,6 +189,7 @@ func Test_Windows23H2AzureOverlayNetworkDualStack(t *testing.T) {
ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"})
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand All @@ -207,6 +213,7 @@ func Test_Windows23H2Gen2AzureNetwork(t *testing.T) {
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip")
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand All @@ -231,6 +238,7 @@ func Test_Windows23H2Gen2AzureOverlayDualStack(t *testing.T) {
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateFileHasContent(ctx, s, "/AzureData/CustomDataSetupScript.log", "CSEScriptsPackageUrl used for provision is https://packages.aks.azure.com/aks/windows/cse/aks-windows-cse-scripts-current.zip")
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand Down Expand Up @@ -294,6 +302,7 @@ func Test_Windows2025(t *testing.T) {
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateDotnetNotInstalledWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand All @@ -319,6 +328,7 @@ func Test_Windows2025Gen2(t *testing.T) {
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateDotnetNotInstalledWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand Down Expand Up @@ -378,6 +388,7 @@ func Test_Windows2022_SecureTLSBootstrapping_BootstrapToken_Fallback(t *testing.
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateDotnetNotInstalledWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand Down Expand Up @@ -434,6 +445,7 @@ func Test_Windows2022_VHDCaching(t *testing.T) {
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateDotnetNotInstalledWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand All @@ -460,6 +472,7 @@ func Test_Windows2022Gen2_k8s_133(t *testing.T) {
ValidateCiliumIsNotRunningWindows(ctx, s)
ValidateDotnetNotInstalledWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand All @@ -483,6 +496,7 @@ func Test_Windows23H2_Cilium2(t *testing.T) {
ValidateWindowsProcessHasCliArguments(ctx, s, "kubelet.exe", []string{"--rotate-certificates=true", "--client-ca-file=c:\\k\\ca.crt"})
ValidateCiliumIsRunningWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand All @@ -506,6 +520,7 @@ func Test_Windows23H2Gen2_WindowsCiliumNetworking(t *testing.T) {
Validator: func(ctx context.Context, s *Scenario) {
ValidateWindowsCiliumIsRunning(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand Down Expand Up @@ -534,6 +549,7 @@ func Test_Windows2022_McrChinaCloud_Windows(t *testing.T) {
`https://mcr.azk8s.cn`)
ValidateDotnetNotInstalledWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
ValidateCollectWindowsLogsScript(ctx, s)
},
},
Expand Down Expand Up @@ -570,6 +586,7 @@ func Test_Windows2025Gen2_McrChinaCloud_Windows(t *testing.T) {
`C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`,
`https://mcr.azk8s.cn`)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
},
},
})
Expand Down Expand Up @@ -616,6 +633,7 @@ func Test_NetworkIsolatedCluster_Windows_WithEgress(t *testing.T) {
ValidateFileDoesNotExist(ctx, s, `C:\ProgramData\containerd\certs.d\mcr.azk8s.cn\hosts.toml`)
ValidateDotnetNotInstalledWindows(ctx, s)
ValidateWindowsSystemServicesRestartConfiguration(ctx, s)
ValidateWindowsExporter(ctx, s)
},
},
})
Expand Down
60 changes: 60 additions & 0 deletions e2e/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -805,6 +805,66 @@ func ValidateWindowsSystemServicesRestartConfiguration(ctx context.Context, s *S
ValidateWindowsSystemServiceRestartConfiguration(ctx, s, "kubeproxy")
}

// ValidateWindowsExporter asserts that the aks-windows-exporter service registered by
// staging/cse/windows/windowsexporterfunc.ps1 is running and serving Prometheus metrics.
//
// When the VHD does not carry the windows-exporter assets (older VHDs where the
// aks-vm-extension still installs the service), the sentinel file is absent and we
// skip the validation - the extension owns the service in that mode and AgentBaker
// has no guarantee about the service state at this point in provisioning.
func ValidateWindowsExporter(ctx context.Context, s *Scenario) {
s.T.Helper()

const (
sentinel = `C:\k\skip_vhd_windows_exporter`
binary = `C:\k\windows-exporter\windows-exporter.exe`
configFile = `C:\k\windows-exporter\windows-exporter-config.yml`
serviceName = "aks-windows-exporter"
metricsURL = "http://localhost:19182/metrics"
)

sentinelCheck := []string{
"$ErrorActionPreference = \"Stop\"",
fmt.Sprintf("if (-not (Test-Path '%s')) { Write-Output 'SKIP'; exit 0 }", sentinel),
"Write-Output 'PRESENT'",
}
res := execScriptOnVMForScenario(ctx, s, strings.Join(sentinelCheck, "\n"))
if strings.Contains(res.stdout, "SKIP") {
s.T.Logf("Skipping aks-windows-exporter validation: sentinel %s not found (aks-vm-extension manages the service on this VHD)", sentinel)
return
}
Comment thread
chmill-zz marked this conversation as resolved.

s.T.Logf("skip_vhd_windows_exporter sentinel present, validating aks-windows-exporter installation")

command := []string{
"$ErrorActionPreference = \"Stop\"",
fmt.Sprintf("if (-not (Test-Path '%s')) { throw 'missing binary: %s' }", binary, binary),
fmt.Sprintf("if (-not (Test-Path '%s')) { throw 'missing config: %s' }", configFile, configFile),
fmt.Sprintf("$svc = Get-Service -Name '%s'", serviceName),
"Write-Output $svc",
fmt.Sprintf("if ($svc.Status -ne 'Running') { throw \"service %s is not running: $($svc.Status)\" }", serviceName),
fmt.Sprintf("if ($svc.StartType -ne 'Automatic') { throw \"service %s StartType is $($svc.StartType), expected Automatic\" }", serviceName),
// Hit the metrics endpoint and require a windows-exporter-specific metric.
fmt.Sprintf("$resp = Invoke-WebRequest -UseBasicParsing -Uri '%s' -TimeoutSec 10", metricsURL),
"$failureReasons = @()",
"if ($resp.StatusCode -ne 200) { $failureReasons += \"metrics endpoint returned $($resp.StatusCode)\" }",
"if ($resp.Content -notmatch 'windows_os_info') { $failureReasons += 'windows_os_info metric missing from /metrics response' }",
"if ($resp.Content -notmatch 'windows_cpu_time_total') { $failureReasons += 'windows_cpu_time_total metric missing from /metrics response' }",
"if ($failureReasons.Count -gt 0) {",
" Write-Output \"metrics endpoint returned status $($resp.StatusCode) with $($resp.Content.Length) bytes\"",
" Write-Output ('metrics validation failures: ' + ($failureReasons -join '; '))",
" Write-Output '--- begin /metrics response ---'",
" Write-Output $resp.Content",
" Write-Output '--- end /metrics response ---'",
Comment thread
chmill-zz marked this conversation as resolved.
" throw ($failureReasons -join '; ')",
"}",
"Write-Output \"metrics endpoint returned status $($resp.StatusCode) with $($resp.Content.Length) bytes\"",
}
validationResult := execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0,
fmt.Sprintf("aks-windows-exporter validation failed on %s", s.Runtime.VM.PrivateIP))
s.T.Logf("aks-windows-exporter validation succeeded on %s: service is Running/Automatic and %s contains windows_os_info and windows_cpu_time_total\n%s", s.Runtime.VM.PrivateIP, metricsURL, strings.TrimSpace(validationResult.stdout))
}

func ValidateSystemdUnitIsNotFailed(ctx context.Context, s *Scenario, serviceName string) {
s.T.Helper()
command := []string{
Expand Down
17 changes: 17 additions & 0 deletions parts/common/components.json
Original file line number Diff line number Diff line change
Expand Up @@ -914,6 +914,23 @@
}
}
},
{
"name": "windows-exporter",
Comment thread
chmill-zz marked this conversation as resolved.
"windowsDownloadLocation": "c:\\akse-cache\\windows-exporter\\",
"downloadURIs": {
"windows": {
"default": {
"versionsV2": [
{
"renovateTag": "<DO_NOT_UPDATE>",
"latestVersion": "0.31.2"
}
],
"downloadURL": "https://packages.aks.azure.com/dalec-packages/windows-exporter/${version}/windows/amd64/windows-exporter_${version}-1_amd64.zip"
}
}
}
},
{
"name": "windows credential provider",
"windowsDownloadLocation": "c:\\akse-cache\\credential-provider\\",
Expand Down
16 changes: 16 additions & 0 deletions parts/windows/kuberneteswindowssetup.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,12 @@ if (Test-Path -Path 'c:\AzureData\windows\securetlsbootstrapfunc.ps1') {
Write-Log "Windows Secure TLS Bootstrap function script not found, skipping dot-source"
}

if (Test-Path -Path 'c:\AzureData\windows\windowsexporterfunc.ps1') {
. c:\AzureData\windows\windowsexporterfunc.ps1
} else {
Write-Log "Windows Exporter function script not found, skipping dot-source"
}

if (Test-Path -Path 'c:\AzureData\windows\windowsciliumnetworkingfunc.ps1') {
. c:\AzureData\windows\windowsciliumnetworkingfunc.ps1
} else {
Expand Down Expand Up @@ -508,6 +514,16 @@ function BasePrep {
Install-GmsaPlugin -GmsaPackageUrl $global:WindowsGmsaPackageUrl
}

# Register aks-windows-exporter when its assets are baked into the VHD.
# Wrapped in Get-Command guard for bidirectional compat with older VHDs that don't
# carry windowsexporterfunc.ps1 in the CSE script package.
if (Get-Command -Name Install-WindowsExporter -ErrorAction SilentlyContinue) {
Logs-To-Event -TaskName "AKS.WindowsCSE.InstallWindowsExporter" -TaskMessage "Install aks-windows-exporter if VHD-baked"
Install-WindowsExporter
} else {
Write-Log "Install-WindowsExporter not available; aks-vm-extension will manage windows-exporter"
}

Write-Log "BasePrep completed successfully"
Logs-To-Event -TaskName "AKS.WindowsCSE.BasePrep" -TaskMessage "BasePrep completed successfully"
}
Expand Down
6 changes: 4 additions & 2 deletions parts/windows/windowscsehelper.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,10 @@ $global:WINDOWS_CSE_ERROR_ORAS_PULL_CREDENTIAL_PROVIDER=81 # exit code for error
$global:WINDOWS_CSE_ERROR_ORAS_PULL_POD_INFRA_CONTAINER=82 # exit code for error pulling pause image with oras from registry
$global:WINDOWS_CSE_ERROR_NETWORK_ISOLATED_CLUSTER_CSE_NOT_CACHED=83 # exit code for cse of network isolated cluster not cached
$global:WINDOWS_CSE_ERROR_ORAS_PULL_CONTAINERD=84 # exit code for error pulling containerd artifact with oras from registry
$global:WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL=85 # exit code for failure starting aks-windows-exporter during CSE
# WINDOWS_CSE_ERROR_MAX_CODE is only used in unit tests to verify whether new error code name is added in $global:ErrorCodeNames
# Please use the current value of WINDOWS_CSE_ERROR_MAX_CODE as the value of the new error code and increment it by 1
$global:WINDOWS_CSE_ERROR_MAX_CODE=85
$global:WINDOWS_CSE_ERROR_MAX_CODE=86

# Please add new error code for downloading new packages in RP code too
$global:ErrorCodeNames = @(
Expand Down Expand Up @@ -178,7 +179,8 @@ $global:ErrorCodeNames = @(
"WINDOWS_CSE_ERROR_ORAS_PULL_CREDENTIAL_PROVIDER",
"WINDOWS_CSE_ERROR_ORAS_PULL_POD_INFRA_CONTAINER",
"WINDOWS_CSE_ERROR_NETWORK_ISOLATED_CLUSTER_CSE_NOT_CACHED",
"WINDOWS_CSE_ERROR_ORAS_PULL_CONTAINERD"
"WINDOWS_CSE_ERROR_ORAS_PULL_CONTAINERD",
"WINDOWS_CSE_ERROR_WINDOWS_EXPORTER_START_FAIL"
)

# The package domain to be used
Expand Down
11 changes: 11 additions & 0 deletions parts/windows/windowsexporter/windows-exporter-config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
web:
listen-address: ":19182"
collectors:
# explicitly enabled for version: https://github.com/prometheus-community/windows_exporter/tree/v0.31.2
# NOTE: `cpu_info`, `container` and `process` collectors are added to default collectors
enabled: "cpu,logical_disk,memory,net,os,physical_disk,service,system,cpu_info,container,process"
collector:
service:
include: "aks-windows-exporter|kubelet|kubeproxy|containerd|hns|csi-proxy"
log:
level: debug
Comment thread
chmill-zz marked this conversation as resolved.
Comment thread
chmill-zz marked this conversation as resolved.
44 changes: 44 additions & 0 deletions parts/windows/windowsexporter/windows-exporter-health.ps1
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@

function Invoke-WindowsExporterRequest {
param([Parameter(Mandatory=$true)][string]$Path)

try {
$response = Invoke-WebRequest -UseBasicParsing -Uri "http://localhost:19182/$Path" -TimeoutSec 10 -ErrorAction Stop
return [string]$response.Content
}
catch {
return ""
}
}

function Get-Health {
$result = Invoke-WindowsExporterRequest -Path "health"
if ($null -ne $result -and $result.Contains("ok")) {
return $result
} else {
return ""
}
}

function Get-Version {
$result = Invoke-WindowsExporterRequest -Path "version"
if ($null -ne $result -and $result.Contains("version")) {
# {"version":"v0.25.1","revision":"f70fa009de541dc99ed210aa7e67c9550133ef02","branch":"HEAD","buildUser":"cloudtest@781d70d7c000002","buildDate":"20240223-08:06:57","goVersion":"go1.21.3"}
$version = $result -replace ".*""version"":""([^""]+)"".*", '$1'
return $version
} else {
return ""
}
Comment thread
chmill-zz marked this conversation as resolved.
}

function Get-MetricsExample {
# The result may be too large in production node. I suggest to call it only when testing.
$result = Invoke-WindowsExporterRequest -Path "metrics"
$example = "windows_process_cpu_time_total"
if ($result -match $example) {
$example = $result -split "`n" | Where-Object {$_ -match $example} | Select-Object -Last 1
return $example
} else {
return ""
}
}
2 changes: 1 addition & 1 deletion staging/cse/windows/README
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pushd aks-windows-cse
unzip ../aks-windows-cse-scripts-*.zip
rm ../*.zip

files=("azurecnifunc.ps1" "calicofunc.ps1" "configfunc.ps1" "containerdfunc.ps1" "containerdtemplate.toml" "kubeletfunc.ps1" "kubernetesfunc.ps1" "nvidiagpudriverfunc.ps1" "securetlsbootstrapfunc.ps1" "windowsciliumnetworkingfunc.ps1")
files=("azurecnifunc.ps1" "calicofunc.ps1" "configfunc.ps1" "containerdfunc.ps1" "containerdtemplate.toml" "kubeletfunc.ps1" "kubernetesfunc.ps1" "nvidiagpudriverfunc.ps1" "securetlsbootstrapfunc.ps1" "windowsciliumnetworkingfunc.ps1" "windowsexporterfunc.ps1")
for file in ${files[@]}; do
echo "Downloading $file from $url/$file"
curl -O "$url/$file"
Expand Down
2 changes: 2 additions & 0 deletions staging/cse/windows/kubernetesfunc.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ function Update-DefenderPreferences {
Add-MpPreference -ExclusionPath "C:\k\containerd.err.log"
Add-MpPreference -ExclusionPath "C:\k\aks-windows-exporter.err.log"
Add-MpPreference -ExclusionPath "C:\k\aks-windows-exporter.log"
Add-MpPreference -ExclusionPath "C:\k\windows-exporter.err.log"
Add-MpPreference -ExclusionPath "C:\k\windows-exporter.log"

# Azure CNI
Add-MpPreference -ExclusionProcess "C:\k\azurecni\bin\azure-cns.exe"
Expand Down
Loading
Loading