From 0e25cf312befbd3c0b0ae428f0ab62084dbbe241 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 12 Mar 2026 17:07:29 -0700 Subject: [PATCH 01/70] feat: bump windows image version for 2026-03B (#8074) Co-authored-by: Jane Jung Co-authored-by: janenotjung-hue <107402425+janenotjung-hue@users.noreply.github.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: aks-node-assistant[bot] <190555641+aks-node-assistant[bot]@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani From 97de410a046cfaac8977543ada5c523c1f7762bc Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sun, 15 Mar 2026 22:08:49 -0700 Subject: [PATCH 02/70] feat(rcv1p): unify cert bootstrap flow and add Windows CA refresh task https://eng.ms/docs/products/onecert-certificates-key-vault-and-dsms/onecert-customer-guide/autorotationandecr/overviewrcv https://eng.ms/docs/products/onecert-certificates-key-vault-and-dsms/onecert-customer-guide/autorotationandecr/rcv1ptsg cse_cmd.sh.gtpl: derive cert endpoint mode from target cloud and always run custom-cloud init script. cse_cmd.sh: same mode logic as template; remove LOCATION export. init-aks-custom-cloud.sh: merged legacy + operation-requests logic into one script with distro-aware cert install paths. parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh: removed (merged into unified script). parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh: removed (merged into unified script). parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests-mariner.sh: removed (merged into unified script). const.go: keep only unified custom-cloud init script constant. variables.go: simplify script selection to always use unified init script. kubernetesfunc.ps1: add location-aware CA retrieval (legacy/rcv1p) and scheduled refresh task registration helper. kuberneteswindowssetup.ps1: pass location to CA retrieval and register refresh task for custom cloud. Signed-off-by: Ramkumar Chinchani --- aks-node-controller/parser/helper.go | 7 +- .../parser/templates/cse_cmd.sh.gtpl | 1 + .../init-aks-custom-cloud-mariner.sh | 186 --------- ...custom-cloud-operation-requests-mariner.sh | 236 ------------ ...nit-aks-custom-cloud-operation-requests.sh | 346 ----------------- .../artifacts/init-aks-custom-cloud.sh | 358 ++++++++++++++++-- parts/windows/kuberneteswindowssetup.ps1 | 4 +- pkg/agent/const.go | 9 +- pkg/agent/variables.go | 19 +- staging/cse/windows/kubernetesfunc.ps1 | 132 +++++-- 10 files changed, 454 insertions(+), 844 deletions(-) delete mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh delete mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests-mariner.sh delete mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh diff --git a/aks-node-controller/parser/helper.go b/aks-node-controller/parser/helper.go index 99c69c7aa4f..cddd58e7f08 100644 --- a/aks-node-controller/parser/helper.go +++ b/aks-node-controller/parser/helper.go @@ -64,6 +64,7 @@ func getFuncMap() template.FuncMap { return template.FuncMap{ "getInitAKSCustomCloudFilepath": getInitAKSCustomCloudFilepath, "getIsAksCustomCloud": getIsAksCustomCloud, + "getCloudLocation": getCloudLocation, } } @@ -538,11 +539,15 @@ func getIsAksCustomCloud(customCloudConfig *aksnodeconfigv1.CustomCloudConfig) b return strings.EqualFold(customCloudConfig.GetCustomCloudEnvName(), helpers.AksCustomCloudName) } +func getCloudLocation(v *aksnodeconfigv1.Configuration) string { + return strings.ToLower(strings.Join(strings.Fields(v.GetClusterConfig().GetLocation()), "")) +} + /* GetCloudTargetEnv determines and returns whether the region is a sovereign cloud which have their own data compliance regulations (China/Germany/USGov) or standard. */ // Azure public cloud. func getCloudTargetEnv(v *aksnodeconfigv1.Configuration) string { - loc := strings.ToLower(strings.Join(strings.Fields(v.GetClusterConfig().GetLocation()), "")) + loc := getCloudLocation(v) switch { case strings.HasPrefix(loc, "china"): return "AzureChinaCloud" diff --git a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl index b1359b071d9..d685a3444da 100644 --- a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl +++ b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl @@ -3,4 +3,5 @@ echo $(date),$(hostname) > ${PROVISION_OUTPUT}; REPO_DEPOT_ENDPOINT="{{.CustomCloudConfig.RepoDepotEndpoint}}" {{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} +LOCATION="{{getCloudLocation .}}" /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh deleted file mode 100644 index 587da9ba270..00000000000 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-mariner.sh +++ /dev/null @@ -1,186 +0,0 @@ -#!/bin/bash -set -x -mkdir -p /root/AzureCACertificates - -IS_MARINER=0 -IS_AZURELINUX=0 -# shellcheck disable=SC3010 -if [[ -f /etc/os-release ]]; then - . /etc/os-release - # shellcheck disable=SC3010 - if [[ $NAME == *"Mariner"* ]]; then - IS_MARINER=1 - elif [[ $NAME == *"Microsoft Azure Linux"* ]]; then - IS_AZURELINUX=1 - else - echo "Unknown Linux distribution" - exit 1 - fi -else - echo "Unsupported operating system" - exit 1 -fi - -echo "distribution is $distribution" -echo "Running on $NAME" - -# http://168.63.129.16 is a constant for the host's wireserver endpoint -certs=$(curl "http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json") -IFS_backup=$IFS -IFS=$'\r\n' -certNames=($(echo $certs | grep -oP '(?<=Name\": \")[^\"]*')) -certBodies=($(echo $certs | grep -oP '(?<=CertBody\": \")[^\"]*')) -for i in ${!certBodies[@]}; do - echo ${certBodies[$i]} | sed 's/\\r\\n/\n/g' | sed 's/\\//g' > "/root/AzureCACertificates/$(echo ${certNames[$i]} | sed 's/.cer/.crt/g')" -done -IFS=$IFS_backup - -cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ -/usr/bin/update-ca-trust - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -scriptPath=$0 -# Determine an absolute, canonical path to this script for use in cron. -if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" -fi - -if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi -fi - -cloud-init status --wait - -function init_mariner_repo_depot { - local repodepot_endpoint=$1 - echo "Adding [extended] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo - - echo "Adding [nvidia] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - - echo "Adding [cloud-native] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo - - echo "Pointing Mariner repos at RepoDepot..." - for f in /etc/yum.repos.d/*.repo - do - sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f - echo "$f modified." - done - echo "Mariner repo setup complete." -} - -function init_azurelinux_repo_depot { - local repodepot_endpoint=$1 - repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") - - # tbd maybe we do this a bit nicer - rm -f /etc/yum.repos.d/azurelinux* - - for repo in "${repos[@]}"; do - output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" - repo_content=( - "[azurelinux-official-$repo]" - "name=Azure Linux Official $repo \$releasever \$basearch" - "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" - "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" - "gpgcheck=1" - "repo_gpgcheck=1" - "enabled=1" - "skip_if_unavailable=True" - "sslverify=1" - ) - - rm -f "$output_file" - - for line in "${repo_content[@]}"; do - echo "$line" >> "$output_file" - done - - echo "File '$output_file' has been created." - done - echo "Azure Linux repo setup complete." -} - -dnf_makecache() { - local retries=10 - local dnf_makecache_output=/tmp/dnf-makecache.out - local i - for i in $(seq 1 $retries); do - ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ - cat $dnf_makecache_output && break || \ - cat $dnf_makecache_output - if [ $i -eq $retries ]; then - return 1 - else sleep 5 - fi - done - echo "Executed dnf makecache -y $i times" -} - -marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" -if [ -z "$marinerRepoDepotEndpoint" ]; then - >&2 echo "repo depot endpoint empty while running custom-cloud init script" -else - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - if [ "$IS_MARINER" -eq 1 ]; then - echo "Initializing Mariner repo depot settings..." - init_mariner_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - elif [ "$IS_AZURELINUX" -eq 1 ]; then - echo "Initializing Azure Linux repo depot settings..." - init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - else - echo "No customizations for distribution: $NAME" - fi -fi - -# Set the chrony config to use the PHC /dev/ptp0 clock -cat > /etc/chrony.conf < "/root/AzureCACertificates/$cert_filename" - echo "Successfully saved certificate: $cert_filename" - else - echo "Warning: Failed to retrieve certificate content for $cert_filename" - fi - done -} - -# Process root certificates -process_cert_operations "operationrequestsroot" - -# Process intermediate certificates -process_cert_operations "operationrequestsintermediate" - -# Copy all certificate files to the Mariner/AzureLinux system certificate directory -cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - -# Update the system certificate store using Mariner/AzureLinux command -/usr/bin/update-ca-trust - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -scriptPath=$0 -# Determine an absolute, canonical path to this script for use in cron. -if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" -fi - -if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi -fi - -function init_mariner_repo_depot { - local repodepot_endpoint=$1 - echo "Adding [extended] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo - - echo "Adding [nvidia] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - - echo "Adding [cloud-native] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo - - echo "Pointing Mariner repos at RepoDepot..." - for f in /etc/yum.repos.d/*.repo - do - sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f - echo "$f modified." - done - echo "Mariner repo setup complete." -} - -function init_azurelinux_repo_depot { - local repodepot_endpoint=$1 - repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") - - # tbd maybe we do this a bit nicer - rm -f /etc/yum.repos.d/azurelinux* - - for repo in "${repos[@]}"; do - output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" - repo_content=( - "[azurelinux-official-$repo]" - "name=Azure Linux Official $repo \$releasever \$basearch" - "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" - "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" - "gpgcheck=1" - "repo_gpgcheck=1" - "enabled=1" - "skip_if_unavailable=True" - "sslverify=1" - ) - - rm -f "$output_file" - - for line in "${repo_content[@]}"; do - echo "$line" >> "$output_file" - done - - echo "File '$output_file' has been created." - done -} - -cloud-init status --wait - -dnf_makecache() { - local retries=10 - local dnf_makecache_output=/tmp/dnf-makecache.out - local i - for i in $(seq 1 $retries); do - ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ - cat $dnf_makecache_output && break || \ - cat $dnf_makecache_output - if [ $i -eq $retries ]; then - return 1 - else sleep 5 - fi - done - echo "Executed dnf makecache -y $i times" -} - -marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" -if [ -z "$marinerRepoDepotEndpoint" ]; then - >&2 echo "repo depot endpoint empty while running custom-cloud init script" -else - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - if [ "$IS_MARINER" -eq 1 ]; then - echo "Initializing Mariner repo depot settings..." - init_mariner_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - elif [ "$IS_AZURELINUX" -eq 1 ]; then - echo "Initializing Azure Linux repo depot settings..." - init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - else - echo "No customizations for distribution: $NAME" - fi -fi - -#EOF diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh deleted file mode 100644 index 99ae86d0242..00000000000 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-operation-requests.sh +++ /dev/null @@ -1,346 +0,0 @@ -#!/bin/bash -set -x -mkdir -p /root/AzureCACertificates - -IS_FLATCAR=0 -IS_UBUNTU=0 -IS_ACL=0 -# shellcheck disable=SC3010 -if [[ -f /etc/os-release ]]; then - . /etc/os-release - # shellcheck disable=SC3010 - if [[ $NAME == *"Ubuntu"* ]]; then - IS_UBUNTU=1 - elif [[ $ID == *"flatcar"* ]]; then - IS_FLATCAR=1 - elif [[ $ID == "azurecontainerlinux" ]] || { [[ $ID == "azurelinux" ]] && [[ ${VARIANT_ID:-} == "azurecontainerlinux" ]]; }; then - IS_ACL=1 - else - echo "Unknown Linux distribution" - exit 1 - fi -else - echo "Unsupported operating system" - exit 1 -fi - -echo "distribution is $distribution" -echo "Running on $NAME" - -# http://168.63.129.16 is a constant for the host's wireserver endpoint -WIRESERVER_ENDPOINT="http://168.63.129.16" - -# Function to make HTTP request with retry logic for rate limiting -make_request_with_retry() { - local url="$1" - local max_retries=10 - local retry_delay=3 - local attempt=1 - - local response - while [ $attempt -le $max_retries ]; do - response=$(curl -f --no-progress-meter "$url") - local request_status=$? - - if echo "$response" | grep -q "RequestRateLimitExceeded"; then - sleep $retry_delay - retry_delay=$((retry_delay * 2)) - attempt=$((attempt + 1)) - elif [ $request_status -ne 0 ]; then - sleep $retry_delay - attempt=$((attempt + 1)) - else - echo "$response" - return 0 - fi - done - - echo "exhausted all retries, last response: $response" - return 1 -} - -# Function to process certificate operations from a given endpoint -process_cert_operations() { - local endpoint_type="$1" - local operation_response - - echo "Retrieving certificate operations for type: $endpoint_type" - operation_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json") - local request_status=$? - if [ -z "$operation_response" ] || [ $request_status -ne 0 ]; then - echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json" - return - fi - - # Extract ResourceFileName values from the JSON response - local cert_filenames - mapfile -t cert_filenames < <(echo "$operation_response" | grep -oP '(?<="ResouceFileName": ")[^"]*') - - if [ ${#cert_filenames[@]} -eq 0 ]; then - echo "No certificate filenames found in response for $endpoint_type" - return - fi - - # Process each certificate file - for cert_filename in "${cert_filenames[@]}"; do - echo "Processing certificate file: $cert_filename" - - # Extract filename and extension - local filename="${cert_filename%.*}" - local extension="${cert_filename##*.}" - - echo "Downloading certificate: filename=$filename, extension=$extension" - - # Retrieve the actual certificate content with retry logic - local cert_content - cert_content=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension") - local request_status=$? - if [ -z "$cert_content" ] || [ $request_status -ne 0 ]; then - echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension" - continue - fi - - if [ -n "$cert_content" ]; then - # Save the certificate to the appropriate location - echo "$cert_content" > "/root/AzureCACertificates/$cert_filename" - echo "Successfully saved certificate: $cert_filename" - else - echo "Warning: Failed to retrieve certificate content for $cert_filename" - fi - done -} - -# Process root certificates -process_cert_operations "operationrequestsroot" - -# Process intermediate certificates -process_cert_operations "operationrequestsintermediate" - -if [ "$IS_ACL" -eq 1 ]; then - cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - update-ca-trust -elif [ "${IS_FLATCAR}" -eq 0 ]; then - # Copy all certificate files to the system certificate directory - cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ - - # Update the system certificate store - update-ca-certificates - - # This copies the updated bundle to the location used by OpenSSL which is commonly used - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem -else - for cert in /root/AzureCACertificates/*.crt; do - destcert="${cert##*/}" - destcert="${destcert%.*}.pem" - cp "$cert" /etc/ssl/certs/"$destcert" - done - update-ca-certificates -fi - - - -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired -action=${1:-init} -if [ "$action" = "ca-refresh" ]; then - exit -fi - -function init_ubuntu_main_repo_depot { - local repodepot_endpoint="$1" - # Initialize directory for keys - mkdir -p /etc/apt/keyrings - - # This copies the updated bundle to the location used by OpenSSL which is commonly used - echo "Copying updated bundle to OpenSSL .pem file..." - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem - echo "Updated bundle copied." - - # Back up sources.list and sources.list.d contents - mkdir -p /etc/apt/backup/ - if [ -f "/etc/apt/sources.list" ]; then - mv /etc/apt/sources.list /etc/apt/backup/ - fi - for sources_file in /etc/apt/sources.list.d/*; do - if [ -f "$sources_file" ]; then - mv "$sources_file" /etc/apt/backup/ - fi - done - - # Set location of sources file - . /etc/os-release - aptSourceFile="/etc/apt/sources.list.d/ubuntu.sources" - - # Create main sources file - cat < /etc/apt/sources.list.d/ubuntu.sources - -Types: deb -URIs: ${repodepot_endpoint}/ubuntu -Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security -Components: main universe restricted multiverse -Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg -EOF - - # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing - # all urls with the RepoDepot Ubuntu url - ubuntuUrl=${repodepot_endpoint}/ubuntu - echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." - sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile - echo "apt source URLs converted, see new file below:" - echo "" - echo "-----" - cat $aptSourceFile - echo "-----" - echo "" -} - -function check_url { - local url=$1 - echo "Checking url: $url" - - # Use curl to check the URL and capture both stdout and stderr - curl_exit_code=$(curl -s --head --request GET $url) - # Check the exit status of curl - # shellcheck disable=SC3010 - if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then - echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" - exit 1 - fi -} - -function write_to_sources_file { - local sources_list_d_file=$1 - local source_uri=$2 - shift 2 - local key_paths=("$@") - - sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" - ubuntuDist=$(lsb_release -c | awk '{print $2}') - - tee -a $sources_file_path < /dev/null - echo "$key_name key added to keyring." -} - -function derive_key_paths { - local key_names=("$@") - local key_paths=() - - for key_name in "${key_names[@]}"; do - key_paths+=("/etc/apt/keyrings/${key_name}.gpg") - done - - echo "${key_paths[*]}" -} - -function add_ms_keys { - # Add the Microsoft package server keys to keyring. - echo "Adding Microsoft keys to keyring..." - - add_key_ubuntu microsoft.asc - add_key_ubuntu msopentech.asc -} - -function aptget_update { - echo "apt-get updating..." - echo "note: depending on how many sources have been added this may take a couple minutes..." - if apt-get update | grep -q "404 Not Found"; then - echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." - exit 1 - else - echo "apt-get update complete!" - fi -} - -function init_ubuntu_pmc_repo_depot { - local repodepot_endpoint="$1" - # Add Microsoft packages source to the azure specific sources.list. - echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." - - microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" - check_url $microsoftPackageSource - write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - echo "Ubuntu ($ubuntuRel) repo added." - echo "Adding packages.microsoft.com keys" - add_ms_keys $repodepot_endpoint -} - -if [ "$IS_UBUNTU" -eq 1 ]; then - scriptPath=$0 - # Determine an absolute, canonical path to this script for use in cron. - if command -v readlink >/dev/null 2>&1; then - # Use readlink -f when available to resolve the canonical path; fall back to $0 on error. - scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" - fi - - if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi - fi - - cloud-init status --wait - rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - ubuntuRel=$(lsb_release --release | awk '{print $2}') - ubuntuDist=$(lsb_release -c | awk '{print $2}') - # initialize archive.ubuntu.com repo - init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} - init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} - # update apt list - echo "Running apt-get update" - aptget_update -elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then - script_path="$(readlink -f "$0")" - svc="/etc/systemd/system/azure-ca-refresh.service" - tmr="/etc/systemd/system/azure-ca-refresh.timer" - - cat >"$svc" <"$tmr" < "/root/AzureCACertificates/$(echo ${certNames[$i]} | sed "s/.cer/.${ext}/g")" -done -IFS=$IFS_backup +WIRESERVER_ENDPOINT="http://168.63.129.16" + +function make_request_with_retry { + local url="$1" + local max_retries=10 + local retry_delay=3 + local attempt=1 + + local response + while [ $attempt -le $max_retries ]; do + response=$(curl -f --no-progress-meter "$url") + local request_status=$? + + if echo "$response" | grep -q "RequestRateLimitExceeded"; then + sleep $retry_delay + retry_delay=$((retry_delay * 2)) + attempt=$((attempt + 1)) + elif [ $request_status -ne 0 ]; then + sleep $retry_delay + attempt=$((attempt + 1)) + else + echo "$response" + return 0 + fi + done -if [ "$IS_ACL" -eq 1 ]; then - cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ - update-ca-trust -elif [ "$IS_FLATCAR" -eq 1 ]; then - cp /root/AzureCACertificates/*.pem /etc/ssl/certs/ - update-ca-certificates -else - cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ - update-ca-certificates + echo "exhausted all retries, last response: $response" + return 1 +} - # This copies the updated bundle to the location used by OpenSSL which is commonly used - cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem +function is_opted_in_for_root_certs { + local opt_in_response + + opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") + local request_status=$? + if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then + echo "Warning: failed to determine IsOptedInForRootCerts state" + return 1 + fi + + if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then + echo "IsOptedInForRootCerts=true" + return 0 + fi + + echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" + return 1 +} + +function get_trust_store_dir { + if [ "$IS_ACL" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + echo "/etc/pki/ca-trust/source/anchors" + elif [ "$IS_FLATCAR" -eq 1 ]; then + echo "/etc/ssl/certs" + else + echo "/usr/local/share/ca-certificates" + fi +} + +function debug_print_trust_store { + local stage="$1" + local trust_store_dir + + trust_store_dir=$(get_trust_store_dir) + echo "Trust store contents ${stage} cert copy: ${trust_store_dir}" + ls -al "$trust_store_dir" || true +} + +function retrieve_legacy_certs { + local certs + local cert_names + local cert_bodies + local i + + certs=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=cacertificates&ext=json") + if [ -z "$certs" ]; then + echo "Warning: failed to retrieve legacy custom cloud certificates" + return 1 + fi + + IFS_backup=$IFS + IFS=$'\r\n' + cert_names=($(echo $certs | grep -oP '(?<=Name\": \")[^\"]*')) + cert_bodies=($(echo $certs | grep -oP '(?<=CertBody\": \")[^\"]*')) + for i in ${!cert_bodies[@]}; do + echo ${cert_bodies[$i]} | sed 's/\\r\\n/\n/g' | sed 's/\\//g' > "/root/AzureCACertificates/$(echo ${cert_names[$i]} | sed 's/.cer/.crt/g')" + done + IFS=$IFS_backup +} + +function process_cert_operations { + local endpoint_type="$1" + local operation_response + + echo "Retrieving certificate operations for type: $endpoint_type" + operation_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json") + local request_status=$? + if [ -z "$operation_response" ] || [ $request_status -ne 0 ]; then + echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$endpoint_type&ext=json" + return 1 + fi + + local cert_filenames + mapfile -t cert_filenames < <(echo "$operation_response" | grep -oP '(?<="ResouceFileName": ")[^"]*') + + if [ ${#cert_filenames[@]} -eq 0 ]; then + echo "No certificate filenames found in response for $endpoint_type" + return 1 + fi + + for cert_filename in "${cert_filenames[@]}"; do + echo "Processing certificate file: $cert_filename" + + local filename="${cert_filename%.*}" + local extension="${cert_filename##*.}" + local cert_content + + cert_content=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension") + local request_status=$? + if [ -z "$cert_content" ] || [ $request_status -ne 0 ]; then + echo "Warning: No response received or request failed for: ${WIRESERVER_ENDPOINT}/machine?comp=acmspackage&type=$filename&ext=$extension" + continue + fi + + echo "$cert_content" > "/root/AzureCACertificates/$cert_filename" + echo "Successfully saved certificate: $cert_filename" + done +} + +function retrieve_rcv1p_certs { + process_cert_operations "operationrequestsroot" || return 1 + process_cert_operations "operationrequestsintermediate" || return 1 +} + +function install_certs_to_trust_store { + mkdir -p /root/AzureCACertificates + + debug_print_trust_store "before" + + if [ "$IS_ACL" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + cp /root/AzureCACertificates/*.crt /etc/pki/ca-trust/source/anchors/ + update-ca-trust + elif [ "$IS_FLATCAR" -eq 1 ]; then + for cert in /root/AzureCACertificates/*.crt; do + destcert="${cert##*/}" + destcert="${destcert%.*}.pem" + cp "$cert" /etc/ssl/certs/"$destcert" + done + update-ca-certificates + else + cp /root/AzureCACertificates/*.crt /usr/local/share/ca-certificates/ + update-ca-certificates + + # This copies the updated bundle to the location used by OpenSSL which is commonly used + cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem + fi + + debug_print_trust_store "after" +} + +# Certificate refresh behavior summary: +# - legacy mode directly attempts certificate download from wireserver and only in ussec and usnat regions. +# - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. +# - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. + +location_normalized="${LOCATION,,}" +location_normalized="${location_normalized//[[:space:]]/}" +if [ -z "$location_normalized" ]; then + echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" +fi + +cert_endpoint_mode="rcv1p" +case "$location_normalized" in + ussec*|usnat*) cert_endpoint_mode="legacy" ;; +esac +echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" +rm -f /root/AzureCACertificates/* +if [ "$cert_endpoint_mode" = "legacy" ]; then + if retrieve_legacy_certs; then + install_certs_to_trust_store + else + echo "Warning: failed to retrieve legacy certificates from wireserver; continuing without trust store updates" + fi +elif [ "$cert_endpoint_mode" = "rcv1p" ]; then + if is_opted_in_for_root_certs; then + if retrieve_rcv1p_certs; then + install_certs_to_trust_store + else + echo "Warning: failed to retrieve rcv1p certificates from wireserver; continuing without trust store updates" + fi + fi fi # This section creates a cron job to poll for refreshed CA certs daily @@ -201,7 +371,80 @@ function init_ubuntu_pmc_repo_depot { add_ms_keys $repodepot_endpoint } -if [ "$IS_UBUNTU" -eq 1 ]; then +function init_mariner_repo_depot { + local repodepot_endpoint=$1 + echo "Adding [extended] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo + + echo "Adding [nvidia] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + + echo "Adding [cloud-native] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo + + echo "Pointing Mariner repos at RepoDepot..." + for f in /etc/yum.repos.d/*.repo; do + sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f + echo "$f modified." + done + echo "Mariner repo setup complete." +} + +function init_azurelinux_repo_depot { + local repodepot_endpoint=$1 + local repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") + + rm -f /etc/yum.repos.d/azurelinux* + + for repo in "${repos[@]}"; do + output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" + repo_content=( + "[azurelinux-official-$repo]" + "name=Azure Linux Official $repo \$releasever \$basearch" + "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" + "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" + "gpgcheck=1" + "repo_gpgcheck=1" + "enabled=1" + "skip_if_unavailable=True" + "sslverify=1" + ) + + rm -f "$output_file" + + for line in "${repo_content[@]}"; do + echo "$line" >> "$output_file" + done + + echo "File '$output_file' has been created." + done + echo "Azure Linux repo setup complete." +} + +function dnf_makecache { + local retries=10 + local dnf_makecache_output=/tmp/dnf-makecache.out + local i + for i in $(seq 1 $retries); do + ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ + cat $dnf_makecache_output && break || \ + cat $dnf_makecache_output + if [ $i -eq $retries ]; then + return 1 + else + sleep 5 + fi + done + echo "Executed dnf makecache -y $i times" +} + +if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then scriptPath=$0 # Determine an absolute, canonical path to this script for use in cron. if command -v readlink >/dev/null 2>&1; then @@ -260,11 +503,72 @@ EOF systemctl enable --now azure-ca-refresh.timer fi +if [ "$IS_UBUNTU" -eq 1 ]; then + rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -n "$rootRepoDepotEndpoint" ]; then + cloud-init status --wait + ubuntuRel=$(lsb_release --release | awk '{print $2}') + ubuntuDist=$(lsb_release -c | awk '{print $2}') + init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} + init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} + echo "Running apt-get update" + aptget_update + else + echo "REPO_DEPOT_ENDPOINT empty, skipping Ubuntu RepoDepot initialization" + fi +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + cloud-init status --wait + + marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -z "$marinerRepoDepotEndpoint" ]; then + >&2 echo "repo depot endpoint empty while running custom-cloud init script" + else + if [ "$IS_MARINER" -eq 1 ]; then + echo "Initializing Mariner repo depot settings..." + init_mariner_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + else + echo "Initializing Azure Linux repo depot settings..." + init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + fi + fi +fi + # Disable systemd-timesyncd and install chrony and uses local time source # ACL has PTP clock config compiled into chronyd with no config file or sourcedir directives, # so it uses only the local PTP clock and has no DHCP-injectable NTP sources. if [ "$IS_ACL" -eq 1 ]; then echo "Skipping chrony configuration for ACL (PTP clock baked into chronyd, no external NTP sources)" +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then +cat > /etc/chrony.conf < $certFilePath + } + + return $true } - Write-Log "Convert CA certificates rawdata" - $caCerts=($rawData.Content) | ConvertFrom-Json - if ([string]::IsNullOrEmpty($caCerts)) { - Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_EMPTY_CA_CERTIFICATES -ErrorMessage "CA certificates rawdata is empty" + $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' + $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + if (($optInResponse.Content -notmatch 'IsOptedInForRootCerts=true')) { + Write-Log "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" + return $false } - $certificates = $caCerts.Certificates - for ($index = 0; $index -lt $certificates.Length ; $index++) { - $name=$certificates[$index].Name - $certFilePath = Join-Path $caFolder $name - Write-Log "Write certificate $name to $certFilePath" - $certificates[$index].CertBody > $certFilePath + $operationRequestTypes = @("operationrequestsroot", "operationrequestsintermediate") + $downloadedAny = $false + + foreach ($requestType in $operationRequestTypes) { + $operationRequestUri = "http://168.63.129.16/machine?comp=acmspackage&type=$requestType&ext=json" + $operationResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$operationRequestUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + $operationJson = ($operationResponse.Content) | ConvertFrom-Json + + if ($null -eq $operationJson -or $null -eq $operationJson.OperationRequests) { + Write-Log "Warning: no operation requests found for $requestType" + continue + } + + foreach ($operation in $operationJson.OperationRequests) { + $resourceFileName = $operation.ResouceFileName + if ([string]::IsNullOrEmpty($resourceFileName)) { + continue + } + + $resourceType = [IO.Path]::GetFileNameWithoutExtension($resourceFileName) + $resourceExt = [IO.Path]::GetExtension($resourceFileName).TrimStart('.') + $resourceUri = "http://168.63.129.16/machine?comp=acmspackage&type=$resourceType&ext=$resourceExt" + + $certContentResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$resourceUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + if ([string]::IsNullOrEmpty($certContentResponse.Content)) { + Write-Log "Warning: empty certificate content for $resourceFileName" + continue + } + + $certFilePath = Join-Path $caFolder $resourceFileName + Write-Log "Write certificate $resourceFileName to $certFilePath" + $certContentResponse.Content > $certFilePath + $downloadedAny = $true + } + } + + if (-not $downloadedAny) { + Write-Log "Warning: no CA certificates were downloaded in rcv1p mode" } + + return $downloadedAny } catch { - # Catch all exceptions in this function. NOTE: exit cannot be caught. - Set-ExitCode -ExitCode $global:WINDOWS_CSE_ERROR_GET_CA_CERTIFICATES -ErrorMessage $_ + Write-Log "Warning: failed to retrieve CA certificates. Error: $_" + return $false } } From b2e72ac1c34b9e72d87b2caa4781ad3198684d77 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 18 Mar 2026 14:08:50 -0700 Subject: [PATCH 03/70] feat: enhance CA certificates refresh task with endpoint mode based on location Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 33 ++++++++++++------- parts/windows/kuberneteswindowssetup.ps1 | 4 ++- staging/cse/windows/kubernetesfunc.ps1 | 15 +++++---- 3 files changed, 34 insertions(+), 18 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index fab9e105975..9f3b4fe479e 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -198,16 +198,28 @@ function install_certs_to_trust_store { # - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. # - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. -location_normalized="${LOCATION,,}" -location_normalized="${location_normalized//[[:space:]]/}" -if [ -z "$location_normalized" ]; then - echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" +# Action values: +# - init: normal provisioning path +# - ca-refresh: scheduled refresh path +action=${1:-init} +requested_cert_endpoint_mode="${2:-}" + +cert_endpoint_mode="" +if [ "$action" = "ca-refresh" ] && [ -n "$requested_cert_endpoint_mode" ]; then + cert_endpoint_mode="${requested_cert_endpoint_mode,,}" +else + location_normalized="${LOCATION,,}" + location_normalized="${location_normalized//[[:space:]]/}" + if [ -z "$location_normalized" ]; then + echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" + fi + + cert_endpoint_mode="rcv1p" + case "$location_normalized" in + ussec*|usnat*) cert_endpoint_mode="legacy" ;; + esac fi -cert_endpoint_mode="rcv1p" -case "$location_normalized" in - ussec*|usnat*) cert_endpoint_mode="legacy" ;; -esac echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then @@ -228,7 +240,6 @@ fi # This section creates a cron job to poll for refreshed CA certs daily # It can be removed if not needed or desired -action=${1:-init} if [ "$action" = "ca-refresh" ]; then exit fi @@ -454,7 +465,7 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh") | crontab -; then + if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$cert_endpoint_mode\"") | crontab -; then echo "Failed to install ca-refresh cron job via crontab" >&2 fi fi @@ -483,7 +494,7 @@ Wants=network-online.target [Service] Type=oneshot -ExecStart=$script_path ca-refresh +ExecStart=$script_path ca-refresh $cert_endpoint_mode EOF cat >"$tmr" < Date: Wed, 18 Mar 2026 17:14:10 -0700 Subject: [PATCH 04/70] feat: add tests for certificate endpoint mode handling in AKS custom cloud spec Signed-off-by: Ramkumar Chinchani --- .../artifacts/init_aks_custom_cloud_spec.sh | 39 +++++ staging/cse/windows/kubernetesfunc.tests.ps1 | 147 ++++++++++++++++++ 2 files changed, 186 insertions(+) create mode 100644 spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh create mode 100644 staging/cse/windows/kubernetesfunc.tests.ps1 diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh new file mode 100644 index 00000000000..f00709306c2 --- /dev/null +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +Describe 'init-aks-custom-cloud.sh refresh mode wiring' + script_path='./parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh' + + It 'parses action and optional requested cert endpoint mode arguments' + When run grep -Eq '^action=\$\{1:-init\}$' "$script_path" + The status should eq 0 + + When run grep -Eq '^requested_cert_endpoint_mode="\$\{2:-\}"$' "$script_path" + The status should eq 0 + End + + It 'uses requested mode during ca-refresh when provided' + When run grep -Eq '^if \[ "\$action" = "ca-refresh" \] && \[ -n "\$requested_cert_endpoint_mode" \]; then$' "$script_path" + The status should eq 0 + + When run grep -Eq '^\s*cert_endpoint_mode="\$\{requested_cert_endpoint_mode,,\}"$' "$script_path" + The status should eq 0 + End + + It 'exits early in ca-refresh mode after certificate refresh logic' + When run grep -Eq '^if \[ "\$action" = "ca-refresh" \]; then$' "$script_path" + The status should eq 0 + + When run grep -Eq '^\s*exit$' "$script_path" + The status should eq 0 + End + + It 'passes cert endpoint mode into cron refresh command' + When run grep -Eq 'ca-refresh "\$cert_endpoint_mode"' "$script_path" + The status should eq 0 + End + + It 'passes cert endpoint mode into systemd refresh command' + When run grep -Eq '^ExecStart=\$script_path ca-refresh \$cert_endpoint_mode$' "$script_path" + The status should eq 0 + End +End diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 new file mode 100644 index 00000000000..ba14ebb48ef --- /dev/null +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -0,0 +1,147 @@ +if (-not (Get-PSDrive -Name C -ErrorAction SilentlyContinue)) { + New-PSDrive -Name C -PSProvider FileSystem -Root ([System.IO.Path]::GetTempPath()) | Out-Null +} + +function Write-Log { + param($Message) + Write-Host "$Message" +} + +function Logs-To-Event { + param($TaskName, $TaskMessage) + Write-Host "$TaskName $TaskMessage" +} + +function Set-ExitCode { + param($ExitCode, $ErrorMessage) + throw "Unexpected Set-ExitCode: $ExitCode $ErrorMessage" +} + +function Create-Directory { + param($FullPath, $DirectoryUsage) + if (-not (Test-Path $FullPath)) { + New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + } +} + +function Get-ScheduledTask { + param($TaskName, $ErrorAction) +} + +function New-ScheduledTaskAction { + param($Execute, $Argument) +} + +function New-ScheduledTaskPrincipal { + param($UserId, $LogonType, $RunLevel) +} + +function New-JobTrigger { + param([switch]$Daily, $At, $DaysInterval) +} + +function New-ScheduledTask { + param($Action, $Principal, $Trigger, $Description) +} + +function Register-ScheduledTask { + param($TaskName, $InputObject) +} + +. $PSScriptRoot\..\..\..\parts\windows\windowscsehelper.ps1 +. $PSCommandPath.Replace('.tests.ps1', '.ps1') + +Describe 'Get-CustomCloudCertEndpointModeFromLocation' { + It 'returns legacy for ussec regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'ussecwest' | Should Be 'legacy' + } + + It 'returns legacy for usnat regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'usnatcentral' | Should Be 'legacy' + } + + It 'returns rcv1p for public regions' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'southcentralus' | Should Be 'rcv1p' + } + + It 'handles mixed-case input' { + Get-CustomCloudCertEndpointModeFromLocation -Location 'UsSeCeast' | Should Be 'legacy' + } +} + +Describe 'Register-CACertificatesRefreshTask' { + BeforeEach { + $script:lastScheduledTaskArgument = $null + + Mock Logs-To-Event + Mock Write-Log + Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } + Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } + Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } + Mock Register-ScheduledTask + Mock New-ScheduledTaskAction -MockWith { + param($Execute, $Argument) + $script:lastScheduledTaskArgument = $Argument + return @{ Execute = $Execute; Argument = $Argument } + } + } + + It 'skips registration when the task already exists' { + Mock Get-ScheduledTask -MockWith { return @{ TaskName = 'aks-ca-certs-refresh-task' } } + + Register-CACertificatesRefreshTask -Location 'southcentralus' -CertEndpointMode 'rcv1p' + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 0 + Assert-MockCalled -CommandName New-ScheduledTaskAction -Exactly -Times 0 + } + + It 'creates a scheduled task that passes the explicit cert endpoint mode' { + Mock Get-ScheduledTask -MockWith { return $null } + + Register-CACertificatesRefreshTask -Location 'southcentralus' -CertEndpointMode 'rcv1p' + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 1 + $script:lastScheduledTaskArgument | Should Match ([regex]::Escape("Get-CACertificates -Location 'southcentralus' -CertEndpointMode 'rcv1p'")) + } +} + +Describe 'Get-CACertificates' { + BeforeEach { + Mock Write-Log + Mock Create-Directory -MockWith { + param($FullPath, $DirectoryUsage) + if (-not (Test-Path $FullPath)) { + New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + } + } + + if (Test-Path 'C:\ca') { + Remove-Item -Path 'C:\ca' -Recurse -Force + } + } + + It 'uses the legacy endpoint when CertEndpointMode is legacy regardless of location' { + Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + return [PSCustomObject]@{ + Content = '{"Certificates":[{"Name":"legacy.crt","CertBody":"legacy-body"}]}' + } + } + + $result = Get-CACertificates -Location 'southcentralus' -CertEndpointMode 'legacy' + + $result | Should Be $true + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' } + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } + } + + It 'returns false when certificate retrieval throws' { + Mock Retry-Command -MockWith { + throw 'simulated retrieval failure' + } + + $result = Get-CACertificates -Location 'ussecwest' -CertEndpointMode 'rcv1p' + + $result | Should Be $false + } +} From 9a9c5eed0daac8d3f58b302ae84915624ae9969e Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 19 Mar 2026 12:44:29 -0700 Subject: [PATCH 05/70] feat: simplify certificate endpoint mode handling and refresh task registration Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 41 ++++++++----------- parts/windows/kuberneteswindowssetup.ps1 | 4 +- .../artifacts/init_aks_custom_cloud_spec.sh | 21 ++++++---- staging/cse/windows/kubernetesfunc.ps1 | 15 +++---- staging/cse/windows/kubernetesfunc.tests.ps1 | 14 +++---- 5 files changed, 44 insertions(+), 51 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 9f3b4fe479e..c7176be2393 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -198,28 +198,19 @@ function install_certs_to_trust_store { # - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. # - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. -# Action values: -# - init: normal provisioning path -# - ca-refresh: scheduled refresh path -action=${1:-init} -requested_cert_endpoint_mode="${2:-}" - -cert_endpoint_mode="" -if [ "$action" = "ca-refresh" ] && [ -n "$requested_cert_endpoint_mode" ]; then - cert_endpoint_mode="${requested_cert_endpoint_mode,,}" -else - location_normalized="${LOCATION,,}" - location_normalized="${location_normalized//[[:space:]]/}" - if [ -z "$location_normalized" ]; then - echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" - fi +refresh_location="${2:-${LOCATION}}" - cert_endpoint_mode="rcv1p" - case "$location_normalized" in - ussec*|usnat*) cert_endpoint_mode="legacy" ;; - esac +location_normalized="${refresh_location,,}" +location_normalized="${location_normalized//[[:space:]]/}" +if [ -z "$location_normalized" ]; then + echo "Warning: LOCATION is empty; defaulting custom cloud certificate endpoint mode to rcv1p" fi +cert_endpoint_mode="rcv1p" +case "$location_normalized" in + ussec*|usnat*) cert_endpoint_mode="legacy" ;; +esac + echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then @@ -238,8 +229,12 @@ elif [ "$cert_endpoint_mode" = "rcv1p" ]; then fi fi -# This section creates a cron job to poll for refreshed CA certs daily -# It can be removed if not needed or desired +# In ca-refresh mode (invoked by the scheduled cron/systemd task with the location as arg), +# only the cert refresh above is needed; exit before running the full init path. +# Action values: +# - init (default): full provisioning path +# - ca-refresh : periodic refresh path; location is passed as arg to avoid env dependency +action=${1:-init} if [ "$action" = "ca-refresh" ]; then exit fi @@ -465,7 +460,7 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$cert_endpoint_mode\"") | crontab -; then + if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then echo "Failed to install ca-refresh cron job via crontab" >&2 fi fi @@ -494,7 +489,7 @@ Wants=network-online.target [Service] Type=oneshot -ExecStart=$script_path ca-refresh $cert_endpoint_mode +ExecStart=$script_path ca-refresh $LOCATION EOF cat >"$tmr" < Date: Thu, 19 Mar 2026 13:04:03 -0700 Subject: [PATCH 06/70] feat: implement conditional CA certificates refresh task registration for legacy and opted-in rcv1p modes Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 29 +++++++++------ parts/windows/kuberneteswindowssetup.ps1 | 4 ++- .../artifacts/init_aks_custom_cloud_spec.sh | 11 ++++++ staging/cse/windows/kubernetesfunc.ps1 | 21 +++++++++++ staging/cse/windows/kubernetesfunc.tests.ps1 | 36 +++++++++++++++++++ 5 files changed, 89 insertions(+), 12 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index c7176be2393..eeb01c392fe 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -212,8 +212,10 @@ case "$location_normalized" in esac echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" +install_ca_refresh_schedule=0 rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then + install_ca_refresh_schedule=1 if retrieve_legacy_certs; then install_certs_to_trust_store else @@ -221,6 +223,7 @@ if [ "$cert_endpoint_mode" = "legacy" ]; then fi elif [ "$cert_endpoint_mode" = "rcv1p" ]; then if is_opted_in_for_root_certs; then + install_ca_refresh_schedule=1 if retrieve_rcv1p_certs; then install_certs_to_trust_store else @@ -458,10 +461,12 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 scriptPath="$(readlink -f "$0" 2>/dev/null || printf '%s' "$0")" fi - if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 + if [ "$install_ca_refresh_schedule" -eq 1 ]; then + if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then + # Quote the script path in the cron entry to avoid issues with spaces or special characters. + if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then + echo "Failed to install ca-refresh cron job via crontab" >&2 + fi fi fi @@ -477,11 +482,12 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 echo "Running apt-get update" aptget_update elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then - script_path="$(readlink -f "$0")" - svc="/etc/systemd/system/azure-ca-refresh.service" - tmr="/etc/systemd/system/azure-ca-refresh.timer" + if [ "$install_ca_refresh_schedule" -eq 1 ]; then + script_path="$(readlink -f "$0")" + svc="/etc/systemd/system/azure-ca-refresh.service" + tmr="/etc/systemd/system/azure-ca-refresh.timer" - cat >"$svc" <"$svc" <"$tmr" <"$tmr" < Date: Thu, 19 Mar 2026 14:54:49 -0700 Subject: [PATCH 07/70] feat: enhance CA certificates refresh task registration for legacy CSE packages Signed-off-by: Ramkumar Chinchani --- parts/windows/kuberneteswindowssetup.ps1 | 9 ++++++++- .../cloud-init/artifacts/init_aks_custom_cloud_spec.sh | 10 +++++----- staging/cse/windows/kubernetesfunc.tests.ps1 | 3 --- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/parts/windows/kuberneteswindowssetup.ps1 b/parts/windows/kuberneteswindowssetup.ps1 index 63ba3dc5c23..7c42d906c6a 100644 --- a/parts/windows/kuberneteswindowssetup.ps1 +++ b/parts/windows/kuberneteswindowssetup.ps1 @@ -487,7 +487,14 @@ function BasePrep { Adjust-DynamicPortRange Register-LogsCleanupScriptTask Register-NodeResetScriptTask - if (Should-InstallCACertificatesRefreshTask -Location $Location) { + # Guard against older CSE packages that do not yet export Should-InstallCACertificatesRefreshTask. + # If the function is absent (old package), fall back to the previous unconditional behaviour so + # that legacy/ussec/usnat clusters continue to register the refresh task. + if (Get-Command -Name Should-InstallCACertificatesRefreshTask -ErrorAction Ignore) { + if (Should-InstallCACertificatesRefreshTask -Location $Location) { + Register-CACertificatesRefreshTask -Location $Location + } + } elseif (Get-Command -Name Register-CACertificatesRefreshTask -ErrorAction Ignore) { Register-CACertificatesRefreshTask -Location $Location } diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh index f85f580a8cc..8b54975d51b 100644 --- a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -18,7 +18,7 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' When run grep -Eq '^location_normalized="\$\{refresh_location,,\}"$' "$script_path" The status should eq 0 - When run grep -Eq 'ussec\*\|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" + When run grep -Eq 'ussec\*|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" The status should eq 0 End @@ -26,10 +26,10 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' When run grep -Eq '^install_ca_refresh_schedule=0$' "$script_path" The status should eq 0 - When run grep -Eq '^\s*install_ca_refresh_schedule=1$' "$script_path" + When run grep -Eq '^[[:space:]]*install_ca_refresh_schedule=1$' "$script_path" The status should eq 0 - When run grep -Eq '^\s*if \[ "\$install_ca_refresh_schedule" -eq 1 \]; then$' "$script_path" + When run grep -Eq '^[[:space:]]*if \[ "\$install_ca_refresh_schedule" -eq 1 \]; then$' "$script_path" The status should eq 0 End @@ -37,12 +37,12 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' When run grep -Eq '^if \[ "\$action" = "ca-refresh" \]; then$' "$script_path" The status should eq 0 - When run grep -Eq '^\s*exit$' "$script_path" + When run grep -Eq '^[[:space:]]*exit$' "$script_path" The status should eq 0 End It 'passes LOCATION directly into cron refresh command' - When run grep -Eq 'ca-refresh \\\\"\$LOCATION\\\\"' "$script_path" + When run grep -Eq 'ca-refresh \\"\$LOCATION\\"' "$script_path" The status should eq 0 End diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 948cd229dc0..8b062a273d0 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -74,7 +74,6 @@ Describe 'Register-CACertificatesRefreshTask' { $script:lastScheduledTaskArgument = $null Mock Logs-To-Event - Mock Write-Log Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } @@ -107,7 +106,6 @@ Describe 'Register-CACertificatesRefreshTask' { Describe 'Should-InstallCACertificatesRefreshTask' { BeforeEach { - Mock Write-Log } It 'returns true for legacy regions without calling the opt-in endpoint' { @@ -143,7 +141,6 @@ Describe 'Should-InstallCACertificatesRefreshTask' { Describe 'Get-CACertificates' { BeforeEach { - Mock Write-Log Mock Create-Directory -MockWith { param($FullPath, $DirectoryUsage) if (-not (Test-Path $FullPath)) { From e2bc72b5ff43a8e48a08bf0c664f8fa1f7e896b1 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 19 Mar 2026 23:27:10 -0700 Subject: [PATCH 08/70] feat: update tests for certificate endpoint mode handling and refresh schedule installation Signed-off-by: Ramkumar Chinchani --- .../artifacts/init_aks_custom_cloud_spec.sh | 12 ++++++++++-- staging/cse/windows/kubernetesfunc.tests.ps1 | 7 +++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh index 8b54975d51b..58812659856 100644 --- a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -17,26 +17,34 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' It 'always derives cert endpoint mode from refresh_location' When run grep -Eq '^location_normalized="\$\{refresh_location,,\}"$' "$script_path" The status should eq 0 + End + It 'maps ussec/usnat locations to legacy cert endpoint mode' When run grep -Eq 'ussec\*|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" The status should eq 0 End - It 'installs refresh schedule only for legacy mode or opted-in rcv1p mode' + It 'initializes refresh schedule installation as disabled' When run grep -Eq '^install_ca_refresh_schedule=0$' "$script_path" The status should eq 0 + End + It 'enables refresh schedule installation for eligible certificate modes' When run grep -Eq '^[[:space:]]*install_ca_refresh_schedule=1$' "$script_path" The status should eq 0 + End + It 'gates refresh schedule installation on install_ca_refresh_schedule' When run grep -Eq '^[[:space:]]*if \[ "\$install_ca_refresh_schedule" -eq 1 \]; then$' "$script_path" The status should eq 0 End - It 'exits early in ca-refresh mode after certificate refresh logic' + It 'checks for ca-refresh mode after certificate refresh logic' When run grep -Eq '^if \[ "\$action" = "ca-refresh" \]; then$' "$script_path" The status should eq 0 + End + It 'exits early in ca-refresh mode after certificate refresh logic' When run grep -Eq '^[[:space:]]*exit$' "$script_path" The status should eq 0 End diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 8b062a273d0..42e15c4fc25 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -48,8 +48,11 @@ function Register-ScheduledTask { param($TaskName, $InputObject) } -. $PSScriptRoot\..\..\..\parts\windows\windowscsehelper.ps1 -. $PSCommandPath.Replace('.tests.ps1', '.ps1') +$helperScriptPath = Join-Path $PSScriptRoot '..\..\..\parts\windows\windowscsehelper.ps1' +$scriptUnderTestPath = Join-Path $PSScriptRoot 'kubernetesfunc.ps1' + +. $helperScriptPath +. $scriptUnderTestPath Describe 'Get-CustomCloudCertEndpointModeFromLocation' { It 'returns legacy for ussec regions' { From ad01392a5c89d5edfa410811505e3b4be1b8654b Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 20 Mar 2026 07:42:47 -0700 Subject: [PATCH 09/70] feat: refactor test setup functions for improved readability and consistency Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.tests.ps1 | 102 +++++++++---------- 1 file changed, 51 insertions(+), 51 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 42e15c4fc25..3f9f403666b 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -1,58 +1,64 @@ -if (-not (Get-PSDrive -Name C -ErrorAction SilentlyContinue)) { - New-PSDrive -Name C -PSProvider FileSystem -Root ([System.IO.Path]::GetTempPath()) | Out-Null -} +BeforeAll { + if (-not (Get-PSDrive -Name C -ErrorAction SilentlyContinue)) { + New-PSDrive -Name C -PSProvider FileSystem -Root ([System.IO.Path]::GetTempPath()) | Out-Null + } -function Write-Log { - param($Message) - Write-Host "$Message" -} + function Write-Log { + param($Message) + Write-Host "$Message" + } -function Logs-To-Event { - param($TaskName, $TaskMessage) - Write-Host "$TaskName $TaskMessage" -} + function Logs-To-Event { + param($TaskName, $TaskMessage) + Write-Host "$TaskName $TaskMessage" + } -function Set-ExitCode { - param($ExitCode, $ErrorMessage) - throw "Unexpected Set-ExitCode: $ExitCode $ErrorMessage" -} + function Set-ExitCode { + param($ExitCode, $ErrorMessage) + throw "Unexpected Set-ExitCode: $ExitCode $ErrorMessage" + } -function Create-Directory { - param($FullPath, $DirectoryUsage) - if (-not (Test-Path $FullPath)) { - New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + function Create-Directory { + param($FullPath, $DirectoryUsage) + if (-not (Test-Path $FullPath)) { + New-Item -Path $FullPath -ItemType Directory -Force | Out-Null + } } -} -function Get-ScheduledTask { - param($TaskName, $ErrorAction) -} + function Get-ScheduledTask { + param($TaskName, $ErrorAction) + } -function New-ScheduledTaskAction { - param($Execute, $Argument) -} + function New-ScheduledTaskAction { + param($Execute, $Argument) + } -function New-ScheduledTaskPrincipal { - param($UserId, $LogonType, $RunLevel) -} + function New-ScheduledTaskPrincipal { + param($UserId, $LogonType, $RunLevel) + } -function New-JobTrigger { - param([switch]$Daily, $At, $DaysInterval) -} + function New-JobTrigger { + param([switch]$Daily, $At, $DaysInterval) + } -function New-ScheduledTask { - param($Action, $Principal, $Trigger, $Description) -} + function New-ScheduledTask { + param($Action, $Principal, $Trigger, $Description) + } -function Register-ScheduledTask { - param($TaskName, $InputObject) -} + function Register-ScheduledTask { + param($TaskName, $InputObject) + } + + function Retry-Command { + param($Command, $Args, $Retries, $RetryDelaySeconds) + } -$helperScriptPath = Join-Path $PSScriptRoot '..\..\..\parts\windows\windowscsehelper.ps1' -$scriptUnderTestPath = Join-Path $PSScriptRoot 'kubernetesfunc.ps1' + $helperScriptPath = Join-Path $PSScriptRoot '..\..\..\parts\windows\windowscsehelper.ps1' + $scriptUnderTestPath = Join-Path $PSScriptRoot 'kubernetesfunc.ps1' -. $helperScriptPath -. $scriptUnderTestPath + . $helperScriptPath + . $scriptUnderTestPath +} Describe 'Get-CustomCloudCertEndpointModeFromLocation' { It 'returns legacy for ussec regions' { @@ -76,11 +82,11 @@ Describe 'Register-CACertificatesRefreshTask' { BeforeEach { $script:lastScheduledTaskArgument = $null - Mock Logs-To-Event + Mock Logs-To-Event -MockWith { } Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } - Mock Register-ScheduledTask + Mock Register-ScheduledTask -MockWith { } Mock New-ScheduledTaskAction -MockWith { param($Execute, $Argument) $script:lastScheduledTaskArgument = $Argument @@ -109,6 +115,7 @@ Describe 'Register-CACertificatesRefreshTask' { Describe 'Should-InstallCACertificatesRefreshTask' { BeforeEach { + Mock Retry-Command -MockWith { } } It 'returns true for legacy regions without calling the opt-in endpoint' { @@ -144,13 +151,6 @@ Describe 'Should-InstallCACertificatesRefreshTask' { Describe 'Get-CACertificates' { BeforeEach { - Mock Create-Directory -MockWith { - param($FullPath, $DirectoryUsage) - if (-not (Test-Path $FullPath)) { - New-Item -Path $FullPath -ItemType Directory -Force | Out-Null - } - } - if (Test-Path 'C:\ca') { Remove-Item -Path 'C:\ca' -Recurse -Force } From e649f3ef910126714f5376832f0a41bb791fcd45 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 20 Mar 2026 08:52:23 -0700 Subject: [PATCH 10/70] feat: update Get-CustomCloudCertEndpointModeFromLocation to clarify endpoint mode handling for legacy and rcv1p regions Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 2 ++ staging/cse/windows/kubernetesfunc.tests.ps1 | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 5ae9df1e217..023542b6f3c 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -275,11 +275,13 @@ function Get-CustomCloudCertEndpointModeFromLocation { $Location ) + # ussec/usnat regions still use the legacy certificate endpoint contract. $normalizedLocation = $Location.ToLowerInvariant() if ($normalizedLocation.StartsWith("ussec") -or $normalizedLocation.StartsWith("usnat")) { return "legacy" } + # All other regions use the rcv1p endpoint mode with opt-in gating. return "rcv1p" } diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 3f9f403666b..2e95cef1338 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -62,19 +62,19 @@ BeforeAll { Describe 'Get-CustomCloudCertEndpointModeFromLocation' { It 'returns legacy for ussec regions' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'ussecwest' | Should Be 'legacy' + Get-CustomCloudCertEndpointModeFromLocation -Location 'ussecwest' | Should -Be 'legacy' } It 'returns legacy for usnat regions' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'usnatcentral' | Should Be 'legacy' + Get-CustomCloudCertEndpointModeFromLocation -Location 'usnatcentral' | Should -Be 'legacy' } It 'returns rcv1p for public regions' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'southcentralus' | Should Be 'rcv1p' + Get-CustomCloudCertEndpointModeFromLocation -Location 'southcentralus' | Should -Be 'rcv1p' } It 'handles mixed-case input' { - Get-CustomCloudCertEndpointModeFromLocation -Location 'UsSeCeast' | Should Be 'legacy' + Get-CustomCloudCertEndpointModeFromLocation -Location 'UsSeCeast' | Should -Be 'legacy' } } @@ -109,7 +109,7 @@ Describe 'Register-CACertificatesRefreshTask' { Register-CACertificatesRefreshTask -Location 'southcentralus' Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 1 - $script:lastScheduledTaskArgument | Should Match ([regex]::Escape("Get-CACertificates -Location 'southcentralus'")) + $script:lastScheduledTaskArgument | Should -Match ([regex]::Escape("Get-CACertificates -Location 'southcentralus'")) } } @@ -123,7 +123,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { $result = Should-InstallCACertificatesRefreshTask -Location 'ussecwest' - $result | Should Be $true + $result | Should -Be $true Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 } @@ -134,7 +134,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' - $result | Should Be $true + $result | Should -Be $true Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } } @@ -145,7 +145,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' - $result | Should Be $false + $result | Should -Be $false } } @@ -166,7 +166,7 @@ Describe 'Get-CACertificates' { $result = Get-CACertificates -Location 'ussecwest' - $result | Should Be $true + $result | Should -Be $true Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' } Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } } @@ -178,6 +178,6 @@ Describe 'Get-CACertificates' { $result = Get-CACertificates -Location 'southcentralus' - $result | Should Be $false + $result | Should -Be $false } } From b12fb1e611a16f1a65c86fed54e2c1391d86d641 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 20 Mar 2026 09:52:25 -0700 Subject: [PATCH 11/70] feat: enhance tests for Should-InstallCACertificatesRefreshTask and Get-CACertificates to verify URI handling Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.tests.ps1 | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 2e95cef1338..8186bfabc4c 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -128,14 +128,18 @@ Describe 'Should-InstallCACertificatesRefreshTask' { } It 'returns true for rcv1p regions when opt-in is enabled' { + $script:lastRetryUri = $null Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:lastRetryUri = $PSBoundParameters['Args'].Uri return [PSCustomObject]@{ Content = 'IsOptedInForRootCerts=true' } } $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' $result | Should -Be $true - Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:lastRetryUri | Should -Be 'http://168.63.129.16/acms/isOptedInForRootCerts' } It 'returns false for rcv1p regions when opt-in is disabled' { @@ -157,8 +161,10 @@ Describe 'Get-CACertificates' { } It 'uses the legacy endpoint when location is a ussec/usnat region' { + $script:retryUris = @() Mock Retry-Command -MockWith { param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:retryUris += $PSBoundParameters['Args'].Uri return [PSCustomObject]@{ Content = '{"Certificates":[{"Name":"legacy.crt","CertBody":"legacy-body"}]}' } @@ -167,8 +173,9 @@ Describe 'Get-CACertificates' { $result = Get-CACertificates -Location 'ussecwest' $result | Should -Be $true - Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' } - Assert-MockCalled -CommandName Retry-Command -Exactly -Times 0 -ParameterFilter { $Args.Uri -eq 'http://168.63.129.16/acms/isOptedInForRootCerts' } + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:retryUris | Should -Contain 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' + $script:retryUris | Should -Not -Contain 'http://168.63.129.16/acms/isOptedInForRootCerts' } It 'returns false when certificate retrieval throws' { From 49b2e6e22d4554ff10d48c5100674ca8c1832152 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 25 Mar 2026 16:58:14 -0700 Subject: [PATCH 12/70] feat: update cse_cmd.sh and cse_cmd.sh.gtpl to ensure consistent logging of custom cloud file paths Signed-off-by: Ramkumar Chinchani --- aks-node-controller/parser/templates/cse_cmd.sh.gtpl | 2 +- parts/linux/cloud-init/artifacts/cse_cmd.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl index d685a3444da..42376814388 100644 --- a/aks-node-controller/parser/templates/cse_cmd.sh.gtpl +++ b/aks-node-controller/parser/templates/cse_cmd.sh.gtpl @@ -1,7 +1,7 @@ echo $(date),$(hostname) > ${PROVISION_OUTPUT}; {{if getIsAksCustomCloud .CustomCloudConfig}} REPO_DEPOT_ENDPOINT="{{.CustomCloudConfig.RepoDepotEndpoint}}" -{{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} LOCATION="{{getCloudLocation .}}" +{{getInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; /usr/bin/nohup /bin/bash -c "/bin/bash /opt/azure/containers/provision_start.sh" diff --git a/parts/linux/cloud-init/artifacts/cse_cmd.sh b/parts/linux/cloud-init/artifacts/cse_cmd.sh index d184b6e5356..21f77f0334c 100644 --- a/parts/linux/cloud-init/artifacts/cse_cmd.sh +++ b/parts/linux/cloud-init/artifacts/cse_cmd.sh @@ -18,8 +18,9 @@ fi; {{end}} {{if IsAKSCustomCloud}} REPO_DEPOT_ENDPOINT="{{AKSCustomCloudRepoDepotEndpoint}}" -{{GetInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; {{end}} +LOCATION={{GetVariable "location"}} +{{GetInitAKSCustomCloudFilepath}} >> /var/log/azure/cluster-provision.log 2>&1; ADMINUSER={{GetParameter "linuxAdminUsername"}} MOBY_VERSION={{GetParameter "mobyVersion"}} TENANT_ID={{GetVariable "tenantID"}} @@ -32,7 +33,6 @@ KUBEPROXY_URL={{GetParameter "kubeProxySpec"}} APISERVER_PUBLIC_KEY={{GetParameter "apiServerCertificate"}} SUBSCRIPTION_ID={{GetVariable "subscriptionId"}} RESOURCE_GROUP={{GetVariable "resourceGroup"}} -LOCATION={{GetVariable "location"}} VM_TYPE={{GetVariable "vmType"}} SUBNET={{GetVariable "subnetName"}} NETWORK_SECURITY_GROUP={{GetVariable "nsgName"}} From 4929b0c4591bd8b637eb5240075faf9ff2303c6f Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 26 Mar 2026 12:55:30 -0700 Subject: [PATCH 13/70] feat: update CA certificates functions for backward compatibility with optional Location parameter Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 35 +++++++++---- staging/cse/windows/kubernetesfunc.tests.ps1 | 52 ++++++++++++++++++++ 2 files changed, 78 insertions(+), 9 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 023542b6f3c..d9852e4288d 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -71,8 +71,8 @@ function Register-NodeResetScriptTask { function Register-CACertificatesRefreshTask { Param( - [Parameter(Mandatory = $true)][string] - $Location + [Parameter(Mandatory = $false)][string] + $Location = "" ) Logs-To-Event -TaskName "AKS.WindowsCSE.RegisterCACertificatesRefreshTask" -TaskMessage "Start to register CA certificates refresh task" @@ -84,7 +84,13 @@ function Register-CACertificatesRefreshTask { return } - $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$Location' | Out-Null }" + # Include -Location only when it was provided, so older VHDs whose Get-CACertificates + # does not accept -Location can still execute the scheduled task successfully. + if ([string]::IsNullOrEmpty($Location)) { + $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates | Out-Null }" + } else { + $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$Location' | Out-Null }" + } $action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -NonInteractive -ExecutionPolicy Bypass -Command `"$refreshCommand`"" $principal = New-ScheduledTaskPrincipal -UserId SYSTEM -LogonType ServiceAccount -RunLevel Highest $trigger = New-JobTrigger -Daily -At "19:00" -DaysInterval 1 @@ -287,10 +293,14 @@ function Get-CustomCloudCertEndpointModeFromLocation { function Should-InstallCACertificatesRefreshTask { Param( - [Parameter(Mandatory = $true)][string] - $Location + [Parameter(Mandatory = $false)][string] + $Location = "" ) + # When Location is not supplied (older callers), default to legacy mode. + if ([string]::IsNullOrEmpty($Location)) { + return $true + } $certEndpointMode = Get-CustomCloudCertEndpointModeFromLocation -Location $Location if ($certEndpointMode -eq "legacy") { return $true @@ -308,15 +318,22 @@ function Should-InstallCACertificatesRefreshTask { function Get-CACertificates { Param( - [Parameter(Mandatory = $true)][string] - $Location + [Parameter(Mandatory = $false)][string] + $Location = "" ) $caFolder = "C:\ca" Create-Directory -FullPath $caFolder -DirectoryUsage "storing CA certificates" - $certEndpointMode = Get-CustomCloudCertEndpointModeFromLocation -Location $Location - Write-Log "Get CA certificates. Location: $Location. EndpointMode: $certEndpointMode" + # When Location is not supplied (older callers), fall back to the legacy endpoint + # which was the original behavior before the rcv1p changes. + if ([string]::IsNullOrEmpty($Location)) { + $certEndpointMode = "legacy" + Write-Log "Get CA certificates. Location not provided, defaulting to legacy endpoint mode" + } else { + $certEndpointMode = Get-CustomCloudCertEndpointModeFromLocation -Location $Location + Write-Log "Get CA certificates. Location: $Location. EndpointMode: $certEndpointMode" + } try { if ($certEndpointMode -eq "legacy") { diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 8186bfabc4c..8ada13ee440 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -187,4 +187,56 @@ Describe 'Get-CACertificates' { $result | Should -Be $false } + + It 'falls back to legacy endpoint when called without -Location (backward compat)' { + $script:retryUris = @() + Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:retryUris += $PSBoundParameters['Args'].Uri + return [PSCustomObject]@{ + Content = '{"Certificates":[{"Name":"compat.crt","CertBody":"compat-body"}]}' + } + } + + $result = Get-CACertificates + + $result | Should -Be $true + Assert-MockCalled -CommandName Retry-Command -Exactly -Times 1 + $script:retryUris | Should -Contain 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' + } +} + +Describe 'Should-InstallCACertificatesRefreshTask - backward compat' { + It 'returns true when called without -Location (backward compat)' { + $result = Should-InstallCACertificatesRefreshTask + + $result | Should -Be $true + } +} + +Describe 'Register-CACertificatesRefreshTask - backward compat' { + BeforeEach { + $script:lastScheduledTaskArgument = $null + + Mock Logs-To-Event -MockWith { } + Mock New-ScheduledTaskPrincipal -MockWith { return @{ Kind = 'principal' } } + Mock New-JobTrigger -MockWith { return @{ Kind = 'trigger' } } + Mock New-ScheduledTask -MockWith { return @{ Kind = 'definition' } } + Mock Register-ScheduledTask -MockWith { } + Mock New-ScheduledTaskAction -MockWith { + param($Execute, $Argument) + $script:lastScheduledTaskArgument = $Argument + return @{ Execute = $Execute; Argument = $Argument } + } + } + + It 'creates a scheduled task without -Location when called without it (backward compat)' { + Mock Get-ScheduledTask -MockWith { return $null } + + Register-CACertificatesRefreshTask + + Assert-MockCalled -CommandName Register-ScheduledTask -Exactly -Times 1 + $script:lastScheduledTaskArgument | Should -Match ([regex]::Escape("Get-CACertificates |")) + $script:lastScheduledTaskArgument | Should -Not -Match "Location" + } } From e485641e9ab6825ef4303e1c0e3b80e13f098e0b Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 27 Mar 2026 09:10:22 -0700 Subject: [PATCH 14/70] feat: remove deprecated Ubuntu repository initialization logic from init-aks-custom-cloud.sh Signed-off-by: Ramkumar Chinchani --- .../cloud-init/artifacts/init-aks-custom-cloud.sh | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index eeb01c392fe..0c5487da414 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -469,18 +469,6 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 fi fi fi - - cloud-init status --wait - rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - # logic taken from https://repodepot.azure.com/scripts/cloud-init/setup_repodepot.sh - ubuntuRel=$(lsb_release --release | awk '{print $2}') - ubuntuDist=$(lsb_release -c | awk '{print $2}') - # initialize archive.ubuntu.com repo - init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} - init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} - # update apt list - echo "Running apt-get update" - aptget_update elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then if [ "$install_ca_refresh_schedule" -eq 1 ]; then script_path="$(readlink -f "$0")" From 6b468eac3c95bc34f8a0f5f94fa35c29c79b5015 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 2 Apr 2026 14:18:47 -0700 Subject: [PATCH 15/70] Split init-aks-custom-cloud.sh to fix Flatcar/ACL customData size limit The unified init-aks-custom-cloud.sh script (~22KB) pushed Flatcar and ACL VM customData over Azure's 87,380 character limit, causing 16 E2E failures. Split the script into two files: - init-aks-custom-cloud.sh: cert refresh + scheduling (included for all clouds) - init-aks-custom-cloud-repos.sh: repo depot + chrony (custom cloud only) The main script sources the repos script at runtime if present. For non-custom-cloud VMs, only the smaller main script is embedded, reducing base64(gzip) size from 8,736 to 4,424 chars (-4,312 chars). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud-repos.sh | 358 ++++++++++++++++++ .../artifacts/init-aks-custom-cloud.sh | 350 +---------------- parts/linux/cloud-init/nodecustomdata.yml | 7 + pkg/agent/baker.go | 3 + pkg/agent/const.go | 5 +- pkg/agent/variables.go | 3 + 6 files changed, 381 insertions(+), 345 deletions(-) create mode 100644 parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh new file mode 100644 index 00000000000..0c68d513568 --- /dev/null +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh @@ -0,0 +1,358 @@ +#!/bin/bash +# This script handles repo depot initialization and chrony configuration for +# AKS custom cloud environments. It is sourced by init-aks-custom-cloud.sh and +# inherits all variables from it (IS_UBUNTU, IS_MARINER, IS_AZURELINUX, +# IS_FLATCAR, IS_ACL, REPO_DEPOT_ENDPOINT, etc.). +# +# This script is only included in custom cloud images to keep the base +# customData size small for non-custom-cloud scenarios. + +set -x + +function init_ubuntu_main_repo_depot { + local repodepot_endpoint="$1" + # Initialize directory for keys + mkdir -p /etc/apt/keyrings + + # This copies the updated bundle to the location used by OpenSSL which is commonly used + echo "Copying updated bundle to OpenSSL .pem file..." + cp /etc/ssl/certs/ca-certificates.crt /usr/lib/ssl/cert.pem + echo "Updated bundle copied." + + # Back up sources.list and sources.list.d contents + mkdir -p /etc/apt/backup/ + if [ -f "/etc/apt/sources.list" ]; then + mv /etc/apt/sources.list /etc/apt/backup/ + fi + for sources_file in /etc/apt/sources.list.d/*; do + if [ -f "$sources_file" ]; then + mv "$sources_file" /etc/apt/backup/ + fi + done + + # Set location of sources file + . /etc/os-release + aptSourceFile="/etc/apt/sources.list.d/ubuntu.sources" + + # Create main sources file + cat < /etc/apt/sources.list.d/ubuntu.sources + +Types: deb +URIs: ${repodepot_endpoint}/ubuntu +Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security +Components: main universe restricted multiverse +Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg +EOF + + # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing + # all urls with the RepoDepot Ubuntu url + ubuntuUrl=${repodepot_endpoint}/ubuntu + echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." + sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile + echo "apt source URLs converted, see new file below:" + echo "" + echo "-----" + cat $aptSourceFile + echo "-----" + echo "" +} + +function check_url { + local url=$1 + echo "Checking url: $url" + + # Use curl to check the URL and capture both stdout and stderr + curl_exit_code=$(curl -s --head --request GET $url) + # Check the exit status of curl + # shellcheck disable=SC3010 + if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then + echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" + exit 1 + fi +} + +function write_to_sources_file { + local sources_list_d_file=$1 + local source_uri=$2 + shift 2 + local key_paths=("$@") + + sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" + ubuntuDist=$(lsb_release -c | awk '{print $2}') + + tee -a $sources_file_path < /dev/null + echo "$key_name key added to keyring." +} + +function derive_key_paths { + local key_names=("$@") + local key_paths=() + + for key_name in "${key_names[@]}"; do + key_paths+=("/etc/apt/keyrings/${key_name}.gpg") + done + + echo "${key_paths[*]}" +} + +function add_ms_keys { + # Add the Microsoft package server keys to keyring. + echo "Adding Microsoft keys to keyring..." + + add_key_ubuntu microsoft.asc + add_key_ubuntu msopentech.asc +} + +function aptget_update { + echo "apt-get updating..." + echo "note: depending on how many sources have been added this may take a couple minutes..." + if apt-get update | grep -q "404 Not Found"; then + echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." + exit 1 + else + echo "apt-get update complete!" + fi +} + +function init_ubuntu_pmc_repo_depot { + local repodepot_endpoint="$1" + # Add Microsoft packages source to the azure specific sources.list. + echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." + + microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" + check_url $microsoftPackageSource + write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) + write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) + echo "Ubuntu ($ubuntuRel) repo added." + echo "Adding packages.microsoft.com keys" + add_ms_keys $repodepot_endpoint +} + +function init_mariner_repo_depot { + local repodepot_endpoint=$1 + echo "Adding [extended] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo + sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo + + echo "Adding [nvidia] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo + + echo "Adding [cloud-native] repo" + cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo + sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo + + echo "Pointing Mariner repos at RepoDepot..." + for f in /etc/yum.repos.d/*.repo; do + sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f + echo "$f modified." + done + echo "Mariner repo setup complete." +} + +function init_azurelinux_repo_depot { + local repodepot_endpoint=$1 + local repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") + + rm -f /etc/yum.repos.d/azurelinux* + + for repo in "${repos[@]}"; do + output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" + repo_content=( + "[azurelinux-official-$repo]" + "name=Azure Linux Official $repo \$releasever \$basearch" + "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" + "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" + "gpgcheck=1" + "repo_gpgcheck=1" + "enabled=1" + "skip_if_unavailable=True" + "sslverify=1" + ) + + rm -f "$output_file" + + for line in "${repo_content[@]}"; do + echo "$line" >> "$output_file" + done + + echo "File '$output_file' has been created." + done + echo "Azure Linux repo setup complete." +} + +function dnf_makecache { + local retries=10 + local dnf_makecache_output=/tmp/dnf-makecache.out + local i + for i in $(seq 1 $retries); do + ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ + cat $dnf_makecache_output && break || \ + cat $dnf_makecache_output + if [ $i -eq $retries ]; then + return 1 + else + sleep 5 + fi + done + echo "Executed dnf makecache -y $i times" +} + +if [ "$IS_UBUNTU" -eq 1 ]; then + rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -n "$rootRepoDepotEndpoint" ]; then + cloud-init status --wait + ubuntuRel=$(lsb_release --release | awk '{print $2}') + ubuntuDist=$(lsb_release -c | awk '{print $2}') + init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} + init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} + echo "Running apt-get update" + aptget_update + else + echo "REPO_DEPOT_ENDPOINT empty, skipping Ubuntu RepoDepot initialization" + fi +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then + cloud-init status --wait + + marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" + if [ -z "$marinerRepoDepotEndpoint" ]; then + >&2 echo "repo depot endpoint empty while running custom-cloud init script" + else + if [ "$IS_MARINER" -eq 1 ]; then + echo "Initializing Mariner repo depot settings..." + init_mariner_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + else + echo "Initializing Azure Linux repo depot settings..." + init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} + dnf_makecache || exit 1 + fi + fi +fi + +# Disable systemd-timesyncd and install chrony and uses local time source +# ACL has PTP clock config compiled into chronyd with no config file or sourcedir directives, +# so it uses only the local PTP clock and has no DHCP-injectable NTP sources. +if [ "$IS_ACL" -eq 1 ]; then + echo "Skipping chrony configuration for ACL (PTP clock baked into chronyd, no external NTP sources)" +elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then +cat > /etc/chrony.conf < $chrony_conf < /etc/apt/sources.list.d/ubuntu.sources - -Types: deb -URIs: ${repodepot_endpoint}/ubuntu -Suites: ${VERSION_CODENAME} ${VERSION_CODENAME}-updates ${VERSION_CODENAME}-backports ${VERSION_CODENAME}-security -Components: main universe restricted multiverse -Signed-By: /usr/share/keyrings/ubuntu-archive-keyring.gpg -EOF - - # Update the apt sources file using the RepoDepot Ubuntu URL for this cloud. Update it by replacing - # all urls with the RepoDepot Ubuntu url - ubuntuUrl=${repodepot_endpoint}/ubuntu - echo "Converting URLs in $aptSourceFile to RepoDepot URLs..." - sed -i "s,https\?://.[^ ]*,$ubuntuUrl,g" $aptSourceFile - echo "apt source URLs converted, see new file below:" - echo "" - echo "-----" - cat $aptSourceFile - echo "-----" - echo "" -} - -function check_url { - local url=$1 - echo "Checking url: $url" - - # Use curl to check the URL and capture both stdout and stderr - curl_exit_code=$(curl -s --head --request GET $url) - # Check the exit status of curl - # shellcheck disable=SC3010 - if [[ $? -ne 0 ]] || echo "$curl_exit_code" | grep -E "404 Not Found" > /dev/null; then - echo "ERROR: $url is not available. Please manually check if the url is valid before re-running script" - exit 1 - fi -} - -function write_to_sources_file { - local sources_list_d_file=$1 - local source_uri=$2 - shift 2 - local key_paths=("$@") - - sources_file_path="/etc/apt/sources.list.d/${sources_list_d_file}.sources" - ubuntuDist=$(lsb_release -c | awk '{print $2}') - - tee -a $sources_file_path < /dev/null - echo "$key_name key added to keyring." -} - -function derive_key_paths { - local key_names=("$@") - local key_paths=() - - for key_name in "${key_names[@]}"; do - key_paths+=("/etc/apt/keyrings/${key_name}.gpg") - done - - echo "${key_paths[*]}" -} - -function add_ms_keys { - # Add the Microsoft package server keys to keyring. - echo "Adding Microsoft keys to keyring..." - - add_key_ubuntu microsoft.asc - add_key_ubuntu msopentech.asc -} - -function aptget_update { - echo "apt-get updating..." - echo "note: depending on how many sources have been added this may take a couple minutes..." - if apt-get update | grep -q "404 Not Found"; then - echo "ERROR: apt-get update failed to find all sources. Please validate the sources or remove bad sources from your sources and try again." - exit 1 - else - echo "apt-get update complete!" - fi -} - -function init_ubuntu_pmc_repo_depot { - local repodepot_endpoint="$1" - # Add Microsoft packages source to the azure specific sources.list. - echo "Adding the packages.microsoft.com Ubuntu-$ubuntuRel repo..." - - microsoftPackageSource="$repodepot_endpoint/microsoft/ubuntu/$ubuntuRel/prod" - check_url $microsoftPackageSource - write_to_sources_file microsoft-prod $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - write_to_sources_file microsoft-prod-testing $microsoftPackageSource $(derive_key_paths microsoft.asc msopentech.asc) - echo "Ubuntu ($ubuntuRel) repo added." - echo "Adding packages.microsoft.com keys" - add_ms_keys $repodepot_endpoint -} - -function init_mariner_repo_depot { - local repodepot_endpoint=$1 - echo "Adding [extended] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|extras|extended|" /etc/yum.repos.d/mariner-extended.repo - sed -i -e "s|Extras|Extended|" /etc/yum.repos.d/mariner-extended.repo - - echo "Adding [nvidia] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|extras|nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - sed -i -e "s|Extras|Nvidia|" /etc/yum.repos.d/mariner-nvidia.repo - - echo "Adding [cloud-native] repo" - cp /etc/yum.repos.d/mariner-extras.repo /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|extras|cloud-native|" /etc/yum.repos.d/mariner-cloud-native.repo - sed -i -e "s|Extras|Cloud-Native|" /etc/yum.repos.d/mariner-cloud-native.repo - - echo "Pointing Mariner repos at RepoDepot..." - for f in /etc/yum.repos.d/*.repo; do - sed -i -e "s|https://packages.microsoft.com|${repodepot_endpoint}/mariner/packages.microsoft.com|" $f - echo "$f modified." - done - echo "Mariner repo setup complete." -} - -function init_azurelinux_repo_depot { - local repodepot_endpoint=$1 - local repos=("amd" "base" "cloud-native" "extended" "ms-non-oss" "ms-oss" "nvidia") - - rm -f /etc/yum.repos.d/azurelinux* - - for repo in "${repos[@]}"; do - output_file="/etc/yum.repos.d/azurelinux-${repo}.repo" - repo_content=( - "[azurelinux-official-$repo]" - "name=Azure Linux Official $repo \$releasever \$basearch" - "baseurl=$repodepot_endpoint/azurelinux/\$releasever/prod/$repo/\$basearch" - "gpgkey=file:///etc/pki/rpm-gpg/MICROSOFT-RPM-GPG-KEY" - "gpgcheck=1" - "repo_gpgcheck=1" - "enabled=1" - "skip_if_unavailable=True" - "sslverify=1" - ) - - rm -f "$output_file" - - for line in "${repo_content[@]}"; do - echo "$line" >> "$output_file" - done - - echo "File '$output_file' has been created." - done - echo "Azure Linux repo setup complete." -} - -function dnf_makecache { - local retries=10 - local dnf_makecache_output=/tmp/dnf-makecache.out - local i - for i in $(seq 1 $retries); do - ! (dnf makecache -y 2>&1 | tee $dnf_makecache_output | grep -E "^([WE]:.*)|([eE]rr.*)$") && \ - cat $dnf_makecache_output && break || \ - cat $dnf_makecache_output - if [ $i -eq $retries ]; then - return 1 - else - sleep 5 - fi - done - echo "Executed dnf makecache -y $i times" -} - if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then scriptPath=$0 # Determine an absolute, canonical path to this script for use in cron. @@ -504,139 +293,12 @@ EOF fi fi -if [ "$IS_UBUNTU" -eq 1 ]; then - rootRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - if [ -n "$rootRepoDepotEndpoint" ]; then - cloud-init status --wait - ubuntuRel=$(lsb_release --release | awk '{print $2}') - ubuntuDist=$(lsb_release -c | awk '{print $2}') - init_ubuntu_main_repo_depot ${rootRepoDepotEndpoint} - init_ubuntu_pmc_repo_depot ${rootRepoDepotEndpoint} - echo "Running apt-get update" - aptget_update - else - echo "REPO_DEPOT_ENDPOINT empty, skipping Ubuntu RepoDepot initialization" - fi -elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then - cloud-init status --wait - - marinerRepoDepotEndpoint="$(echo "${REPO_DEPOT_ENDPOINT}" | sed 's/\/ubuntu//')" - if [ -z "$marinerRepoDepotEndpoint" ]; then - >&2 echo "repo depot endpoint empty while running custom-cloud init script" - else - if [ "$IS_MARINER" -eq 1 ]; then - echo "Initializing Mariner repo depot settings..." - init_mariner_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - else - echo "Initializing Azure Linux repo depot settings..." - init_azurelinux_repo_depot ${marinerRepoDepotEndpoint} - dnf_makecache || exit 1 - fi - fi -fi - -# Disable systemd-timesyncd and install chrony and uses local time source -# ACL has PTP clock config compiled into chronyd with no config file or sourcedir directives, -# so it uses only the local PTP clock and has no DHCP-injectable NTP sources. -if [ "$IS_ACL" -eq 1 ]; then - echo "Skipping chrony configuration for ACL (PTP clock baked into chronyd, no external NTP sources)" -elif [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 ]; then -cat > /etc/chrony.conf < $chrony_conf < Date: Mon, 13 Apr 2026 11:27:13 -0700 Subject: [PATCH 16/70] feat(e2e): add RCV1P cert mode end-to-end tests Add e2e test infrastructure and scenarios to validate RCV1P (Root Certificate V1P) certificate endpoint mode across all supported Linux distros and Windows versions. Infrastructure changes: - Introduce ClusterInfra struct to decouple cluster lifecycle functions from the default Azure subscription, enabling per-subscription clients - Refactor ~20 functions in cluster.go, kube.go, and aks_model.go to accept ClusterInfra instead of hardcoding config.Azure - Add NewAzureClientForSubscription() to construct ARM clients for any subscription, replacing the single-subscription NewAzureClient() - Add CreateVMManagedIdentityInRG() for identity-only creation without blob storage (RCV1P subscription doesn't need shared storage) - Add ClusterRCV1PKubenet cached cluster function and RCV1P-specific resource group/identity cache entries Config and pipeline: - Add RCV1P_SUBSCRIPTION_ID env var to config, with lazy-init of RCV1PAzure client and helper functions - Pass RCV1P_SUBSCRIPTION_ID through e2e-template.yaml and e2e_run.sh - Add dedicated e2e-rcv1p.yaml pipeline with daily schedule Test scenarios: - Linux: Ubuntu 22.04, Ubuntu 24.04, AzureLinux V3, Flatcar, ACL - Windows: Server 2022, 23H2, 2025 - All tests skip gracefully when RCV1P_SUBSCRIPTION_ID is unset Validators: - ValidateRCV1PCertMode (Linux): checks provisioning log for rcv1p mode, verifies certs in /root/AzureCACertificates, validates distro-specific trust store updates, confirms cron/systemd refresh schedule - ValidateRCV1PCertModeWindows: checks C:\AzureCACertificates directory and scheduled refresh task Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .pipelines/e2e-rcv1p.yaml | 19 ++ .pipelines/scripts/e2e_run.sh | 2 + .pipelines/templates/e2e-template.yaml | 1 + e2e/aks_model.go | 29 +-- e2e/cache.go | 47 +++- e2e/cluster.go | 124 ++++++----- e2e/config/azure.go | 96 ++++---- e2e/config/config.go | 23 ++ e2e/kube.go | 14 +- e2e/scenario_rcv1p_test.go | 210 ++++++++++++++++++ e2e/scenario_rcv1p_win_test.go | 91 ++++++++ e2e/test_helpers.go | 37 ++- e2e/types.go | 68 ++++++ e2e/validators.go | 107 +++++++++ e2e/vmss.go | 37 +-- .../artifacts/init-aks-custom-cloud.sh | 3 +- .../artifacts/init_aks_custom_cloud_spec.sh | 2 +- staging/cse/windows/kubernetesfunc.ps1 | 3 +- 18 files changed, 752 insertions(+), 161 deletions(-) create mode 100644 .pipelines/e2e-rcv1p.yaml create mode 100644 e2e/scenario_rcv1p_test.go create mode 100644 e2e/scenario_rcv1p_win_test.go diff --git a/.pipelines/e2e-rcv1p.yaml b/.pipelines/e2e-rcv1p.yaml new file mode 100644 index 00000000000..5fdf9d3a5ee --- /dev/null +++ b/.pipelines/e2e-rcv1p.yaml @@ -0,0 +1,19 @@ +name: $(Date:yyyyMMdd)$(Rev:.r) +variables: + TAGS_TO_RUN: "rcv1pcertmode=true" + SKIP_E2E_TESTS: false + E2E_GO_TEST_TIMEOUT: "75m" +schedules: + - cron: "0 11 * * *" + displayName: Daily 3am PST + branches: + include: + - main + always: true +trigger: none +pr: none +jobs: + - template: ./templates/e2e-template.yaml + parameters: + name: RCV1P Cert Mode Tests + IgnoreScenariosWithMissingVhd: false diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 1dcea264298..097fe250756 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -35,6 +35,7 @@ VHD_BUILD_ID="${VHD_BUILD_ID:-}" IGNORE_SCENARIOS_WITH_MISSING_VHD="${IGNORE_SCENARIOS_WITH_MISSING_VHD:-}" LOGGING_DIR="${LOGGING_DIR:-}" E2E_SUBSCRIPTION_ID="${E2E_SUBSCRIPTION_ID:-}" +RCV1P_SUBSCRIPTION_ID="${RCV1P_SUBSCRIPTION_ID:-}" ENABLE_SECURE_TLS_BOOTSTRAPPING="${ENABLE_SECURE_TLS_BOOTSTRAPPING:-true}" TAGS_TO_SKIP="${TAGS_TO_SKIP:-}" TAGS_TO_RUN="${TAGS_TO_RUN:-}" @@ -47,6 +48,7 @@ echo "VHD_BUILD_ID: ${VHD_BUILD_ID}" echo "IGNORE_SCENARIOS_WITH_MISSING_VHD: ${IGNORE_SCENARIOS_WITH_MISSING_VHD}" echo "LOGGING_DIR: ${LOGGING_DIR}" echo "E2E_SUBSCRIPTION_ID: ${E2E_SUBSCRIPTION_ID}" +echo "RCV1P_SUBSCRIPTION_ID: ${RCV1P_SUBSCRIPTION_ID}" echo "ENABLE_SECURE_TLS_BOOTSTRAPPING: ${ENABLE_SECURE_TLS_BOOTSTRAPPING}" echo "TAGS_TO_SKIP: ${TAGS_TO_SKIP}" echo "TAGS_TO_RUN: ${TAGS_TO_RUN}" diff --git a/.pipelines/templates/e2e-template.yaml b/.pipelines/templates/e2e-template.yaml index 3b4fad643d7..26d659f77ae 100644 --- a/.pipelines/templates/e2e-template.yaml +++ b/.pipelines/templates/e2e-template.yaml @@ -38,6 +38,7 @@ jobs: displayName: Run AgentBaker E2E env: E2E_SUBSCRIPTION_ID: $(E2E_SUBSCRIPTION_ID) + RCV1P_SUBSCRIPTION_ID: $(RCV1P_SUBSCRIPTION_ID) SYS_SSH_PUBLIC_KEY: $(SYS_SSH_PUBLIC_KEY) SYS_SSH_PRIVATE_KEY_B64: $(SYS_SSH_PRIVATE_KEY_B64) BUILD_SRC_DIR: $(System.DefaultWorkingDirectory) diff --git a/e2e/aks_model.go b/e2e/aks_model.go index b618ff87524..f5f999d44c9 100644 --- a/e2e/aks_model.go +++ b/e2e/aks_model.go @@ -301,22 +301,23 @@ func getFirewall(ctx context.Context, location, firewallSubnetID, publicIPID str } func addFirewallRules( - ctx context.Context, clusterModel *armcontainerservice.ManagedCluster, + ctx context.Context, infra *ClusterInfra, clusterModel *armcontainerservice.ManagedCluster, ) error { location := *clusterModel.Location defer toolkit.LogStepCtx(ctx, "adding firewall rules")() rg := *clusterModel.Properties.NodeResourceGroup - vnet, err := getClusterVNet(ctx, rg) + vnet, err := getClusterVNet(ctx, infra, rg) if err != nil { return err } - // For kubenet, the AKS-managed route table must stay attached so that pod - // routes (managed by cloud-provider-azure) and firewall routes coexist. - // For Azure CNI variants, the subnet may not have any route table, so we - // create and associate a dedicated one before adding the firewall routes. - aksSubnetResp, err := config.Azure.Subnet.Get(ctx, rg, vnet.name, "aks-subnet", nil) + // Find the AKS-managed route table currently associated with the subnet. + // We add firewall routes directly to this table so that both pod routes + // (managed by cloud-provider-azure) and firewall routes coexist. Creating + // a separate route table and swapping the subnet association disconnects + // the pod routes and breaks kubenet networking. + aksSubnetResp, err := infra.Azure.Subnet.Get(ctx, rg, vnet.name, "aks-subnet", nil) if err != nil { return fmt.Errorf("failed to get AKS subnet: %w", err) } @@ -334,7 +335,7 @@ func addFirewallRules( } toolkit.Logf(ctx, "Creating subnet %s in VNet %s", firewallSubnetName, vnet.name) - subnetPoller, err := config.Azure.Subnet.BeginCreateOrUpdate( + subnetPoller, err := infra.Azure.Subnet.BeginCreateOrUpdate( ctx, rg, vnet.name, @@ -367,7 +368,7 @@ func addFirewallRules( } toolkit.Logf(ctx, "Creating public IP %s", publicIPName) - pipPoller, err := config.Azure.PublicIPAddresses.BeginCreateOrUpdate( + pipPoller, err := infra.Azure.PublicIPAddresses.BeginCreateOrUpdate( ctx, rg, publicIPName, @@ -388,7 +389,7 @@ func addFirewallRules( firewallName := "abe2e-fw" firewall := getFirewall(ctx, location, firewallSubnetID, publicIPID) - fwPoller, err := config.Azure.AzureFirewall.BeginCreateOrUpdate(ctx, rg, firewallName, *firewall, nil) + fwPoller, err := infra.Azure.AzureFirewall.BeginCreateOrUpdate(ctx, rg, firewallName, *firewall, nil) if err != nil { return fmt.Errorf("failed to start Firewall creation: %w", err) } @@ -434,7 +435,7 @@ func addFirewallRules( for _, route := range firewallRoutes { toolkit.Logf(ctx, "Adding route %q to AKS route table %q", *route.Name, aksRTName) - poller, err := config.Azure.Routes.BeginCreateOrUpdate(ctx, rg, aksRTName, *route.Name, route, nil) + poller, err := infra.Azure.Routes.BeginCreateOrUpdate(ctx, rg, aksRTName, *route.Name, route, nil) if err != nil { return fmt.Errorf("failed to start adding route %q: %w", *route.Name, err) } @@ -512,7 +513,7 @@ func addPrivateAzureContainerRegistry(ctx context.Context, cluster *armcontainer if err := createPrivateAzureContainerRegistryPullSecret(ctx, cluster, kube, resourceGroupName, isNonAnonymousPull); err != nil { return fmt.Errorf("create private acr pull secret: %w", err) } - vnet, err := getClusterVNet(ctx, *cluster.Properties.NodeResourceGroup) + vnet, err := getClusterVNet(ctx, DefaultClusterInfra, *cluster.Properties.NodeResourceGroup) if err != nil { return err } @@ -533,7 +534,7 @@ func addNetworkIsolatedSettings(ctx context.Context, clusterModel *armcontainers location := *clusterModel.Location defer toolkit.LogStepCtx(ctx, fmt.Sprintf("Adding network settings for network isolated cluster %s in rg %s", *clusterModel.Name, *clusterModel.Properties.NodeResourceGroup)) - vnet, err := getClusterVNet(ctx, *clusterModel.Properties.NodeResourceGroup) + vnet, err := getClusterVNet(ctx, DefaultClusterInfra, *clusterModel.Properties.NodeResourceGroup) if err != nil { return err } @@ -680,7 +681,7 @@ func createPrivateAzureContainerRegistry(ctx context.Context, cluster *armcontai } // if ACR gets recreated so should the cluster toolkit.Logf(ctx, "Private ACR deleted, deleting cluster %s", *cluster.Name) - if err := deleteCluster(ctx, *cluster.Name, resourceGroup); err != nil { + if err := deleteCluster(ctx, DefaultClusterInfra, *cluster.Name, resourceGroup); err != nil { return fmt.Errorf("failed to delete cluster: %w", err) } } else { diff --git a/e2e/cache.go b/e2e/cache.go index 1b07d383815..777acdaf559 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -10,6 +10,7 @@ import ( "github.com/Azure/azure-sdk-for-go/sdk/azcore" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources/v3" ) // cachedFunc creates a thread-safe memoized version of a function. @@ -150,56 +151,67 @@ func clusterLatestKubernetesVersion(ctx context.Context, request ClusterRequest) if err != nil { return nil, fmt.Errorf("getting latest kubernetes version cluster model: %w", err) } - return prepareCluster(ctx, model, false, false) + return prepareCluster(ctx, DefaultClusterInfra, model, false, false) } var ClusterKubenet = cachedFunc(clusterKubenet) // clusterKubenet creates a basic cluster using kubenet networking func clusterKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getKubenetClusterModel("abe2e-kubenet-v4", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getKubenetClusterModel("abe2e-kubenet-v4", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterAzureNetwork = cachedFunc(clusterAzureNetwork) // clusterAzureNetwork creates a cluster with Azure CNI networking func clusterAzureNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureNetworkClusterModel("abe2e-azure-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterAzureBootstrapProfileCache = cachedFunc(clusterAzureBootstrapProfileCache) // clusterAzureBootstrapProfileCache creates a cluster with bootstrap profile cache but without network isolation func clusterAzureBootstrapProfileCache(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-bootstrapprofile-cache-v1", request.Location, request.K8sSystemPoolSKU), false, true) + return prepareCluster(ctx, DefaultClusterInfra, getAzureNetworkClusterModel("abe2e-azure-bootstrapprofile-cache-v1", request.Location, request.K8sSystemPoolSKU), false, true) } var ClusterAzureNetworkIsolated = cachedFunc(clusterAzureNetworkIsolated) // clusterAzureNetworkIsolated creates a networkisolated Azure network cluster (no internet access) func clusterAzureNetworkIsolated(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureNetworkClusterModel("abe2e-azure-networkisolated-v1", request.Location, request.K8sSystemPoolSKU), true, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureNetworkClusterModel("abe2e-azure-networkisolated-v1", request.Location, request.K8sSystemPoolSKU), true, false) } var ClusterAzureOverlayNetwork = cachedFunc(clusterAzureOverlayNetwork) // clusterAzureOverlayNetwork creates a cluster with Azure CNI Overlay networking func clusterAzureOverlayNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureOverlayNetworkClusterModel("abe2e-azure-overlay-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureOverlayNetworkClusterModel("abe2e-azure-overlay-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterAzureOverlayNetworkDualStack = cachedFunc(clusterAzureOverlayNetworkDualStack) // clusterAzureOverlayNetworkDualStack creates a dual-stack (IPv4+IPv6) Azure CNI Overlay cluster func clusterAzureOverlayNetworkDualStack(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getAzureOverlayNetworkDualStackClusterModel("abe2e-azure-overlay-dualstack-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getAzureOverlayNetworkDualStackClusterModel("abe2e-azure-overlay-dualstack-v3", request.Location, request.K8sSystemPoolSKU), false, false) } var ClusterCiliumNetwork = cachedFunc(clusterCiliumNetwork) // clusterCiliumNetwork creates a cluster with Cilium CNI networking func clusterCiliumNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, getCiliumNetworkClusterModel("abe2e-cilium-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) + return prepareCluster(ctx, DefaultClusterInfra, getCiliumNetworkClusterModel("abe2e-cilium-network-v3", request.Location, request.K8sSystemPoolSKU), false, false) +} + +var ClusterRCV1PKubenet = cachedFunc(clusterRCV1PKubenet) + +// clusterRCV1PKubenet creates a kubenet cluster in the RCV1P subscription for cert mode testing. +func clusterRCV1PKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) { + infra := RCV1PClusterInfra() + if infra == nil { + return nil, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set, cannot create RCV1P cluster") + } + return prepareCluster(ctx, infra, getKubenetClusterModel("abe2e-rcv1p-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) } // isNotFoundErr checks if an error represents a "not found" response from Azure API @@ -228,6 +240,25 @@ var CachedEnsureResourceGroup = cachedFunc(ensureResourceGroup) var CachedCreateVMManagedIdentity = cachedFunc(config.Azure.CreateVMManagedIdentity) var CachedCompileAndUploadAKSNodeController = cachedFunc(compileAndUploadAKSNodeController) +// CachedRCV1PEnsureResourceGroup creates the resource group in the RCV1P subscription. +var CachedRCV1PEnsureResourceGroup = cachedFunc(ensureRCV1PResourceGroup) + +// CachedRCV1PCreateVMManagedIdentity creates a VM managed identity in the RCV1P subscription. +var CachedRCV1PCreateVMManagedIdentity = cachedFunc(func(ctx context.Context, location string) (string, error) { + if config.RCV1PAzure == nil { + return "", fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set") + } + return config.RCV1PAzure.CreateVMManagedIdentityInRG(ctx, config.RCV1PResourceGroupName(location), location) +}) + +func ensureRCV1PResourceGroup(ctx context.Context, location string) (armresources.ResourceGroup, error) { + infra := RCV1PClusterInfra() + if infra == nil { + return armresources.ResourceGroup{}, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set") + } + return ensureResourceGroupWithInfra(ctx, infra, location) +} + // VMSizeSKURequest is the cache key for Resource SKU lookups by VM size and location. type VMSizeSKURequest struct { Location string diff --git a/e2e/cluster.go b/e2e/cluster.go index 34a6eb6c168..a6a428ba91a 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -67,14 +67,14 @@ func (c *Cluster) MaxPodsPerNode() (int, error) { // This function contains complex concurrent orchestration — keep it as // minimal as possible and push all non-trivial logic into the individual // task functions it calls. -func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.ManagedCluster, isNetworkIsolated, attachPrivateAcr bool) (*Cluster, error) { +func prepareCluster(ctx context.Context, infra *ClusterInfra, clusterModel *armcontainerservice.ManagedCluster, isNetworkIsolated, attachPrivateAcr bool) (*Cluster, error) { defer toolkit.LogStepCtx(ctx, "preparing cluster")() ctx, cancel := context.WithTimeout(ctx, config.Config.TestTimeoutCluster) defer cancel() clusterModel.Name = to.Ptr(fmt.Sprintf("%s-%s", *clusterModel.Name, hash(clusterModel))) - cluster, err := getOrCreateCluster(ctx, clusterModel) + cluster, err := getOrCreateCluster(ctx, infra, clusterModel) if err != nil { return nil, fmt.Errorf("get or create cluster: %w", err) } @@ -85,11 +85,11 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag // finish before other subnet writes (firewall / network-isolated setup) // to avoid Azure VNet serialisation races. bastion := dag.Go(g, func(ctx context.Context) (*Bastion, error) { - return getOrCreateBastion(ctx, cluster) + return getOrCreateBastion(ctx, infra, cluster) }) dag.Run(g, func(ctx context.Context) error { return ensureMaintenanceConfiguration(ctx, cluster) }) - subnet := dag.Go(g, func(ctx context.Context) (string, error) { return getClusterSubnetID(ctx, cluster) }) - kube := dag.Go(g, func(ctx context.Context) (*Kubeclient, error) { return getClusterKubeClient(ctx, cluster) }) + subnet := dag.Go(g, func(ctx context.Context) (string, error) { return getClusterSubnetID(ctx, infra, cluster) }) + kube := dag.Go(g, func(ctx context.Context) (*Kubeclient, error) { return getClusterKubeClient(ctx, infra, cluster) }) identity := dag.Go(g, func(ctx context.Context) (*armcontainerservice.UserAssignedIdentity, error) { return getClusterKubeletIdentity(ctx, cluster) }) @@ -101,12 +101,12 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag // objects whose backing VMSS no longer exist. var networkDeps []dag.Dep if !isNetworkIsolated { - networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addFirewallRules(ctx, cluster) }, bastion)) + networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addFirewallRules(ctx, infra, cluster) }, bastion)) } if isNetworkIsolated { networkDeps = append(networkDeps, dag.Run(g, func(ctx context.Context) error { return addNetworkIsolatedSettings(ctx, cluster) }, bastion)) } - dag.Run1(g, kube, func(ctx context.Context, k *Kubeclient) error { return collectGarbageVMSS(ctx, cluster, k) }, networkDeps...) + dag.Run1(g, kube, func(ctx context.Context, k *Kubeclient) error { return collectGarbageVMSS(ctx, infra, cluster, k) }, networkDeps...) needACR := isNetworkIsolated || attachPrivateAcr acrNonAnon := dag.Run2(g, kube, identity, addACR(cluster, needACR, true)) acrAnon := dag.Run2(g, kube, identity, addACR(cluster, needACR, false)) @@ -130,6 +130,7 @@ func prepareCluster(ctx context.Context, clusterModel *armcontainerservice.Manag if err := g.Wait(); err != nil { return nil, fmt.Errorf("prepare cluster tasks: %w", err) } + return &Cluster{ Model: cluster, Kube: kube.MustGet(), @@ -247,9 +248,10 @@ func hash(cluster *armcontainerservice.ManagedCluster) string { return hexHash[:5] } -func getOrCreateCluster(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { +func getOrCreateCluster(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { defer toolkit.LogStepCtxf(ctx, "get or create cluster %s", *cluster.Name)() - existingCluster, err := getExistingCluster(ctx, *cluster.Location, *cluster.Name) + rgName := infra.ResourceGroupName(*cluster.Location) + existingCluster, err := getExistingCluster(ctx, infra, rgName, *cluster.Name) if err != nil { return nil, fmt.Errorf("failed to get existing cluster %q: %w, and wont retry", *cluster.Name, err) } @@ -259,13 +261,12 @@ func getOrCreateCluster(ctx context.Context, cluster *armcontainerservice.Manage return existingCluster, nil } - return createNewAKSClusterWithRetry(ctx, cluster) + return createNewAKSClusterWithRetry(ctx, infra, rgName, cluster) } // isExistingCluster checks if an AKS cluster exists. return the cluster only if its provisioning state is Succeeded and can be used. non-nil error if not retriable -func getExistingCluster(ctx context.Context, location, clusterName string) (*armcontainerservice.ManagedCluster, error) { - resourceGroupName := config.ResourceGroupName(location) - existingCluster, err := config.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) +func getExistingCluster(ctx context.Context, infra *ClusterInfra, resourceGroupName, clusterName string) (*armcontainerservice.ManagedCluster, error) { + existingCluster, err := infra.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) var azErr *azcore.ResponseError if errors.As(err, &azErr) { if azErr.StatusCode == 404 { @@ -278,7 +279,7 @@ func getExistingCluster(ctx context.Context, location, clusterName string) (*arm switch *existingCluster.Properties.ProvisioningState { case "Succeeded": - nodeRGExists, err := isExistingResourceGroup(ctx, *existingCluster.Properties.NodeResourceGroup) + nodeRGExists, err := isExistingResourceGroup(ctx, infra, *existingCluster.Properties.NodeResourceGroup) if err != nil { return nil, err @@ -290,28 +291,28 @@ func getExistingCluster(ctx context.Context, location, clusterName string) (*arm fallthrough case "Failed": toolkit.Logf(ctx, "##vso[task.logissue type=warning;]Cluster %s in Failed state, deleting", clusterName) - if err := deleteCluster(ctx, clusterName, resourceGroupName); err != nil { + if err := deleteCluster(ctx, infra, clusterName, resourceGroupName); err != nil { return nil, err } // Wait for Azure to confirm cluster is fully deleted before allowing recreation. // This prevents "Reconcile managed identity credential failed" errors where Azure's // backend still has stale references to the old cluster during the new cluster's // identity reconciliation process. - if err := waitForClusterDeletion(ctx, clusterName, resourceGroupName); err != nil { + if err := waitForClusterDeletion(ctx, infra, clusterName, resourceGroupName); err != nil { return nil, fmt.Errorf("failed waiting for cluster deletion: %w", err) } return nil, nil default: // other provisioning state, deleting, , stopping,,cancaled,cancelling,"Creating", "Updating", "Scaling", "Migrating", "Upgrading", "Starting", "Restoring": .. plus many others. toolkit.Logf(ctx, "##vso[task.logissue type=warning;]Unexpected cluster provisioning state %s: %s", clusterName, *existingCluster.Properties.ProvisioningState) - return waitUntilClusterReady(ctx, clusterName, location) + return waitUntilClusterReady(ctx, infra, clusterName, resourceGroupName) } } -func deleteCluster(ctx context.Context, clusterName, resourceGroupName string) error { +func deleteCluster(ctx context.Context, infra *ClusterInfra, clusterName, resourceGroupName string) error { defer toolkit.LogStepCtxf(ctx, "deleting cluster %s", clusterName)() // beileih: why do we do this? - _, err := config.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) + _, err := infra.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) if err != nil { var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == 404 { @@ -321,7 +322,7 @@ func deleteCluster(ctx context.Context, clusterName, resourceGroupName string) e return fmt.Errorf("failed to retrieve cluster while trying to delete it %q: %w", clusterName, err) } - pollerResp, err := config.Azure.AKS.BeginDelete(ctx, resourceGroupName, clusterName, nil) + pollerResp, err := infra.Azure.AKS.BeginDelete(ctx, resourceGroupName, clusterName, nil) if err != nil { return fmt.Errorf("failed to delete cluster %q: %w", clusterName, err) } @@ -332,9 +333,9 @@ func deleteCluster(ctx context.Context, clusterName, resourceGroupName string) e return nil } -func waitForClusterDeletion(ctx context.Context, clusterName, resourceGroupName string) error { +func waitForClusterDeletion(ctx context.Context, infra *ClusterInfra, clusterName, resourceGroupName string) error { return wait.PollUntilContextCancel(ctx, 5*time.Second, true, func(ctx context.Context) (bool, error) { - _, err := config.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) + _, err := infra.Azure.AKS.Get(ctx, resourceGroupName, clusterName, nil) if err != nil { var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == 404 { @@ -346,11 +347,11 @@ func waitForClusterDeletion(ctx context.Context, clusterName, resourceGroupName }) } -func waitUntilClusterReady(ctx context.Context, name, location string) (*armcontainerservice.ManagedCluster, error) { +func waitUntilClusterReady(ctx context.Context, infra *ClusterInfra, name, resourceGroupName string) (*armcontainerservice.ManagedCluster, error) { var cluster armcontainerservice.ManagedClustersClientGetResponse err := wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) { var err error - cluster, err = config.Azure.AKS.Get(ctx, config.ResourceGroupName(location), name, nil) + cluster, err = infra.Azure.AKS.Get(ctx, resourceGroupName, name, nil) if err != nil { return false, err } @@ -369,8 +370,8 @@ func waitUntilClusterReady(ctx context.Context, name, location string) (*armcont return &cluster.ManagedCluster, nil } -func isExistingResourceGroup(ctx context.Context, resourceGroupName string) (bool, error) { - rgExistence, err := config.Azure.ResourceGroup.CheckExistence(ctx, resourceGroupName, nil) +func isExistingResourceGroup(ctx context.Context, infra *ClusterInfra, resourceGroupName string) (bool, error) { + rgExistence, err := infra.Azure.ResourceGroup.CheckExistence(ctx, resourceGroupName, nil) if err != nil { return false, fmt.Errorf("failed to get RG %q: %w", resourceGroupName, err) } @@ -378,11 +379,11 @@ func isExistingResourceGroup(ctx context.Context, resourceGroupName string) (boo return rgExistence.Success, nil } -func createNewAKSCluster(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { +func createNewAKSCluster(ctx context.Context, infra *ClusterInfra, rgName string, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { // Note, it seems like the operation still can start a trigger a new operation even if nothing has changes - pollerResp, err := config.Azure.AKS.BeginCreateOrUpdate( + pollerResp, err := infra.Azure.AKS.BeginCreateOrUpdate( ctx, - config.ResourceGroupName(*cluster.Location), + rgName, *cluster.Name, *cluster, nil, @@ -403,16 +404,16 @@ func createNewAKSCluster(ctx context.Context, cluster *armcontainerservice.Manag // that retries creating a cluster if it fails with a 409 Conflict error // clusters are reused, and sometimes a cluster can be in UPDATING or DELETING state // simple retry should be sufficient to avoid such conflicts -func createNewAKSClusterWithRetry(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { +func createNewAKSClusterWithRetry(ctx context.Context, infra *ClusterInfra, rgName string, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.ManagedCluster, error) { maxRetries := 10 retryInterval := 30 * time.Second var lastErr error for attempt := 0; attempt < maxRetries; attempt++ { if attempt > 0 { - toolkit.Logf(ctx, "Attempt %d: creating or updating cluster %s in region %s and rg %s", attempt+1, *cluster.Name, *cluster.Location, config.ResourceGroupName(*cluster.Location)) + toolkit.Logf(ctx, "Attempt %d: creating or updating cluster %s in region %s and rg %s", attempt+1, *cluster.Name, *cluster.Location, rgName) } - createdCluster, err := createNewAKSCluster(ctx, cluster) + createdCluster, err := createNewAKSCluster(ctx, infra, rgName, cluster) if err == nil { return createdCluster, nil } @@ -465,7 +466,8 @@ func ensureMaintenanceConfiguration(ctx context.Context, cluster *armcontainerse } func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.MaintenanceConfiguration, error) { - toolkit.Logf(ctx, "creating maintenance configuration for cluster %s in rg %s", *cluster.Name, config.ResourceGroupName(*cluster.Location)) + rgName := config.ResourceGroupName(*cluster.Location) + toolkit.Logf(ctx, "creating maintenance configuration for cluster %s in rg %s", *cluster.Name, rgName) maintenance := armcontainerservice.MaintenanceConfiguration{ Properties: &armcontainerservice.MaintenanceConfigurationProperties{ MaintenanceWindow: &armcontainerservice.MaintenanceWindow{ @@ -487,7 +489,7 @@ func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontaine }, } - _, err := config.Azure.Maintenance.CreateOrUpdate(ctx, config.ResourceGroupName(*cluster.Location), *cluster.Name, "default", maintenance, nil) + _, err := config.Azure.Maintenance.CreateOrUpdate(ctx, rgName, *cluster.Name, "default", maintenance, nil) if err != nil { return nil, fmt.Errorf("failed to create maintenance configuration: %w", err) } @@ -495,23 +497,23 @@ func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontaine return &maintenance, nil } -func getOrCreateBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { +func getOrCreateBastion(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { nodeRG := *cluster.Properties.NodeResourceGroup bastionName := fmt.Sprintf("%s-bastion", *cluster.Name) - existing, err := config.Azure.BastionHosts.Get(ctx, nodeRG, bastionName, nil) + existing, err := infra.Azure.BastionHosts.Get(ctx, nodeRG, bastionName, nil) var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == http.StatusNotFound { - return createNewBastion(ctx, cluster) + return createNewBastion(ctx, infra, cluster) } if err != nil { return nil, fmt.Errorf("failed to get bastion %q in rg %q: %w", bastionName, nodeRG, err) } - return NewBastion(config.Azure.Credential, config.Config.SubscriptionID, nodeRG, *existing.BastionHost.Properties.DNSName), nil + return NewBastion(infra.Azure.Credential, infra.SubscriptionID, nodeRG, *existing.BastionHost.Properties.DNSName), nil } -func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { +func createNewBastion(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*Bastion, error) { nodeRG := *cluster.Properties.NodeResourceGroup location := *cluster.Location bastionName := fmt.Sprintf("%s-bastion", *cluster.Name) @@ -519,7 +521,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC publicIPName := fmt.Sprintf("%s-bastion-pip", *cluster.Name) publicIPName = sanitizeAzureResourceName(publicIPName) - vnet, err := getClusterVNet(ctx, nodeRG) + vnet, err := getClusterVNet(ctx, infra, nodeRG) if err != nil { return nil, fmt.Errorf("get cluster vnet in rg %q: %w", nodeRG, err) } @@ -533,7 +535,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC } var bastionSubnetID string - bastionSubnet, subnetGetErr := config.Azure.Subnet.Get(ctx, nodeRG, vnet.name, bastionSubnetName, nil) + bastionSubnet, subnetGetErr := infra.Azure.Subnet.Get(ctx, nodeRG, vnet.name, bastionSubnetName, nil) if subnetGetErr != nil { var subnetAzErr *azcore.ResponseError if !errors.As(subnetGetErr, &subnetAzErr) || subnetAzErr.StatusCode != http.StatusNotFound { @@ -546,7 +548,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC AddressPrefix: to.Ptr(bastionSubnetPrefix), }, } - subnetPoller, err := config.Azure.Subnet.BeginCreateOrUpdate(ctx, nodeRG, vnet.name, bastionSubnetName, subnetParams, nil) + subnetPoller, err := infra.Azure.Subnet.BeginCreateOrUpdate(ctx, nodeRG, vnet.name, bastionSubnetName, subnetParams, nil) if err != nil { return nil, fmt.Errorf("failed to start creating bastion subnet: %w", err) } @@ -571,7 +573,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC } toolkit.Logf(ctx, "creating bastion public IP %s (rg %s)", publicIPName, nodeRG) - pipPoller, err := config.Azure.PublicIPAddresses.BeginCreateOrUpdate(ctx, nodeRG, publicIPName, pipParams, nil) + pipPoller, err := infra.Azure.PublicIPAddresses.BeginCreateOrUpdate(ctx, nodeRG, publicIPName, pipParams, nil) if err != nil { return nil, fmt.Errorf("failed to start creating bastion public IP: %w", err) } @@ -608,7 +610,7 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC } toolkit.Logf(ctx, "creating bastion %s (native client/tunneling enabled) in rg %s", bastionName, nodeRG) - bastionPoller, err := config.Azure.BastionHosts.BeginCreateOrUpdate(ctx, nodeRG, bastionName, bastionHost, nil) + bastionPoller, err := infra.Azure.BastionHosts.BeginCreateOrUpdate(ctx, nodeRG, bastionName, bastionHost, nil) if err != nil { return nil, fmt.Errorf("failed to start creating bastion: %w", err) } @@ -617,23 +619,23 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC return nil, fmt.Errorf("failed to create bastion: %w", err) } - bastion := NewBastion(config.Azure.Credential, config.Config.SubscriptionID, nodeRG, *resp.BastionHost.Properties.DNSName) + bastion := NewBastion(infra.Azure.Credential, infra.SubscriptionID, nodeRG, *resp.BastionHost.Properties.DNSName) - if err := verifyBastion(ctx, cluster, bastion); err != nil { + if err := verifyBastion(ctx, infra, cluster, bastion); err != nil { return nil, fmt.Errorf("failed to verify bastion: %w", err) } return bastion, nil } -func verifyBastion(ctx context.Context, cluster *armcontainerservice.ManagedCluster, bastion *Bastion) error { +func verifyBastion(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster, bastion *Bastion) error { nodeRG := *cluster.Properties.NodeResourceGroup - vmssName, err := getSystemPoolVMSSName(ctx, cluster) + vmssName, err := getSystemPoolVMSSName(ctx, infra, cluster) if err != nil { return err } var vmssVM *armcompute.VirtualMachineScaleSetVM - pager := config.Azure.VMSSVM.NewListPager(nodeRG, vmssName, nil) + pager := infra.Azure.VMSSVM.NewListPager(nodeRG, vmssName, nil) if pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -644,7 +646,7 @@ func verifyBastion(ctx context.Context, cluster *armcontainerservice.ManagedClus } } - vmPrivateIP, err := getPrivateIPFromVMSSVM(ctx, nodeRG, vmssName, *vmssVM.InstanceID) + vmPrivateIP, err := getPrivateIPFromVMSSVMWithClient(ctx, infra.Azure, nodeRG, vmssName, *vmssVM.InstanceID) ctx, cancel := context.WithCancel(ctx) defer cancel() @@ -666,7 +668,7 @@ func verifyBastion(ctx context.Context, cluster *armcontainerservice.ManagedClus return fmt.Errorf("Executed ssh on wrong VM, Expected %s: %s", vmssName, result.stdout) } -func getSystemPoolVMSSName(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (string, error) { +func getSystemPoolVMSSName(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (string, error) { nodeRG := *cluster.Properties.NodeResourceGroup var systemPoolName string for _, pool := range cluster.Properties.AgentPoolProfiles { @@ -674,7 +676,7 @@ func getSystemPoolVMSSName(ctx context.Context, cluster *armcontainerservice.Man systemPoolName = *pool.Name } } - pager := config.Azure.VMSS.NewListPager(nodeRG, nil) + pager := infra.Azure.VMSS.NewListPager(nodeRG, nil) if pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -706,8 +708,8 @@ type VNet struct { subnetId string } -func getClusterVNet(ctx context.Context, mcResourceGroupName string) (VNet, error) { - pager := config.Azure.VNet.NewListPager(mcResourceGroupName, nil) +func getClusterVNet(ctx context.Context, infra *ClusterInfra, mcResourceGroupName string) (VNet, error) { + pager := infra.Azure.VNet.NewListPager(mcResourceGroupName, nil) for pager.More() { nextResult, err := pager.NextPage(ctx) if err != nil { @@ -723,13 +725,13 @@ func getClusterVNet(ctx context.Context, mcResourceGroupName string) (VNet, erro return VNet{}, fmt.Errorf("failed to find aks vnet") } -func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.ManagedCluster, kube *Kubeclient) error { +func collectGarbageVMSS(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster, kube *Kubeclient) error { defer toolkit.LogStepCtx(ctx, "collecting garbage VMSS")() rg := *cluster.Properties.NodeResourceGroup // Build a set of all existing VMSS names while deleting old ones. existingVMSS := map[string]struct{}{} - pager := config.Azure.VMSS.NewListPager(rg, nil) + pager := infra.Azure.VMSS.NewListPager(rg, nil) for pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -752,7 +754,7 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage continue } - _, err := config.Azure.VMSS.BeginDelete(ctx, rg, *vmss.Name, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ + _, err := infra.Azure.VMSS.BeginDelete(ctx, rg, *vmss.Name, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ ForceDeletion: to.Ptr(true), }) if err != nil { @@ -812,8 +814,12 @@ func collectGarbageNodes(ctx context.Context, kube *Kubeclient, existingVMSS map } func ensureResourceGroup(ctx context.Context, location string) (armresources.ResourceGroup, error) { - resourceGroupName := config.ResourceGroupName(location) - rg, err := config.Azure.ResourceGroup.CreateOrUpdate( + return ensureResourceGroupWithInfra(ctx, DefaultClusterInfra, location) +} + +func ensureResourceGroupWithInfra(ctx context.Context, infra *ClusterInfra, location string) (armresources.ResourceGroup, error) { + resourceGroupName := infra.ResourceGroupName(location) + rg, err := infra.Azure.ResourceGroup.CreateOrUpdate( ctx, resourceGroupName, armresources.ResourceGroup{ diff --git a/e2e/config/azure.go b/e2e/config/azure.go index d0de6f04619..847db25a269 100644 --- a/e2e/config/azure.go +++ b/e2e/config/azure.go @@ -117,6 +117,10 @@ func NewHttpClient() *http.Client { } func NewAzureClient() (*AzureClient, error) { + return NewAzureClientForSubscription(Config.SubscriptionID) +} + +func NewAzureClientForSubscription(subscriptionID string) (*AzureClient, error) { httpClient := NewHttpClient() logger := runtime.NewLogPolicy(&policy.LogOptions{ IncludeBody: true, @@ -155,193 +159,183 @@ func NewAzureClient() (*AzureClient, error) { return nil, fmt.Errorf("create core client: %w", err) } - cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(Config.SubscriptionID, credential, opts) + cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create public ip addresses client: %w", err) } - cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(Config.SubscriptionID, credential, opts) - if err != nil { - return nil, fmt.Errorf("create bastion hosts client: %w", err) - } - - cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(Config.SubscriptionID, credential, opts) + cloud.BastionHosts, err = armnetwork.NewBastionHostsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create bastion hosts client: %w", err) } - cloud.RegistriesClient, err = armcontainerregistry.NewRegistriesClient(Config.SubscriptionID, credential, opts) + cloud.RegistriesClient, err = armcontainerregistry.NewRegistriesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create registry client: %w", err) } - cloud.CacheRulesClient, err = armcontainerregistry.NewCacheRulesClient(Config.SubscriptionID, credential, opts) + cloud.CacheRulesClient, err = armcontainerregistry.NewCacheRulesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create cache rules client: %w", err) } - cloud.PrivateEndpointClient, err = armnetwork.NewPrivateEndpointsClient(Config.SubscriptionID, credential, opts) + cloud.PrivateEndpointClient, err = armnetwork.NewPrivateEndpointsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private endpoint client: %w", err) } - cloud.PrivateZonesClient, err = armprivatedns.NewPrivateZonesClient(Config.SubscriptionID, credential, opts) + cloud.PrivateZonesClient, err = armprivatedns.NewPrivateZonesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private dns zones client: %w", err) } - cloud.VirutalNetworkLinksClient, err = armprivatedns.NewVirtualNetworkLinksClient(Config.SubscriptionID, credential, opts) + cloud.VirutalNetworkLinksClient, err = armprivatedns.NewVirtualNetworkLinksClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create virtual network links client: %w", err) } - cloud.RecordSetClient, err = armprivatedns.NewRecordSetsClient(Config.SubscriptionID, credential, opts) + cloud.RecordSetClient, err = armprivatedns.NewRecordSetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create record set client: %w", err) } - cloud.PrivateDNSZoneGroup, err = armnetwork.NewPrivateDNSZoneGroupsClient(Config.SubscriptionID, credential, opts) + cloud.PrivateDNSZoneGroup, err = armnetwork.NewPrivateDNSZoneGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create private dns zone group client: %w", err) } - cloud.SecurityGroup, err = armnetwork.NewSecurityGroupsClient(Config.SubscriptionID, credential, opts) + cloud.SecurityGroup, err = armnetwork.NewSecurityGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create security group client: %w", err) } - cloud.Subnet, err = armnetwork.NewSubnetsClient(Config.SubscriptionID, credential, opts) + cloud.Subnet, err = armnetwork.NewSubnetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create subnet client: %w", err) } - cloud.RouteTables, err = armnetwork.NewRouteTablesClient(Config.SubscriptionID, credential, opts) + cloud.RouteTables, err = armnetwork.NewRouteTablesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create route tables client: %w", err) } - cloud.Routes, err = armnetwork.NewRoutesClient(Config.SubscriptionID, credential, opts) + cloud.Routes, err = armnetwork.NewRoutesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create routes client: %w", err) } - cloud.AKS, err = armcontainerservice.NewManagedClustersClient(Config.SubscriptionID, credential, opts) + cloud.AKS, err = armcontainerservice.NewManagedClustersClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create aks client: %w", err) } - cloud.Maintenance, err = armcontainerservice.NewMaintenanceConfigurationsClient(Config.SubscriptionID, credential, opts) + cloud.Maintenance, err = armcontainerservice.NewMaintenanceConfigurationsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create maintenance client: %w", err) } - cloud.NetworkInterfaces, err = armnetwork.NewInterfacesClient(Config.SubscriptionID, credential, opts) + cloud.NetworkInterfaces, err = armnetwork.NewInterfacesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create network interfaces client: %w", err) } - cloud.VMSS, err = armcompute.NewVirtualMachineScaleSetsClient(Config.SubscriptionID, credential, opts) + cloud.VMSS, err = armcompute.NewVirtualMachineScaleSetsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss client: %w", err) } - cloud.VMSSVM, err = armcompute.NewVirtualMachineScaleSetVMsClient(Config.SubscriptionID, credential, opts) + cloud.VMSSVM, err = armcompute.NewVirtualMachineScaleSetVMsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss vm client: %w", err) } - cloud.VMs, err = armcompute.NewVirtualMachinesClient(Config.SubscriptionID, credential, opts) + cloud.VMs, err = armcompute.NewVirtualMachinesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vms client: %w", err) } - cloud.Images, err = armcompute.NewImagesClient(Config.SubscriptionID, credential, opts) + cloud.Images, err = armcompute.NewImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create images client: %w", err) } - cloud.Snapshots, err = armcompute.NewSnapshotsClient(Config.SubscriptionID, credential, opts) + cloud.Snapshots, err = armcompute.NewSnapshotsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create snapshots client: %w", err) } - cloud.GalleryImages, err = armcompute.NewGalleryImagesClient(Config.SubscriptionID, credential, opts) + cloud.GalleryImages, err = armcompute.NewGalleryImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create gallery images client: %w", err) } - cloud.GalleryImageVersions, err = armcompute.NewGalleryImageVersionsClient(Config.SubscriptionID, credential, opts) + cloud.GalleryImageVersions, err = armcompute.NewGalleryImageVersionsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create gallery image versions client: %w", err) } - cloud.Resource, err = armresources.NewClient(Config.SubscriptionID, credential, opts) + cloud.Resource, err = armresources.NewClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource client: %w", err) } - cloud.ResourceGroup, err = armresources.NewResourceGroupsClient(Config.SubscriptionID, credential, opts) + cloud.ResourceGroup, err = armresources.NewResourceGroupsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource group client: %w", err) } - cloud.VNet, err = armnetwork.NewVirtualNetworksClient(Config.SubscriptionID, credential, opts) + cloud.VNet, err = armnetwork.NewVirtualNetworksClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vnet client: %w", err) } - cloud.AzureFirewall, err = armnetwork.NewAzureFirewallsClient(Config.SubscriptionID, credential, opts) + cloud.AzureFirewall, err = armnetwork.NewAzureFirewallsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create firewall client: %w", err) } - cloud.PublicIPAddresses, err = armnetwork.NewPublicIPAddressesClient(Config.SubscriptionID, credential, opts) - if err != nil { - return nil, fmt.Errorf("create public ip addresses client: %w", err) - } - cloud.Blob, err = azblob.NewClient(Config.BlobStorageAccountURL(), credential, nil) if err != nil { return nil, fmt.Errorf("create blob container client: %w", err) } - cloud.StorageContainers, err = armstorage.NewBlobContainersClient(Config.SubscriptionID, credential, opts) + cloud.StorageContainers, err = armstorage.NewBlobContainersClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create blob container client: %w", err) } - cloud.RoleAssignments, err = armauthorization.NewRoleAssignmentsClient(Config.SubscriptionID, credential, opts) + cloud.RoleAssignments, err = armauthorization.NewRoleAssignmentsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create role assignment client: %w", err) } - cloud.UserAssignedIdentities, err = armmsi.NewUserAssignedIdentitiesClient(Config.SubscriptionID, credential, nil) + cloud.UserAssignedIdentities, err = armmsi.NewUserAssignedIdentitiesClient(subscriptionID, credential, nil) if err != nil { return nil, fmt.Errorf("create user assigned identities client: %w", err) } - cloud.StorageAccounts, err = armstorage.NewAccountsClient(Config.SubscriptionID, credential, nil) + cloud.StorageAccounts, err = armstorage.NewAccountsClient(subscriptionID, credential, nil) if err != nil { return nil, fmt.Errorf("create storage accounts client: %w", err) } - cloud.VMSSVMRunCommands, err = armcompute.NewVirtualMachineScaleSetVMRunCommandsClient(Config.SubscriptionID, credential, opts) + cloud.VMSSVMRunCommands, err = armcompute.NewVirtualMachineScaleSetVMRunCommandsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vmss vm run command client: %w", err) } - cloud.VMExtensionImages, err = armcompute.NewVirtualMachineExtensionImagesClient(Config.SubscriptionID, credential, opts) + cloud.VMExtensionImages, err = armcompute.NewVirtualMachineExtensionImagesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create vm extension images client: %w", err) } - cloud.ResourceSKUs, err = armcompute.NewResourceSKUsClient(Config.SubscriptionID, credential, opts) + cloud.ResourceSKUs, err = armcompute.NewResourceSKUsClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("create resource skus client: %w", err) } // Ensure the gallery exists - cloud.Galleries, err = armcompute.NewGalleriesClient(Config.SubscriptionID, credential, opts) + cloud.Galleries, err = armcompute.NewGalleriesClient(subscriptionID, credential, opts) if err != nil { return nil, fmt.Errorf("failed to create galleries client: %w", err) } @@ -419,6 +413,18 @@ func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context, identityLocat return *identity.Properties.ClientID, nil } +// CreateVMManagedIdentityInRG creates a VM managed identity in the specified resource group +// without creating blob storage infrastructure (which belongs to the default subscription). +func (a *AzureClient) CreateVMManagedIdentityInRG(ctx context.Context, resourceGroupName, location string) (string, error) { + identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, resourceGroupName, VMIdentityName, armmsi.Identity{ + Location: to.Ptr(location), + }, nil) + if err != nil { + return "", fmt.Errorf("create managed identity in RG %s: %w", resourceGroupName, err) + } + return *identity.Properties.ClientID, nil +} + func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error { poller, err := a.StorageAccounts.BeginCreate(ctx, ResourceGroupName(Config.DefaultLocation), Config.BlobStorageAccount(), armstorage.AccountCreateParameters{ Kind: to.Ptr(armstorage.KindStorageV2), diff --git a/e2e/config/config.go b/e2e/config/config.go index d61db484c6e..bd3f9c677c2 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -29,6 +29,10 @@ var ( Azure = mustNewAzureClient() VMIdentityName = "abe2e-vm-identity" + // RCV1PAzure is lazily initialized when RCV1PSubscriptionID is set. + // It provides Azure clients bound to the PlatformSettingsOverride-registered subscription. + RCV1PAzure *AzureClient + DefaultPollUntilDoneOptions = &runtime.PollUntilDoneOptions{ Frequency: time.Second, } @@ -40,6 +44,14 @@ func ResourceGroupName(location string) string { return "abe2e-" + location } +func RCV1PResourceGroupName(location string) string { + return "abe2e-rcv1p-" + location +} + +func (c *Configuration) RCV1PVMIdentityResourceID(location string) string { + return fmt.Sprintf("/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ManagedIdentity/userAssignedIdentities/%s", c.RCV1PSubscriptionID, RCV1PResourceGroupName(location), VMIdentityName) +} + func PrivateACRNameNotAnon(location string) string { return "abe2eprivatenonanon" + location // will have anonymous pull enabled } @@ -88,6 +100,7 @@ type Configuration struct { TestTimeoutCluster time.Duration `env:"TEST_TIMEOUT_CLUSTER" envDefault:"20m"` TestTimeoutVMSS time.Duration `env:"TEST_TIMEOUT_VMSS" envDefault:"17m"` WindowsAdminPassword string `env:"WINDOWS_ADMIN_PASSWORD"` + RCV1PSubscriptionID string `env:"RCV1P_SUBSCRIPTION_ID"` } func (c *Configuration) BlobStorageAccount() string { @@ -169,6 +182,16 @@ func mustLoadConfig() *Configuration { return cfg } +func init() { + if Config.RCV1PSubscriptionID != "" && !strings.HasPrefix(Config.RCV1PSubscriptionID, "$(") { + client, err := NewAzureClientForSubscription(Config.RCV1PSubscriptionID) + if err != nil { + panic(fmt.Sprintf("failed to create RCV1P Azure client: %v", err)) + } + RCV1PAzure = client + } +} + // Returns a newly generated RSA public/private key pair with the private key in PEM format. func mustGetNewRSAKeyPair() ([]byte, []byte, string) { // Generate new key pair diff --git a/e2e/kube.go b/e2e/kube.go index b5f1fe18580..689b9032e95 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -41,10 +41,10 @@ const ( proxyPort = 8888 ) -func getClusterKubeClient(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*Kubeclient, error) { - resourceGroupName := config.ResourceGroupName(*cluster.Location) +func getClusterKubeClient(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*Kubeclient, error) { + resourceGroupName := infra.ResourceGroupName(*cluster.Location) clusterName := *cluster.Name - data, err := getClusterKubeconfigBytes(ctx, resourceGroupName, clusterName) + data, err := getClusterKubeconfigBytes(ctx, infra, resourceGroupName, clusterName) if err != nil { return nil, fmt.Errorf("get cluster kubeconfig bytes: %w", err) } @@ -278,8 +278,8 @@ func logPodDebugInfo(ctx context.Context, kube *Kubeclient, pod *corev1.Pod) { toolkit.Log(ctx, string(info)) } -func getClusterKubeconfigBytes(ctx context.Context, resourceGroupName, clusterName string) ([]byte, error) { - credentialList, err := config.Azure.AKS.ListClusterAdminCredentials(ctx, resourceGroupName, clusterName, nil) +func getClusterKubeconfigBytes(ctx context.Context, infra *ClusterInfra, resourceGroupName, clusterName string) ([]byte, error) { + credentialList, err := infra.Azure.AKS.ListClusterAdminCredentials(ctx, resourceGroupName, clusterName, nil) if err != nil { return nil, fmt.Errorf("list cluster admin credentials: %w", err) } @@ -634,9 +634,9 @@ func (k *Kubeclient) GetProxyURL(ctx context.Context) (string, error) { return proxyURL, nil } -func getClusterSubnetID(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (string, error) { +func getClusterSubnetID(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (string, error) { mcResourceGroupName := *cluster.Properties.NodeResourceGroup - pager := config.Azure.VNet.NewListPager(mcResourceGroupName, nil) + pager := infra.Azure.VNet.NewListPager(mcResourceGroupName, nil) for pager.More() { nextResult, err := pager.NextPage(ctx) if err != nil { diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go new file mode 100644 index 00000000000..817b63cba8e --- /dev/null +++ b/e2e/scenario_rcv1p_test.go @@ -0,0 +1,210 @@ +// scenario_rcv1p_test.go contains end-to-end tests for the RCV1P (Root Certificate V1P) cert mode +// on Linux distros. RCV1P is the next-generation mechanism for distributing Azure root CA certificates +// to AKS nodes. Instead of relying on hardcoded certificate bundles, RCV1P queries the Azure wireserver +// at provisioning time to download the latest root certificates and installs them into the OS trust store. +// +// These tests require: +// - A dedicated subscription (RCV1P_SUBSCRIPTION_ID) with the Microsoft.Compute/PlatformSettingsOverride +// feature flag registered, which enables the wireserver certificate endpoint. +// - The VM opt-in tag "platformsettings.host_environment.service.platform_optedin_for_rootcerts=true" +// on each VMSS, which tells wireserver to serve certificates to this specific VM. +// +// Both conditions must be met: the subscription feature enables the endpoint, and the VM tag grants +// per-VM access. Without the tag, wireserver returns IsOptedInForRootCerts=false. +// +// The positive tests (Test_RCV1P_) verify that certificates are downloaded, installed into +// the distro-specific trust store, and a refresh schedule is created. The negative test +// (Test_RCV1P_NotOptedIn) verifies that omitting the VM tag correctly prevents cert installation. +package e2e + +import ( + "context" + "strings" + "testing" + + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" +) + +// rcv1pOptInTag is the ARM tag that must be set on the VM resource for wireserver to serve +// root certificates. Without this tag, wireserver returns IsOptedInForRootCerts=false even +// if the subscription has the PlatformSettingsOverride feature registered. +const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedin_for_rootcerts" + +// skipIfRCV1PNotConfigured skips the test when the RCV1P subscription is not configured. +// This happens in regular CI runs where the RCV1P variable group is not linked, causing +// Azure DevOps to pass the literal unexpanded string "$(RCV1P_SUBSCRIPTION_ID)". +func skipIfRCV1PNotConfigured(t *testing.T) { + t.Helper() + subID := config.Config.RCV1PSubscriptionID + if subID == "" || strings.HasPrefix(subID, "$(") { + t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") + } +} + +// rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS so that wireserver +// will serve root certificates to this VM during provisioning. +func rcv1pOptInVMConfigMutator(vmss *armcompute.VirtualMachineScaleSet) { + if vmss.Tags == nil { + vmss.Tags = map[string]*string{} + } + vmss.Tags[rcv1pOptInTag] = to.Ptr("true") +} + +// Test_RCV1P_Ubuntu2204 validates RCV1P cert download and trust store installation on Ubuntu 22.04. +// Ubuntu uses /usr/local/share/ca-certificates/ as the cert drop folder and update-ca-certificates +// to rebuild the trust bundle. +func Test_RCV1P_Ubuntu2204(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Ubuntu 22.04 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Ubuntu2404 validates RCV1P cert download and trust store installation on Ubuntu 24.04. +// Covers the newer Ubuntu LTS release to ensure the cert endpoint and trust store integration +// work correctly across Ubuntu versions. +func Test_RCV1P_Ubuntu2404(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Ubuntu 24.04 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2404Gen2Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_AzureLinuxV3 validates RCV1P on Azure Linux V3, which uses a different trust store +// layout (/etc/pki/ca-trust/source/anchors/) and update command (update-ca-trust) than Ubuntu. +// This ensures the provisioning script correctly detects the distro and uses the right paths. +func Test_RCV1P_AzureLinuxV3(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Azure Linux V3 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDAzureLinuxV3Gen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Flatcar validates RCV1P on Flatcar Container Linux, which has a read-only root +// filesystem and requires certificates to be placed in /etc/ssl/certs/ as .pem files. +// This is the most constrained environment for cert installation. +func Test_RCV1P_Flatcar(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Flatcar with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDFlatcarGen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_ACL validates RCV1P on Azure Container Linux (ACL), which shares the same +// trust store layout as Azure Linux (/etc/pki/ca-trust/). ACL requires Trusted Launch, +// so the VMConfigMutator combines both the TrustedLaunch and opt-in tag settings. +func Test_RCV1P_ACL(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on ACL with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDACLGen2TL, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) + rcv1pOptInVMConfigMutator(vmss) + }, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertMode(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_NotOptedIn is a negative test that validates the VM opt-in tag is required +// for cert installation. The VM is created in the RCV1P subscription (which has +// PlatformSettingsOverride registered) but WITHOUT the opt-in tag on the VMSS. +// This verifies that wireserver returns IsOptedInForRootCerts=false and the provisioning +// script correctly skips certificate download and trust store installation. +// This test is critical because it proves the two-layer access control works: +// subscription feature alone is not sufficient — the VM must also be explicitly tagged. +func Test_RCV1P_NotOptedIn(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode without VM opt-in tag; expects no cert installation", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PNotOptedIn(ctx, s) + }, + }, + }) +} diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go new file mode 100644 index 00000000000..55a35b584bb --- /dev/null +++ b/e2e/scenario_rcv1p_win_test.go @@ -0,0 +1,91 @@ +// scenario_rcv1p_win_test.go contains end-to-end tests for the RCV1P cert mode on Windows. +// Windows uses a different cert installation path than Linux: certificates are downloaded to +// C:\ca and imported into the Windows certificate store (Cert:\LocalMachine\Root) via +// Import-Certificate. A scheduled task (aks-ca-certs-refresh-task) is registered to +// periodically refresh the certificates. +// +// These tests run against the same RCV1P subscription and require the same VM opt-in tag +// as the Linux tests (see scenario_rcv1p_test.go for details on the two-layer access control). +package e2e + +import ( + "context" + "testing" + + "github.com/Azure/agentbaker/e2e/config" + "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" +) + +// Test_RCV1P_Windows2022 validates RCV1P cert download and Windows certificate store +// installation on Windows Server 2022. +func Test_RCV1P_Windows2022(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2022 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2022Containerd, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows23H2 validates RCV1P on Windows Server 23H2, the annual channel release. +func Test_RCV1P_Windows23H2(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 23H2 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows23H2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows2025 validates RCV1P on Windows Server 2025. This SKU requires +// Trusted Launch, so the VMConfigMutator combines both TrustedLaunch and opt-in tag settings. +func Test_RCV1P_Windows2025(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2025 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2025, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) + rcv1pOptInVMConfigMutator(vmss) + }, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + Windows2025BootstrapConfigMutator(t, nbc) + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index 374bb80598d..c4e7ae1c630 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -211,10 +211,24 @@ func runScenario(t testing.TB, s *Scenario) error { ctx := newTestCtx(t) maybeSkipScenario(ctx, t, s) - _, err := CachedEnsureResourceGroup(ctx, s.Location) - require.NoError(t, err) - _, err = CachedCreateVMManagedIdentity(ctx, s.Location) - require.NoError(t, err) + if s.AzureClient != nil { + // RCV1P scenario: ensure RG and identity in the RCV1P subscription + _, err := CachedRCV1PEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedRCV1PCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + // Also ensure default subscription infra (RG + identity + blob storage) is provisioned, + // since Windows log extraction on failure uploads to the default subscription's blob storage. + _, err = CachedEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + } else { + _, err := CachedEnsureResourceGroup(ctx, s.Location) + require.NoError(t, err) + _, err = CachedCreateVMManagedIdentity(ctx, s.Location) + require.NoError(t, err) + } s.T = t ctrruntimelog.SetLogger(zap.New()) @@ -261,6 +275,11 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) { nbc, err := getBaseNBC(s.T, s.Runtime.Cluster, s.VHD) require.NoError(s.T, err) + // Override subscription ID for RCV1P scenarios + if s.SubscriptionID != "" { + nbc.SubscriptionID = s.SubscriptionID + } + nbc.EnableScriptlessCSECmd = true if s.Runtime != nil && s.Runtime.EnableScriptlessNBCCSECmd { nbc.EnableScriptlessNBCCSECmd = true @@ -610,7 +629,7 @@ func RunCommand(ctx context.Context, s *Scenario, command string) (armcompute.Ru toolkit.Logf(ctx, "Command %q took %s", command, elapsed) }() - runPoller, err := config.Azure.VMSSVM.BeginRunCommand(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, armcompute.RunCommandInput{ + runPoller, err := s.GetAzure().VMSSVM.BeginRunCommand(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, armcompute.RunCommandInput{ CommandID: func() *string { if s.IsWindows() { return to.Ptr("RunPowerShellScript") @@ -639,11 +658,11 @@ func CreateImage(ctx context.Context, s *Scenario) *config.Image { require.NoErrorf(s.T, err, "failed to run sysprep on Windows VM for image creation") } - vm, err := config.Azure.VMSSVM.Get(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) + vm, err := s.GetAzure().VMSSVM.Get(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) require.NoError(s.T, err, "Failed to get VMSS VM for image creation") s.T.Log("Deallocating VMSS VM...") - poll, err := config.Azure.VMSSVM.BeginDeallocate(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, nil) + poll, err := s.GetAzure().VMSSVM.BeginDeallocate(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *s.Runtime.VM.VM.InstanceID, nil) require.NoError(s.T, err, "Failed to begin deallocate") _, err = poll.PollUntilDone(ctx, nil) require.NoError(s.T, err, "Failed to deallocate") @@ -690,7 +709,7 @@ func CreateSIGImageVersionFromDisk(ctx context.Context, s *Scenario, version str // Create the image version directly from the disk s.T.Logf("Creating gallery image version: %s in %s", version, *image.ID) - createVersionOp, err := config.Azure.GalleryImageVersions.BeginCreateOrUpdate(ctx, rg, *gallery.Name, *image.Name, version, armcompute.GalleryImageVersion{ + createVersionOp, err := s.GetAzure().GalleryImageVersions.BeginCreateOrUpdate(ctx, rg, *gallery.Name, *image.Name, version, armcompute.GalleryImageVersion{ Location: to.Ptr(s.Location), Properties: &armcompute.GalleryImageVersionProperties{ StorageProfile: &armcompute.GalleryImageVersionStorageProfile{ @@ -726,7 +745,7 @@ func CreateSIGImageVersionFromDisk(ctx context.Context, s *Scenario, version str customVHD := *s.Config.VHD customVHD.Name = *image.Name // Use the architecture-specific image name customVHD.Gallery = &config.Gallery{ - SubscriptionID: config.Config.SubscriptionID, + SubscriptionID: s.GetSubscriptionID(), ResourceGroupName: rg, Name: *gallery.Name, } diff --git a/e2e/types.go b/e2e/types.go index 9c6a3b177ce..495acb648f0 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -21,6 +21,33 @@ import ( "golang.org/x/crypto/ssh" ) +// ClusterInfra captures the Azure infrastructure scope for cluster operations. +// It allows cluster creation and management to target different subscriptions. +type ClusterInfra struct { + Azure *config.AzureClient + SubscriptionID string + ResourceGroupName func(location string) string +} + +// DefaultClusterInfra uses the default subscription and resource group naming. +var DefaultClusterInfra = &ClusterInfra{ + Azure: config.Azure, + SubscriptionID: config.Config.SubscriptionID, + ResourceGroupName: config.ResourceGroupName, +} + +// RCV1PClusterInfra returns the ClusterInfra for the RCV1P subscription, or nil if not configured. +func RCV1PClusterInfra() *ClusterInfra { + if config.RCV1PAzure == nil { + return nil + } + return &ClusterInfra{ + Azure: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + ResourceGroupName: config.RCV1PResourceGroupName, + } +} + type Tags struct { Name string ImageName string @@ -35,6 +62,7 @@ type Tags struct { Scriptless bool VHDCaching bool MockAzureChinaCloud bool + RCV1PCertMode bool VMSeriesCoverageTest bool } @@ -128,6 +156,14 @@ type Scenario struct { // a default size will be used. K8sSystemPoolSKU string + // AzureClient overrides the default config.Azure client for this scenario. + // When nil, config.Azure is used. + AzureClient *config.AzureClient + + // SubscriptionID overrides the default config.Config.SubscriptionID for this scenario. + // When empty, config.Config.SubscriptionID is used. + SubscriptionID string + // Runtime contains the runtime state of the scenario. It's populated in the beginning of the test run Runtime *ScenarioRuntime T testing.TB @@ -447,3 +483,35 @@ func (s *Scenario) GetContainerRegistryFQDN() string { // Default to public cloud container registry (also used by Fairfax/US Gov) return "mcr.microsoft.com" } + +// GetAzure returns the AzureClient for this scenario, falling back to the default config.Azure. +func (s *Scenario) GetAzure() *config.AzureClient { + if s.AzureClient != nil { + return s.AzureClient + } + return config.Azure +} + +// GetSubscriptionID returns the subscription ID for this scenario, falling back to config.Config.SubscriptionID. +func (s *Scenario) GetSubscriptionID() string { + if s.SubscriptionID != "" { + return s.SubscriptionID + } + return config.Config.SubscriptionID +} + +// GetResourceGroupName returns the resource group name for this scenario's location. +func (s *Scenario) GetResourceGroupName() string { + if s.SubscriptionID != "" && s.SubscriptionID != config.Config.SubscriptionID { + return config.RCV1PResourceGroupName(s.Location) + } + return config.ResourceGroupName(s.Location) +} + +// GetVMIdentityResourceID returns the VM identity resource ID for this scenario. +func (s *Scenario) GetVMIdentityResourceID() string { + if s.SubscriptionID != "" && s.SubscriptionID != config.Config.SubscriptionID { + return config.Config.RCV1PVMIdentityResourceID(s.Location) + } + return config.Config.VMIdentityResourceID(s.Location) +} diff --git a/e2e/validators.go b/e2e/validators.go index 96597ada092..0a2d6cc25b9 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -412,6 +412,13 @@ func ValidateNonEmptyDirectory(ctx context.Context, s *Scenario, dirName string) execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, "either could not find expected file, or something went wrong") } +func ValidateEmptyDirectory(ctx context.Context, s *Scenario, dirName string) { + s.T.Helper() + command := fmt.Sprintf("[ -d %s ] && [ -z \"$(ls -A %s)\" ]", dirName, dirName) + execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, + fmt.Sprintf("expected directory %s to be empty or not exist", dirName)) +} + func ValidateInspektorGadget(ctx context.Context, s *Scenario) { s.T.Helper() @@ -2834,3 +2841,103 @@ func ValidateVulnerableKernelModulesDisabled(ctx context.Context, s *Scenario) { execScriptOnVMForScenarioValidateExitCode(ctx, s, script, 0, "Vulnerable kernel module mitigation validation failed (algif_aead/esp4/esp6/rxrpc)") } + +// ValidateRCV1PCertMode validates that the rcv1p certificate endpoint mode was used during +// Linux node provisioning, certificates were downloaded and installed, and a refresh task was scheduled. +func ValidateRCV1PCertMode(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate the provisioning log shows rcv1p mode was selected + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Using custom cloud certificate endpoint mode: rcv1p") + + // Validate the subscription is opted in for root certs + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "IsOptedInForRootCerts=true") + + // Validate certificates were downloaded + ValidateNonEmptyDirectory(ctx, s, "/root/AzureCACertificates") + + // Validate trust store was updated (distro-specific path) + trustStoreDir := rcv1pTrustStoreDir(s) + execScriptOnVMForScenarioValidateExitCode(ctx, s, + fmt.Sprintf("sudo ls -1 %s/*.crt 2>/dev/null || sudo ls -1 %s/*.pem 2>/dev/null", trustStoreDir, trustStoreDir), + 0, fmt.Sprintf("expected certificates in trust store directory %s", trustStoreDir)) + + // Validate refresh schedule was created (cron or systemd timer depending on distro) + if s.VHD.Flatcar || s.VHD.OS == config.OSACL { + // Flatcar and ACL use systemd timer + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "systemctl is-enabled azure-ca-refresh.timer", + 0, "expected azure-ca-refresh.timer to be enabled") + } else { + // Ubuntu, Mariner, AzureLinux use cron + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "sudo crontab -l 2>/dev/null | grep -q ca-refresh", + 0, "expected ca-refresh cron entry") + } +} + +// rcv1pTrustStoreDir returns the OS trust store directory for the given scenario's distro. +func rcv1pTrustStoreDir(s *Scenario) string { + switch s.VHD.OS { + case config.OSMariner, config.OSAzureLinux, config.OSACL: + return "/etc/pki/ca-trust/source/anchors" + case config.OSFlatcar: + return "/etc/ssl/certs" + default: + // Ubuntu and anything else + return "/usr/local/share/ca-certificates" + } +} + +// ValidateRCV1PCertModeWindows validates that the rcv1p certificate endpoint mode was used during +// Windows node provisioning, certificates were downloaded and installed, and a refresh task was scheduled. +func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate CA certificates were installed to the Windows certificate store + command := []string{ + "$ErrorActionPreference = 'Stop'", + "$caFolder = 'C:\\ca'", + "if (-not (Test-Path $caFolder)) { throw 'CA certificates folder C:\\ca does not exist' }", + "$certs = Get-ChildItem -Path $caFolder -File", + "if ($certs.Count -eq 0) { throw 'No certificates found in C:\\ca folder' }", + "Write-Host \"Found $($certs.Count) certificate(s) in $caFolder\"", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected certificates in C:\\ca") + + // Validate the refresh scheduled task exists + command = []string{ + "$ErrorActionPreference = 'Stop'", + "$task = Get-ScheduledTask -TaskName 'aks-ca-certs-refresh-task' -ErrorAction SilentlyContinue", + "if (-not $task) { throw 'aks-ca-certs-refresh-task scheduled task not found' }", + "Write-Host \"Scheduled task found: $($task.TaskName) (State: $($task.State))\"", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected aks-ca-certs-refresh-task scheduled task") +} + +// ValidateRCV1PNotOptedIn validates that when the VM does NOT have the opt-in tag, +// wireserver returns IsOptedInForRootCerts=false and no certificates are installed, +// even in the RCV1P subscription with PlatformSettingsOverride registered. +func ValidateRCV1PNotOptedIn(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate the provisioning log shows rcv1p mode was selected + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Using custom cloud certificate endpoint mode: rcv1p") + + // Validate wireserver reported not opted in + ValidateFileHasContent(ctx, s, "/var/log/azure/cluster-provision.log", + "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true") + + // Validate no certificates were downloaded + ValidateEmptyDirectory(ctx, s, "/root/AzureCACertificates") + + // Validate no refresh schedule was created + execScriptOnVMForScenarioValidateExitCode(ctx, s, + "sudo crontab -l 2>/dev/null | grep -q ca-refresh", + 1, "expected no ca-refresh cron entry when not opted in") +} diff --git a/e2e/vmss.go b/e2e/vmss.go index 4abf14f8931..7ebf839fd22 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -383,13 +383,13 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine if config.Config.IsLocalBuild() { s.T.Logf( "VMSS portal link: https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/overview", - config.Config.SubscriptionID, + s.GetSubscriptionID(), *cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, ) s.T.Logf( "Managed cluster portal link: https://ms.portal.azure.com/#@microsoft.onmicrosoft.com/resource/subscriptions/%s/resourceGroups/%s/providers/Microsoft.ContainerService/managedClusters/%s/overview", - config.Config.SubscriptionID, + s.GetSubscriptionID(), *cluster.Model.Properties.NodeResourceGroup, *cluster.Model.Name, ) @@ -401,8 +401,8 @@ func createVMSSModel(ctx context.Context, s *Scenario) armcompute.VirtualMachine model.Identity = &armcompute.VirtualMachineScaleSetIdentity{ Type: to.Ptr(armcompute.ResourceIdentityTypeSystemAssignedUserAssigned), UserAssignedIdentities: map[string]*armcompute.UserAssignedIdentitiesValue{ - *s.Runtime.Cluster.KubeletIdentity.ResourceID: {}, - config.Config.VMIdentityResourceID(s.Location): {}, + *s.Runtime.Cluster.KubeletIdentity.ResourceID: {}, + s.GetVMIdentityResourceID(): {}, }, } @@ -475,7 +475,7 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*ScenarioVM, error) { defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} - operation, err := config.Azure.VMSS.BeginCreateOrUpdate( + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, @@ -492,7 +492,7 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) } @@ -549,7 +549,7 @@ func waitForVMRunningState(ctx context.Context, s *Scenario, vmssVM *armcompute. var lastErr error for { // Get the updated VM with instance view to check power state - vm, err := config.Azure.VMSSVM.Get(ctxTimeout, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmssVM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{ + vm, err := s.GetAzure().VMSSVM.Get(ctxTimeout, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmssVM.InstanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{ Expand: to.Ptr(armcompute.InstanceViewTypesInstanceView), }) @@ -592,7 +592,7 @@ func waitForVMSSVM(ctx context.Context, s *Scenario) (*armcompute.VirtualMachine var lastErr error for { - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetVMsClientListOptions{ + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetVMsClientListOptions{ Expand: to.Ptr("instanceView"), }) @@ -622,9 +622,14 @@ func waitForVMSSVM(ctx context.Context, s *Scenario) (*armcompute.VirtualMachine } // getPrivateIPFromVMSSVM extracts the private IP address from a VMSS VM by querying its network interfaces. -func getPrivateIPFromVMSSVM(ctx context.Context, resourceGroup, vmssName, instanceID string) (string, error) { +func getPrivateIPFromVMSSVM(ctx context.Context, s *Scenario, resourceGroup, vmssName, instanceID string) (string, error) { + return getPrivateIPFromVMSSVMWithClient(ctx, s.GetAzure(), resourceGroup, vmssName, instanceID) +} + +// getPrivateIPFromVMSSVMWithClient extracts the private IP using the given Azure client. +func getPrivateIPFromVMSSVMWithClient(ctx context.Context, azure *config.AzureClient, resourceGroup, vmssName, instanceID string) (string, error) { // Query the network interface to get the IP configuration - pager := config.Azure.NetworkInterfaces.NewListVirtualMachineScaleSetVMNetworkInterfacesPager( + pager := azure.NetworkInterfaces.NewListVirtualMachineScaleSetVMNetworkInterfacesPager( resourceGroup, vmssName, instanceID, @@ -708,7 +713,7 @@ func extractBootDiagnostics(ctx context.Context, s *Scenario) error { return nil } - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) for pager.More() { page, err := pager.NextPage(ctx) if err != nil { @@ -717,7 +722,7 @@ func extractBootDiagnostics(ctx context.Context, s *Scenario) error { for _, vmInstance := range page.Value { // Get boot diagnostics data - bootDiagResp, err := config.Azure.VMSSVM.RetrieveBootDiagnosticsData(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmInstance.InstanceID, nil) + bootDiagResp, err := s.GetAzure().VMSSVM.RetrieveBootDiagnosticsData(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, *vmInstance.InstanceID, nil) if err != nil { return fmt.Errorf("failed to get boot diagnostics for VM %s: %v", *vmInstance.InstanceID, err) } @@ -857,7 +862,7 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { ctx, cancel := context.WithTimeout(ctx, 4*time.Minute) defer cancel() - pager := config.Azure.VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) + pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) page, err := pager.NextPage(ctx) if err != nil { s.T.Logf("failed to list VMSS instances: %s", err) @@ -872,7 +877,7 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { blobPrefix := s.Runtime.VMSSName blobUrl := config.Config.BlobStorageAccountURL() + "/" + config.Config.BlobContainer + "/" + blobPrefix - client := config.Azure.VMSSVMRunCommands + client := s.GetAzure().VMSSVMRunCommands // Invoke the RunCommand on the VMSS instance s.T.Logf("uploading windows logs to blob storage at %s, may take a few minutes", blobUrl) @@ -977,7 +982,7 @@ func deleteVMSS(ctx context.Context, s *Scenario) { } return } - _, err := config.Azure.VMSS.BeginDelete(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ + _, err := s.GetAzure().VMSS.BeginDelete(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, &armcompute.VirtualMachineScaleSetsClientBeginDeleteOptions{ ForceDeletion: to.Ptr(true), }) if err != nil { @@ -1180,7 +1185,7 @@ func getBaseVMSSModel(s *Scenario, customData, cseCmd string) armcompute.Virtual ID: to.Ptr( fmt.Sprintf( loadBalancerBackendAddressPoolIDTemplate, - config.Config.SubscriptionID, + s.GetSubscriptionID(), *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, ), ), diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 2fd36c81434..862c2f09b6c 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -43,7 +43,7 @@ function make_request_with_retry { local response while [ $attempt -le $max_retries ]; do - response=$(curl -f --no-progress-meter "$url") + response=$(curl -f --no-progress-meter --connect-timeout 10 --max-time 30 "$url") local request_status=$? if echo "$response" | grep -q "RequestRateLimitExceeded"; then @@ -213,6 +213,7 @@ esac echo "Using custom cloud certificate endpoint mode: ${cert_endpoint_mode}" install_ca_refresh_schedule=0 +mkdir -p /root/AzureCACertificates rm -f /root/AzureCACertificates/* if [ "$cert_endpoint_mode" = "legacy" ]; then install_ca_refresh_schedule=1 diff --git a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh index 58812659856..13e0f33e188 100644 --- a/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh +++ b/spec/parts/linux/cloud-init/artifacts/init_aks_custom_cloud_spec.sh @@ -20,7 +20,7 @@ Describe 'init-aks-custom-cloud.sh refresh mode wiring' End It 'maps ussec/usnat locations to legacy cert endpoint mode' - When run grep -Eq 'ussec\*|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" + When run grep -Eq 'ussec\*\|usnat\*\) cert_endpoint_mode="legacy"' "$script_path" The status should eq 0 End diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index d9852e4288d..56df5977e87 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -89,7 +89,8 @@ function Register-CACertificatesRefreshTask { if ([string]::IsNullOrEmpty($Location)) { $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates | Out-Null }" } else { - $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$Location' | Out-Null }" + $escapedLocation = $Location -replace "'", "''" + $refreshCommand = "& { . 'C:\AzureData\windows\windowscsehelper.ps1'; . 'C:\AzureData\windows\kubernetesfunc.ps1'; Get-CACertificates -Location '$escapedLocation' | Out-Null }" } $action = New-ScheduledTaskAction -Execute "powershell.exe" -Argument "-NoProfile -NonInteractive -ExecutionPolicy Bypass -Command `"$refreshCommand`"" $principal = New-ScheduledTaskPrincipal -UserId SYSTEM -LogonType ServiceAccount -RunLevel Highest From ee24a233ee51cb23fe8726c93efd7e8c6fa7c8f3 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 14 Apr 2026 14:55:57 -0700 Subject: [PATCH 17/70] Address PR review feedback: fix multi-subscription, validation, and error handling - e2e/cluster.go: Pass ClusterInfra to ensureMaintenanceConfiguration and createNewMaintenanceConfiguration so RCV1P clusters use the correct subscription and resource group instead of the global default. - e2e/validators.go: Fix ValidateEmptyDirectory shell predicate to succeed when the directory is missing (not just when empty), matching the error message. Also quote dirName in the shell command. - staging/cse/windows/kubernetesfunc.ps1: Add -FailOnError switch to Get-CACertificates so initial provisioning fails fast on cert retrieval errors while the scheduled refresh task remains non-fatal. - parts/windows/kuberneteswindowssetup.ps1: Call Get-CACertificates with -FailOnError during initial provisioning. - staging/cse/windows/kubernetesfunc.tests.ps1: Add tests for -FailOnError behavior (exception and empty data paths). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cluster.go | 15 ++++++++------- e2e/validators.go | 2 +- parts/windows/kuberneteswindowssetup.ps1 | 2 +- staging/cse/windows/kubernetesfunc.ps1 | 10 +++++++++- staging/cse/windows/kubernetesfunc.tests.ps1 | 18 ++++++++++++++++++ 5 files changed, 37 insertions(+), 10 deletions(-) diff --git a/e2e/cluster.go b/e2e/cluster.go index a6a428ba91a..777651d2ffc 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -87,7 +87,7 @@ func prepareCluster(ctx context.Context, infra *ClusterInfra, clusterModel *armc bastion := dag.Go(g, func(ctx context.Context) (*Bastion, error) { return getOrCreateBastion(ctx, infra, cluster) }) - dag.Run(g, func(ctx context.Context) error { return ensureMaintenanceConfiguration(ctx, cluster) }) + dag.Run(g, func(ctx context.Context) error { return ensureMaintenanceConfiguration(ctx, infra, cluster) }) subnet := dag.Go(g, func(ctx context.Context) (string, error) { return getClusterSubnetID(ctx, infra, cluster) }) kube := dag.Go(g, func(ctx context.Context) (*Kubeclient, error) { return getClusterKubeClient(ctx, infra, cluster) }) identity := dag.Go(g, func(ctx context.Context) (*armcontainerservice.UserAssignedIdentity, error) { @@ -449,11 +449,12 @@ func isRetryableClusterError(err error) bool { return respErr.ErrorCode == "NotFound" && strings.Contains(err.Error(), "Reconcile managed identity credential failed") } -func ensureMaintenanceConfiguration(ctx context.Context, cluster *armcontainerservice.ManagedCluster) error { - _, err := config.Azure.Maintenance.Get(ctx, config.ResourceGroupName(*cluster.Location), *cluster.Name, "default", nil) +func ensureMaintenanceConfiguration(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) error { + rgName := infra.ResourceGroupName(*cluster.Location) + _, err := infra.Azure.Maintenance.Get(ctx, rgName, *cluster.Name, "default", nil) var azErr *azcore.ResponseError if errors.As(err, &azErr) && azErr.StatusCode == 404 { - _, err = createNewMaintenanceConfiguration(ctx, cluster) + _, err = createNewMaintenanceConfiguration(ctx, infra, cluster) if err != nil { return fmt.Errorf("creating maintenance configuration for cluster %q: %w", *cluster.Name, err) } @@ -465,8 +466,8 @@ func ensureMaintenanceConfiguration(ctx context.Context, cluster *armcontainerse return nil } -func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.MaintenanceConfiguration, error) { - rgName := config.ResourceGroupName(*cluster.Location) +func createNewMaintenanceConfiguration(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) (*armcontainerservice.MaintenanceConfiguration, error) { + rgName := infra.ResourceGroupName(*cluster.Location) toolkit.Logf(ctx, "creating maintenance configuration for cluster %s in rg %s", *cluster.Name, rgName) maintenance := armcontainerservice.MaintenanceConfiguration{ Properties: &armcontainerservice.MaintenanceConfigurationProperties{ @@ -489,7 +490,7 @@ func createNewMaintenanceConfiguration(ctx context.Context, cluster *armcontaine }, } - _, err := config.Azure.Maintenance.CreateOrUpdate(ctx, rgName, *cluster.Name, "default", maintenance, nil) + _, err := infra.Azure.Maintenance.CreateOrUpdate(ctx, rgName, *cluster.Name, "default", maintenance, nil) if err != nil { return nil, fmt.Errorf("failed to create maintenance configuration: %w", err) } diff --git a/e2e/validators.go b/e2e/validators.go index 0a2d6cc25b9..fecd3f0d1ea 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -414,7 +414,7 @@ func ValidateNonEmptyDirectory(ctx context.Context, s *Scenario, dirName string) func ValidateEmptyDirectory(ctx context.Context, s *Scenario, dirName string) { s.T.Helper() - command := fmt.Sprintf("[ -d %s ] && [ -z \"$(ls -A %s)\" ]", dirName, dirName) + command := fmt.Sprintf("! [ -d '%s' ] || [ -z \"$(ls -A '%s')\" ]", dirName, dirName) execScriptOnVMForScenarioValidateExitCode(ctx, s, command, 0, fmt.Sprintf("expected directory %s to be empty or not exist", dirName)) } diff --git a/parts/windows/kuberneteswindowssetup.ps1 b/parts/windows/kuberneteswindowssetup.ps1 index 7c42d906c6a..63b4c2c3ff1 100644 --- a/parts/windows/kuberneteswindowssetup.ps1 +++ b/parts/windows/kuberneteswindowssetup.ps1 @@ -445,7 +445,7 @@ function BasePrep { {{end}} - Get-CACertificates -Location $Location + Get-CACertificates -Location $Location -FailOnError Write-CACert -CACertificate $global:CACertificate ` -KubeDir $global:KubeDir diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 56df5977e87..159161153f0 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -320,7 +320,9 @@ function Should-InstallCACertificatesRefreshTask { function Get-CACertificates { Param( [Parameter(Mandatory = $false)][string] - $Location = "" + $Location = "", + [Parameter(Mandatory = $false)][switch] + $FailOnError ) $caFolder = "C:\ca" @@ -342,6 +344,9 @@ function Get-CACertificates { $rawData = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$uri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 $caCerts = ($rawData.Content) | ConvertFrom-Json if ($null -eq $caCerts -or $null -eq $caCerts.Certificates -or $caCerts.Certificates.Length -eq 0) { + if ($FailOnError) { + throw "CA certificates rawdata is empty for legacy endpoint" + } Write-Log "Warning: CA certificates rawdata is empty for legacy endpoint" return $false } @@ -406,6 +411,9 @@ function Get-CACertificates { return $downloadedAny } catch { + if ($FailOnError) { + throw "Failed to retrieve CA certificates. Error: $_" + } Write-Log "Warning: failed to retrieve CA certificates. Error: $_" return $false } diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 8ada13ee440..42accc39c51 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -188,6 +188,24 @@ Describe 'Get-CACertificates' { $result | Should -Be $false } + It 'throws when certificate retrieval fails with -FailOnError' { + Mock Retry-Command -MockWith { + throw 'simulated retrieval failure' + } + + { Get-CACertificates -Location 'southcentralus' -FailOnError } | Should -Throw '*Failed to retrieve CA certificates*' + } + + It 'throws when legacy endpoint returns empty data with -FailOnError' { + Mock Retry-Command -MockWith { + return [PSCustomObject]@{ + Content = '{"Certificates":[]}' + } + } + + { Get-CACertificates -Location 'ussecwest' -FailOnError } | Should -Throw '*CA certificates rawdata is empty*' + } + It 'falls back to legacy endpoint when called without -Location (backward compat)' { $script:retryUris = @() Mock Retry-Command -MockWith { From 1f0fc8e60bb9517df878db34bf97e12a6487c90d Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 14 Apr 2026 15:09:03 -0700 Subject: [PATCH 18/70] Add Windows not-opted-in negative test for RCV1P cert mode Add Test_RCV1P_Windows_NotOptedIn which creates a Windows VM in the RCV1P subscription without the opt-in tag and validates that: - C:\ca is empty or does not exist (no certificates downloaded) - aks-ca-certs-refresh-task scheduled task is not registered This mirrors the existing Linux Test_RCV1P_NotOptedIn test to ensure the two-layer access control (subscription feature + VM tag) works on Windows. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_win_test.go | 25 +++++++++++++++++++++++++ e2e/validators.go | 27 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 55a35b584bb..73c3851671d 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -89,3 +89,28 @@ func Test_RCV1P_Windows2025(t *testing.T) { }, }) } + +// Test_RCV1P_Windows_NotOptedIn is a negative test that validates the VM opt-in tag is required +// for cert installation on Windows. The VM is created in the RCV1P subscription (which has +// PlatformSettingsOverride registered) but WITHOUT the opt-in tag on the VMSS. +// This verifies that wireserver returns IsOptedInForRootCerts=false and the provisioning +// script correctly skips certificate download and refresh task registration. +func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2022Containerd, + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PNotOptedInWindows(ctx, s) + }, + }, + }) +} diff --git a/e2e/validators.go b/e2e/validators.go index fecd3f0d1ea..4773497d934 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2941,3 +2941,30 @@ func ValidateRCV1PNotOptedIn(ctx context.Context, s *Scenario) { "sudo crontab -l 2>/dev/null | grep -q ca-refresh", 1, "expected no ca-refresh cron entry when not opted in") } + +// ValidateRCV1PNotOptedInWindows validates that when the Windows VM does NOT have the opt-in tag, +// no certificates are installed to C:\ca and no refresh scheduled task is registered, +// even in the RCV1P subscription with PlatformSettingsOverride registered. +func ValidateRCV1PNotOptedInWindows(ctx context.Context, s *Scenario) { + s.T.Helper() + + // Validate C:\ca is empty or does not exist + command := []string{ + "$ErrorActionPreference = 'Stop'", + "$caFolder = 'C:\\ca'", + "if ((Test-Path $caFolder) -and @(Get-ChildItem -Path $caFolder -File).Count -gt 0) { throw 'Expected C:\\ca to be empty or not exist, but found certificates' }", + "Write-Host 'C:\\ca is empty or does not exist as expected'", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected C:\\ca to be empty or not exist when not opted in") + + // Validate no refresh scheduled task was registered + command = []string{ + "$ErrorActionPreference = 'Stop'", + "$task = Get-ScheduledTask -TaskName 'aks-ca-certs-refresh-task' -ErrorAction SilentlyContinue", + "if ($task) { throw 'Expected no aks-ca-certs-refresh-task but found one' }", + "Write-Host 'No aks-ca-certs-refresh-task found as expected'", + } + execScriptOnVMForScenarioValidateExitCode(ctx, s, strings.Join(command, "\n"), 0, + "expected no aks-ca-certs-refresh-task scheduled task when not opted in") +} From 8d17c7c1abf193ed0faadaaefe3e2365d8b94fa5 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 15 Apr 2026 17:32:31 -0700 Subject: [PATCH 19/70] e2e: add VM instance-level tag update for RCV1P wireserver opt-in Wireserver checks tags on the individual VMSS VM instance, not the VMSS resource-level tags. Add VMInstanceTags field to Config and update the VM instance after it appears in the API but before CSE completes. This ensures wireserver sees the opt-in tag when init-aks-custom-cloud.sh queries IsOptedInForRootCerts during provisioning. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 30 ++++++++++++++++++++++-------- e2e/scenario_rcv1p_win_test.go | 3 +++ e2e/types.go | 6 ++++++ e2e/vmss.go | 29 +++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+), 8 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 817b63cba8e..0bb927798ae 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -44,8 +44,9 @@ func skipIfRCV1PNotConfigured(t *testing.T) { } } -// rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS so that wireserver -// will serve root certificates to this VM during provisioning. +// rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS resource level. +// Note: For wireserver to recognize the tag, it must also be set on the individual VM instance. +// Use VMInstanceTags in the Config to set instance-level tags (applied after VM creation). func rcv1pOptInVMConfigMutator(vmss *armcompute.VirtualMachineScaleSet) { if vmss.Tags == nil { vmss.Tags = map[string]*string{} @@ -53,6 +54,14 @@ func rcv1pOptInVMConfigMutator(vmss *armcompute.VirtualMachineScaleSet) { vmss.Tags[rcv1pOptInTag] = to.Ptr("true") } +// rcv1pVMInstanceTags returns the tags that must be set on individual VM instances +// for wireserver to serve root certificates. +func rcv1pVMInstanceTags() map[string]*string { + return map[string]*string{ + rcv1pOptInTag: to.Ptr("true"), + } +} + // Test_RCV1P_Ubuntu2204 validates RCV1P cert download and trust store installation on Ubuntu 22.04. // Ubuntu uses /usr/local/share/ca-certificates/ as the cert drop folder and update-ca-certificates // to rebuild the trust bundle. @@ -66,9 +75,10 @@ func Test_RCV1P_Ubuntu2204(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, - VHD: config.VHDUbuntu2204Gen2Containerd, + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2204Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -91,9 +101,10 @@ func Test_RCV1P_Ubuntu2404(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, - VHD: config.VHDUbuntu2404Gen2Containerd, + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDUbuntu2404Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -116,9 +127,10 @@ func Test_RCV1P_AzureLinuxV3(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, - VHD: config.VHDAzureLinuxV3Gen2, + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDAzureLinuxV3Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -144,6 +156,7 @@ func Test_RCV1P_Flatcar(t *testing.T) { Cluster: ClusterRCV1PKubenet, VHD: config.VHDFlatcarGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { @@ -172,6 +185,7 @@ func Test_RCV1P_ACL(t *testing.T) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) rcv1pOptInVMConfigMutator(vmss) }, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 73c3851671d..0932ae5f97b 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -32,6 +32,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2022Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) @@ -54,6 +55,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows23H2, VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) @@ -80,6 +82,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) rcv1pOptInVMConfigMutator(vmss) }, + VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { Windows2025BootstrapConfigMutator(t, nbc) }, diff --git a/e2e/types.go b/e2e/types.go index 495acb648f0..6350f82209d 100644 --- a/e2e/types.go +++ b/e2e/types.go @@ -253,6 +253,12 @@ type Config struct { // This prevents the Guest Agent from sweeping events before they can be read. // Only set this on CSE performance test scenarios. EagerCSETimingExtraction bool + + // VMInstanceTags are tags applied directly to VMSS VM instances after creation via BeginUpdate. + // This is needed for features like RCV1P where wireserver checks tags on the individual VM instance, + // not the VMSS resource-level tags. These tags are applied after the VM appears in the API but + // before CSE completes, giving wireserver time to see them before the provisioning scripts query it. + VMInstanceTags map[string]*string } func (s *Scenario) PrepareAKSNodeConfig() { diff --git a/e2e/vmss.go b/e2e/vmss.go index 7ebf839fd22..a6facf56150 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -492,6 +492,12 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } + if len(s.Config.VMInstanceTags) > 0 { + if err := updateVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to update VM instance tags: %w", err) + } + } + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -538,6 +544,29 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } +// updateVMInstanceTags updates tags on an individual VMSS VM instance. This is used for features +// like RCV1P where wireserver checks tags on the VM instance level, not the VMSS resource level. +// The update is done after the VM appears in the API but before CSE completes, ensuring the tags +// are visible to wireserver before provisioning scripts query it. +func updateVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "updating VM instance %s/%s tags", vmssName, instanceID)() + + poller, err := s.GetAzure().VMSSVM.BeginUpdate(ctx, resourceGroupName, vmssName, instanceID, + armcompute.VirtualMachineScaleSetVM{ + Tags: tags, + }, nil) + if err != nil { + return fmt.Errorf("failed to begin VM instance tag update: %w", err) + } + + _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return fmt.Errorf("failed to complete VM instance tag update: %w", err) + } + + return nil +} + // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. func waitForVMRunningState(ctx context.Context, s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { ctxTimeout, cancel := context.WithTimeout(ctx, 3*time.Minute) From 8b65cc8770e4b2cbd072d80b60f342ad4cf83fd2 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 15 Apr 2026 21:21:17 -0700 Subject: [PATCH 20/70] e2e: use JSON injection for VM profile tags at VMSS creation time The previous approach of updating VM instance tags after creation had a race condition: the BeginUpdate took ~108s, but CSE ran init-aks-custom-cloud.sh and queried wireserver before the tag update completed. Now we marshal the VMSS model to JSON, inject tags into virtualMachineProfile, and send a raw ARM PUT request via the SDK pipeline. This ensures the tags are present at VMSS creation time and propagate to VM instances before CSE boots. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 147 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 125 insertions(+), 22 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index a6facf56150..bf0229060fe 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -24,6 +24,8 @@ import ( "github.com/Azure/agentbaker/pkg/agent" "github.com/Azure/agentbaker/pkg/agent/datamodel" "github.com/Azure/azure-sdk-for-go/sdk/azcore" + azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" + "github.com/Azure/azure-sdk-for-go/sdk/azcore/streaming" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" "github.com/stretchr/testify/require" @@ -475,11 +477,21 @@ func CreateVMSSWithRetry(ctx context.Context, s *Scenario) (*ScenarioVM, error) func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*ScenarioVM, error) { defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} + + vmssModel := createVMSSModel(ctx, s) + + // When VMInstanceTags are configured, we need to inject tags into + // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. + // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. + if len(s.Config.VMInstanceTags) > 0 { + return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) + } + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - createVMSSModel(ctx, s), + vmssModel, nil, ) if err != nil { @@ -492,12 +504,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - if len(s.Config.VMInstanceTags) > 0 { - if err := updateVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { - return vm, fmt.Errorf("failed to update VM instance tags: %w", err) - } - } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -544,27 +550,124 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// updateVMInstanceTags updates tags on an individual VMSS VM instance. This is used for features -// like RCV1P where wireserver checks tags on the VM instance level, not the VMSS resource level. -// The update is done after the VM appears in the API but before CSE completes, ensuring the tags -// are visible to wireserver before provisioning scripts query it. -func updateVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { - defer toolkit.LogStepCtxf(ctx, "updating VM instance %s/%s tags", vmssName, instanceID)() - - poller, err := s.GetAzure().VMSSVM.BeginUpdate(ctx, resourceGroupName, vmssName, instanceID, - armcompute.VirtualMachineScaleSetVM{ - Tags: tags, - }, nil) +// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into +// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed +// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be +// present at VMSS creation time so they propagate to VM instances before CSE runs. +func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { + defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() + + // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags + vmssJSON, err := json.Marshal(vmssModel) if err != nil { - return fmt.Errorf("failed to begin VM instance tag update: %w", err) + return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) } - _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + var vmssMap map[string]interface{} + if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { + return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) + } + + // Inject tags into properties.virtualMachineProfile + props, ok := vmssMap["properties"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties' field") + } + vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") + } + vmProfile["tags"] = s.Config.VMInstanceTags + s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) + + // Re-marshal the modified model + modifiedBody, err := json.Marshal(vmssMap) if err != nil { - return fmt.Errorf("failed to complete VM instance tag update: %w", err) + return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) } - return nil + // Build the ARM resource URL + subscriptionID := s.SubscriptionID + if subscriptionID == "" { + subscriptionID = config.Config.SubscriptionID + } + resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", + subscriptionID, resourceGroupName, s.Runtime.VMSSName) + + // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) + req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) + if err != nil { + return vm, fmt.Errorf("failed to create ARM request: %w", err) + } + req.Raw().Header.Set("Content-Type", "application/json") + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { + return vm, fmt.Errorf("failed to set request body: %w", err) + } + + resp, err := s.GetAzure().Core.Pipeline().Do(req) + if err != nil { + return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) + } + if resp.StatusCode != 200 && resp.StatusCode != 201 { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) + } + + // Create a poller for the async operation + poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) + if err != nil { + return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) + } + + // Wait for VMSS VM to appear before extracting the private IP + vm.VM, err = waitForVMSSVM(ctx, s) + if err != nil { + return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) + } + + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + if err != nil { + return vm, fmt.Errorf("failed to get VM private IP address: %w", err) + } + + s.T.Cleanup(func() { + defer cleanupBastionTunnel(vm.SSHClient) + cleanupVMSS(ctx, s, vm) + }) + + result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" + if config.Config.KeepVMSS { + s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") + } else { + s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") + } + result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" + s.T.Log(result) + + vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if !s.Config.SkipSSHConnectivityValidation { + var bastErr error + vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) + if bastErr != nil { + return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) + } + } + if err != nil { + return vm, err + } + + err = waitForVMRunningState(ctx, s, vm.VM) + if err != nil { + return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) + } + + return &ScenarioVM{ + VMSS: &vmssResp.VirtualMachineScaleSet, + PrivateIP: vm.PrivateIP, + VM: vm.VM, + SSHClient: vm.SSHClient, + }, nil } // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. From dbc0a285a26e555a28b686affcb29b559b4d44ba Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 00:18:21 -0700 Subject: [PATCH 21/70] e2e: use lightweight PATCH for VM instance tags instead of JSON injection The ARM API does not support virtualMachineProfile.tags for Uniform mode VMSS (400 BadRequest). Instead, use a lightweight PATCH request to update tags on the VM instance after it appears. PATCH only modifies the tags property and should complete in seconds, unlike BeginUpdate which triggers a full model update (~108s). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 149 +++++++++++++++------------------------------------- 1 file changed, 43 insertions(+), 106 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index bf0229060fe..a1ee00bd4f7 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -478,20 +478,11 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} - vmssModel := createVMSSModel(ctx, s) - - // When VMInstanceTags are configured, we need to inject tags into - // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. - // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. - if len(s.Config.VMInstanceTags) > 0 { - return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) - } - operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - vmssModel, + createVMSSModel(ctx, s), nil, ) if err != nil { @@ -504,6 +495,15 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } + // Apply VM instance tags via lightweight PATCH before CSE queries wireserver. + // This is needed for features like RCV1P where wireserver checks tags on the + // individual VM instance, not the VMSS resource-level tags. + if len(s.Config.VMInstanceTags) > 0 { + if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) + } + } + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -550,124 +550,61 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into -// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed -// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be -// present at VMSS creation time so they propagate to VM instances before CSE runs. -func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { - defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() - - // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags - vmssJSON, err := json.Marshal(vmssModel) - if err != nil { - return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) - } +// patchVMInstanceTags sends a lightweight PATCH request to update tags on a VMSS VM instance. +// This is much faster than BeginUpdate (which triggers a full model update) because it only +// modifies the tags property. The PATCH typically completes in seconds rather than minutes. +func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags", vmssName, instanceID)() - var vmssMap map[string]interface{} - if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { - return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) - } - - // Inject tags into properties.virtualMachineProfile - props, ok := vmssMap["properties"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties' field") - } - vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") - } - vmProfile["tags"] = s.Config.VMInstanceTags - s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) - - // Re-marshal the modified model - modifiedBody, err := json.Marshal(vmssMap) - if err != nil { - return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) - } - - // Build the ARM resource URL subscriptionID := s.SubscriptionID if subscriptionID == "" { subscriptionID = config.Config.SubscriptionID } - resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", - subscriptionID, resourceGroupName, s.Runtime.VMSSName) - // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) - req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) - if err != nil { - return vm, fmt.Errorf("failed to create ARM request: %w", err) - } - req.Raw().Header.Set("Content-Type", "application/json") - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { - return vm, fmt.Errorf("failed to set request body: %w", err) - } + resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s?api-version=2025-04-01", + subscriptionID, resourceGroupName, vmssName, instanceID) - resp, err := s.GetAzure().Core.Pipeline().Do(req) - if err != nil { - return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) - } - if resp.StatusCode != 200 && resp.StatusCode != 201 { - body, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) - } + body := struct { + Tags map[string]*string `json:"tags"` + }{Tags: tags} - // Create a poller for the async operation - poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) + bodyJSON, err := json.Marshal(body) if err != nil { - return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) + return fmt.Errorf("failed to marshal tag patch body: %w", err) } - // Wait for VMSS VM to appear before extracting the private IP - vm.VM, err = waitForVMSSVM(ctx, s) + req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) if err != nil { - return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) + return fmt.Errorf("failed to create PATCH request: %w", err) + } + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { + return fmt.Errorf("failed to set request body: %w", err) } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + resp, err := s.GetAzure().Core.Pipeline().Do(req) if err != nil { - return vm, fmt.Errorf("failed to get VM private IP address: %w", err) + return fmt.Errorf("failed to send PATCH request: %w", err) } - s.T.Cleanup(func() { - defer cleanupBastionTunnel(vm.SSHClient) - cleanupVMSS(ctx, s, vm) - }) - - result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" - if config.Config.KeepVMSS { - s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") - } else { - s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") + if resp.StatusCode != 200 && resp.StatusCode != 202 { + respBody, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return fmt.Errorf("PATCH VM instance tags failed with status %d: %s", resp.StatusCode, string(respBody)) } - result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" - s.T.Log(result) - vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) - if !s.Config.SkipSSHConnectivityValidation { - var bastErr error - vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) - if bastErr != nil { - return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) + // If 202 Accepted, poll until complete + if resp.StatusCode == 202 { + poller, err := azruntime.NewPoller[struct{}](resp, s.GetAzure().Core.Pipeline(), nil) + if err != nil { + return fmt.Errorf("failed to create poller for tag PATCH: %w", err) + } + _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return fmt.Errorf("failed to complete tag PATCH: %w", err) } - } - if err != nil { - return vm, err - } - - err = waitForVMRunningState(ctx, s, vm.VM) - if err != nil { - return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) } - return &ScenarioVM{ - VMSS: &vmssResp.VirtualMachineScaleSet, - PrivateIP: vm.PrivateIP, - VM: vm.VM, - SSHClient: vm.SSHClient, - }, nil + return nil } // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. From 266f9d5644e459d35160584dbf5e62410a3c4ab9 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 10:40:31 -0700 Subject: [PATCH 22/70] Revert "e2e: use lightweight PATCH for VM instance tags instead of JSON injection" This reverts commit 03efe783c5dad08baa425e4fa43eaed022eb3dd2. Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 149 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 106 insertions(+), 43 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index a1ee00bd4f7..bf0229060fe 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -478,11 +478,20 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} + vmssModel := createVMSSModel(ctx, s) + + // When VMInstanceTags are configured, we need to inject tags into + // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. + // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. + if len(s.Config.VMInstanceTags) > 0 { + return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) + } + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - createVMSSModel(ctx, s), + vmssModel, nil, ) if err != nil { @@ -495,15 +504,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - // Apply VM instance tags via lightweight PATCH before CSE queries wireserver. - // This is needed for features like RCV1P where wireserver checks tags on the - // individual VM instance, not the VMSS resource-level tags. - if len(s.Config.VMInstanceTags) > 0 { - if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { - return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) - } - } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { return vm, fmt.Errorf("failed to get VM private IP address: %w", err) @@ -550,61 +550,124 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// patchVMInstanceTags sends a lightweight PATCH request to update tags on a VMSS VM instance. -// This is much faster than BeginUpdate (which triggers a full model update) because it only -// modifies the tags property. The PATCH typically completes in seconds rather than minutes. -func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { - defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags", vmssName, instanceID)() +// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into +// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed +// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be +// present at VMSS creation time so they propagate to VM instances before CSE runs. +func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { + defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() + + // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags + vmssJSON, err := json.Marshal(vmssModel) + if err != nil { + return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) + } + var vmssMap map[string]interface{} + if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { + return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) + } + + // Inject tags into properties.virtualMachineProfile + props, ok := vmssMap["properties"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties' field") + } + vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) + if !ok { + return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") + } + vmProfile["tags"] = s.Config.VMInstanceTags + s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) + + // Re-marshal the modified model + modifiedBody, err := json.Marshal(vmssMap) + if err != nil { + return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) + } + + // Build the ARM resource URL subscriptionID := s.SubscriptionID if subscriptionID == "" { subscriptionID = config.Config.SubscriptionID } + resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", + subscriptionID, resourceGroupName, s.Runtime.VMSSName) - resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s?api-version=2025-04-01", - subscriptionID, resourceGroupName, vmssName, instanceID) - - body := struct { - Tags map[string]*string `json:"tags"` - }{Tags: tags} + // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) + req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) + if err != nil { + return vm, fmt.Errorf("failed to create ARM request: %w", err) + } + req.Raw().Header.Set("Content-Type", "application/json") + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { + return vm, fmt.Errorf("failed to set request body: %w", err) + } - bodyJSON, err := json.Marshal(body) + resp, err := s.GetAzure().Core.Pipeline().Do(req) if err != nil { - return fmt.Errorf("failed to marshal tag patch body: %w", err) + return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) + } + if resp.StatusCode != 200 && resp.StatusCode != 201 { + body, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) } - req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) + // Create a poller for the async operation + poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) if err != nil { - return fmt.Errorf("failed to create PATCH request: %w", err) + return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) } - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { - return fmt.Errorf("failed to set request body: %w", err) + + // Wait for VMSS VM to appear before extracting the private IP + vm.VM, err = waitForVMSSVM(ctx, s) + if err != nil { + return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - resp, err := s.GetAzure().Core.Pipeline().Do(req) + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) if err != nil { - return fmt.Errorf("failed to send PATCH request: %w", err) + return vm, fmt.Errorf("failed to get VM private IP address: %w", err) } - if resp.StatusCode != 200 && resp.StatusCode != 202 { - respBody, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return fmt.Errorf("PATCH VM instance tags failed with status %d: %s", resp.StatusCode, string(respBody)) + s.T.Cleanup(func() { + defer cleanupBastionTunnel(vm.SSHClient) + cleanupVMSS(ctx, s, vm) + }) + + result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" + if config.Config.KeepVMSS { + s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") + } else { + s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") } + result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" + s.T.Log(result) - // If 202 Accepted, poll until complete - if resp.StatusCode == 202 { - poller, err := azruntime.NewPoller[struct{}](resp, s.GetAzure().Core.Pipeline(), nil) - if err != nil { - return fmt.Errorf("failed to create poller for tag PATCH: %w", err) - } - _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) - if err != nil { - return fmt.Errorf("failed to complete tag PATCH: %w", err) + vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if !s.Config.SkipSSHConnectivityValidation { + var bastErr error + vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) + if bastErr != nil { + return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) } } + if err != nil { + return vm, err + } - return nil + err = waitForVMRunningState(ctx, s, vm.VM) + if err != nil { + return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) + } + + return &ScenarioVM{ + VMSS: &vmssResp.VirtualMachineScaleSet, + PrivateIP: vm.PrivateIP, + VM: vm.VM, + SSHClient: vm.SSHClient, + }, nil } // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. From 3907efb9efc6445fb106e0660c324496a857962e Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 11:29:58 -0700 Subject: [PATCH 23/70] e2e: use Microsoft.Resources/tags API for VM instance tag patching For Uniform mode VMSS, VM instance tags cannot be set at creation time: - The Go SDK (armcompute v7.x) does not expose a Tags field on VirtualMachineScaleSetVMProfile. - The ARM API rejects virtualMachineProfile.tags for Uniform mode VMSS with: 'Could not find member tags on object of type VirtualMachineProfile'. - PATCH on the Compute VM instance endpoint returns 405 Method Not Allowed. - BeginUpdate (PUT) works but takes ~108s for a full VM model reconciliation, causing a race condition: CSE runs init-aks-custom-cloud.sh and queries wireserver before the tag update completes. Use the Microsoft.Resources/tags API instead, which provides a lightweight PATCH endpoint (/{resourceId}/providers/Microsoft.Resources/tags/default) that updates only tags without triggering a full VM update. The Merge operation adds tags without replacing existing ones. Also moves s.T.Cleanup() registration to immediately after waitForVMSSVM() so the VMSS is always cleaned up even if tag patching or subsequent steps fail, preventing orphaned VMSS resources. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 162 ++++++++++++++++------------------------------------ 1 file changed, 50 insertions(+), 112 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index bf0229060fe..65455e0fd77 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -478,20 +478,11 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} - vmssModel := createVMSSModel(ctx, s) - - // When VMInstanceTags are configured, we need to inject tags into - // virtualMachineProfile which the Go SDK doesn't expose for Uniform mode VMSS. - // We marshal the model to JSON, inject the tags, and send a raw ARM PUT request. - if len(s.Config.VMInstanceTags) > 0 { - return createVMSSWithProfileTags(ctx, s, resourceGroupName, vmssModel, vm) - } - operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - vmssModel, + createVMSSModel(ctx, s), nil, ) if err != nil { @@ -504,16 +495,27 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) } - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) - if err != nil { - return vm, fmt.Errorf("failed to get VM private IP address: %w", err) - } - + // Register cleanup early so the VMSS is always deleted even if subsequent steps + // (tag patching, IP lookup, etc.) fail — preventing orphaned VMSS resources. s.T.Cleanup(func() { defer cleanupBastionTunnel(vm.SSHClient) cleanupVMSS(ctx, s, vm) }) + // Apply VM instance tags via the Microsoft.Resources/tags API before CSE queries + // wireserver. This is needed for features like RCV1P where wireserver checks tags + // on the individual VM instance, not the VMSS resource-level tags. + if len(s.Config.VMInstanceTags) > 0 { + if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) + } + } + + vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) + if err != nil { + return vm, fmt.Errorf("failed to get VM private IP address: %w", err) + } + result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" if config.Config.KeepVMSS { s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") @@ -550,126 +552,62 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// createVMSSWithProfileTags creates a VMSS using a raw ARM PUT request, injecting tags into -// virtualMachineProfile that the Go SDK doesn't expose for Uniform mode VMSS. This is needed -// for features like RCV1P where wireserver checks VM instance-level tags: the tags must be -// present at VMSS creation time so they propagate to VM instances before CSE runs. -func createVMSSWithProfileTags(ctx context.Context, s *Scenario, resourceGroupName string, vmssModel armcompute.VirtualMachineScaleSet, vm *ScenarioVM) (*ScenarioVM, error) { - defer toolkit.LogStepCtxf(ctx, "creating VMSS %s with VM profile tags", s.Runtime.VMSSName)() - - // Marshal the typed model to a generic map so we can inject virtualMachineProfile.tags - vmssJSON, err := json.Marshal(vmssModel) - if err != nil { - return vm, fmt.Errorf("failed to marshal VMSS model: %w", err) - } - - var vmssMap map[string]interface{} - if err := json.Unmarshal(vmssJSON, &vmssMap); err != nil { - return vm, fmt.Errorf("failed to unmarshal VMSS model to map: %w", err) - } - - // Inject tags into properties.virtualMachineProfile - props, ok := vmssMap["properties"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties' field") - } - vmProfile, ok := props["virtualMachineProfile"].(map[string]interface{}) - if !ok { - return vm, fmt.Errorf("VMSS model missing 'properties.virtualMachineProfile' field") - } - vmProfile["tags"] = s.Config.VMInstanceTags - s.T.Logf("injected VM profile tags: %v", s.Config.VMInstanceTags) - - // Re-marshal the modified model - modifiedBody, err := json.Marshal(vmssMap) - if err != nil { - return vm, fmt.Errorf("failed to marshal modified VMSS model: %w", err) - } +// patchVMInstanceTags uses the Microsoft.Resources/tags API to merge tags onto a VMSS VM +// instance. This is a lightweight PATCH that only modifies tags without triggering a full +// VM model update, completing in seconds rather than the ~108s that BeginUpdate takes. +func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags via Resources API", vmssName, instanceID)() - // Build the ARM resource URL subscriptionID := s.SubscriptionID if subscriptionID == "" { subscriptionID = config.Config.SubscriptionID } - resourceURL := fmt.Sprintf("https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s?api-version=2025-04-01", - subscriptionID, resourceGroupName, s.Runtime.VMSSName) - // Send raw PUT request via the SDK pipeline (includes auth, retry, logging) - req, err := azruntime.NewRequest(ctx, "PUT", resourceURL) - if err != nil { - return vm, fmt.Errorf("failed to create ARM request: %w", err) - } - req.Raw().Header.Set("Content-Type", "application/json") - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(modifiedBody)), "application/json"); err != nil { - return vm, fmt.Errorf("failed to set request body: %w", err) - } + // The Microsoft.Resources/tags API allows lightweight tag updates on any Azure resource. + // Using "Merge" operation to add/update tags without replacing existing ones. + resourceURL := fmt.Sprintf( + "https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s/providers/Microsoft.Resources/tags/default?api-version=2021-04-01", + subscriptionID, resourceGroupName, vmssName, instanceID, + ) - resp, err := s.GetAzure().Core.Pipeline().Do(req) - if err != nil { - return vm, fmt.Errorf("failed to send VMSS creation request: %w", err) - } - if resp.StatusCode != 200 && resp.StatusCode != 201 { - body, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return vm, fmt.Errorf("VMSS creation failed with status %d: %s", resp.StatusCode, string(body)) + body := struct { + Operation string `json:"operation"` + Properties struct { + Tags map[string]*string `json:"tags"` + } `json:"properties"` + }{ + Operation: "Merge", } + body.Properties.Tags = tags - // Create a poller for the async operation - poller, err := azruntime.NewPoller[armcompute.VirtualMachineScaleSetsClientCreateOrUpdateResponse](resp, s.GetAzure().Core.Pipeline(), nil) + bodyJSON, err := json.Marshal(body) if err != nil { - return vm, fmt.Errorf("failed to create VMSS creation poller: %w", err) + return fmt.Errorf("failed to marshal tag patch body: %w", err) } - // Wait for VMSS VM to appear before extracting the private IP - vm.VM, err = waitForVMSSVM(ctx, s) + req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) if err != nil { - return vm, fmt.Errorf("failed to wait for VMSS VM: %w", err) + return fmt.Errorf("failed to create PATCH request: %w", err) } - - vm.PrivateIP, err = getPrivateIPFromVMSSVM(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID) - if err != nil { - return vm, fmt.Errorf("failed to get VM private IP address: %w", err) - } - - s.T.Cleanup(func() { - defer cleanupBastionTunnel(vm.SSHClient) - cleanupVMSS(ctx, s, vm) - }) - - result := "SSH Instructions: (may take a few minutes for the VM to be ready for SSH)\n========================\n" - if config.Config.KeepVMSS { - s.T.Logf("VM will be preserved after the test finishes, PLEASE MANUALLY DELETE THE VMSS. Set KEEP_VMSS=false to delete it automatically after the test finishes\n") - } else { - s.T.Logf("VM will be automatically deleted after the test finishes, to preserve it for debugging purposes set KEEP_VMSS=true or pause the test with a breakpoint before the test finishes or failed\n") + if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { + return fmt.Errorf("failed to set request body: %w", err) } - result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" - s.T.Log(result) - vmssResp, err := poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) - if !s.Config.SkipSSHConnectivityValidation { - var bastErr error - vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) - if bastErr != nil { - return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) - } - } + resp, err := s.GetAzure().Core.Pipeline().Do(req) if err != nil { - return vm, err + return fmt.Errorf("failed to send tag PATCH request: %w", err) } - err = waitForVMRunningState(ctx, s, vm.VM) - if err != nil { - return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) + if resp.StatusCode != 200 { + respBody, _ := io.ReadAll(resp.Body) + resp.Body.Close() + return fmt.Errorf("tag PATCH failed with status %d: %s", resp.StatusCode, string(respBody)) } - return &ScenarioVM{ - VMSS: &vmssResp.VirtualMachineScaleSet, - PrivateIP: vm.PrivateIP, - VM: vm.VM, - SSHClient: vm.SSHClient, - }, nil + return nil } + // waitForVMRunningState polls until the VM reaches "Running" power state or the timeout elapses. func waitForVMRunningState(ctx context.Context, s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { ctxTimeout, cancel := context.WithTimeout(ctx, 3*time.Minute) From 2e8b81136e5afe585def841e2206e3f1a674baec Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 15:02:02 -0700 Subject: [PATCH 24/70] e2e: use BeginUpdate + deferred CSE for VM instance tagging Replace the Microsoft.Resources/tags API approach (which returns 405 on Uniform VMSS VM instances) with BeginUpdate (full PUT) + deferred CSE. For scenarios requiring VM instance tags (e.g., RCV1P): 1. Create VMSS without CSE extension profile 2. Wait for VMSS creation to complete 3. Apply tags via VMSSVM.BeginUpdate (~108s full PUT) 4. Re-add CSE extension via a second BeginCreateOrUpdate This ensures wireserver sees the per-VM-instance tags before CSE queries it. The delay is acceptable for E2E validation; production would use a different approach (e.g., AKS RP sets tags pre-boot). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 120 +++++++++++++++++++++++++++++----------------------- 1 file changed, 67 insertions(+), 53 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 65455e0fd77..56eb5a091f8 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -24,8 +24,6 @@ import ( "github.com/Azure/agentbaker/pkg/agent" "github.com/Azure/agentbaker/pkg/agent/datamodel" "github.com/Azure/azure-sdk-for-go/sdk/azcore" - azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" - "github.com/Azure/azure-sdk-for-go/sdk/azcore/streaming" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" "github.com/stretchr/testify/require" @@ -478,11 +476,25 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc defer toolkit.LogStepCtxf(ctx, "creating VMSS %s", s.Runtime.VMSSName)() vm := &ScenarioVM{} + model := createVMSSModel(ctx, s) + + // For scenarios that need VM instance tags (e.g., RCV1P), we must apply tags + // before CSE runs because wireserver checks per-VM-instance tags. The only + // working method for Uniform VMSS is BeginUpdate (full PUT), which takes ~108s. + // To avoid the race, we strip the CSE extension before creation, apply tags + // via BeginUpdate, then re-add the extension in a second update. + var deferredExtensionProfile *armcompute.VirtualMachineScaleSetExtensionProfile + if len(s.Config.VMInstanceTags) > 0 && model.Properties.VirtualMachineProfile.ExtensionProfile != nil { + deferredExtensionProfile = model.Properties.VirtualMachineProfile.ExtensionProfile + model.Properties.VirtualMachineProfile.ExtensionProfile = nil + toolkit.Logf(ctx, "deferring CSE extension until VM instance tags are applied") + } + operation, err := s.GetAzure().VMSS.BeginCreateOrUpdate( ctx, resourceGroupName, s.Runtime.VMSSName, - createVMSSModel(ctx, s), + model, nil, ) if err != nil { @@ -496,18 +508,45 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc } // Register cleanup early so the VMSS is always deleted even if subsequent steps - // (tag patching, IP lookup, etc.) fail — preventing orphaned VMSS resources. + // (tag update, IP lookup, etc.) fail — preventing orphaned VMSS resources. s.T.Cleanup(func() { defer cleanupBastionTunnel(vm.SSHClient) cleanupVMSS(ctx, s, vm) }) - // Apply VM instance tags via the Microsoft.Resources/tags API before CSE queries - // wireserver. This is needed for features like RCV1P where wireserver checks tags - // on the individual VM instance, not the VMSS resource-level tags. + // Wait for initial VMSS creation to fully complete before applying tags. + vmssResp, err := operation.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return vm, fmt.Errorf("failed to create VMSS: %w", err) + } + + // Apply VM instance tags via BeginUpdate (full PUT) and then re-add CSE. + // This is needed for features like RCV1P where wireserver checks tags on + // the individual VM instance, not the VMSS resource-level tags. if len(s.Config.VMInstanceTags) > 0 { - if err := patchVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { - return vm, fmt.Errorf("failed to patch VM instance tags: %w", err) + if err := updateVMInstanceTags(ctx, s, resourceGroupName, s.Runtime.VMSSName, *vm.VM.InstanceID, s.Config.VMInstanceTags); err != nil { + return vm, fmt.Errorf("failed to update VM instance tags: %w", err) + } + + // Re-add CSE extension now that tags are in place. + if deferredExtensionProfile != nil { + toolkit.Logf(ctx, "re-adding CSE extension after tags are applied") + vmssResp.VirtualMachineScaleSet.Properties.VirtualMachineProfile.ExtensionProfile = deferredExtensionProfile + cseOp, err := s.GetAzure().VMSS.BeginCreateOrUpdate( + ctx, + resourceGroupName, + s.Runtime.VMSSName, + vmssResp.VirtualMachineScaleSet, + nil, + ) + if err != nil { + return vm, fmt.Errorf("failed to begin adding CSE extension: %w", err) + } + vmssResp2, err := cseOp.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return vm, fmt.Errorf("failed to add CSE extension: %w", err) + } + vmssResp = vmssResp2 } } @@ -526,7 +565,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" s.T.Log(result) - vmssResp, err := operation.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) if !s.Config.SkipSSHConnectivityValidation { var bastErr error vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) @@ -534,9 +572,6 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to start bastion tunnel: %w", bastErr) } } - if err != nil { - return vm, err - } // Wait for VM to be in "Running" power state before proceeding err = waitForVMRunningState(ctx, s, vm.VM) @@ -552,56 +587,35 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc }, nil } -// patchVMInstanceTags uses the Microsoft.Resources/tags API to merge tags onto a VMSS VM -// instance. This is a lightweight PATCH that only modifies tags without triggering a full -// VM model update, completing in seconds rather than the ~108s that BeginUpdate takes. -func patchVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { - defer toolkit.LogStepCtxf(ctx, "patching VM instance %s/%s tags via Resources API", vmssName, instanceID)() - - subscriptionID := s.SubscriptionID - if subscriptionID == "" { - subscriptionID = config.Config.SubscriptionID - } - - // The Microsoft.Resources/tags API allows lightweight tag updates on any Azure resource. - // Using "Merge" operation to add/update tags without replacing existing ones. - resourceURL := fmt.Sprintf( - "https://management.azure.com/subscriptions/%s/resourceGroups/%s/providers/Microsoft.Compute/virtualMachineScaleSets/%s/virtualMachines/%s/providers/Microsoft.Resources/tags/default?api-version=2021-04-01", - subscriptionID, resourceGroupName, vmssName, instanceID, - ) - - body := struct { - Operation string `json:"operation"` - Properties struct { - Tags map[string]*string `json:"tags"` - } `json:"properties"` - }{ - Operation: "Merge", - } - body.Properties.Tags = tags +// updateVMInstanceTags uses BeginUpdate (full PUT) to set tags on a VMSS VM instance. +// This is the only method that works for Uniform mode VMSS — PATCH and Microsoft.Resources/tags +// API both return 405 at this scope. The operation takes ~108s as it triggers full VM model +// reconciliation. This is acceptable for E2E tests where we defer CSE until tags are in place. +func updateVMInstanceTags(ctx context.Context, s *Scenario, resourceGroupName, vmssName, instanceID string, tags map[string]*string) error { + defer toolkit.LogStepCtxf(ctx, "updating VM instance %s/%s/%s tags via BeginUpdate", resourceGroupName, vmssName, instanceID)() - bodyJSON, err := json.Marshal(body) + // Get current VM instance to preserve existing state + currentVM, err := s.GetAzure().VMSSVM.Get(ctx, resourceGroupName, vmssName, instanceID, &armcompute.VirtualMachineScaleSetVMsClientGetOptions{}) if err != nil { - return fmt.Errorf("failed to marshal tag patch body: %w", err) + return fmt.Errorf("failed to get current VM instance: %w", err) } - req, err := azruntime.NewRequest(ctx, "PATCH", resourceURL) - if err != nil { - return fmt.Errorf("failed to create PATCH request: %w", err) + // Merge new tags with any existing tags + if currentVM.Tags == nil { + currentVM.Tags = make(map[string]*string) } - if err := req.SetBody(streaming.NopCloser(bytes.NewReader(bodyJSON)), "application/json"); err != nil { - return fmt.Errorf("failed to set request body: %w", err) + for k, v := range tags { + currentVM.Tags[k] = v } - resp, err := s.GetAzure().Core.Pipeline().Do(req) + poller, err := s.GetAzure().VMSSVM.BeginUpdate(ctx, resourceGroupName, vmssName, instanceID, currentVM.VirtualMachineScaleSetVM, nil) if err != nil { - return fmt.Errorf("failed to send tag PATCH request: %w", err) + return fmt.Errorf("failed to begin VM instance tag update: %w", err) } - if resp.StatusCode != 200 { - respBody, _ := io.ReadAll(resp.Body) - resp.Body.Close() - return fmt.Errorf("tag PATCH failed with status %d: %s", resp.StatusCode, string(respBody)) + _, err = poller.PollUntilDone(ctx, config.DefaultPollUntilDoneOptions) + if err != nil { + return fmt.Errorf("failed to complete VM instance tag update: %w", err) } return nil From 77245c4d200c50ca5fe0205822c3059ca0f174b3 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 19:42:21 -0700 Subject: [PATCH 25/70] e2e: add feature flag check for RCV1P subscription Verify Microsoft.Compute/PlatformSettingsOverride is registered on the RCV1P subscription before running tests. This fails fast with a clear error if the feature flag is missing, rather than letting tests run and fail with opaque wireserver responses. The check runs once per test run via sync.Once. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 56 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 0bb927798ae..eed7cf43ffd 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -19,11 +19,15 @@ package e2e import ( "context" + "fmt" + "io" "strings" + "sync" "testing" "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" + azruntime "github.com/Azure/azure-sdk-for-go/sdk/azcore/runtime" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" ) @@ -36,12 +40,64 @@ const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedi // skipIfRCV1PNotConfigured skips the test when the RCV1P subscription is not configured. // This happens in regular CI runs where the RCV1P variable group is not linked, causing // Azure DevOps to pass the literal unexpanded string "$(RCV1P_SUBSCRIPTION_ID)". +// It also verifies the Microsoft.Compute/PlatformSettingsOverride feature flag is registered. func skipIfRCV1PNotConfigured(t *testing.T) { t.Helper() subID := config.Config.RCV1PSubscriptionID if subID == "" || strings.HasPrefix(subID, "$(") { t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") } + checkPlatformSettingsOverrideFeatureFlag(t, subID) +} + +var ( + featureFlagCheckOnce sync.Once + featureFlagCheckResult error +) + +// checkPlatformSettingsOverrideFeatureFlag verifies the Microsoft.Compute/PlatformSettingsOverride +// feature flag is registered on the given subscription. This is a prerequisite for wireserver to +// serve root certificates. The check runs only once per test run. +func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID string) { + t.Helper() + featureFlagCheckOnce.Do(func() { + featureFlagCheckResult = verifyFeatureFlag(t.Context(), subscriptionID) + }) + if featureFlagCheckResult != nil { + t.Fatalf("RCV1P feature flag check failed: %v", featureFlagCheckResult) + } +} + +func verifyFeatureFlag(ctx context.Context, subscriptionID string) error { + url := fmt.Sprintf( + "https://management.azure.com/subscriptions/%s/providers/Microsoft.Features/providers/Microsoft.Compute/features/PlatformSettingsOverride?api-version=2021-07-01", + subscriptionID, + ) + + req, err := azruntime.NewRequest(ctx, "GET", url) + if err != nil { + return fmt.Errorf("failed to create feature flag request: %w", err) + } + + resp, err := config.RCV1PAzure.Core.Pipeline().Do(req) + if err != nil { + return fmt.Errorf("failed to query feature flag: %w", err) + } + defer resp.Body.Close() + + body, _ := io.ReadAll(resp.Body) + bodyStr := string(body) + + if resp.StatusCode != 200 { + return fmt.Errorf("feature flag query returned status %d: %s", resp.StatusCode, bodyStr) + } + + if !strings.Contains(bodyStr, `"Registered"`) { + return fmt.Errorf("Microsoft.Compute/PlatformSettingsOverride is NOT registered on subscription %s (response: %s); "+ + "wireserver will not serve root certificates without this feature flag", subscriptionID, bodyStr) + } + + return nil } // rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS resource level. From d88d517b3f15234f088358fba16cf2191be3ce65 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 16 Apr 2026 19:46:34 -0700 Subject: [PATCH 26/70] REVERT ME: poll wireserver IsOptedInForRootCerts with retry loop Experimental commit to validate whether wireserver detects VM instance tags applied via BeginUpdate after VM creation. Polls for up to ~5 minutes (30x10s). Wireserver reads IsOptedInForRootCerts from the Fabric Controller goal state (CCF/ContainerConfig), NOT directly from ARM tags. The flow is: BeginUpdate -> ARM model update -> FC generates new CCF with platformsettings.host_environment.service.platform_optedin_for_rootcerts -> FC pushes CCF to host agent -> wireserver reflects new state. FC goal state propagation can take several minutes, so the polling window is set to ~5 minutes to give adequate time for detection. Logs the full wireserver response on each attempt for diagnostics. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 42 ++++++++++++++----- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 862c2f09b6c..c63e0bc5df9 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -65,20 +65,40 @@ function make_request_with_retry { function is_opted_in_for_root_certs { local opt_in_response + local request_status + local poll_attempt=1 + local max_poll_attempts=30 + local poll_interval=10 + + # Poll wireserver for up to ~5 minutes to allow platform metadata to sync. + # The VM instance tag triggers a Fabric Controller goal state (CCF) update, + # which must propagate to the host agent before wireserver can reflect it. + # FC goal state propagation can take several minutes in practice. + while [ $poll_attempt -le $max_poll_attempts ]; do + echo "is_opted_in_for_root_certs: poll attempt ${poll_attempt}/${max_poll_attempts}" + + opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") + request_status=$? + + echo "is_opted_in_for_root_certs: wireserver response (status=${request_status}): '${opt_in_response}'" + + if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then + echo "Warning: failed to determine IsOptedInForRootCerts state on attempt ${poll_attempt}" + elif echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then + echo "IsOptedInForRootCerts=true (found on attempt ${poll_attempt})" + return 0 + fi - opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") - local request_status=$? - if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then - echo "Warning: failed to determine IsOptedInForRootCerts state" - return 1 - fi + if [ $poll_attempt -lt $max_poll_attempts ]; then + echo "is_opted_in_for_root_certs: not opted in yet, waiting ${poll_interval}s before retry..." + sleep $poll_interval + fi - if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then - echo "IsOptedInForRootCerts=true" - return 0 - fi + poll_attempt=$((poll_attempt + 1)) + done - echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" + echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true after ${max_poll_attempts} attempts" + echo "Last wireserver response: '${opt_in_response}'" return 1 } From 2ad18c707af9741fff6731951586570041bd0123 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 17 Apr 2026 14:15:41 -0700 Subject: [PATCH 27/70] e2e: always log PlatformSettingsOverride feature flag status Log the feature flag status on the default E2E subscription for diagnostics in every RCV1P test, even when RCV1P_SUBSCRIPTION_ID is not set. This helps diagnose wireserver IsOptedInForRootCerts behavior across subscriptions. The feature flag check is now per-subscription (cached via sync.Map) and accepts a failIfMissing parameter: true for RCV1P tests (fail if not registered), false for diagnostics (log only). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 76 ++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 23 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index eed7cf43ffd..72a010cfe4b 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -40,35 +40,70 @@ const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedi // skipIfRCV1PNotConfigured skips the test when the RCV1P subscription is not configured. // This happens in regular CI runs where the RCV1P variable group is not linked, causing // Azure DevOps to pass the literal unexpanded string "$(RCV1P_SUBSCRIPTION_ID)". -// It also verifies the Microsoft.Compute/PlatformSettingsOverride feature flag is registered. +// It always logs the feature flag status on the E2E subscription for diagnostics, +// and verifies the flag is registered on the RCV1P subscription when available. func skipIfRCV1PNotConfigured(t *testing.T) { t.Helper() + // Always log feature flag status on the default E2E subscription for diagnostics + logE2ESubscriptionFeatureFlag(t) + subID := config.Config.RCV1PSubscriptionID if subID == "" || strings.HasPrefix(subID, "$(") { t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") } - checkPlatformSettingsOverrideFeatureFlag(t, subID) + checkPlatformSettingsOverrideFeatureFlag(t, subID, config.RCV1PAzure, true) } var ( - featureFlagCheckOnce sync.Once - featureFlagCheckResult error + featureFlagChecks sync.Map // subscriptionID -> *featureFlagResult ) -// checkPlatformSettingsOverrideFeatureFlag verifies the Microsoft.Compute/PlatformSettingsOverride -// feature flag is registered on the given subscription. This is a prerequisite for wireserver to -// serve root certificates. The check runs only once per test run. -func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID string) { +type featureFlagResult struct { + once sync.Once + registered bool + err error +} + +// checkPlatformSettingsOverrideFeatureFlag checks the Microsoft.Compute/PlatformSettingsOverride +// feature flag on the given subscription. When failIfMissing is true (RCV1P tests), the test +// fails if the flag is not registered. When false (diagnostics), it only logs the result. +func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID string, client *config.AzureClient, failIfMissing bool) { t.Helper() - featureFlagCheckOnce.Do(func() { - featureFlagCheckResult = verifyFeatureFlag(t.Context(), subscriptionID) + val, _ := featureFlagChecks.LoadOrStore(subscriptionID, &featureFlagResult{}) + result := val.(*featureFlagResult) + result.once.Do(func() { + result.registered, result.err = queryFeatureFlag(t.Context(), subscriptionID, client) }) - if featureFlagCheckResult != nil { - t.Fatalf("RCV1P feature flag check failed: %v", featureFlagCheckResult) + + if result.err != nil { + t.Logf("PlatformSettingsOverride feature flag check on subscription %s: error: %v", subscriptionID, result.err) + if failIfMissing { + t.Fatalf("RCV1P feature flag check failed: %v", result.err) + } + return + } + + t.Logf("PlatformSettingsOverride feature flag on subscription %s: registered=%v", subscriptionID, result.registered) + if failIfMissing && !result.registered { + t.Fatalf("Microsoft.Compute/PlatformSettingsOverride is NOT registered on subscription %s; "+ + "wireserver will not serve root certificates without this feature flag", subscriptionID) } } -func verifyFeatureFlag(ctx context.Context, subscriptionID string) error { +// logE2ESubscriptionFeatureFlag logs the PlatformSettingsOverride feature flag status on the +// default E2E subscription for diagnostic purposes. This helps understand wireserver behavior +// (e.g., IsOptedInForRootCerts responses) even in non-RCV1P test runs. +func logE2ESubscriptionFeatureFlag(t *testing.T) { + t.Helper() + e2eAzure, err := config.NewAzureClient() + if err != nil { + t.Logf("WARNING: failed to create E2E Azure client for feature flag check: %v", err) + return + } + checkPlatformSettingsOverrideFeatureFlag(t, config.Config.SubscriptionID, e2eAzure, false) +} + +func queryFeatureFlag(ctx context.Context, subscriptionID string, client *config.AzureClient) (bool, error) { url := fmt.Sprintf( "https://management.azure.com/subscriptions/%s/providers/Microsoft.Features/providers/Microsoft.Compute/features/PlatformSettingsOverride?api-version=2021-07-01", subscriptionID, @@ -76,12 +111,12 @@ func verifyFeatureFlag(ctx context.Context, subscriptionID string) error { req, err := azruntime.NewRequest(ctx, "GET", url) if err != nil { - return fmt.Errorf("failed to create feature flag request: %w", err) + return false, fmt.Errorf("failed to create feature flag request: %w", err) } - resp, err := config.RCV1PAzure.Core.Pipeline().Do(req) + resp, err := client.Core.Pipeline().Do(req) if err != nil { - return fmt.Errorf("failed to query feature flag: %w", err) + return false, fmt.Errorf("failed to query feature flag: %w", err) } defer resp.Body.Close() @@ -89,15 +124,10 @@ func verifyFeatureFlag(ctx context.Context, subscriptionID string) error { bodyStr := string(body) if resp.StatusCode != 200 { - return fmt.Errorf("feature flag query returned status %d: %s", resp.StatusCode, bodyStr) - } - - if !strings.Contains(bodyStr, `"Registered"`) { - return fmt.Errorf("Microsoft.Compute/PlatformSettingsOverride is NOT registered on subscription %s (response: %s); "+ - "wireserver will not serve root certificates without this feature flag", subscriptionID, bodyStr) + return false, fmt.Errorf("feature flag query returned status %d: %s", resp.StatusCode, bodyStr) } - return nil + return strings.Contains(bodyStr, `"Registered"`), nil } // rcv1pOptInVMConfigMutator sets the platform opt-in tag on the VMSS resource level. From d6a151edcfe2f368f943e3a5aaab954e238387b8 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 17 Apr 2026 14:29:39 -0700 Subject: [PATCH 28/70] fix(windows): parse wireserver IsOptedInForRootCerts JSON with ConvertFrom-Json Same bug as Linux: wireserver returns JSON {"IsOptedInForRootCerts":true} but the script used -match "IsOptedInForRootCerts=true" (equals sign). Parse with ConvertFrom-Json and check the boolean property directly. Also add Write-Log for the wireserver response for diagnostics. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 8 ++++++-- staging/cse/windows/kubernetesfunc.tests.ps1 | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 159161153f0..0efc6e557c2 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -310,7 +310,9 @@ function Should-InstallCACertificatesRefreshTask { try { $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 - return ($optInResponse.Content -match 'IsOptedInForRootCerts=true') + Write-Log "IsOptedInForRootCerts wireserver response: $($optInResponse.Content)" + $optInJson = $optInResponse.Content | ConvertFrom-Json + return ($optInJson.IsOptedInForRootCerts -eq $true) } catch { Write-Log "Skipping CA refresh task registration because IsOptedInForRootCerts could not be determined: $_" return $false @@ -363,7 +365,9 @@ function Get-CACertificates { $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 - if (($optInResponse.Content -notmatch 'IsOptedInForRootCerts=true')) { + Write-Log "IsOptedInForRootCerts wireserver response: $($optInResponse.Content)" + $optInJson = $optInResponse.Content | ConvertFrom-Json + if ($optInJson.IsOptedInForRootCerts -ne $true) { Write-Log "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" return $false } diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 42accc39c51..924ccf13fc5 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -132,7 +132,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { Mock Retry-Command -MockWith { param($Command, $Args, $Retries, $RetryDelaySeconds) $script:lastRetryUri = $PSBoundParameters['Args'].Uri - return [PSCustomObject]@{ Content = 'IsOptedInForRootCerts=true' } + return [PSCustomObject]@{ Content = '{"IsOptedInForRootCerts":true}' } } $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' @@ -144,7 +144,7 @@ Describe 'Should-InstallCACertificatesRefreshTask' { It 'returns false for rcv1p regions when opt-in is disabled' { Mock Retry-Command -MockWith { - return [PSCustomObject]@{ Content = 'IsOptedInForRootCerts=false' } + return [PSCustomObject]@{ Content = '{"IsOptedInForRootCerts":false}' } } $result = Should-InstallCACertificatesRefreshTask -Location 'southcentralus' From eaaac9be3bb341fceffaf9b6d96d180c8ae916b2 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 17 Apr 2026 21:47:42 -0700 Subject: [PATCH 29/70] e2e: make RCV1P_SUBSCRIPTION_ID optional with feature flag auto-detection When RCV1P_SUBSCRIPTION_ID is not set, tests now check the default E2E subscription for PlatformSettingsOverride feature flag registration and use it for positive RCV1P tests if available (platform auto-injects the opt-in tag on those subscriptions). Negative tests (NotOptedIn) require RCV1P_SUBSCRIPTION_ID explicitly, since the platform may auto-inject the opt-in tag on the default sub. Helpers rcv1pAzureClient(), rcv1pSubscriptionID(), and rcv1pCluster() centralize the subscription/client/cluster selection logic. All Linux and Windows positive tests use these helpers for consistent behavior. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/validators.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/e2e/validators.go b/e2e/validators.go index 4773497d934..29ec1f0ba19 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2896,6 +2896,10 @@ func rcv1pTrustStoreDir(s *Scenario) string { func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Helper() + // Validate the provisioning log shows wireserver was queried and returned opted-in + ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", + "IsOptedInForRootCerts wireserver response:") + // Validate CA certificates were installed to the Windows certificate store command := []string{ "$ErrorActionPreference = 'Stop'", @@ -2948,6 +2952,14 @@ func ValidateRCV1PNotOptedIn(ctx context.Context, s *Scenario) { func ValidateRCV1PNotOptedInWindows(ctx context.Context, s *Scenario) { s.T.Helper() + // Validate the provisioning log shows wireserver was queried + ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", + "IsOptedInForRootCerts wireserver response:") + + // Validate wireserver reported not opted in + ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", + "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true") + // Validate C:\ca is empty or does not exist command := []string{ "$ErrorActionPreference = 'Stop'", From e85af60c388b621db80b1d73891f523827eb30de Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sat, 18 Apr 2026 01:56:44 -0700 Subject: [PATCH 30/70] e2e: always collect Windows CSE logs (not just on failure) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removes the s.T.Failed() guard on extractLogsFromVMWindows so full CSE logs (CustomDataSetupScript.log, kubelet, containerd, network config) are always uploaded to blob storage, even on success. This is a temporary debug commit — revert after investigation. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 6 +--- .../artifacts/init-aks-custom-cloud.sh | 29 +++++++++++++------ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index 56eb5a091f8..faff65e4366 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -939,11 +939,7 @@ hnsdiag list endpoints >> network_config.txt // extractLogsFromVMWindows runs a script on windows VM to collect logs and upload them to a blob storage // it then lists the blobs in the container and prints the content of each blob func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { - if !s.T.Failed() { - s.T.Logf("skipping logs extraction from windows VM, as the test didn't fail") - return - } - + // Always collect Windows logs for debugging (revert this to restore failure-only collection) ctx, cancel := context.WithTimeout(ctx, 4*time.Minute) defer cancel() pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index c63e0bc5df9..f189c433457 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -42,24 +42,35 @@ function make_request_with_retry { local attempt=1 local response + local http_code + local curl_output while [ $attempt -le $max_retries ]; do - response=$(curl -f --no-progress-meter --connect-timeout 10 --max-time 30 "$url") - local request_status=$? - - if echo "$response" | grep -q "RequestRateLimitExceeded"; then + # capture response body + HTTP status code; -w appends the code after the body. + # curl stderr (connection errors) flows to the script's log naturally. + # http_code is 000 when wireserver is unreachable (connection refused/timeout). + curl_output=$(curl --no-progress-meter --connect-timeout 10 --max-time 30 -w '\n%{http_code}' "$url") || true + http_code=$(echo "$curl_output" | tail -1) + response=$(echo "$curl_output" | sed '$d') + + if echo "$response" | grep -q "RequestRateLimitExceeded" && [ "$http_code" = "403" ]; then + echo "wireserver rate limited (HTTP ${http_code}) on attempt ${attempt}/${max_retries}: ${url}" >&2 sleep $retry_delay retry_delay=$((retry_delay * 2)) attempt=$((attempt + 1)) - elif [ $request_status -ne 0 ]; then - sleep $retry_delay - attempt=$((attempt + 1)) - else + elif [ "$http_code" -ge 200 ] 2>/dev/null && [ "$http_code" -lt 300 ] 2>/dev/null; then echo "$response" return 0 + else + echo "wireserver request failed (HTTP ${http_code}) on attempt ${attempt}/${max_retries}: ${url}" >&2 + if [ -n "$response" ]; then + echo "wireserver error response: ${response}" >&2 + fi + sleep $retry_delay + attempt=$((attempt + 1)) fi done - echo "exhausted all retries, last response: $response" + echo "exhausted all retries for ${url} (last HTTP ${http_code}), last response: $response" >&2 return 1 } From 395766a8615b1fba832d4773b13cbfa1a8ccfe52 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sun, 19 Apr 2026 12:45:40 -0700 Subject: [PATCH 31/70] fix: add wireserver HTTP error diagnostic logging for cert endpoints Log HTTP status codes and error response bodies when wireserver requests fail during RCV1P cert installation. This enables diagnosing specific failure modes (403 rate limit, 404 not delivered, 503 busy, 504 gateway timeout, 000 unreachable) without guessing from generic error messages. Linux (init-aks-custom-cloud.sh): - Replace curl -f with -w to capture HTTP status codes - Log attempt number, HTTP code, URL, and error body on each retry - Rate limit detection now requires both HTTP 403 and body match - All diagnostic output goes to stderr (not stdout) to avoid contaminating function return values Windows (kubernetesfunc.ps1): - Extract HTTP status code from exception Response object - Read and log wireserver error response body when available - Applied to both Should-InstallCACertificatesRefreshTask and Get-CACertificates catch blocks Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 32 +++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 0efc6e557c2..5ddb667dd99 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -314,7 +314,20 @@ function Should-InstallCACertificatesRefreshTask { $optInJson = $optInResponse.Content | ConvertFrom-Json return ($optInJson.IsOptedInForRootCerts -eq $true) } catch { - Write-Log "Skipping CA refresh task registration because IsOptedInForRootCerts could not be determined: $_" + $statusCode = "N/A" + $responseBody = "" + if ($_.Exception -and $_.Exception.Response) { + $statusCode = [int]$_.Exception.Response.StatusCode + try { + $stream = $_.Exception.Response.GetResponseStream() + $reader = New-Object System.IO.StreamReader($stream) + $responseBody = $reader.ReadToEnd() + } catch { } + } + Write-Log "Skipping CA refresh task registration because IsOptedInForRootCerts could not be determined (HTTP $statusCode): $_" + if ($responseBody) { + Write-Log "Wireserver error response body: $responseBody" + } return $false } } @@ -415,10 +428,23 @@ function Get-CACertificates { return $downloadedAny } catch { + $statusCode = "N/A" + $responseBody = "" + if ($_.Exception -and $_.Exception.Response) { + $statusCode = [int]$_.Exception.Response.StatusCode + try { + $stream = $_.Exception.Response.GetResponseStream() + $reader = New-Object System.IO.StreamReader($stream) + $responseBody = $reader.ReadToEnd() + } catch { } + } + if ($responseBody) { + Write-Log "Wireserver error response body: $responseBody" + } if ($FailOnError) { - throw "Failed to retrieve CA certificates. Error: $_" + throw "Failed to retrieve CA certificates (HTTP $statusCode). Error: $_" } - Write-Log "Warning: failed to retrieve CA certificates. Error: $_" + Write-Log "Warning: failed to retrieve CA certificates (HTTP $statusCode). Error: $_" return $false } } From 6116cc510cbd18275267a1346c74bccc290218e7 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sun, 19 Apr 2026 19:07:45 -0700 Subject: [PATCH 32/70] e2e: use testDir() for Windows CSE output log path consistency Replace filepath.Join("scenario-logs", s.T.Name()) with testDir(s.T) in getCustomScriptExtensionStatus to match the pattern used everywhere else in the e2e suite. Ensures Windows CSE output logs are written to the same directory as other scenario logs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/test_helpers.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index c4e7ae1c630..7eada56c231 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -470,7 +470,7 @@ func getCustomScriptExtensionStatus(s *Scenario, vmssVM *armcompute.VirtualMachi if s.IsWindows() { // Save the CSE output for Windows VMs for better troubleshooting if status.Message != nil { - logDir := filepath.Join("scenario-logs", s.T.Name()) + logDir := testDir(s.T) if err := os.MkdirAll(logDir, 0755); err == nil { logFile := filepath.Join(logDir, "windows-cse-output.log") err = os.WriteFile(logFile, []byte(*status.Message), 0644) From d71b6cd6fef2f852ef900c04ca8f3c91f5a3c223 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 21 Apr 2026 10:43:43 -0700 Subject: [PATCH 33/70] fix(e2e): filter CSE extension to fix empty Windows CSE log files The getCustomScriptExtensionStatus function iterated over all VM extensions without filtering by name. On Windows VMs with multiple extensions (e.g., ManagedIdentity + CustomScriptExtension), it could process a non-CSE extension first, write its empty status.Message to windows-cse-output.log, and return before reaching the actual CSE. Fix: - Filter extensions by name (vmssCSE, customscript, aksnode) - Skip empty messages to avoid overwriting with zero-byte files - Log byte count for diagnostics Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/test_helpers.go | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index 7eada56c231..8ec512b5289 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -466,10 +466,26 @@ func validateVM(ctx context.Context, s *Scenario) { func getCustomScriptExtensionStatus(s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { for _, extension := range vmssVM.Properties.InstanceView.Extensions { + // Only process the CSE extension, skip other extensions (e.g., ManagedIdentity) + // whose empty status messages would overwrite the actual CSE output file. + // The extension name in InstanceView is typically "vmssCSE" (matching the resource name) + // but may also appear as the handler type. Match on known CSE identifiers. + if extension.Name == nil { + continue + } + name := strings.ToLower(*extension.Name) + isCSE := name == "vmsscse" || + strings.Contains(name, "customscript") || + strings.Contains(name, "aksnode") + if !isCSE { + continue + } for _, status := range extension.Statuses { if s.IsWindows() { - // Save the CSE output for Windows VMs for better troubleshooting - if status.Message != nil { + // Save the CSE output for Windows VMs for better troubleshooting. + // Only write when the message has actual content to avoid overwriting + // with an empty file from a status entry that has no output. + if status.Message != nil && *status.Message != "" { logDir := testDir(s.T) if err := os.MkdirAll(logDir, 0755); err == nil { logFile := filepath.Join(logDir, "windows-cse-output.log") @@ -477,7 +493,7 @@ func getCustomScriptExtensionStatus(s *Scenario, vmssVM *armcompute.VirtualMachi if err != nil { s.T.Logf("failed to save Windows CSE output to %s: %v", logFile, err) } else { - s.T.Logf("saved Windows CSE output to %s", logFile) + s.T.Logf("saved Windows CSE output to %s (%d bytes)", logFile, len(*status.Message)) } } } From 0c1587d1b5b55f61f7454a73a917806cd115b9e4 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 21 Apr 2026 15:01:38 -0700 Subject: [PATCH 34/70] fix(e2e): re-fetch VM instance view for fresh CSE extension status The VM object passed to getCustomScriptExtensionStatus may have been fetched by waitForVMRunningState before the CSE extension finished executing, resulting in empty extension status messages. This caused windows-cse-output.log to not be written even though the CSE succeeded. Fix by re-fetching the VM with instance view expand directly in getCustomScriptExtensionStatus to ensure we get the latest extension status data including the CSE output message. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/test_helpers.go | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index 8ec512b5289..bac2366c3a6 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -465,6 +465,25 @@ func validateVM(ctx context.Context, s *Scenario) { } func getCustomScriptExtensionStatus(s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { + // Re-fetch the VM with instance view to ensure we have fresh extension status data. + // The VM object passed in may have been fetched before the CSE finished executing, + // so the extension status message could be empty or stale. + if vmssVM.InstanceID != nil { + ctx := context.Background() + freshVM, err := s.GetAzure().VMSSVM.Get(ctx, + *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, + s.Runtime.VMSSName, + *vmssVM.InstanceID, + &armcompute.VirtualMachineScaleSetVMsClientGetOptions{ + Expand: to.Ptr(armcompute.InstanceViewTypesInstanceView), + }) + if err == nil && freshVM.Properties != nil && freshVM.Properties.InstanceView != nil { + vmssVM.Properties.InstanceView = freshVM.Properties.InstanceView + } else if err != nil { + s.T.Logf("warning: failed to re-fetch VM instance view for CSE status: %v", err) + } + } + for _, extension := range vmssVM.Properties.InstanceView.Extensions { // Only process the CSE extension, skip other extensions (e.g., ManagedIdentity) // whose empty status messages would overwrite the actual CSE output file. From 2c02745012020e263e7a1b8c2730c3b90cfb0f86 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Tue, 21 Apr 2026 15:13:11 -0700 Subject: [PATCH 35/70] e2e: trim whitespace from RCV1P_SUBSCRIPTION_ID to fix gating When RCV1P_SUBSCRIPTION_ID resolves to whitespace (e.g. ' ') from an unconfigured ADO pipeline variable, hasExplicitRCV1PSubscription() incorrectly returns true because ' ' != ''. This causes the feature flag API call with an empty/whitespace subscription ID, returning 404 and t.Fatalf instead of gracefully skipping via t.Skip. Fix by applying strings.TrimSpace() in hasExplicitRCV1PSubscription(), rcv1pSubscriptionID(), and the config init() guard. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/config/config.go | 5 +++-- e2e/scenario_rcv1p_test.go | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/e2e/config/config.go b/e2e/config/config.go index bd3f9c677c2..a2b41ecdae0 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -183,8 +183,9 @@ func mustLoadConfig() *Configuration { } func init() { - if Config.RCV1PSubscriptionID != "" && !strings.HasPrefix(Config.RCV1PSubscriptionID, "$(") { - client, err := NewAzureClientForSubscription(Config.RCV1PSubscriptionID) + rcv1pSubID := strings.TrimSpace(Config.RCV1PSubscriptionID) + if rcv1pSubID != "" && !strings.HasPrefix(rcv1pSubID, "$(") { + client, err := NewAzureClientForSubscription(rcv1pSubID) if err != nil { panic(fmt.Sprintf("failed to create RCV1P Azure client: %v", err)) } diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 72a010cfe4b..4504ad976b6 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -47,7 +47,7 @@ func skipIfRCV1PNotConfigured(t *testing.T) { // Always log feature flag status on the default E2E subscription for diagnostics logE2ESubscriptionFeatureFlag(t) - subID := config.Config.RCV1PSubscriptionID + subID := strings.TrimSpace(config.Config.RCV1PSubscriptionID) if subID == "" || strings.HasPrefix(subID, "$(") { t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") } From 0c5e3e1b663d5ff1c925572ae1797a51b066db47 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 11:29:03 -0700 Subject: [PATCH 36/70] e2e: add gen2 Windows RCV1P tests, fix Windows2025 TrustedLaunch Add gen2 variants for all 3 Windows RCV1P tests so they run in gen2 pipeline jobs (windows-2022-containerd-gen2, windows-23H2-gen2, windows-2025-gen2) which previously skipped all RCV1P tests. Fix Test_RCV1P_Windows2025 which incorrectly added TrustedLaunch to a non-gen2 VHD (VHDWindows2025 has UnsupportedGen2: true), causing BadRequest errors. Removed TrustedLaunch from non-gen2 test; the new gen2 variant (Test_RCV1P_Windows2025Gen2) uses VHDWindows2025Gen2 which supports TrustedLaunch natively. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_win_test.go | 79 ++++++++++++++++++++++++++++++++-- 1 file changed, 76 insertions(+), 3 deletions(-) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 0932ae5f97b..c05b0607358 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -14,7 +14,6 @@ import ( "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" - "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" ) // Test_RCV1P_Windows2022 validates RCV1P cert download and Windows certificate store @@ -64,8 +63,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { }) } -// Test_RCV1P_Windows2025 validates RCV1P on Windows Server 2025. This SKU requires -// Trusted Launch, so the VMConfigMutator combines both TrustedLaunch and opt-in tag settings. +// Test_RCV1P_Windows2025 validates RCV1P on Windows Server 2025 (non-gen2). func Test_RCV1P_Windows2025(t *testing.T) { skipIfRCV1PNotConfigured(t) RunScenario(t, &Scenario{ @@ -82,6 +80,81 @@ func Test_RCV1P_Windows2025(t *testing.T) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) rcv1pOptInVMConfigMutator(vmss) }, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + Windows2025BootstrapConfigMutator(t, nbc) + }, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows2022Gen2 validates RCV1P cert download and Windows certificate store +// installation on Windows Server 2022 Gen2. Covers the gen2 pipeline job. +func Test_RCV1P_Windows2022Gen2(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2022 Gen2 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2022ContainerdGen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows23H2Gen2 validates RCV1P on Windows Server 23H2 Gen2. Covers the gen2 pipeline job. +func Test_RCV1P_Windows23H2Gen2(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 23H2 Gen2 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows23H2Gen2, + VMConfigMutator: rcv1pOptInVMConfigMutator, + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: EmptyBootstrapConfigMutator, + Validator: func(ctx context.Context, s *Scenario) { + ValidateRCV1PCertModeWindows(ctx, s) + }, + }, + }) +} + +// Test_RCV1P_Windows2025Gen2 validates RCV1P on Windows Server 2025 Gen2. Covers the gen2 pipeline job. +func Test_RCV1P_Windows2025Gen2(t *testing.T) { + skipIfRCV1PNotConfigured(t) + RunScenario(t, &Scenario{ + Description: "Tests RCV1P cert mode on Windows Server 2025 Gen2 with VM opt-in tag", + AzureClient: config.RCV1PAzure, + SubscriptionID: config.Config.RCV1PSubscriptionID, + Tags: Tags{ + RCV1PCertMode: true, + }, + Config: Config{ + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDWindows2025Gen2, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) + rcv1pOptInVMConfigMutator(vmss) + }, VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { Windows2025BootstrapConfigMutator(t, nbc) From 69a5d1871ae25b449ab94c43a5caf37c46be2d88 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 13:12:09 -0700 Subject: [PATCH 37/70] e2e: switch RCV1P tests to Azure CNI Overlay to fix IP exhaustion Windows nodes use azure-vnet plugin even in kubenet clusters, reserving multiple subnet IPs per node. Many parallel RCV1P tests sharing the same subnet causes 'No available addresses' failures at pod scheduling. Switch from kubenet to Azure CNI Overlay which uses a separate virtual pod CIDR (10.244.0.0/16), eliminating subnet IP exhaustion. This is easily revertable: change ClusterRCV1POverlay -> ClusterRCV1PKubenet and ClusterAzureOverlayNetwork -> ClusterKubenet in rcv1pCluster(). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cache.go | 12 ++++++++++++ e2e/scenario_rcv1p_test.go | 12 ++++++------ e2e/scenario_rcv1p_win_test.go | 14 +++++++------- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/e2e/cache.go b/e2e/cache.go index 777acdaf559..e8358db7770 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -214,6 +214,18 @@ func clusterRCV1PKubenet(ctx context.Context, request ClusterRequest) (*Cluster, return prepareCluster(ctx, infra, getKubenetClusterModel("abe2e-rcv1p-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) } +var ClusterRCV1POverlay = cachedFunc(clusterRCV1POverlay) + +// clusterRCV1POverlay creates an Azure CNI Overlay cluster in the RCV1P subscription. +// Overlay avoids subnet IP exhaustion for Windows tests by using a virtual pod CIDR. +func clusterRCV1POverlay(ctx context.Context, request ClusterRequest) (*Cluster, error) { + infra := RCV1PClusterInfra() + if infra == nil { + return nil, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set, cannot create RCV1P overlay cluster") + } + return prepareCluster(ctx, infra, getAzureOverlayNetworkClusterModel("abe2e-rcv1p-overlay-v1", request.Location, request.K8sSystemPoolSKU), false, false) +} + // isNotFoundErr checks if an error represents a "not found" response from Azure API func isNotFoundErr(err error) bool { var respErr *azcore.ResponseError diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 4504ad976b6..fca1c23fc1a 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -161,7 +161,7 @@ func Test_RCV1P_Ubuntu2204(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDUbuntu2204Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -187,7 +187,7 @@ func Test_RCV1P_Ubuntu2404(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDUbuntu2404Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -213,7 +213,7 @@ func Test_RCV1P_AzureLinuxV3(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDAzureLinuxV3Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -239,7 +239,7 @@ func Test_RCV1P_Flatcar(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDFlatcarGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -265,7 +265,7 @@ func Test_RCV1P_ACL(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDACLGen2TL, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -298,7 +298,7 @@ func Test_RCV1P_NotOptedIn(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDUbuntu2204Gen2Containerd, BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index c05b0607358..08f7da43d9c 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -28,7 +28,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows2022Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -51,7 +51,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows23H2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -74,7 +74,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows2025, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -103,7 +103,7 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows2022ContainerdGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -126,7 +126,7 @@ func Test_RCV1P_Windows23H2Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows23H2Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -149,7 +149,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows2025Gen2, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -181,7 +181,7 @@ func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1POverlay, VHD: config.VHDWindows2022Containerd, BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { From 8df18ae950a953b4dfb8e6b6f3bca84becc80c61 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 16:00:59 -0700 Subject: [PATCH 38/70] e2e: revert RCV1P from overlay back to kubenet Overlay doesn't work for E2E Windows nodes because the E2E framework adds nodes via VMSS outside AKS's node pool flow, so overlay pod CIDR assignments from the control plane don't reach the manually-added nodes. The azure-vnet plugin reports 'no available address pools'. Kubenet IP exhaustion is intermittent and sometimes succeeds; overlay fails consistently in this E2E setup. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cache.go | 12 ------------ e2e/scenario_rcv1p_test.go | 12 ++++++------ e2e/scenario_rcv1p_win_test.go | 14 +++++++------- 3 files changed, 13 insertions(+), 25 deletions(-) diff --git a/e2e/cache.go b/e2e/cache.go index e8358db7770..777acdaf559 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -214,18 +214,6 @@ func clusterRCV1PKubenet(ctx context.Context, request ClusterRequest) (*Cluster, return prepareCluster(ctx, infra, getKubenetClusterModel("abe2e-rcv1p-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) } -var ClusterRCV1POverlay = cachedFunc(clusterRCV1POverlay) - -// clusterRCV1POverlay creates an Azure CNI Overlay cluster in the RCV1P subscription. -// Overlay avoids subnet IP exhaustion for Windows tests by using a virtual pod CIDR. -func clusterRCV1POverlay(ctx context.Context, request ClusterRequest) (*Cluster, error) { - infra := RCV1PClusterInfra() - if infra == nil { - return nil, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set, cannot create RCV1P overlay cluster") - } - return prepareCluster(ctx, infra, getAzureOverlayNetworkClusterModel("abe2e-rcv1p-overlay-v1", request.Location, request.K8sSystemPoolSKU), false, false) -} - // isNotFoundErr checks if an error represents a "not found" response from Azure API func isNotFoundErr(err error) bool { var respErr *azcore.ResponseError diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index fca1c23fc1a..4504ad976b6 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -161,7 +161,7 @@ func Test_RCV1P_Ubuntu2204(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDUbuntu2204Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -187,7 +187,7 @@ func Test_RCV1P_Ubuntu2404(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDUbuntu2404Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -213,7 +213,7 @@ func Test_RCV1P_AzureLinuxV3(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDAzureLinuxV3Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -239,7 +239,7 @@ func Test_RCV1P_Flatcar(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDFlatcarGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -265,7 +265,7 @@ func Test_RCV1P_ACL(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDACLGen2TL, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -298,7 +298,7 @@ func Test_RCV1P_NotOptedIn(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDUbuntu2204Gen2Containerd, BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { }, diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 08f7da43d9c..c05b0607358 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -28,7 +28,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2022Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -51,7 +51,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows23H2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -74,7 +74,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2025, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -103,7 +103,7 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2022ContainerdGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -126,7 +126,7 @@ func Test_RCV1P_Windows23H2Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows23H2Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -149,7 +149,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2025Gen2, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -181,7 +181,7 @@ func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1POverlay, + Cluster: ClusterRCV1PKubenet, VHD: config.VHDWindows2022Containerd, BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { From 1ae597b4710ad5a360f645881b485e885c79c545 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 18:02:53 -0700 Subject: [PATCH 39/70] REVERT ME: use dedicated kubenet cluster for RCV1P tests Give RCV1P tests their own kubenet cluster (abe2e-rcv1p-default-kubenet-v1) instead of sharing ClusterKubenet with all other Windows tests. This avoids subnet IP exhaustion caused by many parallel test nodes competing for the same /24 subnet. To revert: change ClusterRCV1PDefaultKubenet back to ClusterKubenet in rcv1pCluster() and remove ClusterRCV1PDefaultKubenet from cache.go. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cache.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/e2e/cache.go b/e2e/cache.go index 777acdaf559..3f08a6430eb 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -214,6 +214,15 @@ func clusterRCV1PKubenet(ctx context.Context, request ClusterRequest) (*Cluster, return prepareCluster(ctx, infra, getKubenetClusterModel("abe2e-rcv1p-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) } +var ClusterRCV1PDefaultKubenet = cachedFunc(clusterRCV1PDefaultKubenet) + +// clusterRCV1PDefaultKubenet creates a dedicated kubenet cluster for RCV1P tests on the default +// E2E subscription. This avoids sharing the main kubenet cluster's subnet with non-RCV1P tests, +// preventing IP exhaustion when many Windows tests run in parallel. +func clusterRCV1PDefaultKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) { + return prepareCluster(ctx, DefaultClusterInfra, getKubenetClusterModel("abe2e-rcv1p-default-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) +} + // isNotFoundErr checks if an error represents a "not found" response from Azure API func isNotFoundErr(err error) bool { var respErr *azcore.ResponseError From e15d8ca0d99a63844bea08d51a69a44ab034acee Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 20:33:54 -0700 Subject: [PATCH 40/70] REVERT ME: use Azure CNI cluster for Windows RCV1P tests Windows RCV1P tests were failing with IP exhaustion because they used kubenet clusters while baseTemplateWindows() configures the NBC for Azure CNI overlay mode. The azure-vnet plugin on the node then tries overlay IPAM which fails for standalone VMSS nodes. Fix: use ClusterAzureNetwork (matching all other Windows tests) instead of kubenet for Windows RCV1P tests. Linux RCV1P tests stay on kubenet. Removes the unused ClusterRCV1PDefaultKubenet (dedicated kubenet cluster that didn't solve the issue since the root cause was NBC/cluster mismatch). To revert: change rcv1pWindowsCluster() back to rcv1pCluster() in scenario_rcv1p_win_test.go and remove rcv1pWindowsCluster() from scenario_rcv1p_test.go. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cache.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/e2e/cache.go b/e2e/cache.go index 3f08a6430eb..777acdaf559 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -214,15 +214,6 @@ func clusterRCV1PKubenet(ctx context.Context, request ClusterRequest) (*Cluster, return prepareCluster(ctx, infra, getKubenetClusterModel("abe2e-rcv1p-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) } -var ClusterRCV1PDefaultKubenet = cachedFunc(clusterRCV1PDefaultKubenet) - -// clusterRCV1PDefaultKubenet creates a dedicated kubenet cluster for RCV1P tests on the default -// E2E subscription. This avoids sharing the main kubenet cluster's subnet with non-RCV1P tests, -// preventing IP exhaustion when many Windows tests run in parallel. -func clusterRCV1PDefaultKubenet(ctx context.Context, request ClusterRequest) (*Cluster, error) { - return prepareCluster(ctx, DefaultClusterInfra, getKubenetClusterModel("abe2e-rcv1p-default-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) -} - // isNotFoundErr checks if an error represents a "not found" response from Azure API func isNotFoundErr(err error) bool { var respErr *azcore.ResponseError From 4d5ca4e39f2003e84542b612d1215e91f04321e1 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 22 Apr 2026 22:45:47 -0700 Subject: [PATCH 41/70] REVERT ME: add wireserver endpoint diagnostics to Windows RCV1P validator Probes all wireserver cert endpoints (isOptedInForRootCerts, operationrequestsroot, operationrequestsintermediate, legacy cacertificates) during validation and dumps CSE log lines related to certificate operations. Uses execScriptOnVMForScenario with explicit t.Logf to ensure output is always visible in test logs, not swallowed by execScriptOnVMForScenarioValidateExitCode. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/validators.go | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/e2e/validators.go b/e2e/validators.go index 29ec1f0ba19..3377a92a954 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2896,6 +2896,45 @@ func rcv1pTrustStoreDir(s *Scenario) string { func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Helper() + // REVERT ME: Diagnostic block — probe wireserver endpoints and dump CSE log tail from the VM + // so we can see exactly what the wireserver returns for operationrequests and what the CSE logged. + diagCommand := []string{ + "$ErrorActionPreference = 'Continue'", + "Write-Host '=== DIAGNOSTIC: probing wireserver rcv1p endpoints ==='", + "try {", + " $optIn = Invoke-WebRequest -Uri 'http://168.63.129.16/acms/isOptedInForRootCerts' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"isOptedInForRootCerts: $($optIn.Content)\"", + "} catch { Write-Host \"isOptedInForRootCerts ERROR: $_\" }", + "try {", + " $root = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsroot&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"operationrequestsroot status=$($root.StatusCode) length=$($root.Content.Length)\"", + " Write-Host \"operationrequestsroot content: $($root.Content)\"", + "} catch { Write-Host \"operationrequestsroot ERROR: $_\" }", + "try {", + " $intermediate = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsintermediate&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"operationrequestsintermediate status=$($intermediate.StatusCode) length=$($intermediate.Content.Length)\"", + " Write-Host \"operationrequestsintermediate content: $($intermediate.Content)\"", + "} catch { Write-Host \"operationrequestsintermediate ERROR: $_\" }", + "try {", + " $legacy = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"legacy cacertificates status=$($legacy.StatusCode) length=$($legacy.Content.Length)\"", + " $legacyJson = $legacy.Content | ConvertFrom-Json", + " if ($legacyJson.Certificates) { Write-Host \"legacy cert count: $($legacyJson.Certificates.Length)\" } else { Write-Host 'legacy: no Certificates array' }", + "} catch { Write-Host \"legacy cacertificates ERROR: $_\" }", + "Write-Host '=== DIAGNOSTIC: C:\\ca folder contents ==='", + "if (Test-Path 'C:\\ca') { Get-ChildItem -Path 'C:\\ca' -File | ForEach-Object { Write-Host \" $($_.Name) ($($_.Length) bytes)\" } } else { Write-Host 'C:\\ca does not exist' }", + "Write-Host '=== DIAGNOSTIC: CSE log tail (last 60 lines with CA/cert/wireserver) ==='", + "if (Test-Path 'C:\\AzureData\\CustomDataSetupScript.log') {", + " Get-Content 'C:\\AzureData\\CustomDataSetupScript.log' -Tail 200 | Where-Object { $_ -match 'CA |cert|wireserver|optedin|operation|acms|Write cert|Warning' } | Select-Object -Last 60 | ForEach-Object { Write-Host $_ }", + "} else { Write-Host 'CSE log not found' }", + "Write-Host '=== END DIAGNOSTIC ==='", + } + diagResult := execScriptOnVMForScenario(ctx, s, strings.Join(diagCommand, "\n")) + s.T.Logf("REVERT ME: wireserver diagnostics stdout:\n%s", diagResult.stdout) + if diagResult.stderr != "" { + s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) + } + // Validate the provisioning log shows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") From b936a2fca1f60838616e2818fcfb7c48bf7214a8 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 23 Apr 2026 07:31:15 -0700 Subject: [PATCH 42/70] fix: use correct wireserver JSON field name for rcv1p cert download The wireserver operationrequestsroot and operationrequestsintermediate endpoints return certificates under the 'OperationsInfo' field, but the Windows PowerShell code was looking for 'OperationRequests' which doesn't exist in the response. This caused the null check to skip the entire cert download loop, leaving C:\ca empty despite wireserver returning valid certificate data. The Linux implementation avoids this by using grep to extract ResouceFileName values directly from the raw JSON, bypassing the parent field name entirely. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 5ddb667dd99..b5d83453ffa 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -393,12 +393,12 @@ function Get-CACertificates { $operationResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$operationRequestUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 $operationJson = ($operationResponse.Content) | ConvertFrom-Json - if ($null -eq $operationJson -or $null -eq $operationJson.OperationRequests) { + if ($null -eq $operationJson -or $null -eq $operationJson.OperationsInfo) { Write-Log "Warning: no operation requests found for $requestType" continue } - foreach ($operation in $operationJson.OperationRequests) { + foreach ($operation in $operationJson.OperationsInfo) { $resourceFileName = $operation.ResouceFileName if ([string]::IsNullOrEmpty($resourceFileName)) { continue From 15c00775487f50c465acd6a63cefc726772f1d4b Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 23 Apr 2026 07:40:00 -0700 Subject: [PATCH 43/70] REVERT ME: add azcopy error logging to Windows log collection Wraps each azcopy copy call with error checking and logging to diagnose why Windows CSE log uploads consistently return BlobNotFound. Also captures RunCommand stdout/stderr (InstanceView) which was previously not logged, so we can see azcopy output and any MSI auth failures. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/vmss.go | 67 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 54 insertions(+), 13 deletions(-) diff --git a/e2e/vmss.go b/e2e/vmss.go index faff65e4366..bb7521af2ad 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -901,6 +901,7 @@ func extractLogsFromVMLinux(ctx context.Context, s *Scenario, vm *ScenarioVM) er return nil } +// REVERT ME: added error logging around azcopy to diagnose why blob uploads fail (BlobNotFound) const uploadLogsPowershellScript = ` param( [string]$arg1, @@ -908,18 +909,47 @@ param( [string]$arg3 ) -Invoke-WebRequest -UseBasicParsing https://aka.ms/downloadazcopy-v10-windows -OutFile azcopy.zip -Expand-Archive azcopy.zip -cd .\azcopy\* -$env:AZCOPY_AUTO_LOGIN_TYPE="MSI" -$env:AZCOPY_MSI_RESOURCE_STRING=$arg3 -C:\k\debug\collect-windows-logs.ps1 -$CollectedLogs=(Get-ChildItem . -Filter "*_logs.zip" -File)[0].Name -.\azcopy.exe copy $CollectedLogs "$arg1/collected-node-logs.zip" -.\azcopy.exe copy "C:\azuredata\CustomDataSetupScript.log" "$arg1/cse.log" -.\azcopy.exe copy "C:\AzureData\provision.complete" "$arg1/provision.complete" -.\azcopy.exe copy "C:\k\kubelet.err.log" "$arg1/kubelet.err.log" -.\azcopy.exe copy "C:\k\containerd.err.log" "$arg1/containerd.err.log" +# REVERT ME: verbose error logging for azcopy upload diagnostics +function Run-AzCopy { + param([string]$Source, [string]$Dest) + if (-not (Test-Path $Source)) { + Write-Host "AZCOPY SKIP: source not found: $Source" + return + } + Write-Host "AZCOPY: copying $Source -> $Dest" + $output = & .\azcopy.exe copy $Source $Dest 2>&1 + $exitCode = $LASTEXITCODE + Write-Host "AZCOPY: exit=$exitCode output=$output" + if ($exitCode -ne 0) { + Write-Host "AZCOPY ERROR: failed to copy $Source (exit=$exitCode)" + } +} + +try { + Write-Host "Downloading azcopy..." + Invoke-WebRequest -UseBasicParsing https://aka.ms/downloadazcopy-v10-windows -OutFile azcopy.zip + Expand-Archive azcopy.zip + cd .\azcopy\* + $env:AZCOPY_AUTO_LOGIN_TYPE="MSI" + $env:AZCOPY_MSI_RESOURCE_STRING=$arg3 + Write-Host "MSI resource: $arg3" + Write-Host "Blob destination: $arg1" +} catch { + Write-Host "AZCOPY SETUP ERROR: $_" +} + +try { + C:\k\debug\collect-windows-logs.ps1 + $CollectedLogs=(Get-ChildItem . -Filter "*_logs.zip" -File)[0].Name + Run-AzCopy -Source $CollectedLogs -Dest "$arg1/collected-node-logs.zip" +} catch { + Write-Host "COLLECT-LOGS ERROR: $_" +} + +Run-AzCopy -Source "C:\azuredata\CustomDataSetupScript.log" -Dest "$arg1/cse.log" +Run-AzCopy -Source "C:\AzureData\provision.complete" -Dest "$arg1/provision.complete" +Run-AzCopy -Source "C:\k\kubelet.err.log" -Dest "$arg1/kubelet.err.log" +Run-AzCopy -Source "C:\k\containerd.err.log" -Dest "$arg1/containerd.err.log" # Collect network configuration information ipconfig /all > network_config.txt @@ -933,7 +963,7 @@ Get-NetNeighbor >> network_config.txt Get-NetConnectionProfile >> network_config.txt hnsdiag list networks >> network_config.txt hnsdiag list endpoints >> network_config.txt -.\azcopy.exe copy "network_config.txt" "$arg1/network_config.txt" +Run-AzCopy -Source "network_config.txt" -Dest "$arg1/network_config.txt" ` // extractLogsFromVMWindows runs a script on windows VM to collect logs and upload them to a blob storage @@ -1019,6 +1049,17 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { respJSON, _ := json.MarshalIndent(runCommandResp, "", " ") s.T.Logf("run command executed successfully:\n%s", respJSON) + // REVERT ME: log RunCommand stdout/stderr to diagnose azcopy upload failures + if runCommandResp.Properties != nil && runCommandResp.Properties.InstanceView != nil { + iv := runCommandResp.Properties.InstanceView + if iv.Output != nil && *iv.Output != "" { + s.T.Logf("RunCommand stdout:\n%s", *iv.Output) + } + if iv.Error != nil && *iv.Error != "" { + s.T.Logf("RunCommand stderr:\n%s", *iv.Error) + } + } + s.T.Logf("uploaded logs to %s", blobUrl) downloadBlob := func(blobSuffix string) { From 2efa0f2ae629c770de22155d83749be060e3523e Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 23 Apr 2026 10:59:05 -0700 Subject: [PATCH 44/70] REVERT ME: enable verbose test output for azcopy/wireserver diagnostics Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .pipelines/scripts/e2e_run.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 097fe250756..77ae62dbec7 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -97,10 +97,11 @@ tar -xzf "$temp_file" -C bin chmod +x bin/gotestsum rm -f "$temp_file" +# REVERT ME: added -v to see t.Logf output from passing tests (azcopy/wireserver diagnostics) # gotestsum configure to only show logs for failed tests, json file for detailed logs # Run the tests! Yey! test_exit_code=0 -./bin/gotestsum --format testdox --junitfile "${BUILD_SRC_DIR}/e2e/report.xml" --jsonfile "${BUILD_SRC_DIR}/e2e/test-log.json" -- -parallel 60 -timeout "${E2E_GO_TEST_TIMEOUT}" || test_exit_code=$? +./bin/gotestsum --format testdox --junitfile "${BUILD_SRC_DIR}/e2e/report.xml" --jsonfile "${BUILD_SRC_DIR}/e2e/test-log.json" -- -v -parallel 60 -timeout "${E2E_GO_TEST_TIMEOUT}" || test_exit_code=$? # Upload test results as Azure DevOps artifacts echo "##vso[artifact.upload containerfolder=test-results;artifactname=e2e-test-log]${BUILD_SRC_DIR}/e2e/test-log.json" From ce1a29f9b921ec1d61dbd70520d42c3569a5579c Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 23 Apr 2026 20:44:17 -0700 Subject: [PATCH 45/70] REVERT ME: canary check to prove whether SSH validators are broken Adds a ValidateFileHasContent check for a nonsense string that will never exist in the CSE log. If this test PASSES, it proves the ExitMissingError handler in exec.go:130 is silently swallowing SSH exit codes and all Windows validators are no-ops. If this test FAILS (expected), validators are working correctly. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/validators.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/e2e/validators.go b/e2e/validators.go index 3377a92a954..54fda615272 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2935,6 +2935,13 @@ func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) } + // REVERT ME: Canary check — this string should NEVER exist in the CSE log. + // If this test PASSES, it proves the SSH ExitMissingError bug (exec.go:130) is + // silently swallowing failures and all validators are broken. + // If this test FAILS (as expected), validators are working correctly. + ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", + "CANARY_STRING_THAT_SHOULD_NEVER_EXIST_IN_ANY_LOG_FILE_EVER_12345") + // Validate the provisioning log shows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") From fcc42a9a1e39716c1dfd5d72c947c14bdf6d123f Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 23 Apr 2026 22:34:11 -0700 Subject: [PATCH 46/70] Remove canary check - validators confirmed working The canary test proved validators are functional and our branch CSE zip is correctly delivered to VMs. Wireserver returns IsOptedInForRootCerts=true and the CSE log contains the expected RCV1P log lines. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/validators.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index 54fda615272..3377a92a954 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2935,13 +2935,6 @@ func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) } - // REVERT ME: Canary check — this string should NEVER exist in the CSE log. - // If this test PASSES, it proves the SSH ExitMissingError bug (exec.go:130) is - // silently swallowing failures and all validators are broken. - // If this test FAILS (as expected), validators are working correctly. - ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", - "CANARY_STRING_THAT_SHOULD_NEVER_EXIST_IN_ANY_LOG_FILE_EVER_12345") - // Validate the provisioning log shows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") From d37eeaeae1ddcd647ba4f8c9cd12d21c8db167a8 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 24 Apr 2026 08:34:05 -0700 Subject: [PATCH 47/70] fix: make wireserver cert retrieval failures fatal on Linux Cert installation must succeed for the selected mode (legacy or rcv1p). Previously, failures after exhausting retries were silently swallowed with a warning, leaving the node without certificates. Now failures exit 1, matching the Windows -FailOnError behavior. Retries with backoff in make_request_with_retry still handle transient wireserver issues (rate limiting, temporary unavailability). Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index f189c433457..caba6e99335 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -227,7 +227,7 @@ function install_certs_to_trust_store { # Certificate refresh behavior summary: # - legacy mode directly attempts certificate download from wireserver and only in ussec and usnat regions. # - rcv1p mode first checks IsOptedInForRootCerts, then downloads only when opted in. -# - Wireserver failures are treated as non-fatal, and cert trust-store updates are skipped gracefully. +# - Wireserver failures are fatal — cert installation must succeed for the selected mode. refresh_location="${2:-${LOCATION}}" @@ -251,7 +251,8 @@ if [ "$cert_endpoint_mode" = "legacy" ]; then if retrieve_legacy_certs; then install_certs_to_trust_store else - echo "Warning: failed to retrieve legacy certificates from wireserver; continuing without trust store updates" + echo "ERROR: failed to retrieve legacy certificates from wireserver after retries" + exit 1 fi elif [ "$cert_endpoint_mode" = "rcv1p" ]; then if is_opted_in_for_root_certs; then @@ -259,7 +260,8 @@ elif [ "$cert_endpoint_mode" = "rcv1p" ]; then if retrieve_rcv1p_certs; then install_certs_to_trust_store else - echo "Warning: failed to retrieve rcv1p certificates from wireserver; continuing without trust store updates" + echo "ERROR: failed to retrieve rcv1p certificates from wireserver after retries" + exit 1 fi fi fi From 5ff8a2504e24f6998611d6dd0325fe6aa462fbd8 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sat, 25 Apr 2026 15:22:07 -0700 Subject: [PATCH 48/70] revert: remove diagnostic commits used during RCV1P development Reverts the following temporary diagnostic commits that served their purpose during RCV1P cert mode debugging and are no longer needed: - 807b5a46a8 (wireserver endpoint diagnostics in validator) Why: Added to debug cert download failures. The root cause was a JSON field name mismatch (OperationRequests vs OperationsInfo), now fixed. Diagnostic probing adds noise to validator output. - 9f6a9023fb (azcopy error logging in Windows log collection) Why: Added to debug empty CSE log uploads (BlobNotFound). Root cause was ADO job timeout (90m) racing with go test timeout (90m), fixed on main by 54aa84a (reduced go test timeout to 80m). - d083fbef53 (verbose test output with -v flag) Why: Added so t.Logf output would appear in pipeline logs for diagnostics. No longer needed; increases log noise for all tests. - 45041cbe32 (always collect Windows CSE logs) Why: Removed s.T.Failed() guard to collect logs on success too. Root cause of missing logs was the ADO/go-test timeout race, not the collection logic. Restored failure-only collection. - fdc6962bd2 + 11967731f7 (canary check, already net-zero) Why: Canary proved validators work correctly. Already removed by the follow-up commit; these two commits cancel each other. - 0bc8f2e48d (poll wireserver IsOptedInForRootCerts retry loop) Why: Experimental polling for FC goal-state propagation. Tags are now set at VMSS creation time, making polling unnecessary. Already reverted by later commits during development. Kept (not reverted): - 76edb18ed9: Azure CNI cluster for Windows RCV1P tests (real fix for NBC/cluster type mismatch causing IP exhaustion) - a891055eb2: Branch-built CSE zip override (required until RCV1P code ships in a published CSE package) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .pipelines/scripts/e2e_run.sh | 3 +- e2e/validators.go | 41 +------------------- e2e/vmss.go | 72 +++++++++-------------------------- 3 files changed, 19 insertions(+), 97 deletions(-) diff --git a/.pipelines/scripts/e2e_run.sh b/.pipelines/scripts/e2e_run.sh index 77ae62dbec7..097fe250756 100644 --- a/.pipelines/scripts/e2e_run.sh +++ b/.pipelines/scripts/e2e_run.sh @@ -97,11 +97,10 @@ tar -xzf "$temp_file" -C bin chmod +x bin/gotestsum rm -f "$temp_file" -# REVERT ME: added -v to see t.Logf output from passing tests (azcopy/wireserver diagnostics) # gotestsum configure to only show logs for failed tests, json file for detailed logs # Run the tests! Yey! test_exit_code=0 -./bin/gotestsum --format testdox --junitfile "${BUILD_SRC_DIR}/e2e/report.xml" --jsonfile "${BUILD_SRC_DIR}/e2e/test-log.json" -- -v -parallel 60 -timeout "${E2E_GO_TEST_TIMEOUT}" || test_exit_code=$? +./bin/gotestsum --format testdox --junitfile "${BUILD_SRC_DIR}/e2e/report.xml" --jsonfile "${BUILD_SRC_DIR}/e2e/test-log.json" -- -parallel 60 -timeout "${E2E_GO_TEST_TIMEOUT}" || test_exit_code=$? # Upload test results as Azure DevOps artifacts echo "##vso[artifact.upload containerfolder=test-results;artifactname=e2e-test-log]${BUILD_SRC_DIR}/e2e/test-log.json" diff --git a/e2e/validators.go b/e2e/validators.go index 3377a92a954..9c095d86ec4 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2896,46 +2896,7 @@ func rcv1pTrustStoreDir(s *Scenario) string { func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Helper() - // REVERT ME: Diagnostic block — probe wireserver endpoints and dump CSE log tail from the VM - // so we can see exactly what the wireserver returns for operationrequests and what the CSE logged. - diagCommand := []string{ - "$ErrorActionPreference = 'Continue'", - "Write-Host '=== DIAGNOSTIC: probing wireserver rcv1p endpoints ==='", - "try {", - " $optIn = Invoke-WebRequest -Uri 'http://168.63.129.16/acms/isOptedInForRootCerts' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"isOptedInForRootCerts: $($optIn.Content)\"", - "} catch { Write-Host \"isOptedInForRootCerts ERROR: $_\" }", - "try {", - " $root = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsroot&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"operationrequestsroot status=$($root.StatusCode) length=$($root.Content.Length)\"", - " Write-Host \"operationrequestsroot content: $($root.Content)\"", - "} catch { Write-Host \"operationrequestsroot ERROR: $_\" }", - "try {", - " $intermediate = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsintermediate&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"operationrequestsintermediate status=$($intermediate.StatusCode) length=$($intermediate.Content.Length)\"", - " Write-Host \"operationrequestsintermediate content: $($intermediate.Content)\"", - "} catch { Write-Host \"operationrequestsintermediate ERROR: $_\" }", - "try {", - " $legacy = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"legacy cacertificates status=$($legacy.StatusCode) length=$($legacy.Content.Length)\"", - " $legacyJson = $legacy.Content | ConvertFrom-Json", - " if ($legacyJson.Certificates) { Write-Host \"legacy cert count: $($legacyJson.Certificates.Length)\" } else { Write-Host 'legacy: no Certificates array' }", - "} catch { Write-Host \"legacy cacertificates ERROR: $_\" }", - "Write-Host '=== DIAGNOSTIC: C:\\ca folder contents ==='", - "if (Test-Path 'C:\\ca') { Get-ChildItem -Path 'C:\\ca' -File | ForEach-Object { Write-Host \" $($_.Name) ($($_.Length) bytes)\" } } else { Write-Host 'C:\\ca does not exist' }", - "Write-Host '=== DIAGNOSTIC: CSE log tail (last 60 lines with CA/cert/wireserver) ==='", - "if (Test-Path 'C:\\AzureData\\CustomDataSetupScript.log') {", - " Get-Content 'C:\\AzureData\\CustomDataSetupScript.log' -Tail 200 | Where-Object { $_ -match 'CA |cert|wireserver|optedin|operation|acms|Write cert|Warning' } | Select-Object -Last 60 | ForEach-Object { Write-Host $_ }", - "} else { Write-Host 'CSE log not found' }", - "Write-Host '=== END DIAGNOSTIC ==='", - } - diagResult := execScriptOnVMForScenario(ctx, s, strings.Join(diagCommand, "\n")) - s.T.Logf("REVERT ME: wireserver diagnostics stdout:\n%s", diagResult.stdout) - if diagResult.stderr != "" { - s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) - } - - // Validate the provisioning log shows wireserver was queried and returned opted-in + // Validate the provisioning logshows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") diff --git a/e2e/vmss.go b/e2e/vmss.go index bb7521af2ad..e4c3abc4f6a 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -901,7 +901,6 @@ func extractLogsFromVMLinux(ctx context.Context, s *Scenario, vm *ScenarioVM) er return nil } -// REVERT ME: added error logging around azcopy to diagnose why blob uploads fail (BlobNotFound) const uploadLogsPowershellScript = ` param( [string]$arg1, @@ -909,47 +908,18 @@ param( [string]$arg3 ) -# REVERT ME: verbose error logging for azcopy upload diagnostics -function Run-AzCopy { - param([string]$Source, [string]$Dest) - if (-not (Test-Path $Source)) { - Write-Host "AZCOPY SKIP: source not found: $Source" - return - } - Write-Host "AZCOPY: copying $Source -> $Dest" - $output = & .\azcopy.exe copy $Source $Dest 2>&1 - $exitCode = $LASTEXITCODE - Write-Host "AZCOPY: exit=$exitCode output=$output" - if ($exitCode -ne 0) { - Write-Host "AZCOPY ERROR: failed to copy $Source (exit=$exitCode)" - } -} - -try { - Write-Host "Downloading azcopy..." - Invoke-WebRequest -UseBasicParsing https://aka.ms/downloadazcopy-v10-windows -OutFile azcopy.zip - Expand-Archive azcopy.zip - cd .\azcopy\* - $env:AZCOPY_AUTO_LOGIN_TYPE="MSI" - $env:AZCOPY_MSI_RESOURCE_STRING=$arg3 - Write-Host "MSI resource: $arg3" - Write-Host "Blob destination: $arg1" -} catch { - Write-Host "AZCOPY SETUP ERROR: $_" -} - -try { - C:\k\debug\collect-windows-logs.ps1 - $CollectedLogs=(Get-ChildItem . -Filter "*_logs.zip" -File)[0].Name - Run-AzCopy -Source $CollectedLogs -Dest "$arg1/collected-node-logs.zip" -} catch { - Write-Host "COLLECT-LOGS ERROR: $_" -} - -Run-AzCopy -Source "C:\azuredata\CustomDataSetupScript.log" -Dest "$arg1/cse.log" -Run-AzCopy -Source "C:\AzureData\provision.complete" -Dest "$arg1/provision.complete" -Run-AzCopy -Source "C:\k\kubelet.err.log" -Dest "$arg1/kubelet.err.log" -Run-AzCopy -Source "C:\k\containerd.err.log" -Dest "$arg1/containerd.err.log" +Invoke-WebRequest -UseBasicParsing https://aka.ms/downloadazcopy-v10-windows -OutFile azcopy.zip +Expand-Archive azcopy.zip +cd .\azcopy\* +$env:AZCOPY_AUTO_LOGIN_TYPE="MSI" +$env:AZCOPY_MSI_RESOURCE_STRING=$arg3 +C:\k\debug\collect-windows-logs.ps1 +$CollectedLogs=(Get-ChildItem . -Filter "*_logs.zip" -File)[0].Name +.\azcopy.exe copy $CollectedLogs "$arg1/collected-node-logs.zip" +.\azcopy.exe copy "C:\azuredata\CustomDataSetupScript.log" "$arg1/cse.log" +.\azcopy.exe copy "C:\AzureData\provision.complete" "$arg1/provision.complete" +.\azcopy.exe copy "C:\k\kubelet.err.log" "$arg1/kubelet.err.log" +.\azcopy.exe copy "C:\k\containerd.err.log" "$arg1/containerd.err.log" # Collect network configuration information ipconfig /all > network_config.txt @@ -963,13 +933,16 @@ Get-NetNeighbor >> network_config.txt Get-NetConnectionProfile >> network_config.txt hnsdiag list networks >> network_config.txt hnsdiag list endpoints >> network_config.txt -Run-AzCopy -Source "network_config.txt" -Dest "$arg1/network_config.txt" +.\azcopy.exe copy "network_config.txt" "$arg1/network_config.txt" ` // extractLogsFromVMWindows runs a script on windows VM to collect logs and upload them to a blob storage // it then lists the blobs in the container and prints the content of each blob func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { - // Always collect Windows logs for debugging (revert this to restore failure-only collection) + if !s.T.Failed() { + return + } + ctx, cancel := context.WithTimeout(ctx, 4*time.Minute) defer cancel() pager := s.GetAzure().VMSSVM.NewListPager(*s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, nil) @@ -1049,17 +1022,6 @@ func extractLogsFromVMWindows(ctx context.Context, s *Scenario) { respJSON, _ := json.MarshalIndent(runCommandResp, "", " ") s.T.Logf("run command executed successfully:\n%s", respJSON) - // REVERT ME: log RunCommand stdout/stderr to diagnose azcopy upload failures - if runCommandResp.Properties != nil && runCommandResp.Properties.InstanceView != nil { - iv := runCommandResp.Properties.InstanceView - if iv.Output != nil && *iv.Output != "" { - s.T.Logf("RunCommand stdout:\n%s", *iv.Output) - } - if iv.Error != nil && *iv.Error != "" { - s.T.Logf("RunCommand stderr:\n%s", *iv.Error) - } - } - s.T.Logf("uploaded logs to %s", blobUrl) downloadBlob := func(blobSuffix string) { From dfb2c10947930dc7bf818ef923cb82f70ab70370 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Sun, 26 Apr 2026 16:32:19 -0700 Subject: [PATCH 49/70] fix: make wireserver unreachable fatal for RCV1P opt-in check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wireserver unreachable after retries is now fatal (return 2 + exit 1) instead of silently skipping cert installation. If the subscription is opted in for hardened root certs but we silently fall back to the distro's default trust store, we leave a security hole — the node would trust CAs the customer explicitly intended to replace. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index caba6e99335..8bc6a52e112 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -74,6 +74,16 @@ function make_request_with_retry { return 1 } +# Returns: +# 0 - opted in (wireserver confirmed IsOptedInForRootCerts=true) +# 1 - not opted in (wireserver responded with false; valid, skip certs) +# 2 - wireserver unreachable after retries (caller must treat as fatal) +# +# Wireserver unreachable must be fatal (return 2) rather than silently skipping certs. +# If the subscription is opted in for hardened root certs but we silently fall back to +# the distro's default trust store, we leave a security hole — the node would trust CAs +# that the customer explicitly intended to replace. Failing hard surfaces the problem +# immediately instead of letting the node run with an insecure certificate configuration. function is_opted_in_for_root_certs { local opt_in_response local request_status @@ -255,7 +265,16 @@ if [ "$cert_endpoint_mode" = "legacy" ]; then exit 1 fi elif [ "$cert_endpoint_mode" = "rcv1p" ]; then - if is_opted_in_for_root_certs; then + is_opted_in_for_root_certs + opt_in_result=$? + if [ $opt_in_result -eq 2 ]; then + # Fatal: wireserver was unreachable after retries. We cannot determine whether + # the node should use hardened certs or the default trust store. Silently + # falling back to the distro trust store would be a security hole if the + # customer intended hardened certs, so we fail hard here. + echo "ERROR: cannot provision node — wireserver unreachable for cert opt-in check" + exit 1 + elif [ $opt_in_result -eq 0 ]; then install_ca_refresh_schedule=1 if retrieve_rcv1p_certs; then install_certs_to_trust_store From 8c89063182dd16c35565d1828f66510d308acfdd Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Mon, 27 Apr 2026 10:31:04 -0700 Subject: [PATCH 50/70] fix: use RCV1P Azure CNI cluster for Windows tests when explicit subscription set When RCV1P_SUBSCRIPTION_ID is set, Windows RCV1P positive tests set Scenario.AzureClient/SubscriptionID to the RCV1P subscription, but rcv1pWindowsCluster() always returned ClusterAzureNetwork (default subscription). This subscription mismatch would cause VMSS creation to 404 in the RCV1P subscription's node resource group. Fix: - Add ClusterRCV1PAzureNetwork in cache.go (Azure CNI cluster using RCV1PClusterInfra) - Branch rcv1pWindowsCluster() on hasExplicitRCV1PSubscription(), matching the pattern used by rcv1pCluster() for Linux - Fix Test_RCV1P_Windows_NotOptedIn to use ClusterRCV1PAzureNetwork instead of ClusterRCV1PKubenet (Windows needs Azure CNI) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cache.go | 13 +++++++++++++ e2e/scenario_rcv1p_win_test.go | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/e2e/cache.go b/e2e/cache.go index 777acdaf559..6d677c90d76 100644 --- a/e2e/cache.go +++ b/e2e/cache.go @@ -214,6 +214,19 @@ func clusterRCV1PKubenet(ctx context.Context, request ClusterRequest) (*Cluster, return prepareCluster(ctx, infra, getKubenetClusterModel("abe2e-rcv1p-kubenet-v1", request.Location, request.K8sSystemPoolSKU), false, false) } +var ClusterRCV1PAzureNetwork = cachedFunc(clusterRCV1PAzureNetwork) + +// clusterRCV1PAzureNetwork creates an Azure CNI cluster in the RCV1P subscription for Windows cert mode testing. +// Windows tests require Azure CNI (not kubenet) because baseTemplateWindows() configures the NBC for +// Azure CNI overlay mode. +func clusterRCV1PAzureNetwork(ctx context.Context, request ClusterRequest) (*Cluster, error) { + infra := RCV1PClusterInfra() + if infra == nil { + return nil, fmt.Errorf("RCV1P_SUBSCRIPTION_ID not set, cannot create RCV1P Azure CNI cluster") + } + return prepareCluster(ctx, infra, getAzureNetworkClusterModel("abe2e-rcv1p-azure-v1", request.Location, request.K8sSystemPoolSKU), false, false) +} + // isNotFoundErr checks if an error represents a "not found" response from Azure API func isNotFoundErr(err error) bool { var respErr *azcore.ResponseError diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index c05b0607358..d089f8572ff 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -181,7 +181,7 @@ func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2022Containerd, BootstrapConfigMutator: EmptyBootstrapConfigMutator, Validator: func(ctx context.Context, s *Scenario) { From 4ada2fee4afe8677b460feb3c17f87c293160371 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Mon, 27 Apr 2026 10:41:26 -0700 Subject: [PATCH 51/70] fix: replace legacy ca-refresh cron entry with location-aware version On custom clouds (AGC, Delos) where an older version of this script already installed a ca-refresh cron entry without the location argument, the idempotency grep would match the old entry and skip adding the new one. The old cron entry runs ca-refresh with an empty location, causing get_cert_endpoint_mode to default to rcv1p instead of legacy for ussec/usnat environments. Fix: always remove any existing ca-refresh entry for this script and re-add it with the explicit location argument, ensuring upgraded nodes get the correct endpoint mode on periodic refresh. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 8bc6a52e112..3b06a012089 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -304,11 +304,17 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 fi if [ "$install_ca_refresh_schedule" -eq 1 ]; then - if ! crontab -l 2>/dev/null | grep -q "\"$scriptPath\" ca-refresh"; then - # Quote the script path in the cron entry to avoid issues with spaces or special characters. - if ! (crontab -l 2>/dev/null ; printf '%s\n' "0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"") | crontab -; then - echo "Failed to install ca-refresh cron job via crontab" >&2 - fi + # Remove any existing ca-refresh entry for this script (may lack the location argument + # from older VHDs on custom clouds like AGC/Delos) and re-add with the explicit location. + # Without the location argument, ca-refresh defaults endpoint mode to rcv1p which is + # wrong for ussec/usnat legacy environments. + local new_entry="0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"" + local existing + existing=$(crontab -l 2>/dev/null || true) + local filtered + filtered=$(printf '%s\n' "$existing" | grep -v "\"$scriptPath\" ca-refresh" || true) + if ! (printf '%s\n' "$filtered"; printf '%s\n' "$new_entry") | sed '/^$/d' | crontab -; then + echo "Failed to install ca-refresh cron job via crontab" >&2 fi fi elif [ "$IS_FLATCAR" -eq 1 ] || [ "$IS_ACL" -eq 1 ]; then From cf07a7170a4c5fc3a512ffd16a08e6d340fa58b3 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Mon, 27 Apr 2026 10:49:05 -0700 Subject: [PATCH 52/70] fix: align Windows wireserver retries to 10 to match Linux parity All wireserver Retry-Command calls in kubernetesfunc.ps1 increased from 5 to 10 retries, matching Linux make_request_with_retry which uses 10 retries with exponential backoff. Under rate-limiting or transient wireserver unavailability, 5 retries (50s) could exhaust before the endpoint recovers. Added comments explaining: - Retry count parity with Linux - Security rationale: wireserver unreachable with -FailOnError is fatal because silently falling back to the OS default trust store would be a security hole if the customer intended hardened certs Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud.sh | 4 +--- staging/cse/windows/kubernetesfunc.ps1 | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 3b06a012089..fd6a9942b19 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -308,10 +308,8 @@ if [ "$IS_UBUNTU" -eq 1 ] || [ "$IS_MARINER" -eq 1 ] || [ "$IS_AZURELINUX" -eq 1 # from older VHDs on custom clouds like AGC/Delos) and re-add with the explicit location. # Without the location argument, ca-refresh defaults endpoint mode to rcv1p which is # wrong for ussec/usnat legacy environments. - local new_entry="0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"" - local existing + new_entry="0 19 * * * \"$scriptPath\" ca-refresh \"$LOCATION\"" existing=$(crontab -l 2>/dev/null || true) - local filtered filtered=$(printf '%s\n' "$existing" | grep -v "\"$scriptPath\" ca-refresh" || true) if ! (printf '%s\n' "$filtered"; printf '%s\n' "$new_entry") | sed '/^$/d' | crontab -; then echo "Failed to install ca-refresh cron job via crontab" >&2 diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index b5d83453ffa..7b8110a4ac7 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -309,7 +309,9 @@ function Should-InstallCACertificatesRefreshTask { try { $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' - $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + # Use 10 retries to match Linux make_request_with_retry resilience against + # transient wireserver unavailability and rate limiting. + $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 10 -RetryDelaySeconds 10 Write-Log "IsOptedInForRootCerts wireserver response: $($optInResponse.Content)" $optInJson = $optInResponse.Content | ConvertFrom-Json return ($optInJson.IsOptedInForRootCerts -eq $true) @@ -353,10 +355,15 @@ function Get-CACertificates { Write-Log "Get CA certificates. Location: $Location. EndpointMode: $certEndpointMode" } + # Get-CACertificates downloads Azure root CA certificates from wireserver and writes them + # to the local certificate folder. When called with -FailOnError, wireserver unreachable + # after retries is fatal — silently falling back to the OS default trust store would be a + # security hole if the customer intended hardened root certs. This matches the Linux + # behavior in init-aks-custom-cloud.sh (is_opted_in_for_root_certs return code 2 = fatal). try { if ($certEndpointMode -eq "legacy") { $uri = 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' - $rawData = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$uri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + $rawData = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$uri; UseBasicParsing=$true} -Retries 10 -RetryDelaySeconds 10 $caCerts = ($rawData.Content) | ConvertFrom-Json if ($null -eq $caCerts -or $null -eq $caCerts.Certificates -or $caCerts.Certificates.Length -eq 0) { if ($FailOnError) { @@ -377,7 +384,8 @@ function Get-CACertificates { } $optInUri = 'http://168.63.129.16/acms/isOptedInForRootCerts' - $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + # Wireserver opt-in check: 10 retries to match Linux make_request_with_retry. + $optInResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$optInUri; UseBasicParsing=$true} -Retries 10 -RetryDelaySeconds 10 Write-Log "IsOptedInForRootCerts wireserver response: $($optInResponse.Content)" $optInJson = $optInResponse.Content | ConvertFrom-Json if ($optInJson.IsOptedInForRootCerts -ne $true) { @@ -390,7 +398,7 @@ function Get-CACertificates { foreach ($requestType in $operationRequestTypes) { $operationRequestUri = "http://168.63.129.16/machine?comp=acmspackage&type=$requestType&ext=json" - $operationResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$operationRequestUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + $operationResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$operationRequestUri; UseBasicParsing=$true} -Retries 10 -RetryDelaySeconds 10 $operationJson = ($operationResponse.Content) | ConvertFrom-Json if ($null -eq $operationJson -or $null -eq $operationJson.OperationsInfo) { @@ -408,7 +416,7 @@ function Get-CACertificates { $resourceExt = [IO.Path]::GetExtension($resourceFileName).TrimStart('.') $resourceUri = "http://168.63.129.16/machine?comp=acmspackage&type=$resourceType&ext=$resourceExt" - $certContentResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$resourceUri; UseBasicParsing=$true} -Retries 5 -RetryDelaySeconds 10 + $certContentResponse = Retry-Command -Command 'Invoke-WebRequest' -Args @{Uri=$resourceUri; UseBasicParsing=$true} -Retries 10 -RetryDelaySeconds 10 if ([string]::IsNullOrEmpty($certContentResponse.Content)) { Write-Log "Warning: empty certificate content for $resourceFileName" continue From 09b8d20ad834c89e15f0e0c2b8b305b977136dac Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 29 Apr 2026 16:13:17 -0700 Subject: [PATCH 53/70] fix: enhance RCV1P opt-in tag handling in VMSS creation process Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_win_test.go | 1 + e2e/vmss.go | 78 ++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index d089f8572ff..ca8ca02ba17 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -14,6 +14,7 @@ import ( "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" + "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v7" ) // Test_RCV1P_Windows2022 validates RCV1P cert download and Windows certificate store diff --git a/e2e/vmss.go b/e2e/vmss.go index e4c3abc4f6a..4738573bd0d 100644 --- a/e2e/vmss.go +++ b/e2e/vmss.go @@ -478,6 +478,11 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc model := createVMSSModel(ctx, s) + // Record whether the outgoing VMSS model includes the RCV1P opt-in tag. + // This is used after creation to detect platform auto-injection vs tags we set. + rcv1pTagKey := "platformsettings.host_environment.service.platform_optedin_for_rootcerts" + _, requestedRCV1PTag := model.Tags[rcv1pTagKey] + // For scenarios that need VM instance tags (e.g., RCV1P), we must apply tags // before CSE runs because wireserver checks per-VM-instance tags. The only // working method for Uniform VMSS is BeginUpdate (full PUT), which takes ~108s. @@ -565,6 +570,50 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc result += fmt.Sprintf(`az network bastion ssh --target-resource-id "%s" --name "%s-bastion" --resource-group %s --auth-type ssh-key --username azureuser --ssh-key %s`, *vm.VM.ID, *s.Runtime.Cluster.Model.Name, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, config.VMSSHPrivateKeyFileName) + "\n" s.T.Log(result) + // Log VMSS tags for diagnostics (visible in test-log.json via gotestsum --jsonfile). + // For RCV1P tests, compares request tags vs response tags to detect platform auto-injection. + vmssID := "" + if vmssResp.ID != nil { + vmssID = *vmssResp.ID + } + if vmssResp.Tags != nil { + s.T.Logf("VMSS %s (id: %s) tags (%d):", s.Runtime.VMSSName, vmssID, len(vmssResp.Tags)) + for k, v := range vmssResp.Tags { + val := "" + if v != nil { + val = *v + } + if k == rcv1pTagKey { + if requestedRCV1PTag { + s.T.Logf(" tag: %s = %s [RCV1P opt-in tag — set by us in VMSS request]", k, val) + } else { + s.T.Logf(" tag: %s = %s [RCV1P opt-in tag — AUTO-INJECTED by platform, NOT in our VMSS request]", k, val) + } + } else { + s.T.Logf(" tag: %s = %s", k, val) + } + } + // Detect platform auto-injection: tag appeared in response but was NOT in our request. + if respVal, hasTag := vmssResp.Tags[rcv1pTagKey]; hasTag && !requestedRCV1PTag { + val := "" + if respVal != nil { + val = *respVal + } + s.T.Logf("WARNING: platform auto-injected RCV1P opt-in tag %q=%s on VMSS — "+ + "PlatformSettingsOverride feature flag may be causing auto-injection on subscription %s", + rcv1pTagKey, val, s.GetSubscriptionID()) + if s.Tags.RCV1PCertMode && strings.EqualFold(val, "true") { + s.T.Logf("WARNING: auto-injected tag value is 'true' — negative (NotOptedIn) tests will be "+ + "INVALID on this subscription because wireserver will serve certificates regardless of our intent") + } + } + if _, hasTag := vmssResp.Tags[rcv1pTagKey]; !hasTag && s.Tags.RCV1PCertMode { + s.T.Logf(" RCV1P opt-in tag %q NOT present on VMSS (not in request, not auto-injected) — "+ + "wireserver should report IsOptedInForRootCerts=false", rcv1pTagKey) + } + } else { + s.T.Logf("VMSS %s (id: %s) has no tags", s.Runtime.VMSSName, vmssID) + } if !s.Config.SkipSSHConnectivityValidation { var bastErr error vm.SSHClient, bastErr = DialSSHOverBastion(ctx, s.Runtime.Cluster.Bastion, vm.PrivateIP, config.VMSSHPrivateKey) @@ -579,6 +628,35 @@ func CreateVMSS(ctx context.Context, s *Scenario, resourceGroupName string) (*Sc return vm, fmt.Errorf("failed to wait for VM to reach running state: %w", err) } + // Log VM instance tags for diagnostics (visible in test-log.json via gotestsum --jsonfile) + vmInstanceID := "" + if vm.VM.ID != nil { + vmInstanceID = *vm.VM.ID + } + if vm.VM.Tags != nil { + s.T.Logf("VM instance %s (id: %s) tags (%d):", *vm.VM.InstanceID, vmInstanceID, len(vm.VM.Tags)) + for k, v := range vm.VM.Tags { + val := "" + if v != nil { + val = *v + } + if k == rcv1pTagKey { + if requestedRCV1PTag { + s.T.Logf(" tag: %s = %s [RCV1P opt-in tag — inherited from VMSS, set by us]", k, val) + } else { + s.T.Logf(" tag: %s = %s [RCV1P opt-in tag — inherited from VMSS, AUTO-INJECTED by platform]", k, val) + } + } else { + s.T.Logf(" tag: %s = %s", k, val) + } + } + if _, hasTag := vm.VM.Tags[rcv1pTagKey]; !hasTag && s.Tags.RCV1PCertMode { + s.T.Logf(" [RCV1P opt-in tag %q NOT present on VM instance — this is expected for negative tests]", rcv1pTagKey) + } + } else { + s.T.Logf("VM instance %s (id: %s) has no tags", *vm.VM.InstanceID, vmInstanceID) + } + return &ScenarioVM{ VMSS: &vmssResp.VirtualMachineScaleSet, PrivateIP: vm.PrivateIP, From b99bede44015ab68b02be716f94a56dd4ed99496 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 6 May 2026 14:41:51 -0700 Subject: [PATCH 54/70] fix: use Azure CNI cluster for Windows RCV1P tests Windows baseTemplateWindows() configures NBC with NetworkPlugin=azure and NetworkPluginMode=overlay. Using a kubenet cluster causes azure-vnet plugin IPAM failures on the node. Switch all Windows RCV1P tests to use ClusterRCV1PAzureNetwork which creates an Azure CNI overlay cluster in the RCV1P subscription. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_win_test.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index ca8ca02ba17..b2188bd2dd5 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -29,7 +29,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2022Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -52,7 +52,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows23H2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -75,7 +75,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2025, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) @@ -104,7 +104,7 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2022ContainerdGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -127,7 +127,7 @@ func Test_RCV1P_Windows23H2Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows23H2Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), @@ -150,7 +150,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, + Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2025Gen2, VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) From cafa6ecdc8dbf78194f24cb90fd9ae57cef14b50 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 6 May 2026 14:43:24 -0700 Subject: [PATCH 55/70] revert: drop 'REVERT ME' cluster switching commits (now superseded) The following commits are superseded by the permanent fix in c71b1eb24e which correctly assigns ClusterRCV1PAzureNetwork to Windows RCV1P tests and keeps ClusterRCV1PKubenet for Linux RCV1P tests: - 286c711c9d REVERT ME: use dedicated kubenet cluster for RCV1P tests - 4de7fe5022 REVERT ME: use Azure CNI cluster for Windows RCV1P tests Both are no-ops against the current state and can be safely squashed out during final interactive rebase before merge. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani From 00804a3def583a06dc97acf49a88b0e88357d944 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 6 May 2026 15:22:54 -0700 Subject: [PATCH 56/70] revert: drop canary validator and wireserver polling debug commits Reverts: - 5c2ed65603 (canary check that guarantees test failure) - 07d1c4402a (5-minute wireserver polling loop - provisioning regression) The canary ValidateFileHasContent for a nonexistent string causes guaranteed test failures. The wireserver polling adds up to 5 minutes of sleep to every Linux RCV1P node provisioning. Remaining diagnostic commits (wireserver endpoint probing, azcopy logging, verbose output) are kept for initial rollout observability. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/validators.go | 41 +++++++++++++++++- .../artifacts/init-aks-custom-cloud.sh | 42 +++++-------------- 2 files changed, 51 insertions(+), 32 deletions(-) diff --git a/e2e/validators.go b/e2e/validators.go index 9c095d86ec4..3377a92a954 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2896,7 +2896,46 @@ func rcv1pTrustStoreDir(s *Scenario) string { func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Helper() - // Validate the provisioning logshows wireserver was queried and returned opted-in + // REVERT ME: Diagnostic block — probe wireserver endpoints and dump CSE log tail from the VM + // so we can see exactly what the wireserver returns for operationrequests and what the CSE logged. + diagCommand := []string{ + "$ErrorActionPreference = 'Continue'", + "Write-Host '=== DIAGNOSTIC: probing wireserver rcv1p endpoints ==='", + "try {", + " $optIn = Invoke-WebRequest -Uri 'http://168.63.129.16/acms/isOptedInForRootCerts' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"isOptedInForRootCerts: $($optIn.Content)\"", + "} catch { Write-Host \"isOptedInForRootCerts ERROR: $_\" }", + "try {", + " $root = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsroot&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"operationrequestsroot status=$($root.StatusCode) length=$($root.Content.Length)\"", + " Write-Host \"operationrequestsroot content: $($root.Content)\"", + "} catch { Write-Host \"operationrequestsroot ERROR: $_\" }", + "try {", + " $intermediate = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsintermediate&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"operationrequestsintermediate status=$($intermediate.StatusCode) length=$($intermediate.Content.Length)\"", + " Write-Host \"operationrequestsintermediate content: $($intermediate.Content)\"", + "} catch { Write-Host \"operationrequestsintermediate ERROR: $_\" }", + "try {", + " $legacy = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing -TimeoutSec 30", + " Write-Host \"legacy cacertificates status=$($legacy.StatusCode) length=$($legacy.Content.Length)\"", + " $legacyJson = $legacy.Content | ConvertFrom-Json", + " if ($legacyJson.Certificates) { Write-Host \"legacy cert count: $($legacyJson.Certificates.Length)\" } else { Write-Host 'legacy: no Certificates array' }", + "} catch { Write-Host \"legacy cacertificates ERROR: $_\" }", + "Write-Host '=== DIAGNOSTIC: C:\\ca folder contents ==='", + "if (Test-Path 'C:\\ca') { Get-ChildItem -Path 'C:\\ca' -File | ForEach-Object { Write-Host \" $($_.Name) ($($_.Length) bytes)\" } } else { Write-Host 'C:\\ca does not exist' }", + "Write-Host '=== DIAGNOSTIC: CSE log tail (last 60 lines with CA/cert/wireserver) ==='", + "if (Test-Path 'C:\\AzureData\\CustomDataSetupScript.log') {", + " Get-Content 'C:\\AzureData\\CustomDataSetupScript.log' -Tail 200 | Where-Object { $_ -match 'CA |cert|wireserver|optedin|operation|acms|Write cert|Warning' } | Select-Object -Last 60 | ForEach-Object { Write-Host $_ }", + "} else { Write-Host 'CSE log not found' }", + "Write-Host '=== END DIAGNOSTIC ==='", + } + diagResult := execScriptOnVMForScenario(ctx, s, strings.Join(diagCommand, "\n")) + s.T.Logf("REVERT ME: wireserver diagnostics stdout:\n%s", diagResult.stdout) + if diagResult.stderr != "" { + s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) + } + + // Validate the provisioning log shows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index fd6a9942b19..f8101ac7630 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -86,40 +86,20 @@ function make_request_with_retry { # immediately instead of letting the node run with an insecure certificate configuration. function is_opted_in_for_root_certs { local opt_in_response - local request_status - local poll_attempt=1 - local max_poll_attempts=30 - local poll_interval=10 - - # Poll wireserver for up to ~5 minutes to allow platform metadata to sync. - # The VM instance tag triggers a Fabric Controller goal state (CCF) update, - # which must propagate to the host agent before wireserver can reflect it. - # FC goal state propagation can take several minutes in practice. - while [ $poll_attempt -le $max_poll_attempts ]; do - echo "is_opted_in_for_root_certs: poll attempt ${poll_attempt}/${max_poll_attempts}" - - opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") - request_status=$? - - echo "is_opted_in_for_root_certs: wireserver response (status=${request_status}): '${opt_in_response}'" - - if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then - echo "Warning: failed to determine IsOptedInForRootCerts state on attempt ${poll_attempt}" - elif echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then - echo "IsOptedInForRootCerts=true (found on attempt ${poll_attempt})" - return 0 - fi - if [ $poll_attempt -lt $max_poll_attempts ]; then - echo "is_opted_in_for_root_certs: not opted in yet, waiting ${poll_interval}s before retry..." - sleep $poll_interval - fi + opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") + local request_status=$? + if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then + echo "Warning: failed to determine IsOptedInForRootCerts state" + return 1 + fi - poll_attempt=$((poll_attempt + 1)) - done + if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then + echo "IsOptedInForRootCerts=true" + return 0 + fi - echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true after ${max_poll_attempts} attempts" - echo "Last wireserver response: '${opt_in_response}'" + echo "Skipping custom cloud root cert installation because IsOptedInForRootCerts is not true" return 1 } From 5904637a087e41119c052169178ec4aea7856311 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Wed, 6 May 2026 21:57:14 -0700 Subject: [PATCH 57/70] feat(e2e): auto-detect RCV1P feature flag on E2E subscription When RCV1P_SUBSCRIPTION_ID is not explicitly set, the skip logic now checks whether the E2E subscription (E2E_SUBSCRIPTION_ID) has the PlatformSettingsOverride feature flag registered. If it does, the RCV1P tests run automatically using the E2E subscription. This enables MSFT tenant pipelines (where the E2E subscription is already enrolled) to run RCV1P tests without a separate variable. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 63 ++++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 4504ad976b6..27f9e92e0ec 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -37,23 +37,53 @@ import ( // if the subscription has the PlatformSettingsOverride feature registered. const rcv1pOptInTag = "platformsettings.host_environment.service.platform_optedin_for_rootcerts" -// skipIfRCV1PNotConfigured skips the test when the RCV1P subscription is not configured. -// This happens in regular CI runs where the RCV1P variable group is not linked, causing -// Azure DevOps to pass the literal unexpanded string "$(RCV1P_SUBSCRIPTION_ID)". -// It always logs the feature flag status on the E2E subscription for diagnostics, -// and verifies the flag is registered on the RCV1P subscription when available. +// skipIfRCV1PNotConfigured skips the test when no subscription with the RCV1P feature flag +// is available. It checks in order: +// 1. Explicit RCV1P_SUBSCRIPTION_ID (dedicated RCV1P subscription) +// 2. E2E_SUBSCRIPTION_ID auto-detection (checks if the feature flag is registered) +// +// When E2E_SUBSCRIPTION_ID has the feature flag registered (e.g., MSFT tenant pipelines), +// the RCV1P tests run automatically without needing a separate variable. func skipIfRCV1PNotConfigured(t *testing.T) { t.Helper() - // Always log feature flag status on the default E2E subscription for diagnostics - logE2ESubscriptionFeatureFlag(t) subID := strings.TrimSpace(config.Config.RCV1PSubscriptionID) - if subID == "" || strings.HasPrefix(subID, "$(") { - t.Skip("RCV1P_SUBSCRIPTION_ID not set or not resolved, skipping RCV1P cert mode test") + if subID != "" && !strings.HasPrefix(subID, "$(") { + // Explicit RCV1P subscription configured — verify it has the feature flag + checkPlatformSettingsOverrideFeatureFlag(t, subID, config.RCV1PAzure, true) + return + } + + // No explicit RCV1P subscription — try auto-detecting from the E2E subscription + t.Log("RCV1P_SUBSCRIPTION_ID not set, checking if E2E subscription has PlatformSettingsOverride feature flag...") + e2eSubID := strings.TrimSpace(config.Config.SubscriptionID) + if e2eSubID == "" { + t.Skip("neither RCV1P_SUBSCRIPTION_ID nor E2E_SUBSCRIPTION_ID is set, skipping RCV1P test") + } + + e2eAzure, err := config.NewAzureClient() + if err != nil { + t.Skipf("failed to create E2E Azure client for feature flag auto-detection: %v", err) + } + + registered, err := queryFeatureFlag(t.Context(), e2eSubID, e2eAzure) + if err != nil { + t.Skipf("failed to query feature flag on E2E subscription %s: %v", e2eSubID, err) } - checkPlatformSettingsOverrideFeatureFlag(t, subID, config.RCV1PAzure, true) + if !registered { + t.Skipf("E2E subscription %s does not have PlatformSettingsOverride registered, skipping RCV1P test", e2eSubID) + } + + // E2E subscription is enrolled — configure RCV1P globals so the rest of the test infra works + t.Logf("auto-detected PlatformSettingsOverride on E2E subscription %s, using it for RCV1P tests", e2eSubID) + rcv1pAutoDetectOnce.Do(func() { + config.Config.RCV1PSubscriptionID = e2eSubID + config.RCV1PAzure = e2eAzure + }) } +var rcv1pAutoDetectOnce sync.Once + var ( featureFlagChecks sync.Map // subscriptionID -> *featureFlagResult ) @@ -90,19 +120,6 @@ func checkPlatformSettingsOverrideFeatureFlag(t *testing.T, subscriptionID strin } } -// logE2ESubscriptionFeatureFlag logs the PlatformSettingsOverride feature flag status on the -// default E2E subscription for diagnostic purposes. This helps understand wireserver behavior -// (e.g., IsOptedInForRootCerts responses) even in non-RCV1P test runs. -func logE2ESubscriptionFeatureFlag(t *testing.T) { - t.Helper() - e2eAzure, err := config.NewAzureClient() - if err != nil { - t.Logf("WARNING: failed to create E2E Azure client for feature flag check: %v", err) - return - } - checkPlatformSettingsOverrideFeatureFlag(t, config.Config.SubscriptionID, e2eAzure, false) -} - func queryFeatureFlag(ctx context.Context, subscriptionID string, client *config.AzureClient) (bool, error) { url := fmt.Sprintf( "https://management.azure.com/subscriptions/%s/providers/Microsoft.Features/providers/Microsoft.Compute/features/PlatformSettingsOverride?api-version=2021-07-01", From c709eae03f2aa9f7038ba0eae6e538cd19d7f92a Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 09:33:56 -0700 Subject: [PATCH 58/70] fix(e2e): skip NotOptedIn tests on auto-detected enrolled subscriptions On subscriptions with PlatformSettingsOverride registered, the platform auto-injects the opt-in tag on ALL VMSSes, making the 'not opted in' negative test scenario impossible. Skip these tests when the RCV1P subscription was auto-detected from the E2E subscription. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 22 +++++++++++++++++++++- e2e/scenario_rcv1p_win_test.go | 1 + 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index 27f9e92e0ec..eafe8d52524 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -79,10 +79,29 @@ func skipIfRCV1PNotConfigured(t *testing.T) { rcv1pAutoDetectOnce.Do(func() { config.Config.RCV1PSubscriptionID = e2eSubID config.RCV1PAzure = e2eAzure + rcv1pAutoDetected = true }) } -var rcv1pAutoDetectOnce sync.Once +var ( + rcv1pAutoDetectOnce sync.Once + // rcv1pAutoDetected is true when the RCV1P subscription was auto-detected from the + // E2E subscription rather than explicitly set via RCV1P_SUBSCRIPTION_ID. On auto-detected + // (enrolled) subscriptions, the platform auto-injects the opt-in tag on ALL VMSSes, + // making "not opted in" negative tests impossible. + rcv1pAutoDetected bool +) + +// skipNotOptedInOnAutoDetect skips NotOptedIn negative tests when the RCV1P subscription was +// auto-detected. On enrolled subscriptions, the platform auto-injects the opt-in tag on ALL +// VMSSes, making it impossible to test the "not opted in" scenario. +func skipNotOptedInOnAutoDetect(t *testing.T) { + t.Helper() + if rcv1pAutoDetected { + t.Skip("skipping NotOptedIn test: RCV1P subscription was auto-detected from E2E subscription — " + + "platform auto-injects opt-in tag on all VMSSes in enrolled subscriptions") + } +} var ( featureFlagChecks sync.Map // subscriptionID -> *featureFlagResult @@ -307,6 +326,7 @@ func Test_RCV1P_ACL(t *testing.T) { // subscription feature alone is not sufficient — the VM must also be explicitly tagged. func Test_RCV1P_NotOptedIn(t *testing.T) { skipIfRCV1PNotConfigured(t) + skipNotOptedInOnAutoDetect(t) RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode without VM opt-in tag; expects no cert installation", AzureClient: config.RCV1PAzure, diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index b2188bd2dd5..948f5b3d263 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -174,6 +174,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { // script correctly skips certificate download and refresh task registration. func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { skipIfRCV1PNotConfigured(t) + skipNotOptedInOnAutoDetect(t) RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", AzureClient: config.RCV1PAzure, From e1959ba2b99bb95c3e8c99fb7bd1314e80c8d058 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 10:36:31 -0700 Subject: [PATCH 59/70] fix(e2e): use caller context in getCustomScriptExtensionStatus Replace context.Background() with the caller's context so the VM instance view fetch respects test/scenario timeouts instead of potentially hanging indefinitely. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/test_helpers.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/e2e/test_helpers.go b/e2e/test_helpers.go index bac2366c3a6..c020d761de5 100644 --- a/e2e/test_helpers.go +++ b/e2e/test_helpers.go @@ -350,7 +350,7 @@ func prepareAKSNode(ctx context.Context, s *Scenario) (*ScenarioVM, error) { require.NoError(s.T, err, "create vmss %q, check %s for vm logs", s.Runtime.VMSSName, testDir(s.T)) } - err = getCustomScriptExtensionStatus(s, scenarioVM.VM) + err = getCustomScriptExtensionStatus(ctx, s, scenarioVM.VM) require.NoError(s.T, err) if !s.Config.SkipDefaultValidation { @@ -464,12 +464,11 @@ func validateVM(ctx context.Context, s *Scenario) { } } -func getCustomScriptExtensionStatus(s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { +func getCustomScriptExtensionStatus(ctx context.Context, s *Scenario, vmssVM *armcompute.VirtualMachineScaleSetVM) error { // Re-fetch the VM with instance view to ensure we have fresh extension status data. // The VM object passed in may have been fetched before the CSE finished executing, // so the extension status message could be empty or stale. if vmssVM.InstanceID != nil { - ctx := context.Background() freshVM, err := s.GetAzure().VMSSVM.Get(ctx, *s.Runtime.Cluster.Model.Properties.NodeResourceGroup, s.Runtime.VMSSName, From dee66b328ab913a7b65cf25415107a2605f45d8c Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 13:15:34 -0700 Subject: [PATCH 60/70] fix(e2e): remove TrustedLaunch from non-Gen2 Windows 2025 RCV1P test The windows-2025 image does not support TrustedLaunch, only the Gen2 variant does. This matches the pattern on main where Test_Windows2025 uses EmptyVMConfigMutator without TrustedLaunch. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_win_test.go | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 948f5b3d263..3621446896b 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -75,12 +75,9 @@ func Test_RCV1P_Windows2025(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PAzureNetwork, - VHD: config.VHDWindows2025, - VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { - vmss.Properties = addTrustedLaunchToVMSS(vmss.Properties) - rcv1pOptInVMConfigMutator(vmss) - }, + Cluster: ClusterRCV1PAzureNetwork, + VHD: config.VHDWindows2025, + VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { Windows2025BootstrapConfigMutator(t, nbc) From 117adaa82caab87b44e1ece261190732297b6c22 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 13:16:27 -0700 Subject: [PATCH 61/70] fix: return code 2 when wireserver is unreachable in is_opted_in_for_root_certs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The function documented return code 2 for 'wireserver unreachable' and the caller correctly checked for it, but the implementation returned 1 (not opted in) on request failure. This silently skipped cert installation on wireserver outages — a security hole if the subscription is enrolled for hardened certs. Now returns 2 on failure so the caller treats it as fatal, matching the documented contract. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index f8101ac7630..f22a78fd34f 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -90,8 +90,8 @@ function is_opted_in_for_root_certs { opt_in_response=$(make_request_with_retry "${WIRESERVER_ENDPOINT}/acms/isOptedInForRootCerts") local request_status=$? if [ $request_status -ne 0 ] || [ -z "$opt_in_response" ]; then - echo "Warning: failed to determine IsOptedInForRootCerts state" - return 1 + echo "ERROR: wireserver unreachable or returned empty response for IsOptedInForRootCerts" + return 2 fi if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then From 2cb6a99752e4b5441349bdb47645e2a0c8544247 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 13:17:56 -0700 Subject: [PATCH 62/70] fix: throw when opted-in but no certs downloaded with -FailOnError When IsOptedInForRootCerts is true but no certificates are downloaded, Get-CACertificates only logged a warning and returned \False. Because the caller (BasePrep) doesn't check the return value, provisioning continued without the required CA set. Now throws when -FailOnError is set and no certs were downloaded, matching the fail-closed contract. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- staging/cse/windows/kubernetesfunc.ps1 | 3 +++ staging/cse/windows/kubernetesfunc.tests.ps1 | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/staging/cse/windows/kubernetesfunc.ps1 b/staging/cse/windows/kubernetesfunc.ps1 index 7b8110a4ac7..36000558b91 100644 --- a/staging/cse/windows/kubernetesfunc.ps1 +++ b/staging/cse/windows/kubernetesfunc.ps1 @@ -430,6 +430,9 @@ function Get-CACertificates { } if (-not $downloadedAny) { + if ($FailOnError) { + throw "No CA certificates were downloaded in rcv1p mode despite IsOptedInForRootCerts=true" + } Write-Log "Warning: no CA certificates were downloaded in rcv1p mode" } diff --git a/staging/cse/windows/kubernetesfunc.tests.ps1 b/staging/cse/windows/kubernetesfunc.tests.ps1 index 924ccf13fc5..8cf053a5d1d 100644 --- a/staging/cse/windows/kubernetesfunc.tests.ps1 +++ b/staging/cse/windows/kubernetesfunc.tests.ps1 @@ -206,6 +206,22 @@ Describe 'Get-CACertificates' { { Get-CACertificates -Location 'ussecwest' -FailOnError } | Should -Throw '*CA certificates rawdata is empty*' } + It 'throws when opted in but no certs downloaded with -FailOnError' { + $script:callCount = 0 + Mock Retry-Command -MockWith { + param($Command, $Args, $Retries, $RetryDelaySeconds) + $script:callCount++ + $uri = $PSBoundParameters['Args'].Uri + if ($uri -match 'isOptedInForRootCerts') { + return [PSCustomObject]@{ Content = '{"IsOptedInForRootCerts":true}' } + } + # Return empty operation info for cert endpoints + return [PSCustomObject]@{ Content = '{"OperationsInfo":[]}' } + } + + { Get-CACertificates -Location 'southcentralus' -FailOnError } | Should -Throw '*No CA certificates were downloaded*' + } + It 'falls back to legacy endpoint when called without -Location (backward compat)' { $script:retryUris = @() Mock Retry-Command -MockWith { From ccc57f8b7e85cb5c9619180d077ee1cd4dd6fc1d Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 13:57:11 -0700 Subject: [PATCH 63/70] e2e: use branch-built CSE zip for Windows RCV1P tests The published CSE package (aks-windows-cse-scripts-current.zip) does not contain the RCV1P code (Get-CACertificates -Location, -FailOnError, IsOptedInForRootCerts, Register-CACertificatesRefreshTask). Without this override, Windows RCV1P E2E tests pass vacuously using the old code path. This builds a CSE zip from staging/cse/windows/ at test time, uploads it to blob storage with a SAS URL, and overrides CseScriptsPackageURL so the VMs download the branch's CSE scripts. TODO(rcv1p): remove the branch CSE zip override and rcv1pWindowsCSEMutator once the RCV1P code ships in a published CSE package. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 135 +++++++++++++++++++++++++++++++++ e2e/scenario_rcv1p_win_test.go | 19 +++-- 2 files changed, 149 insertions(+), 5 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index eafe8d52524..ca94f2ca67f 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -18,12 +18,16 @@ package e2e import ( + "archive/zip" "context" "fmt" "io" + "os" + "path/filepath" "strings" "sync" "testing" + "time" "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/pkg/agent/datamodel" @@ -184,6 +188,137 @@ func rcv1pVMInstanceTags() map[string]*string { } } +// TODO(rcv1p): remove the branch CSE zip override once the RCV1P code ships in a published +// CSE package on packages.aks.azure.com. Until then, Windows E2E tests would exercise the +// old Get-CACertificates (without -Location, -FailOnError, or IsOptedInForRootCerts) from +// the released aks-windows-cse-scripts-current.zip instead of the PR's version. +var ( + branchCSEZipURL string + branchCSEZipErr error + branchCSEZipOnce sync.Once +) + +// getOrBuildBranchCSEPackageURL builds a CSE zip from staging/cse/windows/ (matching the +// pipeline packaging in .pipelines/scripts/windows_package_cse.sh) and uploads it to the +// E2E blob storage. Returns a SAS-signed URL. Uses sync.Once so the zip is built and +// uploaded exactly once across all parallel tests. +func getOrBuildBranchCSEPackageURL(t *testing.T) string { + t.Helper() + branchCSEZipOnce.Do(func() { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute) + defer cancel() + branchCSEZipURL, branchCSEZipErr = buildAndUploadCSEZip(ctx) + }) + if branchCSEZipErr != nil { + t.Fatalf("failed to build/upload branch CSE zip: %v", branchCSEZipErr) + } + t.Logf("using branch CSE package URL: %s", branchCSEZipURL) + return branchCSEZipURL +} + +func buildAndUploadCSEZip(ctx context.Context) (string, error) { + repoRoot, err := findRepoRoot() + if err != nil { + return "", fmt.Errorf("find repo root: %w", err) + } + cseDir := filepath.Join(repoRoot, "staging", "cse", "windows") + + tmpFile, err := os.CreateTemp("", "aks-windows-cse-scripts-branch-*.zip") + if err != nil { + return "", fmt.Errorf("create temp file: %w", err) + } + defer os.Remove(tmpFile.Name()) + defer tmpFile.Close() + + zw := zip.NewWriter(tmpFile) + err = filepath.Walk(cseDir, func(path string, info os.FileInfo, walkErr error) error { + if walkErr != nil { + return walkErr + } + rel, err := filepath.Rel(cseDir, path) + if err != nil { + return err + } + rel = filepath.ToSlash(rel) + if rel == "." { + return nil + } + // skip test files and debug helper (matches windows_package_cse.sh) + if strings.HasSuffix(rel, ".tests.ps1") || strings.Contains(rel, ".tests.suites") { + if info.IsDir() { + return filepath.SkipDir + } + return nil + } + if rel == "README" || rel == "debug/update-scripts.ps1" { + return nil + } + if info.IsDir() { + return nil + } + w, err := zw.Create(rel) + if err != nil { + return fmt.Errorf("create zip entry %s: %w", rel, err) + } + f, err := os.Open(path) + if err != nil { + return fmt.Errorf("open %s: %w", path, err) + } + defer f.Close() + _, err = io.Copy(w, f) + return err + }) + if err != nil { + return "", fmt.Errorf("build zip: %w", err) + } + if err := zw.Close(); err != nil { + return "", fmt.Errorf("close zip writer: %w", err) + } + + if _, err := tmpFile.Seek(0, io.SeekStart); err != nil { + return "", fmt.Errorf("seek temp file: %w", err) + } + + blobName := fmt.Sprintf("cse-packages/aks-windows-cse-scripts-branch-%s.zip", + time.Now().UTC().Format("20060102-150405")) + url, err := config.Azure.UploadAndGetSignedLink(ctx, blobName, tmpFile) + if err != nil { + return "", fmt.Errorf("upload CSE zip: %w", err) + } + return url, nil +} + +func findRepoRoot() (string, error) { + dir, err := os.Getwd() + if err != nil { + return "", err + } + for { + if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil { + if filepath.Base(dir) == "e2e" { + dir = filepath.Dir(dir) + continue + } + return dir, nil + } + parent := filepath.Dir(dir) + if parent == dir { + return "", fmt.Errorf("could not find repo root (go.mod) from %s", dir) + } + dir = parent + } +} + +// rcv1pWindowsCSEMutator returns a BootstrapConfigMutator that overrides CseScriptsPackageURL +// to use the branch-built CSE zip containing the RCV1P code. +// TODO(rcv1p): remove this once the RCV1P code ships in a published CSE package. +func rcv1pWindowsCSEMutator(t *testing.T) func(*datamodel.NodeBootstrappingConfiguration) { + cseURL := getOrBuildBranchCSEPackageURL(t) + return func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.ContainerService.Properties.WindowsProfile.CseScriptsPackageURL = cseURL + } +} + // Test_RCV1P_Ubuntu2204 validates RCV1P cert download and trust store installation on Ubuntu 22.04. // Ubuntu uses /usr/local/share/ca-certificates/ as the cert drop folder and update-ca-certificates // to rebuild the trust bundle. diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 3621446896b..048deaf5cd1 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -21,6 +21,7 @@ import ( // installation on Windows Server 2022. func Test_RCV1P_Windows2022(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 2022 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -33,7 +34,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { VHD: config.VHDWindows2022Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: EmptyBootstrapConfigMutator, + BootstrapConfigMutator: cseMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) }, @@ -44,6 +45,7 @@ func Test_RCV1P_Windows2022(t *testing.T) { // Test_RCV1P_Windows23H2 validates RCV1P on Windows Server 23H2, the annual channel release. func Test_RCV1P_Windows23H2(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 23H2 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -56,7 +58,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { VHD: config.VHDWindows23H2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: EmptyBootstrapConfigMutator, + BootstrapConfigMutator: cseMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) }, @@ -67,6 +69,7 @@ func Test_RCV1P_Windows23H2(t *testing.T) { // Test_RCV1P_Windows2025 validates RCV1P on Windows Server 2025 (non-gen2). func Test_RCV1P_Windows2025(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 2025 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -80,6 +83,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + cseMutator(nbc) Windows2025BootstrapConfigMutator(t, nbc) }, Validator: func(ctx context.Context, s *Scenario) { @@ -93,6 +97,7 @@ func Test_RCV1P_Windows2025(t *testing.T) { // installation on Windows Server 2022 Gen2. Covers the gen2 pipeline job. func Test_RCV1P_Windows2022Gen2(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 2022 Gen2 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -105,7 +110,7 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { VHD: config.VHDWindows2022ContainerdGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: EmptyBootstrapConfigMutator, + BootstrapConfigMutator: cseMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) }, @@ -116,6 +121,7 @@ func Test_RCV1P_Windows2022Gen2(t *testing.T) { // Test_RCV1P_Windows23H2Gen2 validates RCV1P on Windows Server 23H2 Gen2. Covers the gen2 pipeline job. func Test_RCV1P_Windows23H2Gen2(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 23H2 Gen2 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -128,7 +134,7 @@ func Test_RCV1P_Windows23H2Gen2(t *testing.T) { VHD: config.VHDWindows23H2Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: EmptyBootstrapConfigMutator, + BootstrapConfigMutator: cseMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertModeWindows(ctx, s) }, @@ -139,6 +145,7 @@ func Test_RCV1P_Windows23H2Gen2(t *testing.T) { // Test_RCV1P_Windows2025Gen2 validates RCV1P on Windows Server 2025 Gen2. Covers the gen2 pipeline job. func Test_RCV1P_Windows2025Gen2(t *testing.T) { skipIfRCV1PNotConfigured(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows Server 2025 Gen2 with VM opt-in tag", AzureClient: config.RCV1PAzure, @@ -155,6 +162,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { }, VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + cseMutator(nbc) Windows2025BootstrapConfigMutator(t, nbc) }, Validator: func(ctx context.Context, s *Scenario) { @@ -172,6 +180,7 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { skipIfRCV1PNotConfigured(t) skipNotOptedInOnAutoDetect(t) + cseMutator := rcv1pWindowsCSEMutator(t) // TODO(rcv1p): remove once RCV1P ships in published CSE package RunScenario(t, &Scenario{ Description: "Tests RCV1P cert mode on Windows without VM opt-in tag; expects no cert installation", AzureClient: config.RCV1PAzure, @@ -182,7 +191,7 @@ func Test_RCV1P_Windows_NotOptedIn(t *testing.T) { Config: Config{ Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2022Containerd, - BootstrapConfigMutator: EmptyBootstrapConfigMutator, + BootstrapConfigMutator: cseMutator, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PNotOptedInWindows(ctx, s) }, From 1efb2c0aaff7cf709e7842cbd55be546b154a164 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 21:59:02 -0700 Subject: [PATCH 64/70] fix: parse wireserver IsOptedInForRootCerts JSON response with jq The wireserver returns JSON like {"IsOptedInForRootCerts":true} but the script was using grep for IsOptedInForRootCerts=true (equals sign), which never matches the JSON colon format. Use jq for proper JSON parsing instead. This fix was previously applied but accidentally dropped during a rebase squash/reorder. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index f22a78fd34f..5e914878426 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -94,7 +94,9 @@ function is_opted_in_for_root_certs { return 2 fi - if echo "$opt_in_response" | grep -q "IsOptedInForRootCerts=true"; then + # Wireserver may return JSON ({"IsOptedInForRootCerts":true}) or key=value + # (IsOptedInForRootCerts=true). Use jq for proper JSON parsing. + if echo "$opt_in_response" | jq -e '.IsOptedInForRootCerts == true' > /dev/null 2>&1; then echo "IsOptedInForRootCerts=true" return 0 fi From d1414df8ef4834c8dc2dd689d0ce439ef7a9bff3 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 23:20:07 -0700 Subject: [PATCH 65/70] fix(e2e): update BootstrapConfigMutator signatures after rebase Adapt to upstream signature change: BootstrapConfigMutator now takes (*Cluster, *NodeBootstrappingConfiguration) instead of just (*NodeBootstrappingConfiguration). Also thread infra parameter through setupPrivateDNSForAPIServer to match getClusterVNet signature. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/cluster.go | 6 +++--- e2e/scenario_rcv1p_test.go | 16 ++++++++-------- e2e/scenario_rcv1p_win_test.go | 10 +++++----- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/e2e/cluster.go b/e2e/cluster.go index 777651d2ffc..508a4162842 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -122,7 +122,7 @@ func prepareCluster(ctx context.Context, infra *ClusterInfra, clusterModel *armc }, debugDeps...) if !isNetworkIsolated { dag.Run(g, func(ctx context.Context) error { - return setupPrivateDNSForAPIServer(ctx, cluster) + return setupPrivateDNSForAPIServer(ctx, infra, cluster) }) } extract := dag.Go1(g, kube, extractClusterParams(cluster)) @@ -838,7 +838,7 @@ func ensureResourceGroupWithInfra(ctx context.Context, infra *ClusterInfra, loca // setupPrivateDNSForAPIServer creates a private DNS zone for the API server FQDN // linked to the cluster VNet with an A record pointing to the current public IP. // Simulates a customer environment with minimal private DNS entries. -func setupPrivateDNSForAPIServer(ctx context.Context, cluster *armcontainerservice.ManagedCluster) error { +func setupPrivateDNSForAPIServer(ctx context.Context, infra *ClusterInfra, cluster *armcontainerservice.ManagedCluster) error { defer toolkit.LogStepCtx(ctx, "setting up private DNS for API server")() fqdn := *cluster.Properties.Fqdn @@ -864,7 +864,7 @@ func setupPrivateDNSForAPIServer(ctx context.Context, cluster *armcontainerservi return fmt.Errorf("creating private zone %q: %w", fqdn, err) } - vnet, err := getClusterVNet(ctx, nodeRG) + vnet, err := getClusterVNet(ctx, infra, nodeRG) if err != nil { return fmt.Errorf("getting cluster VNet: %w", err) } diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index ca94f2ca67f..dc90654376d 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -312,9 +312,9 @@ func findRepoRoot() (string, error) { // rcv1pWindowsCSEMutator returns a BootstrapConfigMutator that overrides CseScriptsPackageURL // to use the branch-built CSE zip containing the RCV1P code. // TODO(rcv1p): remove this once the RCV1P code ships in a published CSE package. -func rcv1pWindowsCSEMutator(t *testing.T) func(*datamodel.NodeBootstrappingConfiguration) { +func rcv1pWindowsCSEMutator(t *testing.T) func(*Cluster, *datamodel.NodeBootstrappingConfiguration) { cseURL := getOrBuildBranchCSEPackageURL(t) - return func(nbc *datamodel.NodeBootstrappingConfiguration) { + return func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { nbc.ContainerService.Properties.WindowsProfile.CseScriptsPackageURL = cseURL } } @@ -336,7 +336,7 @@ func Test_RCV1P_Ubuntu2204(t *testing.T) { VHD: config.VHDUbuntu2204Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertMode(ctx, s) @@ -362,7 +362,7 @@ func Test_RCV1P_Ubuntu2404(t *testing.T) { VHD: config.VHDUbuntu2404Gen2Containerd, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertMode(ctx, s) @@ -388,7 +388,7 @@ func Test_RCV1P_AzureLinuxV3(t *testing.T) { VHD: config.VHDAzureLinuxV3Gen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertMode(ctx, s) @@ -414,7 +414,7 @@ func Test_RCV1P_Flatcar(t *testing.T) { VHD: config.VHDFlatcarGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertMode(ctx, s) @@ -443,7 +443,7 @@ func Test_RCV1P_ACL(t *testing.T) { rcv1pOptInVMConfigMutator(vmss) }, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PCertMode(ctx, s) @@ -472,7 +472,7 @@ func Test_RCV1P_NotOptedIn(t *testing.T) { Config: Config{ Cluster: ClusterRCV1PKubenet, VHD: config.VHDUbuntu2204Gen2Containerd, - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { }, Validator: func(ctx context.Context, s *Scenario) { ValidateRCV1PNotOptedIn(ctx, s) diff --git a/e2e/scenario_rcv1p_win_test.go b/e2e/scenario_rcv1p_win_test.go index 048deaf5cd1..fe196a1e9ae 100644 --- a/e2e/scenario_rcv1p_win_test.go +++ b/e2e/scenario_rcv1p_win_test.go @@ -81,9 +81,9 @@ func Test_RCV1P_Windows2025(t *testing.T) { Cluster: ClusterRCV1PAzureNetwork, VHD: config.VHDWindows2025, VMConfigMutator: rcv1pOptInVMConfigMutator, - VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { - cseMutator(nbc) + VMInstanceTags: rcv1pVMInstanceTags(), + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { + cseMutator(nil, nbc) Windows2025BootstrapConfigMutator(t, nbc) }, Validator: func(ctx context.Context, s *Scenario) { @@ -161,8 +161,8 @@ func Test_RCV1P_Windows2025Gen2(t *testing.T) { rcv1pOptInVMConfigMutator(vmss) }, VMInstanceTags: rcv1pVMInstanceTags(), - BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { - cseMutator(nbc) + BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { + cseMutator(nil, nbc) Windows2025BootstrapConfigMutator(t, nbc) }, Validator: func(ctx context.Context, s *Scenario) { From 224576e13b530aabbe4764363ff4527b917417ff Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 23:24:45 -0700 Subject: [PATCH 66/70] fix: fail process_cert_operations when no cert bodies are saved Track the number of successfully saved certificates and return non-zero if all individual cert content fetches failed despite the operation endpoint returning filenames. This closes a gap where retrieve_rcv1p_certs could report success with zero certs actually downloaded. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh index 5e914878426..fc1d2793635 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud.sh @@ -166,6 +166,7 @@ function process_cert_operations { return 1 fi + local saved_count=0 for cert_filename in "${cert_filenames[@]}"; do echo "Processing certificate file: $cert_filename" @@ -182,7 +183,14 @@ function process_cert_operations { echo "$cert_content" > "/root/AzureCACertificates/$cert_filename" echo "Successfully saved certificate: $cert_filename" + saved_count=$((saved_count + 1)) done + + if [ $saved_count -eq 0 ]; then + echo "Error: all certificate content fetches failed for $endpoint_type (${#cert_filenames[@]} filenames found but 0 saved)" + return 1 + fi + echo "Saved $saved_count/${#cert_filenames[@]} certificates for $endpoint_type" } function retrieve_rcv1p_certs { From f757ce84df4c7d6c27aa69e0e6ea8453db5246e8 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 23:25:57 -0700 Subject: [PATCH 67/70] fix: pass repodepot_endpoint explicitly to add_key_ubuntu and add_ms_keys These functions relied on bash dynamic scoping to access the caller's local repodepot_endpoint variable. Pass it as an explicit parameter to follow the repo's shell script guidelines and avoid fragile implicit variable dependencies. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- .../artifacts/init-aks-custom-cloud-repos.sh | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh index 0c68d513568..9f5eae0119a 100644 --- a/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh +++ b/parts/linux/cloud-init/artifacts/init-aks-custom-cloud-repos.sh @@ -92,13 +92,14 @@ EOF } function add_key_ubuntu { - local key_name=$1 + local repodepot_endpoint="$1" + local key_name="$2" - key_url="${repodepot_endpoint}/keys/${key_name}" + local key_url="${repodepot_endpoint}/keys/${key_name}" check_url $key_url echo "Adding $key_name key to keyring..." - key_data=$(wget -O - $key_url) - key_path=$(derive_key_paths $key_name) + local key_data=$(wget -O - $key_url) + local key_path=$(derive_key_paths $key_name) echo "$key_data" | gpg --dearmor | tee $key_path > /dev/null echo "$key_name key added to keyring." } @@ -115,11 +116,12 @@ function derive_key_paths { } function add_ms_keys { + local repodepot_endpoint="$1" # Add the Microsoft package server keys to keyring. echo "Adding Microsoft keys to keyring..." - add_key_ubuntu microsoft.asc - add_key_ubuntu msopentech.asc + add_key_ubuntu "$repodepot_endpoint" microsoft.asc + add_key_ubuntu "$repodepot_endpoint" msopentech.asc } function aptget_update { From 796d1ff6eed60fff764014692ec617609a0ff66a Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Thu, 7 May 2026 23:27:07 -0700 Subject: [PATCH 68/70] chore(e2e): remove REVERT ME wireserver diagnostic block from Windows validator Remove the always-on diagnostic block that probed wireserver endpoints and dumped CSE logs on every Windows RCV1P test run. This bloated test logs, added latency, and could leak wireserver response content into CI. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/scenario_rcv1p_test.go | 4 ++-- e2e/validators.go | 39 -------------------------------------- 2 files changed, 2 insertions(+), 41 deletions(-) diff --git a/e2e/scenario_rcv1p_test.go b/e2e/scenario_rcv1p_test.go index dc90654376d..9c7f889358a 100644 --- a/e2e/scenario_rcv1p_test.go +++ b/e2e/scenario_rcv1p_test.go @@ -410,8 +410,8 @@ func Test_RCV1P_Flatcar(t *testing.T) { RCV1PCertMode: true, }, Config: Config{ - Cluster: ClusterRCV1PKubenet, - VHD: config.VHDFlatcarGen2, + Cluster: ClusterRCV1PKubenet, + VHD: config.VHDFlatcarGen2, VMConfigMutator: rcv1pOptInVMConfigMutator, VMInstanceTags: rcv1pVMInstanceTags(), BootstrapConfigMutator: func(_ *Cluster, nbc *datamodel.NodeBootstrappingConfiguration) { diff --git a/e2e/validators.go b/e2e/validators.go index 3377a92a954..29ec1f0ba19 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -2896,45 +2896,6 @@ func rcv1pTrustStoreDir(s *Scenario) string { func ValidateRCV1PCertModeWindows(ctx context.Context, s *Scenario) { s.T.Helper() - // REVERT ME: Diagnostic block — probe wireserver endpoints and dump CSE log tail from the VM - // so we can see exactly what the wireserver returns for operationrequests and what the CSE logged. - diagCommand := []string{ - "$ErrorActionPreference = 'Continue'", - "Write-Host '=== DIAGNOSTIC: probing wireserver rcv1p endpoints ==='", - "try {", - " $optIn = Invoke-WebRequest -Uri 'http://168.63.129.16/acms/isOptedInForRootCerts' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"isOptedInForRootCerts: $($optIn.Content)\"", - "} catch { Write-Host \"isOptedInForRootCerts ERROR: $_\" }", - "try {", - " $root = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsroot&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"operationrequestsroot status=$($root.StatusCode) length=$($root.Content.Length)\"", - " Write-Host \"operationrequestsroot content: $($root.Content)\"", - "} catch { Write-Host \"operationrequestsroot ERROR: $_\" }", - "try {", - " $intermediate = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=operationrequestsintermediate&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"operationrequestsintermediate status=$($intermediate.StatusCode) length=$($intermediate.Content.Length)\"", - " Write-Host \"operationrequestsintermediate content: $($intermediate.Content)\"", - "} catch { Write-Host \"operationrequestsintermediate ERROR: $_\" }", - "try {", - " $legacy = Invoke-WebRequest -Uri 'http://168.63.129.16/machine?comp=acmspackage&type=cacertificates&ext=json' -UseBasicParsing -TimeoutSec 30", - " Write-Host \"legacy cacertificates status=$($legacy.StatusCode) length=$($legacy.Content.Length)\"", - " $legacyJson = $legacy.Content | ConvertFrom-Json", - " if ($legacyJson.Certificates) { Write-Host \"legacy cert count: $($legacyJson.Certificates.Length)\" } else { Write-Host 'legacy: no Certificates array' }", - "} catch { Write-Host \"legacy cacertificates ERROR: $_\" }", - "Write-Host '=== DIAGNOSTIC: C:\\ca folder contents ==='", - "if (Test-Path 'C:\\ca') { Get-ChildItem -Path 'C:\\ca' -File | ForEach-Object { Write-Host \" $($_.Name) ($($_.Length) bytes)\" } } else { Write-Host 'C:\\ca does not exist' }", - "Write-Host '=== DIAGNOSTIC: CSE log tail (last 60 lines with CA/cert/wireserver) ==='", - "if (Test-Path 'C:\\AzureData\\CustomDataSetupScript.log') {", - " Get-Content 'C:\\AzureData\\CustomDataSetupScript.log' -Tail 200 | Where-Object { $_ -match 'CA |cert|wireserver|optedin|operation|acms|Write cert|Warning' } | Select-Object -Last 60 | ForEach-Object { Write-Host $_ }", - "} else { Write-Host 'CSE log not found' }", - "Write-Host '=== END DIAGNOSTIC ==='", - } - diagResult := execScriptOnVMForScenario(ctx, s, strings.Join(diagCommand, "\n")) - s.T.Logf("REVERT ME: wireserver diagnostics stdout:\n%s", diagResult.stdout) - if diagResult.stderr != "" { - s.T.Logf("REVERT ME: wireserver diagnostics stderr:\n%s", diagResult.stderr) - } - // Validate the provisioning log shows wireserver was queried and returned opted-in ValidateFileHasContent(ctx, s, "C:\\AzureData\\CustomDataSetupScript.log", "IsOptedInForRootCerts wireserver response:") From 482ec2d84cc1d15ad8446e7fff81379c16c1e8fd Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Fri, 8 May 2026 13:16:42 -0700 Subject: [PATCH 69/70] fix(e2e): use infra.Azure for private DNS operations in RCV1P subscription createPrivateZone, waitForPrivateZone, createPrivateDNSLink, and the RecordSet call in setupPrivateDNSForAPIServer were hardcoded to config.Azure (the default E2E subscription). When running RCV1P tests in a separate subscription, the MC_ resource group only exists in the RCV1P subscription, causing ResourceGroupNotFound errors. Add an azure *config.AzureClient parameter to these functions so the caller can pass the correct subscription client. setupPrivateDNSForAPIServer now uses infra.Azure; addPrivateEndpointForACR continues using config.Azure. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Signed-off-by: Ramkumar Chinchani --- e2e/aks_model.go | 26 +++++++++++++------------- e2e/cluster.go | 6 +++--- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/e2e/aks_model.go b/e2e/aks_model.go index f5f999d44c9..05c392dd646 100644 --- a/e2e/aks_model.go +++ b/e2e/aks_model.go @@ -621,11 +621,11 @@ func addPrivateEndpointForACR(ctx context.Context, nodeResourceGroup, privateACR privateZoneName := "privatelink.azurecr.io" var privateZone *armprivatedns.PrivateZone - if privateZone, err = createPrivateZone(ctx, nodeResourceGroup, privateZoneName); err != nil { + if privateZone, err = createPrivateZone(ctx, config.Azure, nodeResourceGroup, privateZoneName); err != nil { return err } - if err = createPrivateDNSLink(ctx, vnet, nodeResourceGroup, privateZoneName); err != nil { + if err = createPrivateDNSLink(ctx, config.Azure, vnet, nodeResourceGroup, privateZoneName); err != nil { return err } @@ -872,8 +872,8 @@ func createPrivateEndpoint(ctx context.Context, nodeResourceGroup, privateEndpoi return &resp.PrivateEndpoint, nil } -func createPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName string) (*armprivatedns.PrivateZone, error) { - pzResp, err := config.Azure.PrivateZonesClient.Get( +func createPrivateZone(ctx context.Context, azure *config.AzureClient, nodeResourceGroup, privateZoneName string) (*armprivatedns.PrivateZone, error) { + pzResp, err := azure.PrivateZonesClient.Get( ctx, nodeResourceGroup, privateZoneName, @@ -885,7 +885,7 @@ func createPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName s dnsZoneParams := armprivatedns.PrivateZone{ Location: to.Ptr("global"), } - poller, err := config.Azure.PrivateZonesClient.BeginCreateOrUpdate( + poller, err := azure.PrivateZonesClient.BeginCreateOrUpdate( ctx, nodeResourceGroup, privateZoneName, @@ -896,7 +896,7 @@ func createPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName s // 409 means another operation is in progress — wait and re-fetch var respErr *azcore.ResponseError if errors.As(err, &respErr) && respErr.StatusCode == 409 { - return waitForPrivateZone(ctx, nodeResourceGroup, privateZoneName) + return waitForPrivateZone(ctx, azure, nodeResourceGroup, privateZoneName) } return nil, fmt.Errorf("failed to create private dns zone in BeginCreateOrUpdate: %w", err) } @@ -909,11 +909,11 @@ func createPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName s return &resp.PrivateZone, nil } -func waitForPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName string) (*armprivatedns.PrivateZone, error) { +func waitForPrivateZone(ctx context.Context, azure *config.AzureClient, nodeResourceGroup, privateZoneName string) (*armprivatedns.PrivateZone, error) { defer toolkit.LogStepCtxf(ctx, "waiting for private DNS zone %s (409 conflict)", privateZoneName)() var zone *armprivatedns.PrivateZone err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { - resp, err := config.Azure.PrivateZonesClient.Get(ctx, nodeResourceGroup, privateZoneName, nil) + resp, err := azure.PrivateZonesClient.Get(ctx, nodeResourceGroup, privateZoneName, nil) if err != nil { var respErr *azcore.ResponseError if errors.As(err, &respErr) && respErr.StatusCode == 404 { @@ -930,9 +930,9 @@ func waitForPrivateZone(ctx context.Context, nodeResourceGroup, privateZoneName return zone, nil } -func createPrivateDNSLink(ctx context.Context, vnet VNet, nodeResourceGroup, privateZoneName string) error { +func createPrivateDNSLink(ctx context.Context, azure *config.AzureClient, vnet VNet, nodeResourceGroup, privateZoneName string) error { networkLinkName := "link-ABE2ETests" - _, err := config.Azure.VirutalNetworkLinksClient.Get( + _, err := azure.VirutalNetworkLinksClient.Get( ctx, nodeResourceGroup, privateZoneName, @@ -945,7 +945,7 @@ func createPrivateDNSLink(ctx context.Context, vnet VNet, nodeResourceGroup, pri return nil } - vnetForId, err := config.Azure.VNet.Get(ctx, nodeResourceGroup, vnet.name, nil) + vnetForId, err := azure.VNet.Get(ctx, nodeResourceGroup, vnet.name, nil) if err != nil { return fmt.Errorf("failed to get vnet: %w", err) } @@ -958,7 +958,7 @@ func createPrivateDNSLink(ctx context.Context, vnet VNet, nodeResourceGroup, pri RegistrationEnabled: to.Ptr(false), }, } - poller, err := config.Azure.VirutalNetworkLinksClient.BeginCreateOrUpdate( + poller, err := azure.VirutalNetworkLinksClient.BeginCreateOrUpdate( ctx, nodeResourceGroup, privateZoneName, @@ -972,7 +972,7 @@ func createPrivateDNSLink(ctx context.Context, vnet VNet, nodeResourceGroup, pri if errors.As(err, &respErr) && respErr.StatusCode == 409 { toolkit.Logf(ctx, "Virtual network link creation conflict (409), waiting for completion") return wait.PollUntilContextTimeout(ctx, 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) { - _, err := config.Azure.VirutalNetworkLinksClient.Get(ctx, nodeResourceGroup, privateZoneName, networkLinkName, nil) + _, err := azure.VirutalNetworkLinksClient.Get(ctx, nodeResourceGroup, privateZoneName, networkLinkName, nil) if err != nil { var respErr *azcore.ResponseError if errors.As(err, &respErr) && respErr.StatusCode == 404 { diff --git a/e2e/cluster.go b/e2e/cluster.go index 508a4162842..36bc092f6c9 100644 --- a/e2e/cluster.go +++ b/e2e/cluster.go @@ -860,7 +860,7 @@ func setupPrivateDNSForAPIServer(ctx context.Context, infra *ClusterInfra, clust } // createPrivateZone and createPrivateDNSLink handle 409 conflicts internally - if _, err := createPrivateZone(ctx, nodeRG, fqdn); err != nil { + if _, err := createPrivateZone(ctx, infra.Azure, nodeRG, fqdn); err != nil { return fmt.Errorf("creating private zone %q: %w", fqdn, err) } @@ -868,11 +868,11 @@ func setupPrivateDNSForAPIServer(ctx context.Context, infra *ClusterInfra, clust if err != nil { return fmt.Errorf("getting cluster VNet: %w", err) } - if err := createPrivateDNSLink(ctx, vnet, nodeRG, fqdn); err != nil { + if err := createPrivateDNSLink(ctx, infra.Azure, vnet, nodeRG, fqdn); err != nil { return fmt.Errorf("linking private zone to VNet: %w", err) } - _, err = config.Azure.RecordSetClient.CreateOrUpdate(ctx, nodeRG, fqdn, armprivatedns.RecordTypeA, "@", + _, err = infra.Azure.RecordSetClient.CreateOrUpdate(ctx, nodeRG, fqdn, armprivatedns.RecordTypeA, "@", armprivatedns.RecordSet{Properties: &armprivatedns.RecordSetProperties{TTL: to.Ptr[int64](300), ARecords: aRecords}}, nil) if err != nil { return fmt.Errorf("creating A record in zone %q: %w", fqdn, err) From 3c0baacb30c797e28e0c9853cf487140241c5895 Mon Sep 17 00:00:00 2001 From: Ramkumar Chinchani Date: Mon, 18 May 2026 14:31:59 -0700 Subject: [PATCH 70/70] fix: guard against unresolved ADO pipeline variable expressions in RCV1PSubscriptionID Signed-off-by: Ramkumar Chinchani --- e2e/config/config.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/e2e/config/config.go b/e2e/config/config.go index a2b41ecdae0..ba46d54991a 100644 --- a/e2e/config/config.go +++ b/e2e/config/config.go @@ -184,6 +184,8 @@ func mustLoadConfig() *Configuration { func init() { rcv1pSubID := strings.TrimSpace(Config.RCV1PSubscriptionID) + // Guard against ADO pipeline variable expressions that weren't resolved (e.g. "$(RCV1P_SUBSCRIPTION_ID)"). + // If the value is still a literal $(...) token, treat it as unset. if rcv1pSubID != "" && !strings.HasPrefix(rcv1pSubID, "$(") { client, err := NewAzureClientForSubscription(rcv1pSubID) if err != nil {