From b84ba074c1e274849a6976f90a4f4ebb2a49911d Mon Sep 17 00:00:00 2001 From: Segev Elmalech Date: Thu, 28 May 2026 14:49:19 +0300 Subject: [PATCH] fix(azurerm): increase cloud-init timeout and improve yum retry resilience - Bump cloud_init_timeout defaults to 1800s across agentless-gw, hub, and sonar-base-instance modules to accommodate slower installs - Make yum_retry attempts configurable via YUM_RETRY_ATTEMPTS env var - Use fast fail-over (3 attempts) for primary azure-cli install path to avoid burning cloud-init time during RHUI incidents - Fallback path retains default 10 attempts with --nobest and --disablerepo='*rhui*' for degraded RHUI scenarios --- modules/azurerm/agentless-gw/variables.tf | 2 +- modules/azurerm/hub/variables.tf | 2 +- modules/azurerm/sonar-base-instance/setup.tftpl | 17 +++++++++++------ .../azurerm/sonar-base-instance/variables.tf | 2 +- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/modules/azurerm/agentless-gw/variables.tf b/modules/azurerm/agentless-gw/variables.tf index 70f851262..1657e833e 100644 --- a/modules/azurerm/agentless-gw/variables.tf +++ b/modules/azurerm/agentless-gw/variables.tf @@ -255,7 +255,7 @@ variable "base_directory" { variable "cloud_init_timeout" { type = number - default = 1200 + default = 1800 description = "Max time to wait for the machine to start" } diff --git a/modules/azurerm/hub/variables.tf b/modules/azurerm/hub/variables.tf index 9b8201b45..f961b5b1e 100644 --- a/modules/azurerm/hub/variables.tf +++ b/modules/azurerm/hub/variables.tf @@ -318,7 +318,7 @@ variable "dra_details" { variable "cloud_init_timeout" { type = number - default = 1200 + default = 1800 description = "Max time to wait for the machine to start" } diff --git a/modules/azurerm/sonar-base-instance/setup.tftpl b/modules/azurerm/sonar-base-instance/setup.tftpl index 22aab57ea..568545d5f 100644 --- a/modules/azurerm/sonar-base-instance/setup.tftpl +++ b/modules/azurerm/sonar-base-instance/setup.tftpl @@ -36,18 +36,19 @@ function internet_access() { # "Failed to download metadata for repo"), which otherwise poison the local # cache and make every retry fail the same way. function yum_retry() { + local max_attempts="$${YUM_RETRY_ATTEMPTS:-10}" local attempt - for attempt in {1..10}; do + for attempt in $(seq 1 "$max_attempts"); do if "$@"; then return 0 fi - echo "Command failed (attempt $attempt/10): $*" + echo "Command failed (attempt $attempt/$max_attempts): $*" echo "Cleaning yum/dnf cache and retrying..." yum clean all || true rm -rf /var/cache/yum /var/cache/dnf || true sleep $((attempt < 6 ? attempt * 10 : 60)) done - echo "All $attempt attempts failed for: $*" + echo "All $max_attempts attempts failed for: $*" echo "RHUI diagnostics:" curl -sI https://rhui-1.microsoft.com 2>&1 | head -5 || true return 1 @@ -88,9 +89,13 @@ function install_azcli_from_internet() { # Azure RHUI returns 400 on rhel-*-baseos-rhui-rpms. yum_retry rpm -Uvh --replacepkgs "$msrepo_url" - # Fall back to disabling RHUI if it's degraded; azure-cli's RHEL-side deps - # are already in the Azure RHEL PAYG base image. - yum_retry dnf install azure-cli -y \ + # Primary path uses fewer attempts (3) so we fail over fast when RHUI is + # degraded -- a real RHUI incident lasts hours, retrying 10x just burns + # cloud-init time. Fallback keeps the default 10 attempts as last resort. + # --nobest lets dnf walk back to an older azure-cli whose python pin + # matches what's already in the Azure RHEL PAYG base image (python3.9) + # when the appstream repo is unreachable. + YUM_RETRY_ATTEMPTS=3 yum_retry dnf install azure-cli -y \ || yum_retry dnf install azure-cli -y --disablerepo='*rhui*' --nobest az login --identity --allow-no-subscriptions diff --git a/modules/azurerm/sonar-base-instance/variables.tf b/modules/azurerm/sonar-base-instance/variables.tf index 862d55b1d..7a1f5158c 100644 --- a/modules/azurerm/sonar-base-instance/variables.tf +++ b/modules/azurerm/sonar-base-instance/variables.tf @@ -227,7 +227,7 @@ variable "base_directory" { variable "cloud_init_timeout" { type = number - default = 900 + default = 1800 description = "Max time to wait for the machine to start" }