From bee614b18e482db4c5e55f60c094b39bfcd30b7e Mon Sep 17 00:00:00 2001 From: Segev Elmalech Date: Tue, 19 May 2026 09:27:05 +0300 Subject: [PATCH 1/4] fix(azure): add resilient apt operations for transient mirror failures Add `apt_update_resilient` and `apt_install_resilient` helper functions that retry apt operations with list repair between attempts to handle transient archive.ubuntu.com failures (e.g. truncated InRelease files, "Splitting up into data and signature failed" errors). Changes: - Configure APT with retry settings and timeouts via /etc/apt/apt.conf.d - Wrap apt-get update/install calls with retry logic and self-repair - Add retry loop for Azure CLI installation from aka.ms - Apply across db-with-agent, dra-admin modules with fallback to plain apt commands when helpers are unavailable --- modules/azurerm/db-with-agent/os_params.tf | 23 ++++++-- modules/azurerm/db-with-agent/setup.tftpl | 53 +++++++++++++++++-- modules/azurerm/dra-admin/setup.tftpl | 28 ++++++++-- modules/azurerm/dra-analytics/setup.tftpl | 28 ++++++++-- .../azurerm/sonar-base-instance/setup.tftpl | 35 ++++++++---- 5 files changed, 144 insertions(+), 23 deletions(-) diff --git a/modules/azurerm/db-with-agent/os_params.tf b/modules/azurerm/db-with-agent/os_params.tf index 95f5cfb35..62ebdc6f6 100644 --- a/modules/azurerm/db-with-agent/os_params.tf +++ b/modules/azurerm/db-with-agent/os_params.tf @@ -14,11 +14,24 @@ locals { echo "Waiting for the lock to be released..." sleep 1 done - sudo apt update -y + # apt_update_resilient is defined in setup.tftpl; falls back to a plain + # update if for any reason it is not available (defense in depth). + if declare -F apt_update_resilient >/dev/null; then + apt_update_resilient + else + sudo apt update -y + fi EOF database_installation_commands = { PostgreSql = <<-EOF - command -v psql || sudo apt install postgresql -y + # apt_install_resilient is defined in setup.tftpl and retries on + # transient apt-mirror failures; fall back to a plain install if + # the helper is unavailable for any reason. + if declare -F apt_install_resilient >/dev/null; then + command -v psql || apt_install_resilient postgresql + else + command -v psql || sudo apt install postgresql -y + fi if ! sudo systemctl is-active --quiet postgresql; then sudo systemctl start postgresql.service echo "PostgreSQL service started successfully." @@ -27,7 +40,11 @@ locals { fi EOF MySql = <<-EOF - command -v mysql || sudo apt install mysql-server -y + if declare -F apt_install_resilient >/dev/null; then + command -v mysql || apt_install_resilient mysql-server + else + command -v mysql || sudo apt install mysql-server -y + fi if ! sudo systemctl is-active --quiet mysql; then sudo systemctl start mysql.service echo "MySQL service started successfully." diff --git a/modules/azurerm/db-with-agent/setup.tftpl b/modules/azurerm/db-with-agent/setup.tftpl index 38301c7ca..c72a1db5f 100644 --- a/modules/azurerm/db-with-agent/setup.tftpl +++ b/modules/azurerm/db-with-agent/setup.tftpl @@ -11,16 +11,64 @@ if [ -x "$(command -v apt-get)" ]; then sudo mkdir -p /etc/apt/apt.conf.d cat </dev/null } @@ -56,4 +104,3 @@ install_azcli install_agent_and_periodic_db_query_cronjob - diff --git a/modules/azurerm/dra-admin/setup.tftpl b/modules/azurerm/dra-admin/setup.tftpl index 349a001a5..d7dcc380c 100644 --- a/modules/azurerm/dra-admin/setup.tftpl +++ b/modules/azurerm/dra-admin/setup.tftpl @@ -3,11 +3,31 @@ set -x exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1 echo BEGIN +# Retry a command on transient dnf/yum-mirror or RHUI failures (e.g. HTTP 400 +# from rhui-{1,2,3}.microsoft.com, "Failed to download metadata for repo", +# brief TLS hangups). Cleans the dnf cache between attempts. +function dnf_retry(){ + local attempt + for attempt in 1 2 3 4 5; do + if "$@"; then + return 0 + fi + echo "Command failed (attempt $attempt): $*. Cleaning dnf cache and retrying..." + dnf clean all || true + rm -rf /var/cache/dnf || true + sleep $((attempt * 10)) + done + return 1 +} + function install-azure-cli(){ - rpm --import https://packages.microsoft.com/keys/microsoft.asc - dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm - dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm - dnf install azure-cli -y + dnf_retry rpm --import https://packages.microsoft.com/keys/microsoft.asc + # One of the next two lines is expected to fail depending on RHEL major + # version; we don't want to block on either, so retries are bounded and + # failures are tolerated. + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm || true + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm || true + dnf_retry dnf install azure-cli -y az login --identity } diff --git a/modules/azurerm/dra-analytics/setup.tftpl b/modules/azurerm/dra-analytics/setup.tftpl index 4613c909b..d22585293 100644 --- a/modules/azurerm/dra-analytics/setup.tftpl +++ b/modules/azurerm/dra-analytics/setup.tftpl @@ -9,11 +9,31 @@ function wait-for-admin(){ done } +# Retry a command on transient dnf/yum-mirror or RHUI failures (e.g. HTTP 400 +# from rhui-{1,2,3}.microsoft.com, "Failed to download metadata for repo", +# brief TLS hangups). Cleans the dnf cache between attempts. +function dnf_retry(){ + local attempt + for attempt in 1 2 3 4 5; do + if "$@"; then + return 0 + fi + echo "Command failed (attempt $attempt): $*. Cleaning dnf cache and retrying..." + dnf clean all || true + rm -rf /var/cache/dnf || true + sleep $((attempt * 10)) + done + return 1 +} + function install-azure-cli(){ - rpm --import https://packages.microsoft.com/keys/microsoft.asc - dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm - dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm - dnf install azure-cli -y + dnf_retry rpm --import https://packages.microsoft.com/keys/microsoft.asc + # One of the next two lines is expected to fail depending on RHEL major + # version; we don't want to block on either, so retries are bounded and + # failures are tolerated. + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm || true + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm || true + dnf_retry dnf install azure-cli -y az login --identity } diff --git a/modules/azurerm/sonar-base-instance/setup.tftpl b/modules/azurerm/sonar-base-instance/setup.tftpl index a59b17f34..2feba42e3 100644 --- a/modules/azurerm/sonar-base-instance/setup.tftpl +++ b/modules/azurerm/sonar-base-instance/setup.tftpl @@ -30,6 +30,25 @@ function internet_access() { fi } +# Retry a yum/dnf-based command with cache repair between attempts. Recovers +# from transient Microsoft RHUI failures (HTTP 400 on +# https://rhui-{1,2,3}.microsoft.com/.../repomd.xml, "All mirrors were tried", +# "Failed to download metadata for repo"), which otherwise poison the local +# cache and make every retry fail the same way. +function yum_retry() { + local attempt + for attempt in 1 2 3 4 5; do + if "$@"; then + return 0 + fi + echo "Command failed (attempt $attempt): $*. Cleaning yum/dnf cache and retrying..." + yum clean all || true + rm -rf /var/cache/yum /var/cache/dnf || true + sleep $((attempt * 10)) + done + return 1 +} + function install_yum_dep_from_internet() { if ! internet_access; then echo "Error: No outbound internet access. Either enable outbound internet access, or make sure $@ is installed in the base ami" @@ -38,12 +57,10 @@ function install_yum_dep_from_internet() { local package="$1" local package_name="$${2:-$1}" + # yum_retry takes care of cache repair between attempts, so a transient + # Microsoft RHUI 400 / repomd.xml download glitch heals automatically. if ! yum list installed "$${package_name}"; then - yum install "$${package}" -y \ - || yum install "$${package}" -y \ - || yum install "$${package}" -y \ - || yum install "$${package}" -y \ - || yum install "$${package}" -y # trying x times since sometimes there is a glitch with the entitlement server + yum_retry yum install "$${package}" -y fi } @@ -52,16 +69,16 @@ function install_azcli_from_internet() { echo "Error: No outbound internet access. Either enable outbound internet access, or make sure az cli is installed in the base image" exit 1 fi - rpm --import https://packages.microsoft.com/keys/microsoft.asc + yum_retry rpm --import https://packages.microsoft.com/keys/microsoft.asc if [ "$RHEL_MAJOR_VERSION" -eq 8 ]; then - dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm + yum_retry dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm elif [ "$RHEL_MAJOR_VERSION" -eq 9 ]; then - dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm + yum_retry dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm else echo "Unsupported RHEL version: $RHEL_MAJOR_VERSION" exit 1 fi - dnf install azure-cli -y + yum_retry dnf install azure-cli -y az login --identity --allow-no-subscriptions } From 098a5fa2c38cec87406b6baa53a237ae1ce9ae51 Mon Sep 17 00:00:00 2001 From: Segev Elmalech Date: Tue, 19 May 2026 14:44:48 +0300 Subject: [PATCH 2/4] feat(sonar_upgrade): skip sonargd-audit-directory healthcheck on POC Add `ignore_healthcheck_checks = ["sonargd-audit-directory"]` to the sonar upgrade workflow to prevent preflight validations from aborting on freshly-deployed POC environments, where the audit directory always contains files and trips the threshold-based WARNING. Also document the `ignore_healthcheck_checks` and `ignore_healthcheck_warnings` options in the AWS sonar_upgrade example so users know how to bypass non-actionable healthchecks. --- .github/workflows/sonar_upgrade.yml | 8 ++++++++ examples/aws/sonar_upgrade/main.tf | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/.github/workflows/sonar_upgrade.yml b/.github/workflows/sonar_upgrade.yml index a9154c00c..70eb12876 100644 --- a/.github/workflows/sonar_upgrade.yml +++ b/.github/workflows/sonar_upgrade.yml @@ -268,6 +268,14 @@ jobs: run_postflight_validations = true stop_on_failure = true + # Skip healthchecks that are expected to trip on a freshly-deployed POC + # environment and are not actionable for the upgrade: + # - sonargd-audit-directory: emits a WARNING when /imperva/data/sonargd/audit + # contains any files (threshold is 0). On a fresh DSF POC the directory + # always has at least one file, which would otherwise abort the upgrade + # during preflight validations. + ignore_healthcheck_checks = ["sonargd-audit-directory"] + } EOF cat ${UPGRADE_EXAMPLE_DIR}/main.tf diff --git a/examples/aws/sonar_upgrade/main.tf b/examples/aws/sonar_upgrade/main.tf index 77ed2e8fb..878a867af 100644 --- a/examples/aws/sonar_upgrade/main.tf +++ b/examples/aws/sonar_upgrade/main.tf @@ -83,4 +83,14 @@ module "sonar_upgrader" { run_postflight_validations = true stop_on_failure = true + # Optionally skip individual healthchecks that are not actionable for the upgrade. + # For example, on a freshly-deployed POC environment the "sonargd-audit-directory" + # healthcheck emits a WARNING ("Files count for /imperva/data/sonargd/audit (1) + # exceed threshold (0). This might indicate an ingestion issue.") which would + # otherwise abort the upgrade during preflight validations. + # ignore_healthcheck_checks = ["sonargd-audit-directory"] + + # Alternatively, ignore all WARNING-level healthchecks (FAILUREs will still abort): + # ignore_healthcheck_warnings = true + } From ddd0faf56f1a0e1607cfa90f5df54305b86e72c3 Mon Sep 17 00:00:00 2001 From: Segev Elmalech Date: Wed, 20 May 2026 16:09:48 +0300 Subject: [PATCH 3/4] fix(azurerm): remove unsupported curl flag in azcli install Drop `--retry-all-errors` from the Azure CLI install command since it requires curl >= 7.71.0, while Ubuntu 20.04 ships curl 7.68.0. The unknown option caused curl to exit 2, and with the newly added `set -o pipefail`, the failure was masked by the pipe to `sudo bash`. Rely on the existing outer retry loop for resilience instead. --- modules/azurerm/db-with-agent/setup.tftpl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/modules/azurerm/db-with-agent/setup.tftpl b/modules/azurerm/db-with-agent/setup.tftpl index c72a1db5f..0586987d1 100644 --- a/modules/azurerm/db-with-agent/setup.tftpl +++ b/modules/azurerm/db-with-agent/setup.tftpl @@ -2,6 +2,7 @@ set -e set -u set -x +set -o pipefail cd /root || exit 1 # Turn off interactive prompts and skip translations @@ -57,8 +58,13 @@ sudo apt-get clean apt_update_resilient function install_azcli_from_internet() { + # Note: --retry-all-errors is curl >= 7.71.0, but Ubuntu 20.04 ships + # curl 7.68.0. Relying on the outer attempt loop below for resilience + # avoids breaking the script on older curl (which exits 2 on unknown + # options, while `set -o pipefail` would otherwise hide that failure + # behind the pipe to `sudo bash`). for attempt in 1 2 3; do - if curl -fsSL --retry 5 --retry-all-errors https://aka.ms/InstallAzureCLIDeb | sudo bash; then + if curl -fsSL --retry 5 https://aka.ms/InstallAzureCLIDeb | sudo bash; then break fi echo "Azure CLI install failed (attempt $attempt). Repairing apt lists..." From 4f486bfcc6335b33ed6823d5319b49f6ebafa8b3 Mon Sep 17 00:00:00 2001 From: Segev Elmalech Date: Wed, 20 May 2026 18:42:24 +0300 Subject: [PATCH 4/4] fix(azure/dra): install only matching RHEL packages-microsoft-prod RPM Detect the RHEL major version and install only the corresponding packages-microsoft-prod RPM instead of attempting both RHEL 8 and 9 variants with `|| true`. The previous approach combined with dnf_retry wasted up to ~5 minutes on the guaranteed-failing install, which could exceed the downstream null_resource readiness timeout. --- modules/azurerm/dra-admin/setup.tftpl | 21 ++++++++++++++++----- modules/azurerm/dra-analytics/setup.tftpl | 21 ++++++++++++++++----- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/modules/azurerm/dra-admin/setup.tftpl b/modules/azurerm/dra-admin/setup.tftpl index d7dcc380c..f70aed992 100644 --- a/modules/azurerm/dra-admin/setup.tftpl +++ b/modules/azurerm/dra-admin/setup.tftpl @@ -3,6 +3,8 @@ set -x exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1 echo BEGIN +RHEL_MAJOR_VERSION=$(cat /etc/os-release | grep VERSION_ID | sed -E 's/^.*"([0-9]+)\.[0-9]+".*$/\1/') + # Retry a command on transient dnf/yum-mirror or RHUI failures (e.g. HTTP 400 # from rhui-{1,2,3}.microsoft.com, "Failed to download metadata for repo", # brief TLS hangups). Cleans the dnf cache between attempts. @@ -22,11 +24,20 @@ function dnf_retry(){ function install-azure-cli(){ dnf_retry rpm --import https://packages.microsoft.com/keys/microsoft.asc - # One of the next two lines is expected to fail depending on RHEL major - # version; we don't want to block on either, so retries are bounded and - # failures are tolerated. - dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm || true - dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm || true + # Install only the packages-microsoft-prod RPM that matches the running + # RHEL major version. Previously we attempted both RHEL 8 and RHEL 9 + # variants and tolerated the failing one with `|| true`, but combined + # with dnf_retry that wasted up to ~5 minutes of bounded retries on the + # impossible install -- enough to push past the downstream null_resource + # readiness timeout. + if [ "$RHEL_MAJOR_VERSION" = "9" ]; then + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm + elif [ "$RHEL_MAJOR_VERSION" = "8" ]; then + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm + else + echo "Unsupported RHEL version: $RHEL_MAJOR_VERSION" + exit 1 + fi dnf_retry dnf install azure-cli -y az login --identity } diff --git a/modules/azurerm/dra-analytics/setup.tftpl b/modules/azurerm/dra-analytics/setup.tftpl index d22585293..f3b555746 100644 --- a/modules/azurerm/dra-analytics/setup.tftpl +++ b/modules/azurerm/dra-analytics/setup.tftpl @@ -3,6 +3,8 @@ set -x exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1 echo BEGIN +RHEL_MAJOR_VERSION=$(cat /etc/os-release | grep VERSION_ID | sed -E 's/^.*"([0-9]+)\.[0-9]+".*$/\1/') + function wait-for-admin(){ while ! nc -z ${admin_server_private_ip} 8443; do sleep 0.1 @@ -28,11 +30,20 @@ function dnf_retry(){ function install-azure-cli(){ dnf_retry rpm --import https://packages.microsoft.com/keys/microsoft.asc - # One of the next two lines is expected to fail depending on RHEL major - # version; we don't want to block on either, so retries are bounded and - # failures are tolerated. - dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm || true - dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm || true + # Install only the packages-microsoft-prod RPM that matches the running + # RHEL major version. Previously we attempted both RHEL 8 and RHEL 9 + # variants and tolerated the failing one with `|| true`, but combined + # with dnf_retry that wasted up to ~5 minutes of bounded retries on the + # impossible install -- enough to push past the downstream null_resource + # readiness timeout. + if [ "$RHEL_MAJOR_VERSION" = "9" ]; then + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm + elif [ "$RHEL_MAJOR_VERSION" = "8" ]; then + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm + else + echo "Unsupported RHEL version: $RHEL_MAJOR_VERSION" + exit 1 + fi dnf_retry dnf install azure-cli -y az login --identity }