diff --git a/.github/workflows/sonar_upgrade.yml b/.github/workflows/sonar_upgrade.yml index a9154c00c..70eb12876 100644 --- a/.github/workflows/sonar_upgrade.yml +++ b/.github/workflows/sonar_upgrade.yml @@ -268,6 +268,14 @@ jobs: run_postflight_validations = true stop_on_failure = true + # Skip healthchecks that are expected to trip on a freshly-deployed POC + # environment and are not actionable for the upgrade: + # - sonargd-audit-directory: emits a WARNING when /imperva/data/sonargd/audit + # contains any files (threshold is 0). On a fresh DSF POC the directory + # always has at least one file, which would otherwise abort the upgrade + # during preflight validations. + ignore_healthcheck_checks = ["sonargd-audit-directory"] + } EOF cat ${UPGRADE_EXAMPLE_DIR}/main.tf diff --git a/examples/aws/sonar_upgrade/main.tf b/examples/aws/sonar_upgrade/main.tf index 77ed2e8fb..878a867af 100644 --- a/examples/aws/sonar_upgrade/main.tf +++ b/examples/aws/sonar_upgrade/main.tf @@ -83,4 +83,14 @@ module "sonar_upgrader" { run_postflight_validations = true stop_on_failure = true + # Optionally skip individual healthchecks that are not actionable for the upgrade. + # For example, on a freshly-deployed POC environment the "sonargd-audit-directory" + # healthcheck emits a WARNING ("Files count for /imperva/data/sonargd/audit (1) + # exceed threshold (0). This might indicate an ingestion issue.") which would + # otherwise abort the upgrade during preflight validations. + # ignore_healthcheck_checks = ["sonargd-audit-directory"] + + # Alternatively, ignore all WARNING-level healthchecks (FAILUREs will still abort): + # ignore_healthcheck_warnings = true + } diff --git a/modules/azurerm/db-with-agent/os_params.tf b/modules/azurerm/db-with-agent/os_params.tf index 95f5cfb35..62ebdc6f6 100644 --- a/modules/azurerm/db-with-agent/os_params.tf +++ b/modules/azurerm/db-with-agent/os_params.tf @@ -14,11 +14,24 @@ locals { echo "Waiting for the lock to be released..." sleep 1 done - sudo apt update -y + # apt_update_resilient is defined in setup.tftpl; falls back to a plain + # update if for any reason it is not available (defense in depth). + if declare -F apt_update_resilient >/dev/null; then + apt_update_resilient + else + sudo apt update -y + fi EOF database_installation_commands = { PostgreSql = <<-EOF - command -v psql || sudo apt install postgresql -y + # apt_install_resilient is defined in setup.tftpl and retries on + # transient apt-mirror failures; fall back to a plain install if + # the helper is unavailable for any reason. + if declare -F apt_install_resilient >/dev/null; then + command -v psql || apt_install_resilient postgresql + else + command -v psql || sudo apt install postgresql -y + fi if ! sudo systemctl is-active --quiet postgresql; then sudo systemctl start postgresql.service echo "PostgreSQL service started successfully." @@ -27,7 +40,11 @@ locals { fi EOF MySql = <<-EOF - command -v mysql || sudo apt install mysql-server -y + if declare -F apt_install_resilient >/dev/null; then + command -v mysql || apt_install_resilient mysql-server + else + command -v mysql || sudo apt install mysql-server -y + fi if ! sudo systemctl is-active --quiet mysql; then sudo systemctl start mysql.service echo "MySQL service started successfully." diff --git a/modules/azurerm/db-with-agent/setup.tftpl b/modules/azurerm/db-with-agent/setup.tftpl index 38301c7ca..0586987d1 100644 --- a/modules/azurerm/db-with-agent/setup.tftpl +++ b/modules/azurerm/db-with-agent/setup.tftpl @@ -2,6 +2,7 @@ set -e set -u set -x +set -o pipefail cd /root || exit 1 # Turn off interactive prompts and skip translations @@ -11,16 +12,69 @@ if [ -x "$(command -v apt-get)" ]; then sudo mkdir -p /etc/apt/apt.conf.d cat <= 7.71.0, but Ubuntu 20.04 ships + # curl 7.68.0. Relying on the outer attempt loop below for resilience + # avoids breaking the script on older curl (which exits 2 on unknown + # options, while `set -o pipefail` would otherwise hide that failure + # behind the pipe to `sudo bash`). + for attempt in 1 2 3; do + if curl -fsSL --retry 5 https://aka.ms/InstallAzureCLIDeb | sudo bash; then + break + fi + echo "Azure CLI install failed (attempt $attempt). Repairing apt lists..." + apt_update_resilient || true + sleep $((attempt * 10)) + if [ "$attempt" = 3 ]; then + echo "Azure CLI install: giving up after 3 attempts" + return 1 + fi + done az login --identity --allow-no-subscriptions >/dev/null } @@ -56,4 +110,3 @@ install_azcli install_agent_and_periodic_db_query_cronjob - diff --git a/modules/azurerm/dra-admin/setup.tftpl b/modules/azurerm/dra-admin/setup.tftpl index 349a001a5..f70aed992 100644 --- a/modules/azurerm/dra-admin/setup.tftpl +++ b/modules/azurerm/dra-admin/setup.tftpl @@ -3,11 +3,42 @@ set -x exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1 echo BEGIN +RHEL_MAJOR_VERSION=$(cat /etc/os-release | grep VERSION_ID | sed -E 's/^.*"([0-9]+)\.[0-9]+".*$/\1/') + +# Retry a command on transient dnf/yum-mirror or RHUI failures (e.g. HTTP 400 +# from rhui-{1,2,3}.microsoft.com, "Failed to download metadata for repo", +# brief TLS hangups). Cleans the dnf cache between attempts. +function dnf_retry(){ + local attempt + for attempt in 1 2 3 4 5; do + if "$@"; then + return 0 + fi + echo "Command failed (attempt $attempt): $*. Cleaning dnf cache and retrying..." + dnf clean all || true + rm -rf /var/cache/dnf || true + sleep $((attempt * 10)) + done + return 1 +} + function install-azure-cli(){ - rpm --import https://packages.microsoft.com/keys/microsoft.asc - dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm - dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm - dnf install azure-cli -y + dnf_retry rpm --import https://packages.microsoft.com/keys/microsoft.asc + # Install only the packages-microsoft-prod RPM that matches the running + # RHEL major version. Previously we attempted both RHEL 8 and RHEL 9 + # variants and tolerated the failing one with `|| true`, but combined + # with dnf_retry that wasted up to ~5 minutes of bounded retries on the + # impossible install -- enough to push past the downstream null_resource + # readiness timeout. + if [ "$RHEL_MAJOR_VERSION" = "9" ]; then + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm + elif [ "$RHEL_MAJOR_VERSION" = "8" ]; then + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm + else + echo "Unsupported RHEL version: $RHEL_MAJOR_VERSION" + exit 1 + fi + dnf_retry dnf install azure-cli -y az login --identity } diff --git a/modules/azurerm/dra-analytics/setup.tftpl b/modules/azurerm/dra-analytics/setup.tftpl index 4613c909b..f3b555746 100644 --- a/modules/azurerm/dra-analytics/setup.tftpl +++ b/modules/azurerm/dra-analytics/setup.tftpl @@ -3,17 +3,48 @@ set -x exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1 echo BEGIN +RHEL_MAJOR_VERSION=$(cat /etc/os-release | grep VERSION_ID | sed -E 's/^.*"([0-9]+)\.[0-9]+".*$/\1/') + function wait-for-admin(){ while ! nc -z ${admin_server_private_ip} 8443; do sleep 0.1 done } +# Retry a command on transient dnf/yum-mirror or RHUI failures (e.g. HTTP 400 +# from rhui-{1,2,3}.microsoft.com, "Failed to download metadata for repo", +# brief TLS hangups). Cleans the dnf cache between attempts. +function dnf_retry(){ + local attempt + for attempt in 1 2 3 4 5; do + if "$@"; then + return 0 + fi + echo "Command failed (attempt $attempt): $*. Cleaning dnf cache and retrying..." + dnf clean all || true + rm -rf /var/cache/dnf || true + sleep $((attempt * 10)) + done + return 1 +} + function install-azure-cli(){ - rpm --import https://packages.microsoft.com/keys/microsoft.asc - dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm - dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm - dnf install azure-cli -y + dnf_retry rpm --import https://packages.microsoft.com/keys/microsoft.asc + # Install only the packages-microsoft-prod RPM that matches the running + # RHEL major version. Previously we attempted both RHEL 8 and RHEL 9 + # variants and tolerated the failing one with `|| true`, but combined + # with dnf_retry that wasted up to ~5 minutes of bounded retries on the + # impossible install -- enough to push past the downstream null_resource + # readiness timeout. + if [ "$RHEL_MAJOR_VERSION" = "9" ]; then + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm + elif [ "$RHEL_MAJOR_VERSION" = "8" ]; then + dnf_retry dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm + else + echo "Unsupported RHEL version: $RHEL_MAJOR_VERSION" + exit 1 + fi + dnf_retry dnf install azure-cli -y az login --identity } diff --git a/modules/azurerm/sonar-base-instance/setup.tftpl b/modules/azurerm/sonar-base-instance/setup.tftpl index a59b17f34..2feba42e3 100644 --- a/modules/azurerm/sonar-base-instance/setup.tftpl +++ b/modules/azurerm/sonar-base-instance/setup.tftpl @@ -30,6 +30,25 @@ function internet_access() { fi } +# Retry a yum/dnf-based command with cache repair between attempts. Recovers +# from transient Microsoft RHUI failures (HTTP 400 on +# https://rhui-{1,2,3}.microsoft.com/.../repomd.xml, "All mirrors were tried", +# "Failed to download metadata for repo"), which otherwise poison the local +# cache and make every retry fail the same way. +function yum_retry() { + local attempt + for attempt in 1 2 3 4 5; do + if "$@"; then + return 0 + fi + echo "Command failed (attempt $attempt): $*. Cleaning yum/dnf cache and retrying..." + yum clean all || true + rm -rf /var/cache/yum /var/cache/dnf || true + sleep $((attempt * 10)) + done + return 1 +} + function install_yum_dep_from_internet() { if ! internet_access; then echo "Error: No outbound internet access. Either enable outbound internet access, or make sure $@ is installed in the base ami" @@ -38,12 +57,10 @@ function install_yum_dep_from_internet() { local package="$1" local package_name="$${2:-$1}" + # yum_retry takes care of cache repair between attempts, so a transient + # Microsoft RHUI 400 / repomd.xml download glitch heals automatically. if ! yum list installed "$${package_name}"; then - yum install "$${package}" -y \ - || yum install "$${package}" -y \ - || yum install "$${package}" -y \ - || yum install "$${package}" -y \ - || yum install "$${package}" -y # trying x times since sometimes there is a glitch with the entitlement server + yum_retry yum install "$${package}" -y fi } @@ -52,16 +69,16 @@ function install_azcli_from_internet() { echo "Error: No outbound internet access. Either enable outbound internet access, or make sure az cli is installed in the base image" exit 1 fi - rpm --import https://packages.microsoft.com/keys/microsoft.asc + yum_retry rpm --import https://packages.microsoft.com/keys/microsoft.asc if [ "$RHEL_MAJOR_VERSION" -eq 8 ]; then - dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm + yum_retry dnf install -y https://packages.microsoft.com/config/rhel/8/packages-microsoft-prod.rpm elif [ "$RHEL_MAJOR_VERSION" -eq 9 ]; then - dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm + yum_retry dnf install -y https://packages.microsoft.com/config/rhel/9.0/packages-microsoft-prod.rpm else echo "Unsupported RHEL version: $RHEL_MAJOR_VERSION" exit 1 fi - dnf install azure-cli -y + yum_retry dnf install azure-cli -y az login --identity --allow-no-subscriptions }