diff --git a/.github/workflows/plan-ansible.yml b/.github/workflows/plan-ansible.yml index 3cdbe15e96..68c6835f9c 100644 --- a/.github/workflows/plan-ansible.yml +++ b/.github/workflows/plan-ansible.yml @@ -151,6 +151,12 @@ jobs: set -e if [ "$ansible_exit" -ne 0 ] && [ "$ansible_exit" -ne 2 ]; then + echo "::group::Ansible stderr for ${playbook_name} on ${node}" + cat "plan_outputs/${node}-${playbook_name}.stderr" || true + echo "::endgroup::" + echo "::group::Ansible JSON tail for ${playbook_name} on ${node}" + tail -n 200 "$json_file" || true + echo "::endgroup::" echo "::error::Ansible playbook $playbook failed with exit code $ansible_exit" return "$ansible_exit" fi @@ -182,6 +188,7 @@ jobs: fi - name: Upload plan output + if: always() uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: plan-${{ matrix.node }} diff --git a/.github/workflows/upgrade-k8s.yml b/.github/workflows/upgrade-k8s.yml index 9a54a104d0..87fbc7f8a3 100644 --- a/.github/workflows/upgrade-k8s.yml +++ b/.github/workflows/upgrade-k8s.yml @@ -29,7 +29,6 @@ on: env: NODE_IPS: '{"shanghai-1":"192.168.10.102","shanghai-2":"192.168.10.103","shanghai-3":"192.168.10.104"}' ALL_NODE_IPS: "192.168.10.102 192.168.10.103 192.168.10.104" - ETCD_SNAPSHOT_S3_URI: s3://arch-etcd-snapshots/kubernetes-upgrade jobs: extract-versions: @@ -191,8 +190,6 @@ jobs: shell: bash run: | set -eo pipefail - ETCD_SNAPSHOT_ARTIFACT_DIR="/tmp/etcd-snapshots" - mkdir -p "${ETCD_SNAPSHOT_ARTIFACT_DIR}" cd ansible source .venv/bin/activate echo "Running pre-upgrade checks with K8S_VERSION=${K8S_VERSION}" @@ -204,7 +201,6 @@ jobs: -e "kubernetes_upgrade_version=${K8S_VERSION}" \ -e "kubernetes_upgrade_package=${K8S_PACKAGE}" \ -e "crio_upgrade_version=${CRIO_VERSION}" \ - -e "etcd_snapshot_local_dir=${ETCD_SNAPSHOT_ARTIFACT_DIR}" \ -e "etcd_snapshot_force=true" \ -vv 2>&1 | tee /tmp/pre-check-ansible.log @@ -216,25 +212,6 @@ jobs: path: /tmp/pre-check-ansible.log retention-days: 14 - - name: Upload etcd snapshot to S3 - if: always() - shell: bash - run: | - set -euo pipefail - shopt -s nullglob - - snapshots=(/tmp/etcd-snapshots/*.db) - if [ "${#snapshots[@]}" -eq 0 ]; then - echo "No etcd snapshots found to upload" - exit 0 - fi - - for snapshot in "${snapshots[@]}"; do - aws s3 cp "${snapshot}" \ - "${ETCD_SNAPSHOT_S3_URI}/run-${GITHUB_RUN_ID}/$(basename "${snapshot}")" \ - --sse AES256 - done - upgrade-cp-1: name: Upgrade shanghai-1 (first CP) needs: [extract-versions, pre-check] diff --git a/ansible/roles/kubernetes_upgrade/defaults/main.yml b/ansible/roles/kubernetes_upgrade/defaults/main.yml index 7aea0bf949..e2e1f259d4 100644 --- a/ansible/roles/kubernetes_upgrade/defaults/main.yml +++ b/ansible/roles/kubernetes_upgrade/defaults/main.yml @@ -17,9 +17,13 @@ upgrade_health_check_delay: 10 # seconds # etcd snapshot etcd_snapshot_dir: /var/lib/etcd-snapshots etcd_snapshot_pod_dir: /var/lib/etcd -etcd_snapshot_local_dir: "" etcd_snapshot_cleanup_pod_file: true +etcd_snapshot_cleanup_host_file: false etcd_snapshot_force: false +etcd_snapshot_store_namespace: etcd-snapshots +etcd_snapshot_store_pod_selector: app.kubernetes.io/name=etcd-snapshot-store +etcd_snapshot_store_mount_path: /snapshots +etcd_snapshot_store_retention_count: 3 etcd_cacert: /etc/kubernetes/pki/etcd/ca.crt etcd_cert: /etc/kubernetes/pki/etcd/healthcheck-client.crt etcd_key: /etc/kubernetes/pki/etcd/healthcheck-client.key diff --git a/ansible/roles/kubernetes_upgrade/molecule/default/prepare.yml b/ansible/roles/kubernetes_upgrade/molecule/default/prepare.yml index ada0b8660e..03d42a42ba 100644 --- a/ansible/roles/kubernetes_upgrade/molecule/default/prepare.yml +++ b/ansible/roles/kubernetes_upgrade/molecule/default/prepare.yml @@ -119,6 +119,9 @@ *"get node"*"conditions"*) printf "MemoryPressure=False\nDiskPressure=False\nPIDPressure=False\nReady=True\n" ;; + *"get pod"*"app.kubernetes.io/name=etcd-snapshot-store"*"jsonpath"*) + echo -n "etcd-snapshot-store-test" + ;; *"drain"*) echo "node/control-plane-test cordoned" echo "node/control-plane-test drained" @@ -140,13 +143,25 @@ printf "mock snapshot\n" > "$snapshot_path" echo "Snapshot saved at $snapshot_path" ;; - *"exec etcd-"*"etcdctl snapshot status"*) + *"exec etcd-"*"etcdutl snapshot status"*) echo "+----------+----------+------------+------------+" echo "| HASH | REVISION | TOTAL KEYS | TOTAL SIZE |" echo "+----------+----------+------------+------------+" echo "| abcd1234 | 12345 | 1000 | 5.0 MB |" echo "+----------+----------+------------+------------+" ;; + *"exec -i "*" sh -c "*"cat > "*) + shell_command="${@: -1}" + snapshot_path="${shell_command#cat > }" + snapshot_path="${snapshot_path%\'}" + snapshot_path="${snapshot_path#\'}" + mkdir -p "$(dirname "$snapshot_path")" + cat > "$snapshot_path" + ;; + *"exec "*" test -s "*) + snapshot_path="${@: -1}" + test -s "$snapshot_path" + ;; *"exec etcd-"*" cat "*) snapshot_path="${@: -1}" cat "$snapshot_path" @@ -190,6 +205,26 @@ mode: '0755' become: true + - name: Create mock etcdutl binary + ansible.builtin.copy: + dest: /usr/local/bin/etcdutl + content: | + #!/bin/bash + case "$*" in + "snapshot status "*) + echo "+----------+----------+------------+------------+" + echo "| HASH | REVISION | TOTAL KEYS | TOTAL SIZE |" + echo "+----------+----------+------------+------------+" + echo "| abcd1234 | 12345 | 1000 | 5.0 MB |" + echo "+----------+----------+------------+------------+" + ;; + *) + echo "mock etcdutl: $*" + ;; + esac + mode: '0755' + become: true + - name: Create mock kubeadm binary ansible.builtin.copy: dest: /usr/local/bin/kubeadm diff --git a/ansible/roles/kubernetes_upgrade/tasks/etcd_snapshot.yml b/ansible/roles/kubernetes_upgrade/tasks/etcd_snapshot.yml index 2f00cb96fa..00cfa43c91 100644 --- a/ansible/roles/kubernetes_upgrade/tasks/etcd_snapshot.yml +++ b/ansible/roles/kubernetes_upgrade/tasks/etcd_snapshot.yml @@ -15,20 +15,66 @@ register: existing_snapshots tags: [etcd_snapshot, pre_checks] +- name: Set valid existing etcd snapshots + ansible.builtin.set_fact: + _valid_existing_snapshots: "{{ existing_snapshots.files | selectattr('size', 'gt', 0) | list }}" + tags: [etcd_snapshot, pre_checks] + - name: Set etcd snapshot paths ansible.builtin.set_fact: _etcd_snapshot_filename: "pre-upgrade-{{ ansible_facts['date_time']['iso8601_basic_short'] }}.db" _etcd_snapshot_pod_path: "{{ etcd_snapshot_pod_dir }}/pre-upgrade-{{ ansible_facts['date_time']['iso8601_basic_short'] }}.db" _etcd_snapshot_remote_path: "{{ etcd_snapshot_dir }}/pre-upgrade-{{ ansible_facts['date_time']['iso8601_basic_short'] }}.db" + _etcd_snapshot_needs_create: "{{ (etcd_snapshot_force | bool) or (_valid_existing_snapshots | length == 0) }}" tags: [etcd_snapshot, pre_checks] - name: Reuse existing etcd snapshot for today ansible.builtin.set_fact: - _etcd_snapshot_remote_path: "{{ (existing_snapshots.files | sort(attribute='mtime') | last).path }}" - _etcd_snapshot_filename: "{{ (existing_snapshots.files | sort(attribute='mtime') | last).path | basename }}" + _etcd_snapshot_remote_path: "{{ (_valid_existing_snapshots | sort(attribute='mtime') | last).path }}" + _etcd_snapshot_filename: "{{ (_valid_existing_snapshots | sort(attribute='mtime') | last).path | basename }}" when: - not (etcd_snapshot_force | bool) - - existing_snapshots.matched > 0 + - _valid_existing_snapshots | length > 0 + tags: [etcd_snapshot, pre_checks] + +- name: Wait for etcd snapshot store pod + ansible.builtin.command: + argv: + - kubectl + - "--kubeconfig={{ kubeconfig_path }}" + - -n + - "{{ etcd_snapshot_store_namespace }}" + - wait + - "--for=condition=Ready" + - pod + - "-l" + - "{{ etcd_snapshot_store_pod_selector }}" + - --timeout=180s + become: true + changed_when: false + tags: [etcd_snapshot, pre_checks] + +- name: Get ready etcd snapshot store pod + ansible.builtin.command: + argv: + - kubectl + - "--kubeconfig={{ kubeconfig_path }}" + - -n + - "{{ etcd_snapshot_store_namespace }}" + - get + - pod + - "-l" + - "{{ etcd_snapshot_store_pod_selector }}" + - -o + - "jsonpath={.items[0].metadata.name}" + become: true + changed_when: false + register: etcd_snapshot_store_pod_result + tags: [etcd_snapshot, pre_checks] + +- name: Set etcd snapshot store pod name + ansible.builtin.set_fact: + _etcd_snapshot_store_pod: "{{ etcd_snapshot_store_pod_result.stdout }}" tags: [etcd_snapshot, pre_checks] - name: Take etcd snapshot inside etcd pod @@ -50,7 +96,7 @@ - "--key={{ etcd_key }}" become: true changed_when: true - when: (etcd_snapshot_force | bool) or existing_snapshots.matched == 0 + when: _etcd_snapshot_needs_create | bool tags: [etcd_snapshot, pre_checks] - name: Verify etcd snapshot integrity inside etcd pod @@ -63,14 +109,14 @@ - exec - "etcd-{{ inventory_hostname }}" - -- - - etcdctl + - etcdutl - snapshot - status - "{{ _etcd_snapshot_pod_path }}" - --write-out=table become: true changed_when: false - when: (etcd_snapshot_force | bool) or existing_snapshots.matched == 0 + when: _etcd_snapshot_needs_create | bool tags: [etcd_snapshot, pre_checks] - name: Move etcd snapshot from host-mounted etcd data directory @@ -81,26 +127,72 @@ - "{{ _etcd_snapshot_remote_path }}" become: true changed_when: true - when: (etcd_snapshot_force | bool) or existing_snapshots.matched == 0 + when: _etcd_snapshot_needs_create | bool tags: [etcd_snapshot, pre_checks] -- name: Create local etcd snapshot artifact directory - ansible.builtin.file: - path: "{{ etcd_snapshot_local_dir }}" - state: directory - mode: '0700' - delegate_to: localhost - become: false - when: etcd_snapshot_local_dir | length > 0 +- name: Stream etcd snapshot into Longhorn PVC + ansible.builtin.shell: | + set -o pipefail + cat {{ _etcd_snapshot_remote_path | quote }} | \ + kubectl --kubeconfig={{ kubeconfig_path | quote }} \ + -n {{ etcd_snapshot_store_namespace | quote }} \ + exec -i {{ _etcd_snapshot_store_pod | quote }} -- \ + sh -c "cat > {{ (etcd_snapshot_store_mount_path ~ "/" ~ _etcd_snapshot_filename) | quote }}" + args: + executable: /bin/bash + become: true + changed_when: true + when: _etcd_snapshot_needs_create | bool tags: [etcd_snapshot, pre_checks] -- name: Fetch etcd snapshot to local artifact directory - ansible.builtin.fetch: - src: "{{ _etcd_snapshot_remote_path }}" - dest: "{{ etcd_snapshot_local_dir }}/{{ _etcd_snapshot_filename }}" - flat: true +- name: Verify etcd snapshot exists in Longhorn PVC + ansible.builtin.command: + argv: + - kubectl + - "--kubeconfig={{ kubeconfig_path }}" + - -n + - "{{ etcd_snapshot_store_namespace }}" + - exec + - "{{ _etcd_snapshot_store_pod }}" + - -- + - test + - -s + - "{{ etcd_snapshot_store_mount_path }}/{{ _etcd_snapshot_filename }}" become: true - when: etcd_snapshot_local_dir | length > 0 + changed_when: false + when: _etcd_snapshot_needs_create | bool + tags: [etcd_snapshot, pre_checks] + +- name: Prune old etcd snapshots from Longhorn PVC + ansible.builtin.command: + argv: + - kubectl + - "--kubeconfig={{ kubeconfig_path }}" + - -n + - "{{ etcd_snapshot_store_namespace }}" + - exec + - "{{ _etcd_snapshot_store_pod }}" + - -- + - sh + - -c + - | + set -eu + cd {{ etcd_snapshot_store_mount_path | quote }} + retention={{ etcd_snapshot_store_retention_count | int }} + index=0 + for snapshot in $(ls -1t pre-upgrade-*.db 2>/dev/null || true); do + index=$((index + 1)) + if [ "$index" -gt "$retention" ]; then + rm -f -- "$snapshot" + echo "deleted $snapshot" + fi + done + become: true + changed_when: "'deleted ' in prune_etcd_snapshots.stdout" + register: prune_etcd_snapshots + when: + - _etcd_snapshot_needs_create | bool + - (etcd_snapshot_store_retention_count | int) > 0 tags: [etcd_snapshot, pre_checks] - name: Remove temporary etcd snapshot from host-mounted etcd data directory @@ -113,5 +205,18 @@ changed_when: false when: - etcd_snapshot_cleanup_pod_file | bool - - (etcd_snapshot_force | bool) or existing_snapshots.matched == 0 + - _etcd_snapshot_needs_create | bool + tags: [etcd_snapshot, pre_checks] + +- name: Remove temporary etcd snapshot from control plane host + ansible.builtin.command: + argv: + - rm + - -f + - "{{ _etcd_snapshot_remote_path }}" + become: true + changed_when: false + when: + - etcd_snapshot_cleanup_host_file | bool + - _etcd_snapshot_needs_create | bool tags: [etcd_snapshot, pre_checks] diff --git a/ansible/roles/kubernetes_upgrade/tasks/upgrade_apt_source.yml b/ansible/roles/kubernetes_upgrade/tasks/upgrade_apt_source.yml index c84e1740de..2e76337c32 100644 --- a/ansible/roles/kubernetes_upgrade/tasks/upgrade_apt_source.yml +++ b/ansible/roles/kubernetes_upgrade/tasks/upgrade_apt_source.yml @@ -62,6 +62,7 @@ ansible.builtin.apt: update_cache: true become: true + changed_when: false tags: [upgrade] - name: Upgrade CRI-O diff --git a/docs/project_docs/BOXP-2-etcd-snapshot-longhorn-pvc/plan.md b/docs/project_docs/BOXP-2-etcd-snapshot-longhorn-pvc/plan.md new file mode 100644 index 0000000000..af419c143d --- /dev/null +++ b/docs/project_docs/BOXP-2-etcd-snapshot-longhorn-pvc/plan.md @@ -0,0 +1,23 @@ +# BOXP-2 etcd snapshot Longhorn PVC plan + +## Context + +The Kubernetes upgrade pre-check successfully creates an etcd snapshot on the control-plane node, but fetching a 600MB+ snapshot back to the GitHub Actions runner can be killed by SSH/module memory limits. Storing the file in a dedicated Longhorn PVC keeps it inside the cluster and lets the existing Longhorn backup target handle durable backup. + +## Plan + +- Remove the GitHub Actions local artifact fetch and dedicated S3 upload path for etcd snapshots. +- Keep the previous S3 Terraform resources managed for now so an apply does not try to delete a non-empty bucket as part of this migration. +- Manage the dedicated namespace, Longhorn PVC, and storage Deployment from the lolice GitOps manifests. +- In the upgrade pre-check, wait for the GitOps-managed storage pod by label and stream the snapshot into it. +- Create the snapshot inside the etcd pod, verify it with `etcdutl snapshot status`, move it to the control-plane host, and stream it into the PVC. +- Ignore zero-byte same-day snapshot files when deciding whether a reusable snapshot exists. +- Retain only the newest PVC snapshots by count so repeated dry-runs do not fill the volume. +- Keep the control-plane host copy by default because the current rollback task restores from `etcd_snapshot_dir`. +- Make Plan Ansible CI print captured stderr and JSON tail on playbook failure, and upload plan outputs even when the job fails. + +## Validation + +- Run Ansible syntax and lint checks for the upgrade playbook. +- Run the kubernetes_upgrade Molecule scenario if local container support allows it. +- Run a GitHub Actions dry-run after the PR is merged to confirm the pre-check stores the snapshot in the Longhorn PVC.