Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/plan-ansible.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,12 @@ jobs:
set -e

if [ "$ansible_exit" -ne 0 ] && [ "$ansible_exit" -ne 2 ]; then
echo "::group::Ansible stderr for ${playbook_name} on ${node}"
cat "plan_outputs/${node}-${playbook_name}.stderr" || true
echo "::endgroup::"
echo "::group::Ansible JSON tail for ${playbook_name} on ${node}"
tail -n 200 "$json_file" || true
echo "::endgroup::"
echo "::error::Ansible playbook $playbook failed with exit code $ansible_exit"
return "$ansible_exit"
fi
Expand Down Expand Up @@ -182,6 +188,7 @@ jobs:
fi

- name: Upload plan output
if: always()
uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
with:
name: plan-${{ matrix.node }}
Expand Down
23 changes: 0 additions & 23 deletions .github/workflows/upgrade-k8s.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ on:
env:
NODE_IPS: '{"shanghai-1":"192.168.10.102","shanghai-2":"192.168.10.103","shanghai-3":"192.168.10.104"}'
ALL_NODE_IPS: "192.168.10.102 192.168.10.103 192.168.10.104"
ETCD_SNAPSHOT_S3_URI: s3://arch-etcd-snapshots/kubernetes-upgrade

jobs:
extract-versions:
Expand Down Expand Up @@ -191,8 +190,6 @@ jobs:
shell: bash
run: |
set -eo pipefail
ETCD_SNAPSHOT_ARTIFACT_DIR="/tmp/etcd-snapshots"
mkdir -p "${ETCD_SNAPSHOT_ARTIFACT_DIR}"
cd ansible
source .venv/bin/activate
echo "Running pre-upgrade checks with K8S_VERSION=${K8S_VERSION}"
Expand All @@ -204,7 +201,6 @@ jobs:
-e "kubernetes_upgrade_version=${K8S_VERSION}" \
-e "kubernetes_upgrade_package=${K8S_PACKAGE}" \
-e "crio_upgrade_version=${CRIO_VERSION}" \
-e "etcd_snapshot_local_dir=${ETCD_SNAPSHOT_ARTIFACT_DIR}" \
-e "etcd_snapshot_force=true" \
-vv 2>&1 | tee /tmp/pre-check-ansible.log

Expand All @@ -216,25 +212,6 @@ jobs:
path: /tmp/pre-check-ansible.log
retention-days: 14

- name: Upload etcd snapshot to S3
if: always()
shell: bash
run: |
set -euo pipefail
shopt -s nullglob

snapshots=(/tmp/etcd-snapshots/*.db)
if [ "${#snapshots[@]}" -eq 0 ]; then
echo "No etcd snapshots found to upload"
exit 0
fi

for snapshot in "${snapshots[@]}"; do
aws s3 cp "${snapshot}" \
"${ETCD_SNAPSHOT_S3_URI}/run-${GITHUB_RUN_ID}/$(basename "${snapshot}")" \
--sse AES256
done

upgrade-cp-1:
name: Upgrade shanghai-1 (first CP)
needs: [extract-versions, pre-check]
Expand Down
6 changes: 5 additions & 1 deletion ansible/roles/kubernetes_upgrade/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,13 @@ upgrade_health_check_delay: 10 # seconds
# etcd snapshot
etcd_snapshot_dir: /var/lib/etcd-snapshots
etcd_snapshot_pod_dir: /var/lib/etcd
etcd_snapshot_local_dir: ""
etcd_snapshot_cleanup_pod_file: true
etcd_snapshot_cleanup_host_file: false
etcd_snapshot_force: false
etcd_snapshot_store_namespace: etcd-snapshots
etcd_snapshot_store_pod_selector: app.kubernetes.io/name=etcd-snapshot-store
etcd_snapshot_store_mount_path: /snapshots
etcd_snapshot_store_retention_count: 3
etcd_cacert: /etc/kubernetes/pki/etcd/ca.crt
etcd_cert: /etc/kubernetes/pki/etcd/healthcheck-client.crt
etcd_key: /etc/kubernetes/pki/etcd/healthcheck-client.key
Expand Down
37 changes: 36 additions & 1 deletion ansible/roles/kubernetes_upgrade/molecule/default/prepare.yml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@
*"get node"*"conditions"*)
printf "MemoryPressure=False\nDiskPressure=False\nPIDPressure=False\nReady=True\n"
;;
*"get pod"*"app.kubernetes.io/name=etcd-snapshot-store"*"jsonpath"*)
echo -n "etcd-snapshot-store-test"
;;
*"drain"*)
echo "node/control-plane-test cordoned"
echo "node/control-plane-test drained"
Expand All @@ -140,13 +143,25 @@
printf "mock snapshot\n" > "$snapshot_path"
echo "Snapshot saved at $snapshot_path"
;;
*"exec etcd-"*"etcdctl snapshot status"*)
*"exec etcd-"*"etcdutl snapshot status"*)
echo "+----------+----------+------------+------------+"
echo "| HASH | REVISION | TOTAL KEYS | TOTAL SIZE |"
echo "+----------+----------+------------+------------+"
echo "| abcd1234 | 12345 | 1000 | 5.0 MB |"
echo "+----------+----------+------------+------------+"
;;
*"exec -i "*" sh -c "*"cat > "*)
shell_command="${@: -1}"
snapshot_path="${shell_command#cat > }"
snapshot_path="${snapshot_path%\'}"
snapshot_path="${snapshot_path#\'}"
mkdir -p "$(dirname "$snapshot_path")"
cat > "$snapshot_path"
;;
*"exec "*" test -s "*)
snapshot_path="${@: -1}"
test -s "$snapshot_path"
;;
*"exec etcd-"*" cat "*)
snapshot_path="${@: -1}"
cat "$snapshot_path"
Expand Down Expand Up @@ -190,6 +205,26 @@
mode: '0755'
become: true

- name: Create mock etcdutl binary
ansible.builtin.copy:
dest: /usr/local/bin/etcdutl
content: |
#!/bin/bash
case "$*" in
"snapshot status "*)
echo "+----------+----------+------------+------------+"
echo "| HASH | REVISION | TOTAL KEYS | TOTAL SIZE |"
echo "+----------+----------+------------+------------+"
echo "| abcd1234 | 12345 | 1000 | 5.0 MB |"
echo "+----------+----------+------------+------------+"
;;
*)
echo "mock etcdutl: $*"
;;
esac
mode: '0755'
become: true

- name: Create mock kubeadm binary
ansible.builtin.copy:
dest: /usr/local/bin/kubeadm
Expand Down
149 changes: 127 additions & 22 deletions ansible/roles/kubernetes_upgrade/tasks/etcd_snapshot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,66 @@
register: existing_snapshots
tags: [etcd_snapshot, pre_checks]

- name: Set valid existing etcd snapshots
ansible.builtin.set_fact:
_valid_existing_snapshots: "{{ existing_snapshots.files | selectattr('size', 'gt', 0) | list }}"
tags: [etcd_snapshot, pre_checks]

- name: Set etcd snapshot paths
ansible.builtin.set_fact:
_etcd_snapshot_filename: "pre-upgrade-{{ ansible_facts['date_time']['iso8601_basic_short'] }}.db"
_etcd_snapshot_pod_path: "{{ etcd_snapshot_pod_dir }}/pre-upgrade-{{ ansible_facts['date_time']['iso8601_basic_short'] }}.db"
_etcd_snapshot_remote_path: "{{ etcd_snapshot_dir }}/pre-upgrade-{{ ansible_facts['date_time']['iso8601_basic_short'] }}.db"
_etcd_snapshot_needs_create: "{{ (etcd_snapshot_force | bool) or (_valid_existing_snapshots | length == 0) }}"
tags: [etcd_snapshot, pre_checks]

- name: Reuse existing etcd snapshot for today
ansible.builtin.set_fact:
_etcd_snapshot_remote_path: "{{ (existing_snapshots.files | sort(attribute='mtime') | last).path }}"
_etcd_snapshot_filename: "{{ (existing_snapshots.files | sort(attribute='mtime') | last).path | basename }}"
_etcd_snapshot_remote_path: "{{ (_valid_existing_snapshots | sort(attribute='mtime') | last).path }}"
_etcd_snapshot_filename: "{{ (_valid_existing_snapshots | sort(attribute='mtime') | last).path | basename }}"
when:
- not (etcd_snapshot_force | bool)
- existing_snapshots.matched > 0
- _valid_existing_snapshots | length > 0
tags: [etcd_snapshot, pre_checks]

- name: Wait for etcd snapshot store pod
ansible.builtin.command:
argv:
- kubectl
- "--kubeconfig={{ kubeconfig_path }}"
- -n
- "{{ etcd_snapshot_store_namespace }}"
- wait
- "--for=condition=Ready"
- pod
- "-l"
- "{{ etcd_snapshot_store_pod_selector }}"
- --timeout=180s
become: true
changed_when: false
tags: [etcd_snapshot, pre_checks]

- name: Get ready etcd snapshot store pod
ansible.builtin.command:
argv:
- kubectl
- "--kubeconfig={{ kubeconfig_path }}"
- -n
- "{{ etcd_snapshot_store_namespace }}"
- get
- pod
- "-l"
- "{{ etcd_snapshot_store_pod_selector }}"
- -o
- "jsonpath={.items[0].metadata.name}"
become: true
changed_when: false
register: etcd_snapshot_store_pod_result
tags: [etcd_snapshot, pre_checks]

- name: Set etcd snapshot store pod name
ansible.builtin.set_fact:
_etcd_snapshot_store_pod: "{{ etcd_snapshot_store_pod_result.stdout }}"
tags: [etcd_snapshot, pre_checks]

- name: Take etcd snapshot inside etcd pod
Expand All @@ -50,7 +96,7 @@
- "--key={{ etcd_key }}"
become: true
changed_when: true
when: (etcd_snapshot_force | bool) or existing_snapshots.matched == 0
when: _etcd_snapshot_needs_create | bool
tags: [etcd_snapshot, pre_checks]

- name: Verify etcd snapshot integrity inside etcd pod
Expand All @@ -63,14 +109,14 @@
- exec
- "etcd-{{ inventory_hostname }}"
- --
- etcdctl
- etcdutl
- snapshot
- status
- "{{ _etcd_snapshot_pod_path }}"
- --write-out=table
become: true
changed_when: false
when: (etcd_snapshot_force | bool) or existing_snapshots.matched == 0
when: _etcd_snapshot_needs_create | bool
tags: [etcd_snapshot, pre_checks]

- name: Move etcd snapshot from host-mounted etcd data directory
Expand All @@ -81,26 +127,72 @@
- "{{ _etcd_snapshot_remote_path }}"
become: true
changed_when: true
when: (etcd_snapshot_force | bool) or existing_snapshots.matched == 0
when: _etcd_snapshot_needs_create | bool
tags: [etcd_snapshot, pre_checks]

- name: Create local etcd snapshot artifact directory
ansible.builtin.file:
path: "{{ etcd_snapshot_local_dir }}"
state: directory
mode: '0700'
delegate_to: localhost
become: false
when: etcd_snapshot_local_dir | length > 0
- name: Stream etcd snapshot into Longhorn PVC
ansible.builtin.shell: |
set -o pipefail
cat {{ _etcd_snapshot_remote_path | quote }} | \
kubectl --kubeconfig={{ kubeconfig_path | quote }} \
-n {{ etcd_snapshot_store_namespace | quote }} \
exec -i {{ _etcd_snapshot_store_pod | quote }} -- \
sh -c "cat > {{ (etcd_snapshot_store_mount_path ~ "/" ~ _etcd_snapshot_filename) | quote }}"
args:
executable: /bin/bash
become: true
changed_when: true
when: _etcd_snapshot_needs_create | bool
tags: [etcd_snapshot, pre_checks]

- name: Fetch etcd snapshot to local artifact directory
ansible.builtin.fetch:
src: "{{ _etcd_snapshot_remote_path }}"
dest: "{{ etcd_snapshot_local_dir }}/{{ _etcd_snapshot_filename }}"
flat: true
- name: Verify etcd snapshot exists in Longhorn PVC
ansible.builtin.command:
argv:
- kubectl
- "--kubeconfig={{ kubeconfig_path }}"
- -n
- "{{ etcd_snapshot_store_namespace }}"
- exec
- "{{ _etcd_snapshot_store_pod }}"
- --
- test
- -s
- "{{ etcd_snapshot_store_mount_path }}/{{ _etcd_snapshot_filename }}"
become: true
when: etcd_snapshot_local_dir | length > 0
changed_when: false
when: _etcd_snapshot_needs_create | bool
tags: [etcd_snapshot, pre_checks]

- name: Prune old etcd snapshots from Longhorn PVC
ansible.builtin.command:
argv:
- kubectl
- "--kubeconfig={{ kubeconfig_path }}"
- -n
- "{{ etcd_snapshot_store_namespace }}"
- exec
- "{{ _etcd_snapshot_store_pod }}"
- --
- sh
- -c
- |
set -eu
cd {{ etcd_snapshot_store_mount_path | quote }}
retention={{ etcd_snapshot_store_retention_count | int }}
index=0
for snapshot in $(ls -1t pre-upgrade-*.db 2>/dev/null || true); do
index=$((index + 1))
if [ "$index" -gt "$retention" ]; then
rm -f -- "$snapshot"
echo "deleted $snapshot"
fi
done
become: true
changed_when: "'deleted ' in prune_etcd_snapshots.stdout"
register: prune_etcd_snapshots
when:
- _etcd_snapshot_needs_create | bool
- (etcd_snapshot_store_retention_count | int) > 0
tags: [etcd_snapshot, pre_checks]

- name: Remove temporary etcd snapshot from host-mounted etcd data directory
Expand All @@ -113,5 +205,18 @@
changed_when: false
when:
- etcd_snapshot_cleanup_pod_file | bool
- (etcd_snapshot_force | bool) or existing_snapshots.matched == 0
- _etcd_snapshot_needs_create | bool
tags: [etcd_snapshot, pre_checks]

- name: Remove temporary etcd snapshot from control plane host
ansible.builtin.command:
argv:
- rm
- -f
- "{{ _etcd_snapshot_remote_path }}"
become: true
changed_when: false
when:
- etcd_snapshot_cleanup_host_file | bool
- _etcd_snapshot_needs_create | bool
tags: [etcd_snapshot, pre_checks]
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
ansible.builtin.apt:
update_cache: true
become: true
changed_when: false
tags: [upgrade]

- name: Upgrade CRI-O
Expand Down
23 changes: 23 additions & 0 deletions docs/project_docs/BOXP-2-etcd-snapshot-longhorn-pvc/plan.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# BOXP-2 etcd snapshot Longhorn PVC plan

## Context

The Kubernetes upgrade pre-check successfully creates an etcd snapshot on the control-plane node, but fetching a 600MB+ snapshot back to the GitHub Actions runner can be killed by SSH/module memory limits. Storing the file in a dedicated Longhorn PVC keeps it inside the cluster and lets the existing Longhorn backup target handle durable backup.

## Plan

- Remove the GitHub Actions local artifact fetch and dedicated S3 upload path for etcd snapshots.
- Keep the previous S3 Terraform resources managed for now so an apply does not try to delete a non-empty bucket as part of this migration.
- Manage the dedicated namespace, Longhorn PVC, and storage Deployment from the lolice GitOps manifests.
- In the upgrade pre-check, wait for the GitOps-managed storage pod by label and stream the snapshot into it.
- Create the snapshot inside the etcd pod, verify it with `etcdutl snapshot status`, move it to the control-plane host, and stream it into the PVC.
- Ignore zero-byte same-day snapshot files when deciding whether a reusable snapshot exists.
- Retain only the newest PVC snapshots by count so repeated dry-runs do not fill the volume.
- Keep the control-plane host copy by default because the current rollback task restores from `etcd_snapshot_dir`.
- Make Plan Ansible CI print captured stderr and JSON tail on playbook failure, and upload plan outputs even when the job fails.

## Validation

- Run Ansible syntax and lint checks for the upgrade playbook.
- Run the kubernetes_upgrade Molecule scenario if local container support allows it.
- Run a GitHub Actions dry-run after the PR is merged to confirm the pre-check stores the snapshot in the Longhorn PVC.
Loading