From 7c604e29a86b7a3352715a0d5c3ca8e50350810e Mon Sep 17 00:00:00 2001 From: Wenting Wu Date: Thu, 26 Feb 2026 14:09:27 -0500 Subject: [PATCH 1/7] feat: enable CNPG in-place updates and add CI verification step - Set ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES=true in cloudnative-pg Helm values to enable instance manager in-place updates - Add CI step in setup-test-environment to verify the env var is correctly propagated to the CNPG operator deployment Signed-off-by: Wenting Wu --- .../actions/setup-test-environment/action.yml | 28 +++++++++++++++++++ operator/documentdb-helm-chart/values.yaml | 3 ++ 2 files changed, 31 insertions(+) diff --git a/.github/actions/setup-test-environment/action.yml b/.github/actions/setup-test-environment/action.yml index 956c5974..52a11f26 100644 --- a/.github/actions/setup-test-environment/action.yml +++ b/.github/actions/setup-test-environment/action.yml @@ -551,6 +551,34 @@ runs: echo "✓ DocumentDB Operator installation completed on ${{ inputs.architecture }}" + - name: Verify CNPG ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES env is set + shell: bash + run: | + echo "Verifying ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is set on the CNPG operator deployment..." + + # Find the CNPG controller deployment in cnpg-system namespace + CNPG_DEPLOY=$(kubectl get deployments -n cnpg-system -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + if [[ -z "$CNPG_DEPLOY" ]]; then + echo "❌ No deployment found in cnpg-system namespace" + kubectl get all -n cnpg-system + exit 1 + fi + echo "Found CNPG deployment: $CNPG_DEPLOY" + + # Extract the value of ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES from the deployment + ENV_VALUE=$(kubectl get deployment "$CNPG_DEPLOY" -n cnpg-system \ + -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES")].value}') + + if [[ "$ENV_VALUE" == "true" ]]; then + echo "✅ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is correctly set to 'true'" + else + echo "❌ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is not set or has unexpected value: '$ENV_VALUE'" + echo "Deployment env vars:" + kubectl get deployment "$CNPG_DEPLOY" -n cnpg-system \ + -o jsonpath='{.spec.template.spec.containers[0].env}' | jq . || true + exit 1 + fi + - name: Create DocumentDB credentials secret shell: bash run: | diff --git a/operator/documentdb-helm-chart/values.yaml b/operator/documentdb-helm-chart/values.yaml index 65c3628b..d08dde48 100644 --- a/operator/documentdb-helm-chart/values.yaml +++ b/operator/documentdb-helm-chart/values.yaml @@ -27,3 +27,6 @@ image: pullPolicy: Always cloudnative-pg: namespaceOverride: cnpg-system + additionalEnv: + - name: ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES + value: "true" From b27f5c85dbf07fab307e95e26b6af65b6a84cfdc Mon Sep 17 00:00:00 2001 From: Wenting Wu Date: Fri, 27 Feb 2026 09:39:46 -0500 Subject: [PATCH 2/7] refactor: move operator upgrade to first test step Signed-off-by: Wenting Wu --- .../actions/setup-test-environment/action.yml | 112 ++++------ .../workflows/test-upgrade-and-rollback.yml | 207 +++++++++++++++++- 2 files changed, 245 insertions(+), 74 deletions(-) diff --git a/.github/actions/setup-test-environment/action.yml b/.github/actions/setup-test-environment/action.yml index 52a11f26..1d9ab40f 100644 --- a/.github/actions/setup-test-environment/action.yml +++ b/.github/actions/setup-test-environment/action.yml @@ -73,6 +73,10 @@ inputs: description: 'Kubernetes version for Kind cluster (e.g., v1.35.0, v1.34.0)' required: false default: 'v1.35.0' + released-chart-version: + description: 'Install operator from public Helm repo instead of built artifacts. Use "latest" for latest version or a specific version string.' + required: false + default: '' runs: using: 'composite' @@ -377,7 +381,7 @@ runs: fi - name: Install DocumentDB Operator (local chart) - if: inputs.use-external-images == 'false' + if: inputs.use-external-images == 'false' && inputs.released-chart-version == '' shell: bash run: | echo "Installing DocumentDB Operator on ${{ inputs.architecture }} using local chart version: ${{ inputs.chart-version }}" @@ -439,36 +443,9 @@ runs: ls -la ./artifacts/ || echo "No artifacts directory found" exit 1 fi - - # Verify operator installation - echo "Verifying DocumentDB operator installation..." - kubectl wait --for=condition=Available deployment/documentdb-operator -n ${{ inputs.operator-namespace }} --timeout=300s - - # Verify that our newly built images are being used - echo "Verifying operator deployment uses our newly built images on ${{ inputs.architecture }}..." - echo "Operator image:" - kubectl get deployment documentdb-operator -n ${{ inputs.operator-namespace }} -o jsonpath='{.spec.template.spec.containers[0].image}' - echo "" - echo "Sidecar injector image (if present):" - kubectl get deployment documentdb-operator -n ${{ inputs.operator-namespace }} -o jsonpath='{.spec.template.spec.containers[1].image}' || echo "No sidecar container found" - echo "" - - # Additional verification - check that operator is actually running - echo "Checking operator pod status..." - kubectl get pods -n ${{ inputs.operator-namespace }} -l app.kubernetes.io/name=documentdb-operator - - # Verify operator logs for any immediate issues - echo "Checking operator logs for any startup issues..." - kubectl logs -n ${{ inputs.operator-namespace }} deployment/documentdb-operator --tail=20 || echo "Could not retrieve operator logs" - - # Check for CRDs installation - echo "Verifying DocumentDB CRDs are installed..." - kubectl get crd db.documentdb.io || echo "DocumentDB CRD not found" - - echo "✓ DocumentDB Operator installation completed on ${{ inputs.architecture }}" - name: Install DocumentDB Operator (external images) - if: inputs.use-external-images == 'true' + if: inputs.use-external-images == 'true' && inputs.released-chart-version == '' shell: bash run: | echo "Installing DocumentDB Operator on ${{ inputs.architecture }} using external images with tag: ${{ inputs.image-tag }}" @@ -523,13 +500,49 @@ runs: --values /tmp/values-override.yaml \ --wait --timeout=15m fi + + - name: Install DocumentDB Operator (released chart) + if: inputs.released-chart-version != '' + shell: bash + run: | + echo "Installing DocumentDB Operator from public Helm repo..." + echo "Requested chart version: ${{ inputs.released-chart-version }}" + + # Add the public DocumentDB Helm repository + helm repo add documentdb https://documentdb.github.io/documentdb-kubernetes-operator + helm repo update + + # Install the released chart + # If version is 'latest', omit --version to get the latest available + CHART_VERSION="${{ inputs.released-chart-version }}" + if [[ "$CHART_VERSION" == "latest" ]]; then + echo "Installing latest released version..." + helm install documentdb-operator documentdb/documentdb-operator \ + --namespace ${{ inputs.operator-namespace }} \ + --create-namespace \ + --wait --timeout=15m + else + echo "Installing version $CHART_VERSION..." + helm install documentdb-operator documentdb/documentdb-operator \ + --namespace ${{ inputs.operator-namespace }} \ + --create-namespace \ + --version "$CHART_VERSION" \ + --wait --timeout=15m + fi - # Verify operator installation - echo "Verifying DocumentDB operator installation..." + # Log resolved version + echo "Installed Helm releases:" + helm list -n ${{ inputs.operator-namespace }} + + - name: Verify operator installation + shell: bash + run: | + echo "Verifying DocumentDB operator installation on ${{ inputs.architecture }}..." kubectl wait --for=condition=Available deployment/documentdb-operator -n ${{ inputs.operator-namespace }} --timeout=300s - # Verify that the external images are being used with chart defaults - echo "Verifying operator deployment uses external images with chart default version..." + echo "Installed Helm releases:" + helm list -n ${{ inputs.operator-namespace }} + echo "Operator image:" kubectl get deployment documentdb-operator -n ${{ inputs.operator-namespace }} -o jsonpath='{.spec.template.spec.containers[0].image}' echo "" @@ -537,47 +550,16 @@ runs: kubectl get deployment documentdb-operator -n ${{ inputs.operator-namespace }} -o jsonpath='{.spec.template.spec.containers[1].image}' || echo "No sidecar container found" echo "" - # Additional verification - check that operator is actually running echo "Checking operator pod status..." kubectl get pods -n ${{ inputs.operator-namespace }} -l app.kubernetes.io/name=documentdb-operator - # Verify operator logs for any immediate issues echo "Checking operator logs for any startup issues..." kubectl logs -n ${{ inputs.operator-namespace }} deployment/documentdb-operator --tail=20 || echo "Could not retrieve operator logs" - # Check for CRDs installation echo "Verifying DocumentDB CRDs are installed..." kubectl get crd db.documentdb.io || echo "DocumentDB CRD not found" - echo "✓ DocumentDB Operator installation completed on ${{ inputs.architecture }}" - - - name: Verify CNPG ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES env is set - shell: bash - run: | - echo "Verifying ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is set on the CNPG operator deployment..." - - # Find the CNPG controller deployment in cnpg-system namespace - CNPG_DEPLOY=$(kubectl get deployments -n cnpg-system -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - if [[ -z "$CNPG_DEPLOY" ]]; then - echo "❌ No deployment found in cnpg-system namespace" - kubectl get all -n cnpg-system - exit 1 - fi - echo "Found CNPG deployment: $CNPG_DEPLOY" - - # Extract the value of ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES from the deployment - ENV_VALUE=$(kubectl get deployment "$CNPG_DEPLOY" -n cnpg-system \ - -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES")].value}') - - if [[ "$ENV_VALUE" == "true" ]]; then - echo "✅ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is correctly set to 'true'" - else - echo "❌ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is not set or has unexpected value: '$ENV_VALUE'" - echo "Deployment env vars:" - kubectl get deployment "$CNPG_DEPLOY" -n cnpg-system \ - -o jsonpath='{.spec.template.spec.containers[0].env}' | jq . || true - exit 1 - fi + echo "✓ DocumentDB Operator installation verified on ${{ inputs.architecture }}" - name: Create DocumentDB credentials secret shell: bash diff --git a/.github/workflows/test-upgrade-and-rollback.yml b/.github/workflows/test-upgrade-and-rollback.yml index 2b5f7785..5aa5fab8 100644 --- a/.github/workflows/test-upgrade-and-rollback.yml +++ b/.github/workflows/test-upgrade-and-rollback.yml @@ -71,6 +71,7 @@ jobs: DOCUMENTDB_OLD_IMAGE: ghcr.io/guanzhousongmicrosoft/documentdb-pg18:0.109.0 GATEWAY_IMAGE: ghcr.io/microsoft/documentdb/documentdb-local:16 GATEWAY_OLD_IMAGE: ghcr.io/microsoft/documentdb/documentdb-local:16 # TODO: update to actual old gateway image when available + RELEASED_CHART_VERSION: 'latest' steps: - name: Checkout repository @@ -121,6 +122,7 @@ jobs: documentdb-image: ${{ env.DOCUMENTDB_OLD_IMAGE }} gateway-image: ${{ env.GATEWAY_OLD_IMAGE }} use-external-images: ${{ github.event_name != 'pull_request' && inputs.image_tag != '' && inputs.image_tag != null }} + released-chart-version: ${{ env.RELEASED_CHART_VERSION }} github-token: ${{ secrets.GITHUB_TOKEN }} repository-owner: ${{ github.repository_owner }} @@ -162,9 +164,189 @@ jobs: fi rm -f /tmp/pf_output.log - - name: "Step 1: Upgrade Both Extension and Gateway Images" + - name: "Step 1: Operator Control Plane Upgrade (released → built)" run: | - echo "=== Step 1: Upgrade Both Extension and Gateway Images ===" + echo "=== Step 1: Operator Control Plane Upgrade ===" + echo "Upgrading operator from released chart to locally built version on ${{ matrix.architecture }}..." + + ARCH="${{ matrix.architecture }}" + + # --- Baseline from Released Operator --- + echo "" + echo "--- Baseline (Released Operator) ---" + echo "Helm release info:" + helm list -n $OPERATOR_NS + + RELEASED_OPERATOR_IMAGE=$(kubectl get deployment documentdb-operator -n $OPERATOR_NS -o jsonpath='{.spec.template.spec.containers[0].image}') + echo "Released operator image: $RELEASED_OPERATOR_IMAGE" + + # Check CNPG ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is NOT set on released version + CNPG_DEPLOY=$(kubectl get deployments -n cnpg-system -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) + echo "CNPG deployment: $CNPG_DEPLOY" + + PRE_UPGRADE_ENV=$(kubectl get deployment "$CNPG_DEPLOY" -n cnpg-system \ + -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES")].value}' 2>/dev/null || echo "") + echo "ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES before upgrade: '${PRE_UPGRADE_ENV:-}'" + + if [[ -n "$PRE_UPGRADE_ENV" ]]; then + echo "⚠️ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is already set on released version (value: $PRE_UPGRADE_ENV)" + else + echo "✓ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is not set on released version (expected)" + fi + + # Record DB pod state before operator upgrade + echo "" + echo "DB pods before operator upgrade:" + kubectl get pods -n $DB_NS -l cnpg.io/cluster=$DB_NAME -o wide + PRE_UPGRADE_UIDS=$(kubectl get pods -n $DB_NS -l cnpg.io/cluster=$DB_NAME -o jsonpath='{.items[*].metadata.uid}') + echo "Pod UIDs: $PRE_UPGRADE_UIDS" + + # --- Prepare Built Chart --- + echo "" + echo "--- Preparing Built Chart ---" + CHART_ARTIFACT_DIR="./artifacts/build-helm-chart-${ARCH}" + EXPECTED_CHART_FILE="$CHART_ARTIFACT_DIR/documentdb-chart-${{ env.CHART_VERSION }}-${ARCH}.tgz" + + if [ ! -f "$EXPECTED_CHART_FILE" ]; then + echo "❌ Built Helm chart not found: $EXPECTED_CHART_FILE" + ls -la "$CHART_ARTIFACT_DIR/" || echo "Chart artifact directory not found" + exit 1 + fi + + echo "Extracting built chart: $EXPECTED_CHART_FILE" + rm -rf ./documentdb-chart + tar -xzf "$EXPECTED_CHART_FILE" + + echo "Built chart version:" + cat ./documentdb-chart/Chart.yaml | grep -E "^(version|appVersion):" + + # --- Perform Helm Upgrade --- + echo "" + echo "--- Performing Helm Upgrade ---" + LOCAL_IMAGE_TAG="${{ env.IMAGE_TAG }}-${ARCH}" + echo "Upgrading with image tag: $LOCAL_IMAGE_TAG" + + helm upgrade documentdb-operator ./documentdb-chart \ + --namespace $OPERATOR_NS \ + --set documentDbVersion="$LOCAL_IMAGE_TAG" \ + --set image.documentdbk8soperator.tag="$LOCAL_IMAGE_TAG" \ + --set image.sidecarinjector.tag="$LOCAL_IMAGE_TAG" \ + --wait --timeout=15m + + echo "Helm upgrade completed. Release info:" + helm list -n $OPERATOR_NS + + # --- Verify Upgraded Operator --- + echo "" + echo "--- Verifying Upgraded Operator ---" + kubectl wait --for=condition=Available deployment/documentdb-operator -n $OPERATOR_NS --timeout=300s + + UPGRADED_OPERATOR_IMAGE=$(kubectl get deployment documentdb-operator -n $OPERATOR_NS -o jsonpath='{.spec.template.spec.containers[0].image}') + echo "Upgraded operator image: $UPGRADED_OPERATOR_IMAGE" + + if [[ "$UPGRADED_OPERATOR_IMAGE" == "$RELEASED_OPERATOR_IMAGE" ]]; then + echo "❌ Operator image did not change after upgrade" + exit 1 + fi + echo "✓ Operator image changed: $RELEASED_OPERATOR_IMAGE → $UPGRADED_OPERATOR_IMAGE" + + # --- Verify ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES --- + echo "" + echo "--- Verifying ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES ---" + + # Wait for CNPG deployment to be updated + echo "Waiting for CNPG deployment to roll out..." + kubectl rollout status deployment "$CNPG_DEPLOY" -n cnpg-system --timeout=300s + + POST_UPGRADE_ENV=$(kubectl get deployment "$CNPG_DEPLOY" -n cnpg-system \ + -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES")].value}' 2>/dev/null || echo "") + echo "ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES after upgrade: '${POST_UPGRADE_ENV:-}'" + + if [[ "$POST_UPGRADE_ENV" == "true" ]]; then + echo "✅ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES correctly set to 'true' after upgrade" + else + echo "❌ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES not set to 'true' after upgrade (value: '${POST_UPGRADE_ENV:-}')" + echo "CNPG deployment env vars:" + kubectl get deployment "$CNPG_DEPLOY" -n cnpg-system \ + -o jsonpath='{.spec.template.spec.containers[0].env}' | jq . || true + exit 1 + fi + + # --- Verify DB Pod Stability --- + echo "" + echo "--- Verifying DB Pod Stability ---" + kubectl get pods -n $DB_NS -l cnpg.io/cluster=$DB_NAME -o wide + POST_UPGRADE_UIDS=$(kubectl get pods -n $DB_NS -l cnpg.io/cluster=$DB_NAME -o jsonpath='{.items[*].metadata.uid}') + echo "Pod UIDs after upgrade: $POST_UPGRADE_UIDS" + + if [[ "$PRE_UPGRADE_UIDS" == "$POST_UPGRADE_UIDS" ]]; then + echo "✓ DB pod UIDs unchanged — operator upgrade did not restart DB pods" + else + echo "⚠️ DB pod UIDs changed — pods may have been restarted during operator upgrade" + echo " Before: $PRE_UPGRADE_UIDS" + echo " After: $POST_UPGRADE_UIDS" + fi + + # --- Verify Cluster Health --- + echo "" + echo "--- Verifying Cluster Health ---" + timeout 300 bash -c ' + while true; do + DB_STATUS=$(kubectl get documentdb '$DB_NAME' -n '$DB_NS' -o jsonpath="{.status.status}" 2>/dev/null) + CLUSTER_STATUS=$(kubectl get cluster '$DB_NAME' -n '$DB_NS' -o jsonpath="{.status.phase}" 2>/dev/null) + echo "DocumentDB status: $DB_STATUS, CNPG phase: $CLUSTER_STATUS" + if [[ "$DB_STATUS" == "Cluster in healthy state" && "$CLUSTER_STATUS" == "Cluster in healthy state" ]]; then + echo "✓ Cluster is healthy after operator upgrade" + break + fi + sleep 10 + done + ' + + echo "" + echo "✅ Step 1 passed: Operator control plane upgraded successfully" + echo " Operator: $RELEASED_OPERATOR_IMAGE → $UPGRADED_OPERATOR_IMAGE" + echo " ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES: '${PRE_UPGRADE_ENV:-}' → '$POST_UPGRADE_ENV'" + + - name: Setup port forwarding for operator upgrade verification + uses: ./.github/actions/setup-port-forwarding + with: + namespace: ${{ env.DB_NS }} + cluster-name: ${{ env.DB_NAME }} + port: ${{ env.DB_PORT }} + architecture: ${{ matrix.architecture }} + test-type: 'comprehensive' + + - name: Verify data persistence after operator upgrade + run: | + echo "=== Data Persistence: Verifying after operator upgrade ===" + mongosh 127.0.0.1:$DB_PORT \ + -u $DB_USERNAME \ + -p $DB_PASSWORD \ + --authenticationMechanism SCRAM-SHA-256 \ + --tls \ + --tlsAllowInvalidCertificates \ + --eval ' + db = db.getSiblingDB("upgrade_test_db"); + var count = db.test_collection.countDocuments(); + assert(count === 2, "Expected 2 documents but found " + count + " after operator upgrade"); + print("✓ All " + count + " documents persisted through operator upgrade"); + ' + echo "✓ Data persistence verified after operator upgrade" + + - name: Cleanup port forwarding after operator upgrade verification + if: always() + run: | + if [ -f /tmp/pf_pid ]; then + PF_PID=$(cat /tmp/pf_pid) + kill $PF_PID 2>/dev/null || true + rm -f /tmp/pf_pid + fi + rm -f /tmp/pf_output.log + + - name: "Step 2: Upgrade Both Extension and Gateway Images" + run: | + echo "=== Step 2: Upgrade Both Extension and Gateway Images ===" echo "Testing simultaneous extension + gateway upgrade on ${{ matrix.architecture }}..." OLD_EXTENSION="${{ env.DOCUMENTDB_OLD_IMAGE }}" @@ -288,7 +470,7 @@ jobs: fi echo "" - echo "✅ Step 1 passed: Both images upgraded successfully" + echo "✅ Step 2 passed: Both images upgraded successfully" echo " Extension: $OLD_EXTENSION → $NEW_EXTENSION" echo " Gateway: $OLD_GATEWAY → $NEW_GATEWAY" @@ -328,9 +510,9 @@ jobs: fi rm -f /tmp/pf_output.log - - name: "Step 2: Rollback Extension Image (gateway stays at new version)" + - name: "Step 3: Rollback Extension Image (gateway stays at new version)" run: | - echo "=== Step 2: Rollback Extension Image ===" + echo "=== Step 3: Rollback Extension Image ===" echo "Rolling back extension image while keeping gateway at new version..." OLD_EXTENSION="${{ env.DOCUMENTDB_OLD_IMAGE }}" @@ -444,7 +626,7 @@ jobs: fi echo "" - echo "✅ Step 2 passed: Extension rolled back, gateway unchanged" + echo "✅ Step 3 passed: Extension rolled back, gateway unchanged" echo " Extension: $NEW_EXTENSION → $OLD_EXTENSION (rolled back)" echo " Gateway: $NEW_GATEWAY (unchanged)" @@ -484,9 +666,9 @@ jobs: fi rm -f /tmp/pf_output.log - - name: "Step 3: Rollback Gateway Image (extension stays at old version)" + - name: "Step 4: Rollback Gateway Image (extension stays at old version)" run: | - echo "=== Step 3: Rollback Gateway Image ===" + echo "=== Step 4: Rollback Gateway Image ===" echo "Rolling back gateway image while keeping extension at old version..." OLD_EXTENSION="${{ env.DOCUMENTDB_OLD_IMAGE }}" @@ -576,7 +758,7 @@ jobs: fi echo "" - echo "✅ Step 3 passed: Gateway rolled back, extension unchanged" + echo "✅ Step 4 passed: Gateway rolled back, extension unchanged" echo " Extension: $OLD_EXTENSION (unchanged)" echo " Gateway: $NEW_GATEWAY → $OLD_GATEWAY (rolled back)" @@ -637,6 +819,13 @@ jobs: echo "- **New Gateway Image**: ${{ env.GATEWAY_IMAGE }}" >> $GITHUB_STEP_SUMMARY echo "- **Image Tag**: ${{ env.IMAGE_TAG }}" >> $GITHUB_STEP_SUMMARY echo "- **Chart Version**: ${{ env.CHART_VERSION }}" >> $GITHUB_STEP_SUMMARY + echo "- **Released Chart Version**: ${{ env.RELEASED_CHART_VERSION }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "### Test Steps:" >> $GITHUB_STEP_SUMMARY + echo "- Step 1: Operator control plane upgrade (released → built)" >> $GITHUB_STEP_SUMMARY + echo "- Step 2: Upgrade both extension and gateway images" >> $GITHUB_STEP_SUMMARY + echo "- Step 3: Rollback extension image" >> $GITHUB_STEP_SUMMARY + echo "- Step 4: Rollback gateway image" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY if [[ "${{ job.status }}" == "success" ]]; then From 8d7715e66f2c2205de279373803c79820f1fa032 Mon Sep 17 00:00:00 2001 From: Wenting Wu Date: Fri, 27 Feb 2026 10:27:22 -0500 Subject: [PATCH 3/7] feat: dynamically determine initial DocumentDB image for upgrade test - Add DOCUMENTDB_COMBINED_IMAGE env var for released operator (no ImageVolume) - Add 'Determine initial DocumentDB image' step that checks released chart CRD for postgresImage field to decide between combined vs extension image - Update setup-test-environment to use dynamically resolved DOCUMENTDB_INITIAL_IMAGE - Fix Step 2 baseline checks to use INITIAL_IMAGE and flexible schema version check - Fix sidecar-injector verification to query cnpg-system namespace - Fix CRD name from db.documentdb.io to dbs.documentdb.io - Add design docs for dual-mode deployment, rollback support, and unified upgrade Signed-off-by: Wenting Wu --- .../actions/setup-test-environment/action.yml | 4 +- .../workflows/test-upgrade-and-rollback.yml | 76 ++++++++++++++----- 2 files changed, 60 insertions(+), 20 deletions(-) diff --git a/.github/actions/setup-test-environment/action.yml b/.github/actions/setup-test-environment/action.yml index 1d9ab40f..a838b348 100644 --- a/.github/actions/setup-test-environment/action.yml +++ b/.github/actions/setup-test-environment/action.yml @@ -547,7 +547,7 @@ runs: kubectl get deployment documentdb-operator -n ${{ inputs.operator-namespace }} -o jsonpath='{.spec.template.spec.containers[0].image}' echo "" echo "Sidecar injector image (if present):" - kubectl get deployment documentdb-operator -n ${{ inputs.operator-namespace }} -o jsonpath='{.spec.template.spec.containers[1].image}' || echo "No sidecar container found" + kubectl get deployment sidecar-injector -n cnpg-system -o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null || echo "No sidecar injector deployment found" echo "" echo "Checking operator pod status..." @@ -557,7 +557,7 @@ runs: kubectl logs -n ${{ inputs.operator-namespace }} deployment/documentdb-operator --tail=20 || echo "Could not retrieve operator logs" echo "Verifying DocumentDB CRDs are installed..." - kubectl get crd db.documentdb.io || echo "DocumentDB CRD not found" + kubectl get crd dbs.documentdb.io || echo "DocumentDB CRD not found" echo "✓ DocumentDB Operator installation verified on ${{ inputs.architecture }}" diff --git a/.github/workflows/test-upgrade-and-rollback.yml b/.github/workflows/test-upgrade-and-rollback.yml index 5aa5fab8..8c61fb30 100644 --- a/.github/workflows/test-upgrade-and-rollback.yml +++ b/.github/workflows/test-upgrade-and-rollback.yml @@ -69,6 +69,7 @@ jobs: CHART_VERSION: ${{ needs.build.outputs.chart_version || '0.1.0' }} DOCUMENTDB_IMAGE: ghcr.io/guanzhousongmicrosoft/documentdb-pg18:0.110.0 DOCUMENTDB_OLD_IMAGE: ghcr.io/guanzhousongmicrosoft/documentdb-pg18:0.109.0 + DOCUMENTDB_COMBINED_IMAGE: ghcr.io/microsoft/documentdb/documentdb-local:16 GATEWAY_IMAGE: ghcr.io/microsoft/documentdb/documentdb-local:16 GATEWAY_OLD_IMAGE: ghcr.io/microsoft/documentdb/documentdb-local:16 # TODO: update to actual old gateway image when available RELEASED_CHART_VERSION: 'latest' @@ -98,9 +99,53 @@ jobs: echo "- **Architecture**: \`${{ matrix.architecture }}\`" >> $GITHUB_STEP_SUMMARY echo "- **Old Extension Image**: \`${{ env.DOCUMENTDB_OLD_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY echo "- **New Extension Image**: \`${{ env.DOCUMENTDB_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY + echo "- **Combined Image**: \`${{ env.DOCUMENTDB_COMBINED_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY echo "- **Old Gateway Image**: \`${{ env.GATEWAY_OLD_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY echo "- **New Gateway Image**: \`${{ env.GATEWAY_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY + - name: Determine initial DocumentDB image + run: | + echo "=== Determining DocumentDB image for initial deployment ===" + + # Add the public DocumentDB Helm repository + helm repo add documentdb https://documentdb.github.io/documentdb-kubernetes-operator 2>/dev/null || true + helm repo update + + # Resolve the released chart version + CHART_VERSION="${{ env.RELEASED_CHART_VERSION }}" + if [[ "$CHART_VERSION" == "latest" ]]; then + RESOLVED_VERSION=$(helm search repo documentdb/documentdb-operator -o json | jq -r '.[0].version') + else + RESOLVED_VERSION="$CHART_VERSION" + fi + echo "Resolved released chart version: $RESOLVED_VERSION" + + # Check if the CRD in the released chart has postgresImage field + # This is the definitive check — postgresImage means the operator supports ImageVolume mode + CHART_VERSION_FLAG="" + if [[ "$CHART_VERSION" != "latest" ]]; then + CHART_VERSION_FLAG="--version $CHART_VERSION" + fi + CRD_OUTPUT=$(helm show crds documentdb/documentdb-operator $CHART_VERSION_FLAG 2>/dev/null || echo "") + if echo "$CRD_OUTPUT" | grep -q "postgresImage"; then + echo "CRD contains postgresImage field → operator supports ImageVolume mode" + USE_COMBINED=false + else + echo "CRD does not contain postgresImage field → combined image required" + USE_COMBINED=true + fi + + # Set the initial image based on determination + COMBINED_IMAGE="${{ env.DOCUMENTDB_COMBINED_IMAGE }}" + EXTENSION_IMAGE="${{ env.DOCUMENTDB_OLD_IMAGE }}" + if [[ "$USE_COMBINED" == "true" ]]; then + echo "DOCUMENTDB_INITIAL_IMAGE=$COMBINED_IMAGE" >> $GITHUB_ENV + echo "✓ Using combined image for initial deployment: $COMBINED_IMAGE" + else + echo "DOCUMENTDB_INITIAL_IMAGE=$EXTENSION_IMAGE" >> $GITHUB_ENV + echo "✓ Using extension image for initial deployment: $EXTENSION_IMAGE" + fi + - name: Setup test environment uses: ./.github/actions/setup-test-environment with: @@ -119,7 +164,7 @@ jobs: db-port: ${{ env.DB_PORT }} image-tag: ${{ env.IMAGE_TAG }} chart-version: ${{ env.CHART_VERSION }} - documentdb-image: ${{ env.DOCUMENTDB_OLD_IMAGE }} + documentdb-image: ${{ env.DOCUMENTDB_INITIAL_IMAGE }} gateway-image: ${{ env.GATEWAY_OLD_IMAGE }} use-external-images: ${{ github.event_name != 'pull_request' && inputs.image_tag != '' && inputs.image_tag != null }} released-chart-version: ${{ env.RELEASED_CHART_VERSION }} @@ -349,16 +394,16 @@ jobs: echo "=== Step 2: Upgrade Both Extension and Gateway Images ===" echo "Testing simultaneous extension + gateway upgrade on ${{ matrix.architecture }}..." - OLD_EXTENSION="${{ env.DOCUMENTDB_OLD_IMAGE }}" + INITIAL_IMAGE="${{ env.DOCUMENTDB_INITIAL_IMAGE }}" NEW_EXTENSION="${{ env.DOCUMENTDB_IMAGE }}" OLD_GATEWAY="${{ env.GATEWAY_OLD_IMAGE }}" NEW_GATEWAY="${{ env.GATEWAY_IMAGE }}" - # Verify baseline: cluster deployed with old images + # Verify baseline: cluster deployed with initial images CURRENT_EXTENSION=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.spec.documentDBImage}') echo "Current extension image: $CURRENT_EXTENSION" - if [[ "$CURRENT_EXTENSION" != "$OLD_EXTENSION" ]]; then - echo "❌ Expected old extension image $OLD_EXTENSION but found $CURRENT_EXTENSION" + if [[ "$CURRENT_EXTENSION" != "$INITIAL_IMAGE" ]]; then + echo "❌ Expected initial image $INITIAL_IMAGE but found $CURRENT_EXTENSION" exit 1 fi @@ -368,29 +413,23 @@ jobs: echo "❌ Expected old gateway image $OLD_GATEWAY but found $CURRENT_GATEWAY" exit 1 fi - echo "✓ Cluster deployed with old images" + echo "✓ Cluster deployed with initial images" - # Record and verify version before upgrade + # Record version before upgrade + # Note: When using combined image (tag like :16), schemaVersion won't match the image tag. + # Just verify it's set and record it. VERSION_BEFORE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') echo "DocumentDB schema version before upgrade: $VERSION_BEFORE" - - # The image tag (e.g., "0.110.0") now matches status.schemaVersion directly - OLD_VERSION_TAG="${OLD_EXTENSION##*:}" - EXPECTED_VERSION_BEFORE="$OLD_VERSION_TAG" if [[ -z "$VERSION_BEFORE" ]]; then echo "❌ status.schemaVersion is empty before upgrade" exit 1 fi - if [[ "$VERSION_BEFORE" != "$EXPECTED_VERSION_BEFORE" ]]; then - echo "❌ Expected version $EXPECTED_VERSION_BEFORE before upgrade but found $VERSION_BEFORE" - exit 1 - fi - echo "✓ DocumentDB version matches expected $EXPECTED_VERSION_BEFORE before upgrade" + echo "✓ DocumentDB schema version set before upgrade: $VERSION_BEFORE" # Patch both images simultaneously echo "" echo "Upgrading both images..." - echo " Extension: $OLD_EXTENSION → $NEW_EXTENSION" + echo " Extension: $INITIAL_IMAGE → $NEW_EXTENSION" echo " Gateway: $OLD_GATEWAY → $NEW_GATEWAY" kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ -p "{\"spec\":{\"documentDBImage\":\"$NEW_EXTENSION\",\"gatewayImage\":\"$NEW_GATEWAY\"}}" @@ -471,7 +510,7 @@ jobs: echo "" echo "✅ Step 2 passed: Both images upgraded successfully" - echo " Extension: $OLD_EXTENSION → $NEW_EXTENSION" + echo " Extension: $INITIAL_IMAGE → $NEW_EXTENSION" echo " Gateway: $OLD_GATEWAY → $NEW_GATEWAY" - name: Setup port forwarding for upgrade verification @@ -813,6 +852,7 @@ jobs: echo "## Upgrade & Rollback Test Summary for ${{ matrix.architecture }}" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY echo "- **Architecture**: ${{ matrix.architecture }}" >> $GITHUB_STEP_SUMMARY + echo "- **Initial Image**: ${{ env.DOCUMENTDB_INITIAL_IMAGE }}" >> $GITHUB_STEP_SUMMARY echo "- **Old Extension Image**: ${{ env.DOCUMENTDB_OLD_IMAGE }}" >> $GITHUB_STEP_SUMMARY echo "- **New Extension Image**: ${{ env.DOCUMENTDB_IMAGE }}" >> $GITHUB_STEP_SUMMARY echo "- **Old Gateway Image**: ${{ env.GATEWAY_OLD_IMAGE }}" >> $GITHUB_STEP_SUMMARY From 2df8d336b64e622900cd5934666e436b1305f790 Mon Sep 17 00:00:00 2001 From: Wenting Wu Date: Fri, 27 Feb 2026 11:32:15 -0500 Subject: [PATCH 4/7] refactor: use release version comparison to determine combined image Replace CRD postgresImage field check with simple semver comparison. Versions <= 0.1.3 use combined image, versions > 0.1.3 use extension image. Signed-off-by: Wenting Wu --- .../workflows/test-upgrade-and-rollback.yml | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/.github/workflows/test-upgrade-and-rollback.yml b/.github/workflows/test-upgrade-and-rollback.yml index 8c61fb30..722ce9e7 100644 --- a/.github/workflows/test-upgrade-and-rollback.yml +++ b/.github/workflows/test-upgrade-and-rollback.yml @@ -120,19 +120,16 @@ jobs: fi echo "Resolved released chart version: $RESOLVED_VERSION" - # Check if the CRD in the released chart has postgresImage field - # This is the definitive check — postgresImage means the operator supports ImageVolume mode - CHART_VERSION_FLAG="" - if [[ "$CHART_VERSION" != "latest" ]]; then - CHART_VERSION_FLAG="--version $CHART_VERSION" - fi - CRD_OUTPUT=$(helm show crds documentdb/documentdb-operator $CHART_VERSION_FLAG 2>/dev/null || echo "") - if echo "$CRD_OUTPUT" | grep -q "postgresImage"; then - echo "CRD contains postgresImage field → operator supports ImageVolume mode" - USE_COMBINED=false - else - echo "CRD does not contain postgresImage field → combined image required" + # Determine image mode based on release version + # Versions <= 0.1.3 use combined image (no ImageVolume support) + # Versions > 0.1.3 use extension image (ImageVolume mode) + THRESHOLD="0.1.3" + if [[ "$(printf '%s\n' "$THRESHOLD" "$RESOLVED_VERSION" | sort -V | head -n1)" == "$RESOLVED_VERSION" ]]; then + echo "Released version $RESOLVED_VERSION <= $THRESHOLD → combined image required" USE_COMBINED=true + else + echo "Released version $RESOLVED_VERSION > $THRESHOLD → extension image supported" + USE_COMBINED=false fi # Set the initial image based on determination From 0bf77fc76d54bfe2e694d2b4b5e2f10cd65b442d Mon Sep 17 00:00:00 2001 From: Wenting Wu Date: Fri, 27 Feb 2026 11:57:49 -0500 Subject: [PATCH 5/7] fix: allow empty schemaVersion before upgrade When using a combined image with older released operator versions, status.schemaVersion may not be populated. Remove the hard failure and just log the value. Signed-off-by: Wenting Wu --- .github/workflows/test-upgrade-and-rollback.yml | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test-upgrade-and-rollback.yml b/.github/workflows/test-upgrade-and-rollback.yml index 722ce9e7..6c3b708c 100644 --- a/.github/workflows/test-upgrade-and-rollback.yml +++ b/.github/workflows/test-upgrade-and-rollback.yml @@ -413,15 +413,9 @@ jobs: echo "✓ Cluster deployed with initial images" # Record version before upgrade - # Note: When using combined image (tag like :16), schemaVersion won't match the image tag. - # Just verify it's set and record it. + # Note: schemaVersion may be empty when using combined image with older operator versions. VERSION_BEFORE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') - echo "DocumentDB schema version before upgrade: $VERSION_BEFORE" - if [[ -z "$VERSION_BEFORE" ]]; then - echo "❌ status.schemaVersion is empty before upgrade" - exit 1 - fi - echo "✓ DocumentDB schema version set before upgrade: $VERSION_BEFORE" + echo "DocumentDB schema version before upgrade: ${VERSION_BEFORE:-}" # Patch both images simultaneously echo "" From 89d12caa226a564267f0590b46974a371863dbbe Mon Sep 17 00:00:00 2001 From: Wenting Wu Date: Fri, 27 Feb 2026 12:51:55 -0500 Subject: [PATCH 6/7] feat: recreate cluster for ImageVolume mode when released operator used combined mode When the released operator version <= 0.1.3 deploys in combined mode, the cluster must be recreated under the upgraded operator to switch to ImageVolume mode before testing extension/gateway upgrades. - Persist USE_COMBINED to GITHUB_ENV for later step branching - Add 4 conditional steps (gated by USE_COMBINED == true): 1. Recreate cluster: delete combined-mode CR, create fresh with extension image 2. Setup port forwarding for re-seeding 3. Re-seed test data (2 documents) 4. Cleanup port forwarding - Update DOCUMENTDB_INITIAL_IMAGE after recreation so Step 2 baseline passes - All new steps marked with TODO: Remove once released version > 0.1.3 Signed-off-by: Wenting Wu --- .../workflows/test-upgrade-and-rollback.yml | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/.github/workflows/test-upgrade-and-rollback.yml b/.github/workflows/test-upgrade-and-rollback.yml index 6c3b708c..d564f35e 100644 --- a/.github/workflows/test-upgrade-and-rollback.yml +++ b/.github/workflows/test-upgrade-and-rollback.yml @@ -132,6 +132,10 @@ jobs: USE_COMBINED=false fi + # Persist USE_COMBINED for later steps + # TODO: Remove once released version > 0.1.3 + echo "USE_COMBINED=$USE_COMBINED" >> $GITHUB_ENV + # Set the initial image based on determination COMBINED_IMAGE="${{ env.DOCUMENTDB_COMBINED_IMAGE }}" EXTENSION_IMAGE="${{ env.DOCUMENTDB_OLD_IMAGE }}" @@ -386,6 +390,140 @@ jobs: fi rm -f /tmp/pf_output.log + # ============================================================ + # TODO: Remove the following 4 steps once released version > 0.1.3 + # When the released operator uses combined mode, the cluster must be + # recreated under the upgraded operator to switch to ImageVolume mode. + # ============================================================ + + - name: "Recreate cluster for ImageVolume mode (combined → extension)" + if: env.USE_COMBINED == 'true' + run: | + echo "=== Recreating cluster: combined mode → ImageVolume mode ===" + echo "The released operator deployed in combined mode. After operator upgrade," + echo "we must recreate the cluster so the new operator deploys it in ImageVolume mode." + + # Delete the combined-mode cluster + echo "" + echo "Deleting combined-mode cluster..." + kubectl delete documentdb $DB_NAME -n $DB_NS --wait=false + + echo "Waiting for DocumentDB to be deleted..." + timeout 300 bash -c ' + while true; do + db_exists=$(kubectl -n '$DB_NS' get documentdb '$DB_NAME' --ignore-not-found -o name) + if [[ -z "$db_exists" ]]; then + echo "✓ DocumentDB deleted successfully." + break + fi + echo "DocumentDB still exists. Waiting..." + sleep 10 + done + ' + + echo "Waiting for cluster pods to be cleaned up..." + timeout 120 bash -c ' + while true; do + pod_count=$(kubectl get pods -n '$DB_NS' -l cnpg.io/cluster='$DB_NAME' --no-headers 2>/dev/null | wc -l) + if [[ "$pod_count" -eq 0 ]]; then + echo "✓ All cluster pods cleaned up." + break + fi + echo "Still $pod_count pods remaining. Waiting..." + sleep 5 + done + ' + + # Create a fresh cluster with extension image under the upgraded operator + OLD_EXTENSION="${{ env.DOCUMENTDB_OLD_IMAGE }}" + OLD_GATEWAY="${{ env.GATEWAY_OLD_IMAGE }}" + echo "" + echo "Creating new cluster with ImageVolume mode..." + echo " Extension image: $OLD_EXTENSION" + echo " Gateway image: $OLD_GATEWAY" + cat </dev/null) + CLUSTER_STATUS=$(kubectl get cluster '$DB_NAME' -n '$DB_NS' -o jsonpath="{.status.phase}" 2>/dev/null) + echo "DocumentDB status: $DB_STATUS, CNPG phase: $CLUSTER_STATUS" + if [[ "$DB_STATUS" == "Cluster in healthy state" && "$CLUSTER_STATUS" == "Cluster in healthy state" ]]; then + echo "✓ Recreated cluster is healthy" + break + fi + sleep 10 + done + ' + + # Update DOCUMENTDB_INITIAL_IMAGE so Step 2 baseline check uses the correct image + echo "DOCUMENTDB_INITIAL_IMAGE=$OLD_EXTENSION" >> $GITHUB_ENV + echo "" + echo "✅ Cluster recreated in ImageVolume mode" + echo " DOCUMENTDB_INITIAL_IMAGE updated to: $OLD_EXTENSION" + + - name: Setup port forwarding for re-seeding after recreation + if: env.USE_COMBINED == 'true' + uses: ./.github/actions/setup-port-forwarding + with: + namespace: ${{ env.DB_NS }} + cluster-name: ${{ env.DB_NAME }} + port: ${{ env.DB_PORT }} + architecture: ${{ matrix.architecture }} + test-type: 'comprehensive' + + - name: Re-seed test data after cluster recreation + if: env.USE_COMBINED == 'true' + run: | + echo "=== Re-seeding test data after cluster recreation ===" + mongosh 127.0.0.1:$DB_PORT \ + -u $DB_USERNAME \ + -p $DB_PASSWORD \ + --authenticationMechanism SCRAM-SHA-256 \ + --tls \ + --tlsAllowInvalidCertificates \ + --eval ' + db = db.getSiblingDB("upgrade_test_db"); + db.test_collection.insertOne({ _id: "upgrade_marker", step: "pre-upgrade", timestamp: new Date().toISOString() }); + db.test_collection.insertOne({ _id: "persistence_check", data: "this_must_survive_rollback", count: 42 }); + var count = db.test_collection.countDocuments(); + print("✓ Seed data written: " + count + " documents"); + assert(count === 2, "Expected 2 documents but found " + count); + ' + echo "✓ Seed data re-written after cluster recreation" + + - name: Cleanup port forwarding after re-seeding + if: ${{ always() && env.USE_COMBINED == 'true' }} + run: | + if [ -f /tmp/pf_pid ]; then + PF_PID=$(cat /tmp/pf_pid) + kill $PF_PID 2>/dev/null || true + rm -f /tmp/pf_pid + fi + rm -f /tmp/pf_output.log + + # ============================================================ + # END TODO: Remove the above 4 steps once released version > 0.1.3 + # ============================================================ + - name: "Step 2: Upgrade Both Extension and Gateway Images" run: | echo "=== Step 2: Upgrade Both Extension and Gateway Images ===" From 1c65ed9dcbf5cf0cd644278d2b2cda36094c25a8 Mon Sep 17 00:00:00 2001 From: Wenting Wu Date: Fri, 27 Feb 2026 13:34:51 -0500 Subject: [PATCH 7/7] refactor: use OLD_EXTENSION for Step 2 baseline and add strict schemaVersion validation Signed-off-by: Wenting Wu --- .../workflows/test-upgrade-and-rollback.yml | 78 +++++++------------ 1 file changed, 29 insertions(+), 49 deletions(-) diff --git a/.github/workflows/test-upgrade-and-rollback.yml b/.github/workflows/test-upgrade-and-rollback.yml index d564f35e..76827a46 100644 --- a/.github/workflows/test-upgrade-and-rollback.yml +++ b/.github/workflows/test-upgrade-and-rollback.yml @@ -103,6 +103,7 @@ jobs: echo "- **Old Gateway Image**: \`${{ env.GATEWAY_OLD_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY echo "- **New Gateway Image**: \`${{ env.GATEWAY_IMAGE }}\`" >> $GITHUB_STEP_SUMMARY + # TODO: Remove this step once release versions > 0.1.3 - name: Determine initial DocumentDB image run: | echo "=== Determining DocumentDB image for initial deployment ===" @@ -114,7 +115,11 @@ jobs: # Resolve the released chart version CHART_VERSION="${{ env.RELEASED_CHART_VERSION }}" if [[ "$CHART_VERSION" == "latest" ]]; then - RESOLVED_VERSION=$(helm search repo documentdb/documentdb-operator -o json | jq -r '.[0].version') + RESOLVED_VERSION=$(helm search repo documentdb/documentdb-operator -o json | jq -r '.[0].version' 2>/dev/null || echo "") + if [[ -z "$RESOLVED_VERSION" || "$RESOLVED_VERSION" == "null" ]]; then + echo "⚠️ Failed to resolve chart version from Helm repo, defaulting to threshold" + RESOLVED_VERSION="0.1.3" + fi else RESOLVED_VERSION="$CHART_VERSION" fi @@ -133,7 +138,7 @@ jobs: fi # Persist USE_COMBINED for later steps - # TODO: Remove once released version > 0.1.3 + # TODO: Remove once we deprecate combined mode echo "USE_COMBINED=$USE_COMBINED" >> $GITHUB_ENV # Set the initial image based on determination @@ -226,20 +231,6 @@ jobs: RELEASED_OPERATOR_IMAGE=$(kubectl get deployment documentdb-operator -n $OPERATOR_NS -o jsonpath='{.spec.template.spec.containers[0].image}') echo "Released operator image: $RELEASED_OPERATOR_IMAGE" - # Check CNPG ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is NOT set on released version - CNPG_DEPLOY=$(kubectl get deployments -n cnpg-system -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) - echo "CNPG deployment: $CNPG_DEPLOY" - - PRE_UPGRADE_ENV=$(kubectl get deployment "$CNPG_DEPLOY" -n cnpg-system \ - -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES")].value}' 2>/dev/null || echo "") - echo "ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES before upgrade: '${PRE_UPGRADE_ENV:-}'" - - if [[ -n "$PRE_UPGRADE_ENV" ]]; then - echo "⚠️ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is already set on released version (value: $PRE_UPGRADE_ENV)" - else - echo "✓ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES is not set on released version (expected)" - fi - # Record DB pod state before operator upgrade echo "" echo "DB pods before operator upgrade:" @@ -296,28 +287,6 @@ jobs: fi echo "✓ Operator image changed: $RELEASED_OPERATOR_IMAGE → $UPGRADED_OPERATOR_IMAGE" - # --- Verify ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES --- - echo "" - echo "--- Verifying ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES ---" - - # Wait for CNPG deployment to be updated - echo "Waiting for CNPG deployment to roll out..." - kubectl rollout status deployment "$CNPG_DEPLOY" -n cnpg-system --timeout=300s - - POST_UPGRADE_ENV=$(kubectl get deployment "$CNPG_DEPLOY" -n cnpg-system \ - -o jsonpath='{.spec.template.spec.containers[0].env[?(@.name=="ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES")].value}' 2>/dev/null || echo "") - echo "ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES after upgrade: '${POST_UPGRADE_ENV:-}'" - - if [[ "$POST_UPGRADE_ENV" == "true" ]]; then - echo "✅ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES correctly set to 'true' after upgrade" - else - echo "❌ ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES not set to 'true' after upgrade (value: '${POST_UPGRADE_ENV:-}')" - echo "CNPG deployment env vars:" - kubectl get deployment "$CNPG_DEPLOY" -n cnpg-system \ - -o jsonpath='{.spec.template.spec.containers[0].env}' | jq . || true - exit 1 - fi - # --- Verify DB Pod Stability --- echo "" echo "--- Verifying DB Pod Stability ---" @@ -352,7 +321,6 @@ jobs: echo "" echo "✅ Step 1 passed: Operator control plane upgraded successfully" echo " Operator: $RELEASED_OPERATOR_IMAGE → $UPGRADED_OPERATOR_IMAGE" - echo " ENABLE_INSTANCE_MANAGER_INPLACE_UPDATES: '${PRE_UPGRADE_ENV:-}' → '$POST_UPGRADE_ENV'" - name: Setup port forwarding for operator upgrade verification uses: ./.github/actions/setup-port-forwarding @@ -529,16 +497,16 @@ jobs: echo "=== Step 2: Upgrade Both Extension and Gateway Images ===" echo "Testing simultaneous extension + gateway upgrade on ${{ matrix.architecture }}..." - INITIAL_IMAGE="${{ env.DOCUMENTDB_INITIAL_IMAGE }}" + OLD_EXTENSION="${{ env.DOCUMENTDB_OLD_IMAGE }}" NEW_EXTENSION="${{ env.DOCUMENTDB_IMAGE }}" OLD_GATEWAY="${{ env.GATEWAY_OLD_IMAGE }}" NEW_GATEWAY="${{ env.GATEWAY_IMAGE }}" - # Verify baseline: cluster deployed with initial images + # Verify baseline: cluster deployed with old images CURRENT_EXTENSION=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.spec.documentDBImage}') echo "Current extension image: $CURRENT_EXTENSION" - if [[ "$CURRENT_EXTENSION" != "$INITIAL_IMAGE" ]]; then - echo "❌ Expected initial image $INITIAL_IMAGE but found $CURRENT_EXTENSION" + if [[ "$CURRENT_EXTENSION" != "$OLD_EXTENSION" ]]; then + echo "❌ Expected old extension image $OLD_EXTENSION but found $CURRENT_EXTENSION" exit 1 fi @@ -548,17 +516,29 @@ jobs: echo "❌ Expected old gateway image $OLD_GATEWAY but found $CURRENT_GATEWAY" exit 1 fi - echo "✓ Cluster deployed with initial images" + echo "✓ Cluster deployed with old images" - # Record version before upgrade - # Note: schemaVersion may be empty when using combined image with older operator versions. + # Record and verify version before upgrade VERSION_BEFORE=$(kubectl get documentdb $DB_NAME -n $DB_NS -o jsonpath='{.status.schemaVersion}') - echo "DocumentDB schema version before upgrade: ${VERSION_BEFORE:-}" + echo "DocumentDB schema version before upgrade: $VERSION_BEFORE" + + # The image tag (e.g., "0.110.0") now matches status.schemaVersion directly + OLD_VERSION_TAG="${OLD_EXTENSION##*:}" + EXPECTED_VERSION_BEFORE="$OLD_VERSION_TAG" + if [[ -z "$VERSION_BEFORE" ]]; then + echo "❌ status.schemaVersion is empty before upgrade" + exit 1 + fi + if [[ "$VERSION_BEFORE" != "$EXPECTED_VERSION_BEFORE" ]]; then + echo "❌ Expected version $EXPECTED_VERSION_BEFORE before upgrade but found $VERSION_BEFORE" + exit 1 + fi + echo "✓ DocumentDB version matches expected $EXPECTED_VERSION_BEFORE before upgrade" # Patch both images simultaneously echo "" echo "Upgrading both images..." - echo " Extension: $INITIAL_IMAGE → $NEW_EXTENSION" + echo " Extension: $OLD_EXTENSION → $NEW_EXTENSION" echo " Gateway: $OLD_GATEWAY → $NEW_GATEWAY" kubectl patch documentdb $DB_NAME -n $DB_NS --type='merge' \ -p "{\"spec\":{\"documentDBImage\":\"$NEW_EXTENSION\",\"gatewayImage\":\"$NEW_GATEWAY\"}}" @@ -639,7 +619,7 @@ jobs: echo "" echo "✅ Step 2 passed: Both images upgraded successfully" - echo " Extension: $INITIAL_IMAGE → $NEW_EXTENSION" + echo " Extension: $OLD_EXTENSION → $NEW_EXTENSION" echo " Gateway: $OLD_GATEWAY → $NEW_GATEWAY" - name: Setup port forwarding for upgrade verification