From 41b696d34bfe6308be334168d0143dfb0bb40eab Mon Sep 17 00:00:00 2001 From: Dan Fuchs Date: Wed, 5 Nov 2025 15:30:55 -0600 Subject: [PATCH] Add backup plans and restore plans for RSP GKE clusters Add a backup plan and a restore plan for a complete backup and restore of the RSP and Roundtable clusters. These are compatible with the instructions in the [cluster rebuild runbook](https://sqr-108.lsst.io/). --- .../deployments/roundtable/env/dev-gke.tfvars | 7 +- .../roundtable/env/production-gke.tfvars | 7 +- .../deployments/roundtable/gke/main.tf | 11 +- .../env/integration-gke.tfvars | 7 +- .../env/production-gke.tfvars | 7 +- .../deployments/science-platform/main.tf | 163 ++++++++++++++++++ 6 files changed, 190 insertions(+), 12 deletions(-) diff --git a/environment/deployments/roundtable/env/dev-gke.tfvars b/environment/deployments/roundtable/env/dev-gke.tfvars index 19e89757..5746c163 100644 --- a/environment/deployments/roundtable/env/dev-gke.tfvars +++ b/environment/deployments/roundtable/env/dev-gke.tfvars @@ -67,5 +67,8 @@ gke_backup_agent_config = true enable_dataplane_v2 = true -# Increase this number to force Terraform to update the dev environment. -# Serial: 14 +# If you didn't make any other changes to this file, increase this number to +# force Terraform to update this environment. You may need to do this if you +# changed .tf files in this environment, or if you changed any modules that +# this environment uses, but you didn't change any variables in this file. +# Serial: 15 diff --git a/environment/deployments/roundtable/env/production-gke.tfvars b/environment/deployments/roundtable/env/production-gke.tfvars index 1e1e8075..36c247d6 100644 --- a/environment/deployments/roundtable/env/production-gke.tfvars +++ b/environment/deployments/roundtable/env/production-gke.tfvars @@ -68,5 +68,8 @@ gke_backup_agent_config = true enable_dataplane_v2 = false -# Increase this number to force Terraform to update the production environment. -# Serial: 5 +# If you didn't make any other changes to this file, increase this number to +# force Terraform to update this environment. You may need to do this if you +# changed .tf files in this environment, or if you changed any modules that +# this environment uses, but you didn't change any variables in this file. +# Serial: 6 diff --git a/environment/deployments/roundtable/gke/main.tf b/environment/deployments/roundtable/gke/main.tf index 207666ec..68472826 100644 --- a/environment/deployments/roundtable/gke/main.tf +++ b/environment/deployments/roundtable/gke/main.tf @@ -125,10 +125,13 @@ resource "google_gke_backup_restore_plan" "complete" { cluster_resource_restore_scope { # If we're restoring to a DataplaneV2 cluster from a non-DataplaneV2 - # backup, we don't want to restore these resources, since the Calico - # CRDs won't exist. If we're restoring from DataplaneV2 to DataplaneV2, - # then we shouldn't have any of these resources in the backup anyway - # and this won't matter. + # backup, we don't want to restore these resources, since the Calico CRDs + # won't exist. If we're restoring from DataplaneV2 to DataplaneV2, then + # we shouldn't have any of these resources in the backup anyway and this + # won't matter. If we're restoring from non-DataplaneV2 to + # non-DataplaneV2, then these resources get installed by the addon + # anyway, and will not be restored from the backup due to the + # USE_EXISTING_VERSION cluster_resource_conflict_policy. excluded_group_kinds { resource_group = "crd.projectcalico.org" diff --git a/environment/deployments/science-platform/env/integration-gke.tfvars b/environment/deployments/science-platform/env/integration-gke.tfvars index 97241f78..f5dfb056 100644 --- a/environment/deployments/science-platform/env/integration-gke.tfvars +++ b/environment/deployments/science-platform/env/integration-gke.tfvars @@ -97,5 +97,8 @@ gke_backup_agent_config = true enable_dataplane_v2 = false -# Increase this number to force Terraform to update the int environment. -# Serial: 10 +# If you didn't make any other changes to this file, increase this number to +# force Terraform to update this environment. You may need to do this if you +# changed .tf files in this environment, or if you changed any modules that +# this environment uses, but you didn't change any variables in this file. +# Serial: 11 diff --git a/environment/deployments/science-platform/env/production-gke.tfvars b/environment/deployments/science-platform/env/production-gke.tfvars index 840c4672..e50bc9b9 100644 --- a/environment/deployments/science-platform/env/production-gke.tfvars +++ b/environment/deployments/science-platform/env/production-gke.tfvars @@ -106,5 +106,8 @@ enable_dataplane_v2 = false # bucket = "lsst-terraform-state" # prefix = "qserv/stable/gke" -# Increase this number to force Terraform to update the prod environment. -# Serial: 4 +# If you didn't make any other changes to this file, increase this number to +# force Terraform to update this environment. You may need to do this if you +# changed .tf files in this environment, or if you changed any modules that +# this environment uses, but you didn't change any variables in this file. +# Serial: 5 diff --git a/environment/deployments/science-platform/main.tf b/environment/deployments/science-platform/main.tf index 6454d54d..34ddec53 100644 --- a/environment/deployments/science-platform/main.tf +++ b/environment/deployments/science-platform/main.tf @@ -205,3 +205,166 @@ module "firewall_cert_manager" { network = module.project_factory.network_name custom_rules = var.custom_rules } + +resource "google_gke_backup_backup_plan" "complete" { + count = var.cluster_backup_plan != null ? 1 : 0 + + name = "${module.gke.name}" + cluster = module.gke.id + project = local.project_id + location = "us-central1" + + backup_config { + include_volume_data = true + include_secrets = true + all_namespaces = true + } + + # If you destroy the associated cluster, terraform will try to destroy and + # recreate this backup plan, which will also try to destroy all of the + # backups associated with the plan. If we are trying to intentionally rebuild + # a cluster, we will need to destroy it first, and we don't want this backup + # plan destroyed. + lifecycle { + ignore_changes = [ + cluster, + name, + ] + } +} + +resource "google_gke_backup_restore_plan" "complete" { + count = var.cluster_backup_plan != null ? 1 : 0 + + name = "${module.gke.name}" + project = local.project_id + location = "us-central1" + backup_plan = google_gke_backup_backup_plan.complete[0].id + cluster = module.gke.id + + restore_config { + all_namespaces = true + + # We're assuming this restore plan is intended to run on a completely empty + # new cluster. + namespaced_resource_restore_mode = "FAIL_ON_CONFLICT" + + # We're assuming this restore plan is intended to run on a completely empty + # new cluster, so REUSE_VOLUME_HANDLE_FROM_BACKUP won't work. + volume_data_restore_policy = "RESTORE_VOLUME_DATA_FROM_BACKUP" + + cluster_resource_conflict_policy = "USE_EXISTING_VERSION" + + cluster_resource_restore_scope { + # If we're restoring to a DataplaneV2 cluster from a non-DataplaneV2 + # backup, we don't want to restore these resources, since the Calico CRDs + # won't exist. If we're restoring from DataplaneV2 to DataplaneV2, then + # we shouldn't have any of these resources in the backup anyway and this + # won't matter. If we're restoring from non-DataplaneV2 to + # non-DataplaneV2, then these resources get installed by the addon + # anyway, and will not be restored from the backup due to the + # USE_EXISTING_VERSION cluster_resource_conflict_policy. + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "BGPConfiguration" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "BGPFilter" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "BGPPeer" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "BlockAffinity" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "CalicoNodeStatus" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "ClusterInformation" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "FelixConfiguration" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "GlobalBGPConfig" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "GlobalFelixConfig" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "GlobalNetworkPolicy" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "GlobalNetworkSet" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "HostEndpoint" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "IPAMBlock" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "IPAMConfig" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "IPAMHandle" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "IPPool" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "IPReservation" + } + + excluded_group_kinds { + resource_group = "crd.projectcalico.org" + resource_kind = "KubeControllersConfiguration" + } + } + } + + # If you destroy the associated cluster, terraform will try to destroy and + # recreate this restore plan. If we are trying to intentionally rebuild a + # cluster, we will need to destroy it first, and we don't want this restore + # plan destroyed. + lifecycle { + ignore_changes = [ + cluster, + name, + ] + } +}