From a39837cc87a6dcdb0722e32e7e5054cd0fd338fa Mon Sep 17 00:00:00 2001 From: Flegma Date: Wed, 8 Apr 2026 14:21:36 +0200 Subject: [PATCH 1/3] fix: add resource limits, PodDisruptionBudgets, and backup error handling --- base/api/deployment.yaml | 7 +++++++ base/api/kustomization.yaml | 1 + base/api/pdb.yaml | 10 ++++++++++ base/backups/postgres-backup-cronjob.yaml | 14 +++++++++++--- base/hasura/deployment.yaml | 7 +++++++ base/hasura/kustomization.yaml | 3 ++- base/hasura/pdb.yaml | 10 ++++++++++ base/minio/stateful-set.yaml | 7 +++++++ base/redis/stateful-set.yaml | 7 +++++++ base/timescaledb/kustomization.yaml | 3 ++- base/timescaledb/pdb.yaml | 10 ++++++++++ base/timescaledb/stateful-set.yaml | 7 +++++++ base/typesense/stateful-set.yaml | 7 +++++++ base/web/deployment.yaml | 7 +++++++ 14 files changed, 95 insertions(+), 5 deletions(-) create mode 100644 base/api/pdb.yaml create mode 100644 base/hasura/pdb.yaml create mode 100644 base/timescaledb/pdb.yaml diff --git a/base/api/deployment.yaml b/base/api/deployment.yaml index 5803d31..d581260 100644 --- a/base/api/deployment.yaml +++ b/base/api/deployment.yaml @@ -37,6 +37,13 @@ spec: containers: - image: ghcr.io/5stackgg/api:latest name: api + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" ports: - containerPort: 5585 startupProbe: diff --git a/base/api/kustomization.yaml b/base/api/kustomization.yaml index 9d3a970..72ebfd2 100644 --- a/base/api/kustomization.yaml +++ b/base/api/kustomization.yaml @@ -4,6 +4,7 @@ kind: Kustomization resources: - deployment.yaml - service.yaml + - pdb.yaml - ingress.yaml - ingress-ws.yaml - ingress-relay.yaml diff --git a/base/api/pdb.yaml b/base/api/pdb.yaml new file mode 100644 index 0000000..f463d85 --- /dev/null +++ b/base/api/pdb.yaml @@ -0,0 +1,10 @@ +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: api-pdb + namespace: 5stack +spec: + minAvailable: 1 + selector: + matchLabels: + app: api diff --git a/base/backups/postgres-backup-cronjob.yaml b/base/backups/postgres-backup-cronjob.yaml index d8d46a9..a2a9102 100644 --- a/base/backups/postgres-backup-cronjob.yaml +++ b/base/backups/postgres-backup-cronjob.yaml @@ -21,7 +21,7 @@ spec: - | set -euo pipefail - apk add --no-cache zip curl ca-certificates bash aws-cli || true + apk add --no-cache zip curl ca-certificates bash aws-cli TS=$(date -u +"%Y%m%d%H%M%S") DUMP_FILE="/tmp/backup-$TS.dump" @@ -30,7 +30,12 @@ spec: export PGPASSWORD="$POSTGRES_PASSWORD" pg_dump -h "$POSTGRES_HOST" -p "$POSTGRES_PORT" -U "$POSTGRES_USER" -d "$POSTGRES_DB" -F c -Z 0 --no-owner --no-privileges -f "$DUMP_FILE" - + + if [ ! -f "$DUMP_FILE" ] || [ ! -s "$DUMP_FILE" ]; then + echo "ERROR: pg_dump failed or produced empty file" + exit 1 + fi + zip -j "$ZIP_FILE" "$DUMP_FILE" ENDPOINT="${S3_ENDPOINT:-minio}" @@ -49,7 +54,10 @@ spec: echo "Uploading to $ENDPOINT : $S3_DB_BACKUP_BUCKET/backup-$TS.zip" - aws s3 cp "$ZIP_FILE" "s3://$S3_DB_BACKUP_BUCKET/backup-$TS.zip" --endpoint-url "$ENDPOINT" + if ! aws s3 cp "$ZIP_FILE" "s3://$S3_DB_BACKUP_BUCKET/backup-$TS.zip" --endpoint-url "$ENDPOINT"; then + echo "ERROR: S3 upload failed" + exit 1 + fi echo "Backup complete: $ZIP_FILE" diff --git a/base/hasura/deployment.yaml b/base/hasura/deployment.yaml index 6d41f5d..fb61a9b 100644 --- a/base/hasura/deployment.yaml +++ b/base/hasura/deployment.yaml @@ -36,6 +36,13 @@ spec: containers: - image: hasura/graphql-engine:v2.48.9-ce.cli-migrations-v3 name: hasura + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" ports: - containerPort: 8080 startupProbe: diff --git a/base/hasura/kustomization.yaml b/base/hasura/kustomization.yaml index 350ec31..78e44d0 100644 --- a/base/hasura/kustomization.yaml +++ b/base/hasura/kustomization.yaml @@ -4,4 +4,5 @@ kind: Kustomization resources: - deployment.yaml - service.yaml - - ingress.yaml \ No newline at end of file + - ingress.yaml + - pdb.yaml \ No newline at end of file diff --git a/base/hasura/pdb.yaml b/base/hasura/pdb.yaml new file mode 100644 index 0000000..c020e63 --- /dev/null +++ b/base/hasura/pdb.yaml @@ -0,0 +1,10 @@ +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: hasura-pdb + namespace: 5stack +spec: + minAvailable: 1 + selector: + matchLabels: + app: hasura diff --git a/base/minio/stateful-set.yaml b/base/minio/stateful-set.yaml index 7c591b0..7fe77f8 100644 --- a/base/minio/stateful-set.yaml +++ b/base/minio/stateful-set.yaml @@ -31,6 +31,13 @@ spec: containers: - name: minio image: quay.io/minio/minio:latest + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" ports: - containerPort: 9000 - containerPort: 9090 diff --git a/base/redis/stateful-set.yaml b/base/redis/stateful-set.yaml index fdc45c4..9cc7d3c 100644 --- a/base/redis/stateful-set.yaml +++ b/base/redis/stateful-set.yaml @@ -30,6 +30,13 @@ spec: containers: - name: redis image: redis:8.4-alpine + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "250m" ports: - containerPort: 6379 protocol: TCP diff --git a/base/timescaledb/kustomization.yaml b/base/timescaledb/kustomization.yaml index 2bab9aa..5d3fc18 100644 --- a/base/timescaledb/kustomization.yaml +++ b/base/timescaledb/kustomization.yaml @@ -3,4 +3,5 @@ kind: Kustomization resources: - stateful-set.yaml - - service.yaml \ No newline at end of file + - service.yaml + - pdb.yaml \ No newline at end of file diff --git a/base/timescaledb/pdb.yaml b/base/timescaledb/pdb.yaml new file mode 100644 index 0000000..e073c9b --- /dev/null +++ b/base/timescaledb/pdb.yaml @@ -0,0 +1,10 @@ +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: timescaledb-pdb + namespace: 5stack +spec: + minAvailable: 1 + selector: + matchLabels: + app: timescaledb diff --git a/base/timescaledb/stateful-set.yaml b/base/timescaledb/stateful-set.yaml index 6af54eb..15a88d4 100644 --- a/base/timescaledb/stateful-set.yaml +++ b/base/timescaledb/stateful-set.yaml @@ -29,6 +29,13 @@ spec: containers: - name: timescaledb image: timescale/timescaledb:latest-pg17 + resources: + requests: + memory: "512Mi" + cpu: "500m" + limits: + memory: "1Gi" + cpu: "1000m" args: - postgres - '-c' diff --git a/base/typesense/stateful-set.yaml b/base/typesense/stateful-set.yaml index f8076fb..ba54344 100644 --- a/base/typesense/stateful-set.yaml +++ b/base/typesense/stateful-set.yaml @@ -31,6 +31,13 @@ spec: containers: - name: typesense image: typesense/typesense:29.0 + resources: + requests: + memory: "256Mi" + cpu: "250m" + limits: + memory: "512Mi" + cpu: "500m" ports: - containerPort: 8108 startupProbe: diff --git a/base/web/deployment.yaml b/base/web/deployment.yaml index b1aedbe..3062d87 100644 --- a/base/web/deployment.yaml +++ b/base/web/deployment.yaml @@ -36,6 +36,13 @@ spec: containers: - image: ghcr.io/5stackgg/web:latest name: web + resources: + requests: + memory: "128Mi" + cpu: "100m" + limits: + memory: "256Mi" + cpu: "250m" ports: - containerPort: 3000 startupProbe: From d26d09ed342ea3862ff8746cb3182db8eaef6acd Mon Sep 17 00:00:00 2001 From: Flegma Date: Wed, 8 Apr 2026 14:50:04 +0200 Subject: [PATCH 2/3] fix: use maxUnavailable PDBs and increase API/Hasura resource limits Change PDBs from minAvailable:1 to maxUnavailable:1 so single-replica workloads don't block node drains and cluster upgrades. Bump API and Hasura memory limits from 512Mi to 1Gi and CPU from 500m to 1000m to handle NestJS+BullMQ+WebSocket and Hasura subscription load. --- base/api/deployment.yaml | 4 ++-- base/api/pdb.yaml | 2 +- base/hasura/deployment.yaml | 4 ++-- base/hasura/pdb.yaml | 2 +- base/timescaledb/pdb.yaml | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/base/api/deployment.yaml b/base/api/deployment.yaml index d581260..d199953 100644 --- a/base/api/deployment.yaml +++ b/base/api/deployment.yaml @@ -42,8 +42,8 @@ spec: memory: "256Mi" cpu: "250m" limits: - memory: "512Mi" - cpu: "500m" + memory: "1Gi" + cpu: "1000m" ports: - containerPort: 5585 startupProbe: diff --git a/base/api/pdb.yaml b/base/api/pdb.yaml index f463d85..df12532 100644 --- a/base/api/pdb.yaml +++ b/base/api/pdb.yaml @@ -4,7 +4,7 @@ metadata: name: api-pdb namespace: 5stack spec: - minAvailable: 1 + maxUnavailable: 1 selector: matchLabels: app: api diff --git a/base/hasura/deployment.yaml b/base/hasura/deployment.yaml index fb61a9b..c5fe5f6 100644 --- a/base/hasura/deployment.yaml +++ b/base/hasura/deployment.yaml @@ -41,8 +41,8 @@ spec: memory: "256Mi" cpu: "250m" limits: - memory: "512Mi" - cpu: "500m" + memory: "1Gi" + cpu: "1000m" ports: - containerPort: 8080 startupProbe: diff --git a/base/hasura/pdb.yaml b/base/hasura/pdb.yaml index c020e63..7e2e8e7 100644 --- a/base/hasura/pdb.yaml +++ b/base/hasura/pdb.yaml @@ -4,7 +4,7 @@ metadata: name: hasura-pdb namespace: 5stack spec: - minAvailable: 1 + maxUnavailable: 1 selector: matchLabels: app: hasura diff --git a/base/timescaledb/pdb.yaml b/base/timescaledb/pdb.yaml index e073c9b..5417f88 100644 --- a/base/timescaledb/pdb.yaml +++ b/base/timescaledb/pdb.yaml @@ -4,7 +4,7 @@ metadata: name: timescaledb-pdb namespace: 5stack spec: - minAvailable: 1 + maxUnavailable: 1 selector: matchLabels: app: timescaledb From ffa968e4d723a5cb3add27166309a5a27e06f4f2 Mon Sep 17 00:00:00 2001 From: Flegma Date: Fri, 10 Apr 2026 11:42:10 +0200 Subject: [PATCH 3/3] fix: increase API and Hasura memory limits to 4Gi --- base/api/deployment.yaml | 2 +- base/hasura/deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/base/api/deployment.yaml b/base/api/deployment.yaml index d199953..2569f22 100644 --- a/base/api/deployment.yaml +++ b/base/api/deployment.yaml @@ -42,7 +42,7 @@ spec: memory: "256Mi" cpu: "250m" limits: - memory: "1Gi" + memory: "4Gi" cpu: "1000m" ports: - containerPort: 5585 diff --git a/base/hasura/deployment.yaml b/base/hasura/deployment.yaml index c5fe5f6..1591778 100644 --- a/base/hasura/deployment.yaml +++ b/base/hasura/deployment.yaml @@ -41,7 +41,7 @@ spec: memory: "256Mi" cpu: "250m" limits: - memory: "1Gi" + memory: "4Gi" cpu: "1000m" ports: - containerPort: 8080