diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml new file mode 100644 index 0000000..f4aa404 --- /dev/null +++ b/.github/workflows/docker-build.yml @@ -0,0 +1,59 @@ +name: Build and Push CI Deployment Cleanup Docker Image + +on: + push: + tags: [ 'ci-deployment-cleanup-v*' ] + workflow_dispatch: + +env: + REGISTRY: ghcr.io + IMAGE_NAME: swissdatasciencecenter/renku-ci-cleanup + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + id-token: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Extract version from tag + id: version + run: | + VERSION=$(echo "${{ github.ref_name }}" | sed 's/ci-deployment-cleanup-v//') + echo "version=$VERSION" >> $GITHUB_OUTPUT + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=semver,pattern={{version}},value=${{ steps.version.outputs.version }} + type=sha + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build and push Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: ci-deployment-cleanup/Dockerfile + platforms: linux/amd64,linux/arm64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max diff --git a/.github/workflows/helm-chart.yml b/.github/workflows/helm-chart.yml new file mode 100644 index 0000000..86bc77a --- /dev/null +++ b/.github/workflows/helm-chart.yml @@ -0,0 +1,56 @@ +name: Package and Push Helm Chart + +on: + push: + tags: [ 'ci-deployment-cleanup-v*' ] + workflow_dispatch: + +env: + REGISTRY: ghcr.io + +jobs: + helm-chart: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + id-token: write + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: '3.14.0' + + - name: Log in to Container Registry + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Determine app version + id: version + run: | + VERSION=$(echo "${{ github.ref_name }}" | sed 's/ci-deployment-cleanup-v//') + echo "appVersion=$VERSION" >> $GITHUB_OUTPUT + + - name: Update Chart.yaml with app version + run: | + cd ci-deployment-cleanup/helm-chart + sed -i "s/appVersion: .*/appVersion: \"${{ steps.version.outputs.appVersion }}\"/" Chart.yaml + + - name: Lint Helm chart + run: | + cd ci-deployment-cleanup + helm lint helm-chart/ + + - name: Package and push Helm chart + run: | + cd ci-deployment-cleanup + helm package helm-chart/ + helm push *.tgz oci://${{ env.REGISTRY }}/swissdatasciencecenter/helm-charts diff --git a/.github/workflows/lint-cleanup-script.yml b/.github/workflows/lint-cleanup-script.yml new file mode 100644 index 0000000..cf78754 --- /dev/null +++ b/.github/workflows/lint-cleanup-script.yml @@ -0,0 +1,33 @@ +name: Lint cleanup.py script + +on: + push: + paths: + - 'ci-deployment-cleanup/helm-chart/cleanup.py' + pull_request: + paths: + - 'ci-deployment-cleanup/helm-chart/cleanup.py' + +permissions: + contents: read + +jobs: + ruff: + runs-on: ubuntu-24.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Ruff + run: pip install ruff + + - name: Run Ruff format check + run: ruff format --check ci-deployment-cleanup/helm-chart/cleanup.py + + - name: Run Ruff linting + run: ruff check ci-deployment-cleanup/helm-chart/cleanup.py \ No newline at end of file diff --git a/ci-deployment-cleanup/Dockerfile b/ci-deployment-cleanup/Dockerfile new file mode 100644 index 0000000..b9222c9 --- /dev/null +++ b/ci-deployment-cleanup/Dockerfile @@ -0,0 +1,60 @@ +FROM golang:1.24-alpine AS builder + +RUN apk add --no-cache --no-scripts make bash git || apk fix + +WORKDIR /app + +# Copy renku-dev-utils files +COPY . . + +# Build the rdu binary +RUN make rdu + +FROM alpine:3.18 + +RUN apk add --no-cache \ + bash \ + curl \ + ca-certificates \ + jq \ + openssl \ + python3 \ + py3-pip \ + && ARCH=$(case $(uname -m) in x86_64) echo amd64;; aarch64) echo arm64;; *) echo amd64;; esac) \ + && curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/${ARCH}/kubectl" \ + && chmod +x kubectl \ + && mv kubectl /usr/local/bin/ + +# Install Python dependencies +RUN pip3 install --no-cache-dir \ + python-dateutil \ + PyGithub \ + kubernetes + +RUN curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 \ + && chmod 700 get_helm.sh \ + && ./get_helm.sh \ + && rm get_helm.sh + +# Copy the rdu binary from builder stage +COPY --from=builder /app/build/renku-dev-utils /usr/local/bin/rdu + +# Make rdu executable +RUN chmod +x /usr/local/bin/rdu + +# Create a non-root user +RUN addgroup -g 1000 appuser && \ + adduser -u 1000 -G appuser -s /bin/bash -D appuser + +# Switch to non-root user +USER appuser + +# Set working directory +WORKDIR /home/appuser + +# Verify installations +RUN rdu version || echo "rdu installed" && \ + kubectl version --client && \ + helm version + +CMD ["/bin/bash"] diff --git a/ci-deployment-cleanup/README.md b/ci-deployment-cleanup/README.md new file mode 100644 index 0000000..5a245b0 --- /dev/null +++ b/ci-deployment-cleanup/README.md @@ -0,0 +1,51 @@ +# Renku CI Deployment Cleanup + +A Kubernetes-based CI deployment cleanup system that uses a Helm chart to deploy automated cleanup of old Renku CI deployments. This system runs as a CronJob that leverages the `rdu` tool for comprehensive cleanup. + +## Installation + +Install the Helm chart: +```bash +helm install renku-ci-cleanup ./helm-chart +``` + +## Exemption + +Namespaces can be exempted from cleanup by adding the label `renku.io/cleanup-exempt: "true"` to the namespace. + +## How It Works + +1. The CronJob runs on the specified schedule (default: every 6 hours) +2. It queries Kubernetes for ALL namespaces in the cluster +3. For each namespace found: + - Checks if the namespace has the exemption label (if so, skips it) + - Checks if the namespace name matches any of the configured patterns (if enforcement is enabled) + - Calculates the age based on the namespace creation timestamp + - Checks GitHub PR status for PR-based cleanup (if enabled) + - If the namespace is older than the configured threshold AND matches the naming patterns AND is not exempt, it uses `rdu cleanup-deployment` to: + - Delete all sessions + - Uninstall all Helm releases + - Delete all jobs and PVCs + - Delete the entire namespace +4. Logging shows what actions were taken, including exemption and pattern matching results + +## Key Configuration + +The main configuration options in `values.yaml`: + +- `cleanup.maxAge`: Maximum age in hours before cleanup (default: 720 hours / 30 days) +- `cleanup.dryRun`: Enable dry-run mode (default: false) +- `cleanup.namespacePatterns`: List of regex patterns for namespace names +- `cleanup.enforceNamePatterns`: Enable strict pattern matching (default: true) +- `cleanup.prCleanup.enabled`: Enable GitHub PR-based cleanup (default: false) +- `cronJob.schedule`: Cron schedule (default: "0 */6 * * *" - every 6 hours) + +## PR-Based Cleanup + +The system supports GitHub PR-based cleanup that can automatically clean up namespaces when their associated pull requests are closed or merged. This feature requires: + +- `cleanup.prCleanup.enabled: true` +- GitHub API token configured +- Repository mappings in `cleanup.prCleanup.repositories` + +Example configuration maps namespace patterns to GitHub repositories and PR numbers. diff --git a/ci-deployment-cleanup/helm-chart/Chart.yaml b/ci-deployment-cleanup/helm-chart/Chart.yaml new file mode 100644 index 0000000..b2930d1 --- /dev/null +++ b/ci-deployment-cleanup/helm-chart/Chart.yaml @@ -0,0 +1,18 @@ +apiVersion: v2 +name: renku-ci-cleanup +description: A Helm chart for cleaning up old Renku CI deployments +type: application +version: 1.0.0 +appVersion: "1.0.0" +keywords: + - renku + - ci + - cleanup + - deployment +home: https://github.com/SwissDataScienceCenter/renku-dev-utils +sources: + - https://github.com/SwissDataScienceCenter/renku-dev-utils +maintainers: + - name: Renku Team + email: hello@renku.io + url: https://renkulab.io diff --git a/ci-deployment-cleanup/helm-chart/cleanup.py b/ci-deployment-cleanup/helm-chart/cleanup.py new file mode 100755 index 0000000..9562c2f --- /dev/null +++ b/ci-deployment-cleanup/helm-chart/cleanup.py @@ -0,0 +1,343 @@ +import subprocess +import shlex +import shutil +import json +import re +import os +from dateutil import parser +from datetime import datetime, timedelta +from github import Github, Auth +from kubernetes import client, config +import logging + +namespace_patterns_str = os.environ.get("NAMESPACE_PATTERNS", "") +NAMESPACE_REGEXES = namespace_patterns_str.split() if namespace_patterns_str else [] + +GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "") +MAX_AGE_HOURS = int(os.environ.get("MAX_AGE_HOURS", "720")) +DRY_RUN = os.environ.get("DRY_RUN", "false").lower() == "true" + +exemption_label_str = os.environ.get("EXEMPTION_LABEL", "") +if exemption_label_str and "=" in exemption_label_str: + EXEMPTION_ANNOTATION = exemption_label_str.split("=", 1)[0] +else: + EXEMPTION_ANNOTATION = "renku.io/cleanup-exempt" + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +console_handler = logging.StreamHandler() +console_handler.setLevel(logging.INFO) +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +console_handler.setFormatter(formatter) +logger.addHandler(console_handler) + +pr_repositories_str = os.environ.get("PR_REPOSITORIES", "") +NAMESPACE_PATTERN_TO_REPO_MAP = {} +if pr_repositories_str: + for mapping in pr_repositories_str.split(): + if ":" in mapping: + pattern, repo = mapping.split(":", 1) + NAMESPACE_PATTERN_TO_REPO_MAP[pattern] = repo + + +class CIDeployment: + def __init__(self, name, namespace, revision, updated, status, chart, app_version): + self.name = name + self.namespace = namespace + self.revision = revision + self.updated = updated + self.status = status + self.chart = chart + self.app_version = app_version + self.repo = None + self.pr_number = None + self.pr_is_open = None + + +class NamespaceChecker: + def __init__(self): + try: + config.load_incluster_config() + except config.ConfigException: + config.load_kube_config() + self.v1 = client.CoreV1Api() + + def is_namespace_exempt(self, namespace_name): + try: + namespace = self.v1.read_namespace(namespace_name) + if namespace.metadata.annotations: + exempt_value = namespace.metadata.annotations.get(EXEMPTION_ANNOTATION) + return exempt_value == "true" + return False + except Exception as e: + logger.error( + f"Error checking namespace annotations for {namespace_name}: {e}" + ) + return True + + +class GithubPRChecker: + def __init__(self, github_token): + self.g = Github(auth=Auth.Token(github_token)) + + def is_pr_open(self, repo_name, pr_number): + try: + repo = self.g.get_repo(repo_name) + pr = repo.get_pull(pr_number) + return pr.state == "open" + except Exception as e: + logger.error(f"Error checking PR status for {repo_name}#{pr_number}: {e}") + return True + + +class ShellExecution: + def __init__(self, command): + self.command = command + + def execute(self, dry_run=True): + try: + args = shlex.split(self.command) + path = shutil.which(args[0]) + if path is None: + raise FileNotFoundError(f"Command not found: {self.command.split()[0]}") + else: + args[0] = path + + logger.debug(f"Executing with resolved path: {args}") + + if dry_run: + return "Dry run enabled. No action taken.", "", 0 + + result = subprocess.run( + args, + timeout=900, + encoding="utf-8", + capture_output=True, + check=False, + ) + + return result.stdout, result.stderr, result.returncode + except subprocess.TimeoutExpired: + return "", "Command timed out", -1 + except FileNotFoundError as e: + return "", str(e), -1 + except Exception as e: + return "", str(e), -1 + + +class CIDeploymentsManager: + def __init__(self): + self.deployments = [] + + def get_deployments(self): + command = "helm list --all-namespaces -o json" + shell_exec = ShellExecution(command) + stdout, stderr, returncode = shell_exec.execute(dry_run=False) + + if returncode != 0: + raise RuntimeError( + f"helm command failed with return code {returncode}: {stderr}" + ) + + if not stdout: + raise RuntimeError(f"helm command returned empty output. stderr: {stderr}") + + input_dict = json.loads(stdout) + output_set = set() + for ns_regex in NAMESPACE_REGEXES: + output_dict = filter( + lambda ns: re.match(ns_regex, ns["namespace"]), input_dict + ) + for item in output_dict: + last_activity = parser.parse(item["updated"][:19]) + item = CIDeployment( + name=item["name"], + namespace=item["namespace"], + revision=item["revision"], + updated=last_activity, + status=item["status"], + chart=item["chart"], + app_version=item["app_version"], + ) + output_set.add(item) + self.deployments = list(output_set) + + def filter_by_age(self, deployments, hours): + threshold_time = datetime.now() - timedelta(hours=hours) + return [dep for dep in deployments if dep.updated < threshold_time] + + def filter_by_closed_prs(self, deployments): + pr_checker = GithubPRChecker(GITHUB_TOKEN) + filtered = [] + for dep in deployments: + if dep.repo and dep.pr_number: + if not pr_checker.is_pr_open(dep.repo, int(dep.pr_number)): + dep.pr_is_open = False + filtered.append(dep) + else: + dep.pr_is_open = True + else: + filtered.append(dep) + return filtered + + def filter_exempt_namespaces(self, deployments): + ns_checker = NamespaceChecker() + filtered = [] + for dep in deployments: + if ns_checker.is_namespace_exempt(dep.namespace): + logger.info(f"Skipping exempt namespace: {dep.namespace}") + else: + filtered.append(dep) + return filtered + + def get_deletable_deployments(self, max_age_hours): + old = self.filter_by_age(self.deployments, max_age_hours) + closed_pr = self.filter_by_closed_prs(self.deployments) + candidates = list(set(old).union(set(closed_pr))) + return self.filter_exempt_namespaces(candidates) + + def print_deployments(self, deployments): + for dep in deployments: + logger.debug(f"\nName: {dep.name}") + logger.debug(f" Namespace: {dep.namespace}") + logger.debug(f" Updated: {dep.updated}") + logger.debug(f" Repo: {dep.repo}") + logger.debug(f" PR: {dep.pr_number}") + logger.debug(f" PR Open: {dep.pr_is_open}") + + def exclude_deployments(self, names_to_exclude): + self.deployments = [ + dep for dep in self.deployments if dep.name not in names_to_exclude + ] + + def match_namespaces_to_repos(self): + for dep in self.deployments: + for pattern, repo in NAMESPACE_PATTERN_TO_REPO_MAP.items(): + if re.match(pattern, dep.namespace): + dep.repo = repo + break + + def assign_pr_numbers(self): + for dep in self.deployments: + potential_pr = dep.namespace.split("-")[-1] + try: + pr_num = int(potential_pr) + dep.pr_number = pr_num + except ValueError: + logger.info( + f"Warning: Could not parse PR number from namespace {dep.namespace}, skipping PR assignment" + ) + dep.pr_number = None + + def run_cleanup(self, max_age_hours=None, dry_run=None): + if max_age_hours is None: + max_age_hours = MAX_AGE_HOURS + if dry_run is None: + dry_run = DRY_RUN + + logger.debug( + f"Starting cleanup with max_age_hours={max_age_hours}, dry_run={dry_run}" + ) + if dry_run: + logger.info("DRY RUN MODE: No actual deletions will be performed") + + logger.debug("Getting CI deployments") + self.get_deployments() + logger.debug(f"Found {len(self.deployments)} CI deployments") + self.match_namespaces_to_repos() + self.assign_pr_numbers() + + logger.debug("Determining deletable CI deployments") + deployments_to_delete = self.get_deletable_deployments(max_age_hours) + + logger.info(f"Total CI deployments to delete: {len(deployments_to_delete)}") + self.print_deployments(deployments=deployments_to_delete) + + successful_deletions = [] + failed_deletions = [] + + for deployment in deployments_to_delete: + remover = CIDeploymentRemover(deployment, dry_run=dry_run) + stdout, stderr, returncode = remover.remove_with_rdu() + + if returncode == 0: + successful_deletions.append(deployment.namespace) + else: + failed_deletions.append((deployment.namespace, returncode, stderr)) + + self.print_summary( + deployments_to_delete, successful_deletions, failed_deletions + ) + + return successful_deletions, failed_deletions + + def print_summary(self, all_deployments, successful, failed): + logger.info("=" * 80) + logger.info("CLEANUP SUMMARY") + logger.info("=" * 80) + logger.info(f"Total CI deployments processed: {len(all_deployments)}") + logger.info(f"Successful deletions: {len(successful)}") + logger.info(f"Failed deletions: {len(failed)}") + + if failed: + logger.error("Failed namespaces:") + for namespace, returncode, stderr in failed: + logger.error(f" - {namespace} (exit code: {returncode})") + if stderr: + logger.error(f" Error: {stderr[:200]}") + + +class CIDeploymentRemover: + def __init__(self, deployment, dry_run=True): + self.deployment = deployment + self.dry_run = dry_run + + def remove(self): + self.remove_with_rdu() + + def remove_with_rdu(self): + command = f"rdu cleanup-deployment --namespace {self.deployment.namespace} --delete-namespace --yes" + logger.info( + f"\n{'[DRY RUN] ' if self.dry_run else ''}Deleting namespace: {self.deployment.namespace}" + ) + logger.debug(f" Updated: {self.deployment.updated}") + logger.debug(f" Repo: {self.deployment.repo}") + logger.debug( + f" PR: {self.deployment.pr_number} (Open: {self.deployment.pr_is_open})" + ) + + if self.dry_run: + logger.info(f" Command: {command}") + return "Dry run enabled. No action taken.", "", 0 + else: + logger.debug(f" Executing: {command}") + shell_exec = ShellExecution(command) + stdout, stderr, returncode = shell_exec.execute(dry_run=False) + + if returncode == 0: + logger.info( + f" ✓ Successfully deleted namespace: {self.deployment.namespace}" + ) + else: + logger.error( + f" ✗ Failed to delete namespace: {self.deployment.namespace}" + ) + logger.debug(f" Return code: {returncode}") + if stderr: + logger.error(f" Error output: {stderr}") + if stdout: + logger.debug(f" Standard output: {stdout}") + + return stdout, stderr, returncode + + +if __name__ == "__main__": + if not GITHUB_TOKEN: + logger.error("ERROR: GITHUB_TOKEN environment variable is required but not set") + exit(1) + + logger.info(f"Environment: MAX_AGE_HOURS={MAX_AGE_HOURS}, DRY_RUN={DRY_RUN}") + + manager = CIDeploymentsManager() + manager.run_cleanup() diff --git a/ci-deployment-cleanup/helm-chart/templates/_helpers.tpl b/ci-deployment-cleanup/helm-chart/templates/_helpers.tpl new file mode 100644 index 0000000..f949709 --- /dev/null +++ b/ci-deployment-cleanup/helm-chart/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "renku-ci-cleanup.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "renku-ci-cleanup.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "renku-ci-cleanup.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "renku-ci-cleanup.labels" -}} +helm.sh/chart: {{ include "renku-ci-cleanup.chart" . }} +{{ include "renku-ci-cleanup.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "renku-ci-cleanup.selectorLabels" -}} +app.kubernetes.io/name: {{ include "renku-ci-cleanup.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "renku-ci-cleanup.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "renku-ci-cleanup.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/ci-deployment-cleanup/helm-chart/templates/configmap.yaml b/ci-deployment-cleanup/helm-chart/templates/configmap.yaml new file mode 100644 index 0000000..4b44677 --- /dev/null +++ b/ci-deployment-cleanup/helm-chart/templates/configmap.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "renku-ci-cleanup.fullname" . }}-script + labels: + {{- include "renku-ci-cleanup.labels" . | nindent 4 }} +data: +{{ (.Files.Glob "cleanup.py").AsConfig | indent 2 }} diff --git a/ci-deployment-cleanup/helm-chart/templates/cronjob.yaml b/ci-deployment-cleanup/helm-chart/templates/cronjob.yaml new file mode 100644 index 0000000..2ecd2a8 --- /dev/null +++ b/ci-deployment-cleanup/helm-chart/templates/cronjob.yaml @@ -0,0 +1,101 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "renku-ci-cleanup.fullname" . }} + labels: + {{- include "renku-ci-cleanup.labels" . | nindent 4 }} +spec: + schedule: {{ .Values.cronJob.schedule | quote }} + concurrencyPolicy: {{ .Values.cronJob.concurrencyPolicy }} + failedJobsHistoryLimit: {{ .Values.cronJob.failedJobsHistoryLimit }} + successfulJobsHistoryLimit: {{ .Values.cronJob.successfulJobsHistoryLimit }} + jobTemplate: + spec: + template: + metadata: + labels: + {{- include "renku-ci-cleanup.selectorLabels" . | nindent 12 }} + spec: + restartPolicy: {{ .Values.cronJob.restartPolicy }} + serviceAccountName: {{ include "renku-ci-cleanup.serviceAccountName" . }} + containers: + - name: cleanup + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + command: + - python3 + - /scripts/cleanup.py + env: + - name: MAX_AGE_HOURS + value: {{ .Values.cleanup.maxAge | quote }} + - name: EXEMPTION_LABEL + value: {{ .Values.cleanup.exemptionLabel | quote }} + - name: ENFORCE_NAME_PATTERNS + value: {{ .Values.cleanup.enforceNamePatterns | quote }} + {{- if .Values.cleanup.enforceNamePatterns }} + - name: NAMESPACE_PATTERNS + value: {{ join " " .Values.cleanup.namespacePatterns | quote }} + {{- end }} + - name: PR_CLEANUP_ENABLED + value: {{ .Values.cleanup.prCleanup.enabled | quote }} + {{- if .Values.cleanup.prCleanup.enabled }} + - name: PR_REPOSITORIES + value: "{{- range $i, $repo := .Values.cleanup.prCleanup.repositories }}{{- if $i }} {{ end }}{{ $repo.namespacePattern }}:{{ $repo.repo }}{{- end }}" + - name: GITHUB_TOKEN + valueFrom: + secretKeyRef: + name: {{ include "renku-ci-cleanup.fullname" . }}-github-token + key: token + optional: true + {{- end }} + {{- if .Values.cleanup.dryRun }} + - name: DRY_RUN + value: "true" + {{- end }} + {{- if .Values.debug }} + {{- if .Values.debug.enabled }} + - name: DEBUG_MODE + value: "true" + {{- end }} + {{- end }} + volumeMounts: + - name: cleanup-script + mountPath: /scripts + readOnly: true + - name: service-account-token + mountPath: /var/run/secrets/kubernetes.io/serviceaccount + readOnly: true + resources: + {{- toYaml .Values.resources | nindent 14 }} + volumes: + - name: cleanup-script + configMap: + name: {{ include "renku-ci-cleanup.fullname" . }}-script + defaultMode: 0755 + - name: service-account-token + projected: + sources: + - serviceAccountToken: + path: token + - configMap: + name: kube-root-ca.crt + items: + - key: ca.crt + path: ca.crt + - downwardAPI: + items: + - path: namespace + fieldRef: + fieldPath: metadata.namespace + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 12 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 12 }} + {{- end }} \ No newline at end of file diff --git a/ci-deployment-cleanup/helm-chart/templates/rbac.yaml b/ci-deployment-cleanup/helm-chart/templates/rbac.yaml new file mode 100644 index 0000000..6aa0f67 --- /dev/null +++ b/ci-deployment-cleanup/helm-chart/templates/rbac.yaml @@ -0,0 +1,45 @@ +{{- if .Values.rbac.create -}} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "renku-ci-cleanup.fullname" . }} + labels: + {{- include "renku-ci-cleanup.labels" . | nindent 4 }} +rules: +- apiGroups: [""] + resources: ["namespaces"] + verbs: ["get", "list", "delete", "watch"] +- apiGroups: [""] + resources: ["pods", "services", "configmaps", "secrets", "persistentvolumeclaims"] + verbs: ["get", "list", "delete", "deletecollection"] +- apiGroups: ["apps"] + resources: ["deployments", "replicasets", "statefulsets"] + verbs: ["get", "list", "delete"] +- apiGroups: ["batch"] + resources: ["jobs", "cronjobs"] + verbs: ["get", "list", "delete", "deletecollection"] +- apiGroups: ["extensions", "networking.k8s.io"] + resources: ["ingresses"] + verbs: ["get", "list", "delete"] +- apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "rolebindings"] + verbs: ["get", "list", "delete"] +- apiGroups: ["amalthea.dev"] + resources: ["amaltheasessions", "jupyterservers"] + verbs: ["get", "list", "delete", "deletecollection", "update", "patch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "renku-ci-cleanup.fullname" . }} + labels: + {{- include "renku-ci-cleanup.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "renku-ci-cleanup.fullname" . }} +subjects: +- kind: ServiceAccount + name: {{ include "renku-ci-cleanup.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} +{{- end }} \ No newline at end of file diff --git a/ci-deployment-cleanup/helm-chart/templates/secret.yaml b/ci-deployment-cleanup/helm-chart/templates/secret.yaml new file mode 100644 index 0000000..1143cd4 --- /dev/null +++ b/ci-deployment-cleanup/helm-chart/templates/secret.yaml @@ -0,0 +1,13 @@ +{{- if .Values.cleanup.prCleanup.enabled }} +{{- if .Values.cleanup.prCleanup.githubToken }} +apiVersion: v1 +kind: Secret +metadata: + name: {{ include "renku-ci-cleanup.fullname" . }}-github-token + labels: + {{- include "renku-ci-cleanup.labels" . | nindent 4 }} +type: Opaque +data: + token: {{ .Values.cleanup.prCleanup.githubToken | b64enc }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/ci-deployment-cleanup/helm-chart/templates/serviceaccount.yaml b/ci-deployment-cleanup/helm-chart/templates/serviceaccount.yaml new file mode 100644 index 0000000..1378a1f --- /dev/null +++ b/ci-deployment-cleanup/helm-chart/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "renku-ci-cleanup.serviceAccountName" . }} + labels: + {{- include "renku-ci-cleanup.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} \ No newline at end of file diff --git a/ci-deployment-cleanup/helm-chart/values.yaml b/ci-deployment-cleanup/helm-chart/values.yaml new file mode 100644 index 0000000..8112f24 --- /dev/null +++ b/ci-deployment-cleanup/helm-chart/values.yaml @@ -0,0 +1,109 @@ +# Default values for renku-ci-cleanup +# This is a YAML-formatted file + +# Container image configuration +image: + repository: ghcr.io/swissdatasciencecenter/renku-ci-cleanup + pullPolicy: IfNotPresent + # tag defaults to appVersion from Chart.yaml if not specified + tag: "" + +# CronJob configuration +cronJob: + # Cron schedule (every 6 hours by default) + schedule: "0 */6 * * *" + + # Concurrency policy for the cronjob + concurrencyPolicy: Forbid + + # Number of failed jobs to keep + failedJobsHistoryLimit: 5 + + # Number of successful jobs to keep + successfulJobsHistoryLimit: 3 + + # Restart policy for the job pods + restartPolicy: OnFailure + +# Cleanup configuration +cleanup: + # Maximum age in hours for CI deployments before cleanup + maxAge: 720 + + # Label used to exempt namespaces from cleanup + # Namespaces with this label will be skipped regardless of age + exemptionLabel: "renku.io/cleanup-exempt=true" + + # Namespace name patterns to match (regex patterns) + # Only namespaces matching these patterns will be considered for cleanup + namespacePatterns: + - "^ci-renku-.*" + - "^renku-blog-ci-.*" + - "^renku-ci-.*" + + # Enable strict name pattern matching (default: true) + # When true, namespaces must match at least one pattern to be cleaned up + enforceNamePatterns: true + + # Dry run mode - set to true to only log what would be deleted + dryRun: false + + # Debug mode + debug: + enabled: false + + # GitHub PR-based cleanup configuration + # Maps namespace patterns to GitHub repositories for PR status checking + # Format: namespace regex pattern -> {repo: "owner/repo", suffixPattern: "regex"} + prCleanup: + enabled: false + # GitHub API token for accessing PR status (required if prCleanup.enabled is true) + # Should be provided via secret or environment variable + githubToken: "" + # Repository mappings + repositories: + - namespacePattern: "^ci-renku-(.+)$" + repo: "SwissDataScienceCenter/renku" + suffixPattern: "(.+)" + - namespacePattern: "^renku-blog-ci-(.+)$" + repo: "SwissDataScienceCenter/renku-blog" + suffixPattern: "(.+)" + # Example: if namespace is "ci-renku-1234", it maps to PR #1234 in SwissDataScienceCenter/renku + +# Service account configuration +serviceAccount: + # Specifies whether a service account should be created + create: true + + # Annotations to add to the service account + annotations: {} + + # The name of the service account to use + name: "" + +# RBAC configuration +rbac: + # Specifies whether RBAC resources should be created + create: true + +# Resource limits and requests +resources: + limits: + cpu: 500m + memory: 512Mi + requests: + cpu: 100m + memory: 128Mi + +# Node selector for pod assignment +nodeSelector: {} + +# Tolerations for pod assignment +tolerations: [] + +# Affinity for pod assignment +affinity: {} + +# Name overrides +nameOverride: "" +fullnameOverride: "" diff --git a/pkg/cmd/cleanupdeployment.go b/pkg/cmd/cleanupdeployment.go index 584a170..f60eecf 100644 --- a/pkg/cmd/cleanupdeployment.go +++ b/pkg/cmd/cleanupdeployment.go @@ -13,6 +13,7 @@ import ( ns "github.com/SwissDataScienceCenter/renku-dev-utils/pkg/namespace" "github.com/spf13/cobra" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/api/errors" ) var cleanupDeploymentCmd = &cobra.Command{ @@ -49,29 +50,31 @@ func cleanupDeployment(cmd *cobra.Command, args []string) { os.Exit(1) } - // Ask for confirmation - fmt.Printf("This command will perform the following actions in the namespace '%s':\n", namespace) - fmt.Println(" 1. Delete all sessions") - fmt.Println(" 2. Uninstall all helm releases") - fmt.Println(" 3. Delete all jobs") - fmt.Println(" 4. Delete all PVCs") - fmt.Println(" 5. Forcibly delete all sessions") - if deleteNamespace { - fmt.Printf(" 6. Delete the namespace '%s'\n", namespace) - } - proceed, err := askForConfirmation("Proceed?") - if err != nil { - fmt.Println(err) - os.Exit(1) - } - if !proceed { - os.Exit(0) + // Ask for confirmation unless --yes flag is set + if !yes { + fmt.Printf("This command will perform the following actions in the namespace '%s':\n", namespace) + fmt.Println(" 1. Delete all sessions") + fmt.Println(" 2. Uninstall all helm releases") + fmt.Println(" 3. Delete all jobs") + fmt.Println(" 4. Delete all PVCs") + fmt.Println(" 5. Forcibly delete all sessions") + if deleteNamespace { + fmt.Printf(" 6. Delete the namespace '%s'\n", namespace) + } + proceed, err := askForConfirmation("Proceed?") + if err != nil { + fmt.Println(err) + os.Exit(1) + } + if !proceed { + os.Exit(0) + } } // 1. Delete all sessions fmt.Println("1. Delete all sessions") err = k8s.DeleteAllSessions(ctx, client, namespace, k8s.DeleteAllSessionsOptions{}) - if err != nil { + if err != nil && !errors.IsNotFound(err) { fmt.Println(err) os.Exit(1) } @@ -109,7 +112,7 @@ func cleanupDeployment(cmd *cobra.Command, args []string) { // 5. Forcibly delete all sessions fmt.Println("5. Forcibly delete all sessions") err = k8s.ForciblyDeleteAllSessions(ctx, client, namespace, k8s.DeleteAllSessionsOptions{}) - if err != nil { + if err != nil && !errors.IsNotFound(err) { fmt.Println(err) os.Exit(1) } @@ -128,6 +131,7 @@ func cleanupDeployment(cmd *cobra.Command, args []string) { func init() { cleanupDeploymentCmd.Flags().StringVarP(&namespace, "namespace", "n", "", "k8s namespace") cleanupDeploymentCmd.Flags().BoolVar(&deleteNamespace, "delete-namespace", false, "if set, the namespace will be deleted") + cleanupDeploymentCmd.Flags().BoolVarP(&yes, "yes", "y", false, "skip confirmation prompt") } func askForConfirmation(question string) (response bool, err error) { diff --git a/pkg/cmd/root.go b/pkg/cmd/root.go index 495a9b6..e23b45a 100644 --- a/pkg/cmd/root.go +++ b/pkg/cmd/root.go @@ -15,6 +15,7 @@ var secretKey string var secretKeyUsername string var secretName string var userEmail string +var yes bool var rootCmd = &cobra.Command{ Use: "rdu", diff --git a/pkg/k8s/client.go b/pkg/k8s/client.go index 6122682..d790b93 100644 --- a/pkg/k8s/client.go +++ b/pkg/k8s/client.go @@ -2,22 +2,32 @@ package k8s import ( "fmt" + "os" "path/filepath" "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/util/homedir" ) func GetClientset() (*kubernetes.Clientset, error) { - home := homedir.HomeDir() - if home == "" { - return nil, fmt.Errorf("could not determine home directory") + config, err := rest.InClusterConfig() + if err == nil { + return kubernetes.NewForConfig(config) } - kubeconfig := filepath.Join(home, ".kube", "config") - config, err := clientcmd.BuildConfigFromFlags("", kubeconfig) + kubeconfig := os.Getenv("KUBECONFIG") + if kubeconfig == "" { + home := homedir.HomeDir() + if home == "" { + return nil, fmt.Errorf("could not determine home directory") + } + kubeconfig = filepath.Join(home, ".kube", "config") + } + + config, err = clientcmd.BuildConfigFromFlags("", kubeconfig) if err != nil { return nil, err } @@ -26,13 +36,21 @@ func GetClientset() (*kubernetes.Clientset, error) { } func GetDynamicClient() (client *dynamic.DynamicClient, err error) { - home := homedir.HomeDir() - if home == "" { - return nil, fmt.Errorf("could not determine home directory") + config, err := rest.InClusterConfig() + if err == nil { + return dynamic.NewForConfig(config) + } + + kubeconfig := os.Getenv("KUBECONFIG") + if kubeconfig == "" { + home := homedir.HomeDir() + if home == "" { + return nil, fmt.Errorf("could not determine home directory") + } + kubeconfig = filepath.Join(home, ".kube", "config") } - kubeconfig := filepath.Join(home, ".kube", "config") - config, err := clientcmd.BuildConfigFromFlags("", kubeconfig) + config, err = clientcmd.BuildConfigFromFlags("", kubeconfig) if err != nil { return nil, err }