From 5b2259e3d6e8d9d25b61937fad0cb7df587bd9e4 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Thu, 19 Feb 2026 18:11:41 +0100 Subject: [PATCH 01/27] chore(deps): rebase on Alpine 3.23.4 --- core/build-image.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/build-image.sh b/core/build-image.sh index 2304afe7a..dbda27ea6 100755 --- a/core/build-image.sh +++ b/core/build-image.sh @@ -125,7 +125,7 @@ buildah rm "${container}" images+=("${repobase}/${reponame}") echo "Building the restic/rclone image..." -container=$(buildah from docker.io/library/alpine:3.22.4) +container=$(buildah from docker.io/library/alpine:3.23.4) reponame="restic" buildah add "${container}" restic/ / buildah run ${container} sh <<'EOF' @@ -146,7 +146,7 @@ buildah rm "${container}" images+=("${repobase}/${reponame}") echo "Building the rsync image..." -container=$(buildah from docker.io/library/alpine:3.22.4) +container=$(buildah from docker.io/library/alpine:3.23.4) reponame="rsync" buildah run ${container} -- apk add --no-cache rsync buildah add "${container}" rsync/entrypoint.sh /entrypoint.sh @@ -159,7 +159,7 @@ buildah rm "${container}" images+=("${repobase}/${reponame}") echo "Building the support image..." -container=$(buildah from docker.io/library/alpine:3.22.4) +container=$(buildah from docker.io/library/alpine:3.23.4) reponame="support" buildah run ${container} -- sh <<'EOF' apk add --no-cache openvpn gettext-envsubst From 5338ee283b31bcda796d1256d5b1dec6e1521089 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Thu, 12 Feb 2026 18:14:57 +0100 Subject: [PATCH 02/27] feat(rclone-gateway): Restic Rest & WebDAV server - Two separate images: 1) RCLONE_IMAGE for rclone-gateway.service (HAProxy frontend + Rclone rest/webdav backends), and 2) RESTIC_IMAGE for restic app-level backup clients. - Access to WebDAV and Restic on HTTP port 4694 requires authentication. Cluster nodes have unlimited access, whilst applications can only add new backup data with Rest protocol. Authentication and authorization layers are implemented with HAProxy. - During restoration, the application is granted access to the source repository through the Redis HASH "private/nodes/restore_uuid". - The rclonegwctl write-configuration command generates service configuration from Redis DB acls and keys. Output files: rclone.conf, modrepo.map, auth.map, haproxy.cfg, rclone-webdav.env. - Additional "combined" Rclone remote presents all remotes under the same tree with a uniform three level structure: 1. repo uuid 2. module image name, e.g. "traefik" 3. module uuid - The old rclone-webdav.service is replaced by rclone-gateway.service. The local repository originally served by rclone-webdav.service is now accessible through rclone-gateway on the node that hosts it. Assisted-By: copilot:claude-sonnet-4.6 --- core/build-image.sh | 36 ++- .../etc/systemd/system/rclone-gateway.service | 50 ++++ .../etc/systemd/system/rclone-webdav.service | 33 --- .../create-cluster/70start_rclone_gateway | 11 + .../actions/join-node/70start_rclone_gateway | 1 + .../actions/restore-module/50restore_module | 22 +- .../var/lib/nethserver/node/bin/rclonegwctl | 249 ++++++++++++++++++ .../nethserver/node/etc/haproxy/haproxy.cfg | 131 +++++++++ .../nethserver/node/events/acl-changed/50acl | 4 + .../lib/nethserver/node/install-finalize.sh | 6 +- .../var/lib/nethserver/node/uninstall.sh | 2 +- .../node/update-core.d/20restart_webdav | 12 +- .../local/bin/rclone-gateway-entrypoint.sh | 27 ++ .../rclone/usr/local/bin/rclone-serve-restart | 138 ++++++++++ .../usr/local/bin/rclone-wrapper | 0 core/rclone/usr/local/bin/reload-config | 16 ++ docs/core/backup_restore.md | 225 +++++----------- docs/core/database.md | 20 ++ docs/modules/backup_restore.md | 149 +++++++++++ 19 files changed, 928 insertions(+), 204 deletions(-) create mode 100644 core/imageroot/etc/systemd/system/rclone-gateway.service delete mode 100644 core/imageroot/etc/systemd/system/rclone-webdav.service create mode 100755 core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/70start_rclone_gateway create mode 120000 core/imageroot/var/lib/nethserver/cluster/actions/join-node/70start_rclone_gateway create mode 100755 core/imageroot/var/lib/nethserver/node/bin/rclonegwctl create mode 100644 core/imageroot/var/lib/nethserver/node/etc/haproxy/haproxy.cfg create mode 100755 core/rclone/usr/local/bin/rclone-gateway-entrypoint.sh create mode 100755 core/rclone/usr/local/bin/rclone-serve-restart rename core/{restic => rclone}/usr/local/bin/rclone-wrapper (100%) create mode 100755 core/rclone/usr/local/bin/reload-config create mode 100644 docs/modules/backup_restore.md diff --git a/core/build-image.sh b/core/build-image.sh index dbda27ea6..df1528932 100755 --- a/core/build-image.sh +++ b/core/build-image.sh @@ -76,6 +76,7 @@ printf "CORE_IMAGE=${repobase}/core:%s\n" "${IMAGETAG:-latest}" >> "${core_env_f printf "REDIS_IMAGE=${repobase}/redis:%s\n" "${IMAGETAG:-latest}" >> "${core_env_file}" printf "RSYNC_IMAGE=${repobase}/rsync:%s\n" "${IMAGETAG:-latest}" >> "${core_env_file}" printf "RESTIC_IMAGE=${repobase}/restic:%s\n" "${IMAGETAG:-latest}" >> "${core_env_file}" +printf "RCLONE_IMAGE=${repobase}/rclone:%s\n" "${IMAGETAG:-latest}" >> "${core_env_file}" printf "SUPPORT_IMAGE=${repobase}/support:%s\n" "${IMAGETAG:-latest}" >> "${core_env_file}" printf "PROMTAIL_IMAGE=docker.io/grafana/alloy:v1.11.3\n" >> "${core_env_file}" printf "NODE_EXPORTER_IMAGE=quay.io/prometheus/node-exporter:v1.10.2\n" >> "${core_env_file}" @@ -83,7 +84,7 @@ chmod -c 644 "${core_env_file}" source "${core_env_file}" buildah add "${container}" ${core_env_file} /etc/nethserver/core.env buildah config \ - --label="org.nethserver.images=${REDIS_IMAGE} ${RSYNC_IMAGE} ${RESTIC_IMAGE} ${PROMTAIL_IMAGE} ${SUPPORT_IMAGE} ${NODE_EXPORTER_IMAGE}" \ + --label="org.nethserver.images=${REDIS_IMAGE} ${RSYNC_IMAGE} ${RESTIC_IMAGE} ${RCLONE_IMAGE} ${PROMTAIL_IMAGE} ${SUPPORT_IMAGE} ${NODE_EXPORTER_IMAGE}" \ --label="org.nethserver.flags=core_module" \ --entrypoint=/ "${container}" buildah commit "${container}" "${repobase}/${reponame}" @@ -124,22 +125,37 @@ buildah commit "${container}" "${repobase}/${reponame}" buildah rm "${container}" images+=("${repobase}/${reponame}") -echo "Building the restic/rclone image..." +echo "Building the restic image..." container=$(buildah from docker.io/library/alpine:3.23.4) reponame="restic" -buildah add "${container}" restic/ / buildah run ${container} sh <<'EOF' -apk add --no-cache restic rclone -addgroup -S restic -adduser -S -D -H -h /dev/null -s /sbin/nologin -G restic restic -mkdir -v -p -m 0750 /srv/repo -chown -c restic:restic /srv/repo +apk add --no-cache restic EOF buildah config \ --cmd='[]' \ --entrypoint='["/usr/bin/restic"]' \ - --env='RCLONE_CONFIG=/dev/null' \ - --volume=/srv/repo \ + ${container} +buildah commit "${container}" "${repobase}/${reponame}" +buildah rm "${container}" +images+=("${repobase}/${reponame}") + +echo "Building the rclone image..." +container=$(buildah from docker.io/library/alpine:3.23.4) +reponame="rclone" +buildah add "${container}" rclone/ / +buildah run ${container} sh <<'EOF' +addgroup -S rclone -g 101 +adduser -u 100 -S -D -h /var/lib/rclone -s /sbin/nologin -G rclone rclone +apk add --no-cache rclone haproxy python3 +EOF +buildah config \ + --user=rclone:rclone \ + --cmd='[]' \ + --entrypoint='["/usr/bin/rclone"]' \ + --env=RCLONE_CACHE_DIR=/var/cache/rclone \ + --env=RCLONE_CONFIG=/etc/rclone/rclone.conf \ + --env=RCLONE_UNIX_SOCKET=/var/lib/rclone/rclone.sock \ + --env=RCLONE_LOG_SYSTEMD=1 \ ${container} buildah commit "${container}" "${repobase}/${reponame}" buildah rm "${container}" diff --git a/core/imageroot/etc/systemd/system/rclone-gateway.service b/core/imageroot/etc/systemd/system/rclone-gateway.service new file mode 100644 index 000000000..71153c131 --- /dev/null +++ b/core/imageroot/etc/systemd/system/rclone-gateway.service @@ -0,0 +1,50 @@ +[Unit] +Description=Rclone Gateway server +After=redis.service +Wants=redis.service +StartLimitIntervalSec=10s +StartLimitBurst=3 +ConditionPathExists=/etc/wireguard/wg0.conf + +[Service] +Type=forking +WorkingDirectory=/var/lib/nethserver/node/state +PIDFile=%t/%N.pid +Environment=PODMAN_SYSTEMD_UNIT=%n +Environment=BACKUP_VOLUME=rclone-webdav +EnvironmentFile=/etc/nethserver/core.env +EnvironmentFile=-/var/lib/nethserver/node/state/rclone-webdav.env +Restart=always +TimeoutStopSec=120 +TimeoutStartSec=120 +SuccessExitStatus=143 +ExecStartPre=/bin/rm -f %t/%N.pid %t/%N.cid +ExecStartPre=mkdir -vp rclone haproxy +ExecStartPre=-runagent -m node rclonegwctl write-configuration --rclonedir=rclone --haproxydir=haproxy +ExecStart=/usr/bin/podman run \ + --conmon-pidfile=%t/%N.pid \ + --cidfile=%t/%N.cid \ + --cgroups=no-conmon \ + --detach \ + --init \ + --log-opt=tag=%N \ + --replace --name=%N \ + --network=host \ + --volume=./rclone:/etc/rclone:ro,Z \ + --volume=./haproxy:/etc/haproxy:ro,Z \ + --volume=${BACKUP_VOLUME}:/srv/repo:z \ + --mount=type=tmpfs,tmpfs-size=10M,destination=/var/lib/rclone,chown=true \ + --volume=/dev/log:/dev/log \ + --volume=rclone-cache:/var/cache/rclone:Z \ + --entrypoint=rclone-gateway-entrypoint.sh \ + --env-file=rclone-webdav.env \ + ${RCLONE_IMAGE} +ExecStartPost=bash -c '{ while ! exec 3<>/dev/tcp/127.0.0.1/4694; do sleep 5 ; done } &>/dev/null' +ExecReload=runagent -m node rclonegwctl write-configuration --rclonedir=rclone --haproxydir=haproxy +ExecReload=runagent -m node podman exec %N reload-config +ExecStop=/usr/bin/podman stop --ignore --cidfile %t/%N.cid -t 115 +ExecStopPost=/usr/bin/podman rm --ignore -f --cidfile %t/%N.cid + +[Install] +WantedBy=default.target +Alias=rclone-webdav.service diff --git a/core/imageroot/etc/systemd/system/rclone-webdav.service b/core/imageroot/etc/systemd/system/rclone-webdav.service deleted file mode 100644 index f1dd8c16a..000000000 --- a/core/imageroot/etc/systemd/system/rclone-webdav.service +++ /dev/null @@ -1,33 +0,0 @@ -[Unit] -Description=Rclone WebDAV server - -[Service] -Environment=PODMAN_SYSTEMD_UNIT=%n -Environment=OPTIONS= -Environment=BACKUP_VOLUME=rclone-webdav -EnvironmentFile=/etc/nethserver/core.env -EnvironmentFile=-/var/lib/nethserver/node/state/rclone-webdav.env -Restart=always -TimeoutStopSec=120 -ExecStartPre=/bin/rm -f %t/%N.pid %t/%N.cid -ExecStart=/usr/bin/podman run \ - --conmon-pidfile=%t/%N.pid \ - --cidfile=%t/%N.cid \ - --cgroups=no-conmon \ - --detach \ - --log-opt=tag=%N \ - --replace --name=%N \ - --network=host \ - --user=restic:restic \ - --volume=${BACKUP_VOLUME}:/srv/repo:z \ - --entrypoint=[] \ - ${RESTIC_IMAGE} rclone serve webdav /srv/repo --addr :4694 $OPTIONS - -ExecStop=/usr/bin/podman stop --ignore --cidfile %t/%N.cid -t 90 -ExecStopPost=/usr/bin/podman rm --ignore -f --cidfile %t/%N.cid -PIDFile=%t/%N.pid -Type=forking -SuccessExitStatus=143 - -[Install] -WantedBy=default.target diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/70start_rclone_gateway b/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/70start_rclone_gateway new file mode 100755 index 000000000..1156e5b1f --- /dev/null +++ b/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/70start_rclone_gateway @@ -0,0 +1,11 @@ +#!/bin/bash + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +set -e +exec 1>&2 + +systemctl enable --now rclone-gateway.service diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/join-node/70start_rclone_gateway b/core/imageroot/var/lib/nethserver/cluster/actions/join-node/70start_rclone_gateway new file mode 120000 index 000000000..b3723c0e8 --- /dev/null +++ b/core/imageroot/var/lib/nethserver/cluster/actions/join-node/70start_rclone_gateway @@ -0,0 +1 @@ +../create-cluster/70start_rclone_gateway \ No newline at end of file diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module b/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module index 9806e0077..d92cdb202 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module +++ b/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module @@ -26,6 +26,7 @@ import os import agent.tasks import json import tempfile +import uuid request = json.load(sys.stdin) @@ -66,27 +67,40 @@ if image_name == 'traefik': # merge-like strategy, implemented by the module as wanted. # - Build fake add-module result to maintain consistent action output format. # - Never schedule Traefik for removal to avoid breaking cluster connectivity. + # - Authorize access to source module's repository. module_id = agent.resolve_agent_id("traefik@node", node_id=str(node_id)).removeprefix("module/") + new_module_uuid = rdb.hget("cluster/module_uuid", module_id) + agent.assert_exp(new_module_uuid) # Cannot restore without destination UUID add_module_result = { "output": { "module_id": module_id, "image_name": image_name, "image_url": image_url, - "module_uuid": rdb.hget("cluster/module_uuid", module_id), + "module_uuid": new_module_uuid, }, "error": "", "exit_code": 0, } remove_modules = set() + trx = rdb.pipeline() + trx.hset('private/nodes/restore_uuid', new_module_uuid, source_module_uuid) + trx.publish('cluster/event/backup-destination-changed', json.dumps({"destination_ids":[repository]})) + trx.execute() else: - force_uuid = source_module_uuid and (replace_requested or len(remove_modules) == 0) + if source_module_uuid and (replace_requested or len(remove_modules) == 0): + # Force assignment of UUID from the restored environment: + new_module_uuid = source_module_uuid + else: + # Generate a new random UUID: + new_module_uuid = str(uuid.uuid4()) + # Grant access to source module's repository for restoration: + rdb.hset('private/nodes/restore_uuid', new_module_uuid, source_module_uuid) add_module_result = agent.tasks.run("cluster", "add-module", data={ "image": image_url, "node": node_id, **({"volumes": mvolumes} if mvolumes else {}), - # Force assignment of UUID from the restored environment: - **({"module_uuid": source_module_uuid} if force_uuid else {}), + "module_uuid": new_module_uuid, }, endpoint="redis://cluster-leader", progress_callback=agent.get_progress_callback(2, 15) diff --git a/core/imageroot/var/lib/nethserver/node/bin/rclonegwctl b/core/imageroot/var/lib/nethserver/node/bin/rclonegwctl new file mode 100755 index 000000000..58c096f0b --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/bin/rclonegwctl @@ -0,0 +1,249 @@ +#!/usr/bin/env python3 + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +import agent +import sys +import os +import argparse +import cluster.backup +import configparser +import time + + +def main(): + rootp = argparse.ArgumentParser() + subp = rootp.add_subparsers(dest="cmd", required=True) + cfgp = subp.add_parser('write-configuration', help="Write configuration files") + cfgp.add_argument("--rclonedir", required=True, help="Write rclone configuration file to given directory") + cfgp.add_argument("--haproxydir", required=True, help="Write HAProxy configuration files to given directory") + waitp = subp.add_parser('wait-webdav', help="Check if WebDAV service is up") + waitp.add_argument("--attempts", required=False, type=int, default=5) + args = rootp.parse_args() + rdb = agent.redis_connect(privileged=True, use_replica=True) + if args.cmd == 'write-configuration': + write_config_env("rclone-webdav.env", rdb) + if args.rclonedir: + rclone_conf = args.rclonedir + '/rclone.conf' + write_rclonecfg(rclone_conf, rdb) + if args.haproxydir: + write_haproxy_cfg(args.haproxydir + '/haproxy.cfg', rdb) + write_auth_map(args.haproxydir + '/auth.map', rdb) + write_modrepomap(args.haproxydir + "/modrepo.map", rdb) + elif args.cmd == 'wait-webdav': + retry_webdav_startup(args.attempts) + +def retry_webdav_startup(attempts): + check_url = cluster.backup.GATEWAY_URL + with cluster.backup.TimeoutSession(timeout=(3, 10)) as rses: + for attempt in range(1, attempts): + req = rses.options(check_url) + if req.status_code == 200: + break + time.sleep(1) + else: + print("[ERROR] Retry attempts exhausted!", file=sys.stderr) + sys.exit(1) + +def write_haproxy_cfg(file_path, rdb): + with open(os.environ['AGENT_INSTALL_DIR'] + '/etc/haproxy/haproxy.cfg', 'r') as fp: + cfg_template = fp.read() + + frontend_data = "\n" + backends_data = "\n" + + try: + rclone_conf_map = rdb.hgetall('private/nodes/backup_destination/rclone_conf') + except Exception as ex: + rclone_conf_map = {} + + mnode_addr = {} + for node_id in set(rdb.hvals("cluster/module_node")): + node_addr = rdb.hget(f"node/{node_id}/vpn", "ip_address") + mnode_addr[node_id] = node_addr + + for dest_uuid, rclone_conf in rclone_conf_map.items(): + try: + oconf = cluster.backup.parse_rclone_params(rclone_conf) + backup_conf = rdb.hgetall('cluster/backup_repository/' + dest_uuid) + except Exception as ex: + print(f"[WARNING] Failed to obtain configuration of {dest_uuid}!", ex, file=sys.stderr) + continue + + dtype = oconf.get('type', 'ty') + durl = backup_conf.get('url', '') + + if backup_conf['provider'] == 'cluster': + dest_fe_cfg, dest_be_cfg = generate_haproxy_internal_cfg(dtype, dest_uuid, oconf, durl, mnode_addr) + else: + dest_fe_cfg, dest_be_cfg = generate_haproxy_remote_cfg(rdb, dtype, dest_uuid, mnode_addr) + + frontend_data += dest_fe_cfg + backends_data += dest_be_cfg + + with agent.safe_open(file_path, "w") as fileo: + cfg = cfg_template + cfg = cfg.replace(r"#{{INCLUDE-FRONTEND-SECTION}}#", frontend_data) + cfg = cfg.replace(r"#{{INCLUDE-BACKENDS-SECTION}}#", backends_data) + fileo.write(cfg) + +def generate_haproxy_internal_cfg(dtype, dest_uuid, oconf, durl, mnode_addr): + suuid = dest_uuid[0:8] + + for node_id, node_addr in mnode_addr.items(): + if durl == f"webdav:http://{node_addr}:4694": + serve_on_node = node_id + break + else: + serve_on_node = '' + + if serve_on_node == os.environ["NODE_ID"]: + frontend_data = f" # {dest_uuid} is served by local rclone\n" + backends_data = "" + else: + frontend_data = ( + f" acl route_{suuid} path_beg /{dest_uuid}/\n" + f" use_backend gw_{dtype}_{suuid} if is_loopback route_{suuid}\n\n" + ) + backends_data = ( + f"backend gw_{dtype}_{suuid}\n" + f" option httpchk\n" + f" http-check send meth GET uri /{dest_uuid}/_probe_rest_404 hdr Accept application/vnd.x.restic.rest.v2\n" + f" http-check expect status 404\n" + f" server gw{node_id} {node_addr}:4694 check observe layer7 on-error mark-down\n\n" + ) + return frontend_data, backends_data + +def generate_haproxy_remote_cfg(rdb, dtype, dest_uuid, mnode_addr): + suuid = dest_uuid[0:8] + frontend_data = ( + f" acl route_{suuid} path_beg /{dest_uuid}/\n" + f" acl forward_{suuid} nbsrv(rest_{dtype}_{suuid}) eq 0\n" + f" use_backend gw_{dtype}_{suuid} if is_loopback route_{suuid} forward_{suuid}\n\n" + ) + + backends_data = ( + f"backend rest_{dtype}_{suuid}\n" + f" option httpchk GET /{dest_uuid}/_probe_rest_404\n" + f" http-check expect status 404\n" + f" server rest0 unix@/var/lib/rclone/backend-restic.sock check observe layer7 on-error mark-down\n" + "\n" + f"backend gw_{dtype}_{suuid}\n" + f" option httpchk\n" + f" http-check send meth GET uri /{dest_uuid}/_probe_rest_404 hdr Accept application/vnd.x.restic.rest.v2\n" + f" http-check expect status 404\n" + ) + + leader_id = rdb.hget('cluster/environment', 'NODE_ID') + for node_id, node_addr in mnode_addr.items(): + if node_id == os.environ["NODE_ID"]: + continue # skip local node + backends_data += " " + backends_data += f"server gw{node_id} {node_addr}:4694 " + if node_id != leader_id and leader_id != os.environ["NODE_ID"]: + backends_data += "backup " + backends_data += "check observe layer7 on-error mark-down\n" + else: + backends_data += "\n" + + return frontend_data, backends_data + +def write_config_env(file_path, rdb): + with agent.safe_open(file_path, "w") as fileo: + cluster_network = rdb.get("cluster/network") or "0.0.0.0/32" + print("CLUSTER_NETWORK=" + cluster_network, file=fileo) + print("NODE_ID=" + os.environ["NODE_ID"], file=fileo) + +def write_auth_map(file_path, rdb): + with agent.safe_open(file_path, "w") as fileo: + try: + redis_acl_list = rdb.smembers('cluster/acls') + except Exception as ex: + print(ex, file=sys.stderr) + redis_acl_list = set() + for acl_entry in redis_acl_list: + acl_fields = acl_entry.split() + user = acl_fields[1] + if user.startswith('node/'): + group = "nodes" + elif user.startswith('module/'): + group = "modules" + else: + continue # skip + secret = next(filter(lambda x: x.startswith("#"), acl_fields)) + secret = secret.lstrip("#").lower() + if user and secret: + creds = user + ':' + secret + print(f"{creds:<88} {group}", file=fileo) + +def write_modrepomap(file_path, rdb): + with agent.safe_open(file_path, "w") as fileo: + try: + module_uuid_map = rdb.hgetall('cluster/module_uuid') + except Exception: + module_uuid_map = dict() + try: + restore_map = rdb.hgetall('private/nodes/restore_uuid') + except Exception: + restore_map = dict() + for mid, muuid in module_uuid_map.items(): + # Strip trailing digits to derive the app type + app_type = mid.rstrip("0123456789") + repopath = f"{app_type}/{muuid}" + print(f"module/{mid:<16} {repopath}", file=fileo) + if muuid in restore_map: + # Enable access to source app's repository for restoration + source_uuid = restore_map[muuid] + print(f" {repopath} {app_type}/{source_uuid}", file=fileo) + +def write_rclonecfg(file_path, rdb): + with agent.safe_open(file_path, "w") as fileo: + destinations = [] + print(";\n; generated automatically from Redis DB contents\n;\n", file=fileo) + try: + rclone_conf_map = rdb.hgetall('private/nodes/backup_destination/rclone_conf') + node_vpn_ip = rdb.hget(f"node/{os.environ['NODE_ID']}/vpn", "ip_address") + node_url = f"webdav:http://{node_vpn_ip}:4694" + except Exception as ex: + rclone_conf_map = dict() + for key, conf in rclone_conf_map.items(): + conf = conf.strip() + "\n" + if conf: + try: + backup_conf = rdb.hgetall('cluster/backup_repository/' + key) + if backup_conf['provider'] == 'rclone': + destination_basepath = backup_conf['basepath'] + else: + destination_basepath = cluster.backup.extract_rclone_basepath(backup_conf['url']) + except Exception as ex: + print(agent.SD_WARNING + f"Backup destination {key} config parse failed:", ex, file=sys.stderr) + continue + # Override configuration for the local node repository: + if backup_conf['provider'] == 'cluster': + if backup_conf['url'] == node_url: + conf = f"[{key}]\ntype = local\n" + destination_basepath = "/srv/repo" + else: + continue + destination_basepath = destination_basepath.rstrip('/') + # Double-quote escape + # - https://rclone.org/docs/#connection-strings + # - https://rclone.org/combine/#combine-upstreams + destination_basepath = destination_basepath.replace('"', '""') + destinations.append(f'"{key}={key}:{destination_basepath}"') + print(conf, file=fileo) + # The "combined" remote merges all other destinations in a unique + # logical tree, where root-level elements correspond to destination + # UUIDs. + if destinations: + print(( + "[combined]\n" + "type = combine\n" + "upstreams = " + ' '.join(destinations) + "\n" + ), file=fileo) + +if __name__ == '__main__': + main() diff --git a/core/imageroot/var/lib/nethserver/node/etc/haproxy/haproxy.cfg b/core/imageroot/var/lib/nethserver/node/etc/haproxy/haproxy.cfg new file mode 100644 index 000000000..34923fa21 --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/etc/haproxy/haproxy.cfg @@ -0,0 +1,131 @@ +# +# HAProxy configuration for NS8 rclone-gateway service +# + +global + maxconn 2048 + presetenv CLUSTER_NETWORK 0.0.0.0/32 # fallback if not set + presetenv NODE_ID 0 + log /dev/log local0 err err + log-tag rclone-gateway + +defaults + log global + mode http + timeout connect 5s + timeout client 5s + timeout server 5s + +# ========================================================== +# GATEWAY FRONTEND (Port 4694) +# Routes to REST backend when the client sends +# "Accept: application/vnd.x.restic.rest.v2"; WebDAV otherwise. +# ========================================================== +frontend gateway_frontend + bind :4694 + http-after-response set-header X-HAProxy-Backend "%[be_name]@node%[env(NODE_ID)]" + + # + # Client IP-based authorization + # + http-request set-var(txn.accept_hdr) req.hdr(Accept) + acl is_loopback src 127.0.0.1 ::1 + acl is_trusted src "${CLUSTER_NETWORK}" + http-request deny unless is_loopback or is_trusted + + # + # Health-check authorization + # + acl health_request path_end _probe_rest_404 + http-request allow if health_request + + # + # User authentication + # + + # Make sure credentials are given: + acl has_user http_auth_user -m found + acl has_pass http_auth_pass -m found + http-request auth realm ns8-rclone-gateway unless has_user has_pass + # Build lookup key: "username:sha256hex" -> group + http-request set-var(txn.pass_sha256) http_auth_pass,digest(sha256),hex,lower + http-request set-var(txn.auth_key) http_auth_user,concat(:,txn.pass_sha256) + http-request set-var(txn.auth_group) var(txn.auth_key),map(/etc/haproxy/auth.map) + acl is_node var(txn.auth_group) -m str "nodes" + acl is_module var(txn.auth_group) -m str "modules" + # Credentials check: + http-request auth realm ns8-rclone-gateway unless is_node or is_module + + # Protocol detection: REST client sends this Accept header + acl is_rest hdr(Accept) -m sub "application/vnd.x.restic.rest.v2" + # Workaround for POST {path}?create=true that have no Accept header + acl is_rest url_param(create) -m str "true" + + # + # User authorization + # + + # nodes: always pass through + http-request allow if is_node + + # Common path-based vars for module authorization + http-request set-var(txn.modrepo) http_auth_user,map(/etc/haproxy/modrepo.map) + http-request set-var(txn.restore) var(txn.modrepo),map(/etc/haproxy/modrepo.map) + http-request set-var(txn.appuuid) path,field(4,/),regsub(\.json$,) + http-request set-var(txn.vpath) path,field(3,/),concat(/,txn.appuuid,) + + acl is_root_path path / + acl is_read_method method GET HEAD OPTIONS PROPFIND + acl is_repo_match var(txn.vpath),strcmp(txn.modrepo) -m int 0 + acl is_repo_match var(txn.vpath),strcmp(txn.restore) -m int 0 + acl delete_method method DELETE + + # REST-specific ACLs + http-request set-var(txn.rest_type) path,field(5,/) + acl is_lock_type var(txn.rest_type) -m str locks + + # modules (WebDAV): allow PROPFIND / + http-request allow if is_module !is_rest is_read_method is_root_path + # modules (WebDAV): allow read methods on their allowed repo paths + http-request allow if is_module !is_rest is_read_method is_repo_match + + # modules (REST): check repo, allow DELETE on locks only + http-request allow if is_module is_rest is_repo_match is_lock_type delete_method + # modules (REST): check repo and permit non-DELETE queries + http-request allow if is_module is_rest is_repo_match !delete_method + + # deny everything else: + http-request deny + + # runtime-generated config: + #{{INCLUDE-FRONTEND-SECTION}}# + + # default rclone_serve backends: + use_backend default_rest if is_rest + default_backend default_webdav + +# ========================================================== +# DEFAULT RCLONE BACKENDS +# ========================================================== + +backend default_webdav + server default_webdav unix@/var/lib/rclone/backend-webdav.sock + +backend default_rest + server default_rest unix@/var/lib/rclone/backend-restic.sock + +# ========================================================== +# FAILOVER BACKENDS +# For local clients only, provide failover configuration to +# backends that are not reachable from the local node +# ========================================================== + +defaults + log global + mode http + timeout connect 5s + timeout client 5s + timeout server 5s + default-server inter 60s fastinter 3s rise 2 fall 2 + + #{{INCLUDE-BACKENDS-SECTION}}# diff --git a/core/imageroot/var/lib/nethserver/node/events/acl-changed/50acl b/core/imageroot/var/lib/nethserver/node/events/acl-changed/50acl index bc4d9f168..fadf97e73 100755 --- a/core/imageroot/var/lib/nethserver/node/events/acl-changed/50acl +++ b/core/imageroot/var/lib/nethserver/node/events/acl-changed/50acl @@ -23,6 +23,10 @@ set -e exec 1>&2 +if [[ ${AGENT_EVENT_SOURCE} != cluster ]] ; then + exit 0 # invalid source, skip restart +fi + # # Reload ACLs # diff --git a/core/imageroot/var/lib/nethserver/node/install-finalize.sh b/core/imageroot/var/lib/nethserver/node/install-finalize.sh index 8d00d3431..cde44dc01 100755 --- a/core/imageroot/var/lib/nethserver/node/install-finalize.sh +++ b/core/imageroot/var/lib/nethserver/node/install-finalize.sh @@ -97,7 +97,11 @@ EOF ) | redis-cli echo "Start API server and core agents:" -systemctl enable --now api-server.service agent@cluster.service agent@node.service rclone-webdav.service +systemctl enable --now \ + api-server.service \ + agent@cluster.service \ + agent@node.service \ + # end of service list echo "Start node timers" systemctl enable --now password-warning.timer diff --git a/core/imageroot/var/lib/nethserver/node/uninstall.sh b/core/imageroot/var/lib/nethserver/node/uninstall.sh index cc37dd7e8..ca5bb0dfd 100644 --- a/core/imageroot/var/lib/nethserver/node/uninstall.sh +++ b/core/imageroot/var/lib/nethserver/node/uninstall.sh @@ -75,7 +75,7 @@ systemctl disable --now \ redis.service \ wg-quick@wg0.service \ phonehome.timer \ - rclone-webdav.service \ + rclone-gateway.service \ promtail.service \ node_exporter.service \ send-heartbeat.service \ diff --git a/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav b/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav index 80124a2dd..2f78c3af2 100755 --- a/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav +++ b/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav @@ -6,7 +6,13 @@ # exec 1>&2 -set -e -# Restart the local backup destination WebDAV server: -systemctl restart rclone-webdav.service +if [[ ! -d rclone ]] ; then + # Replace rclone-webdav with the gateway service: + systemctl disable --now rclone-webdav.service + rm -vf /etc/systemd/system/rclone-webdav.service \ + /etc/systemd/system/default.target.wants/rclone-webdav.service + systemctl enable rclone-gateway.service +fi + +systemctl restart rclone-gateway.service diff --git a/core/rclone/usr/local/bin/rclone-gateway-entrypoint.sh b/core/rclone/usr/local/bin/rclone-gateway-entrypoint.sh new file mode 100755 index 000000000..efad71eb0 --- /dev/null +++ b/core/rclone/usr/local/bin/rclone-gateway-entrypoint.sh @@ -0,0 +1,27 @@ +#!/bin/ash + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +# shellcheck shell=dash disable=SC3045 + +set -e + +rclone ${DEBUG:+-vvvv} \ + rcd \ + --rc-no-auth \ + --rc-addr "${RCLONE_UNIX_SOCKET}" \ + & +echo "Initialize backends..." +rclone-serve-restart +echo "Initialize frontend..." +haproxy ${DEBUG:+-V} \ + -W \ + -db \ + -p /var/lib/rclone/haproxy.pid \ + -f /etc/haproxy/haproxy.cfg \ + 2>/dev/null & + +wait -n diff --git a/core/rclone/usr/local/bin/rclone-serve-restart b/core/rclone/usr/local/bin/rclone-serve-restart new file mode 100755 index 000000000..56e8ecbe8 --- /dev/null +++ b/core/rclone/usr/local/bin/rclone-serve-restart @@ -0,0 +1,138 @@ +#!/usr/bin/python3 + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +import sys +import json +import re +import subprocess + +RCLONE_LOW_LEVEL_RETRIES = 1 +RCLONE_CONNECT_TIMEOUT = 5 * 1_000_000_000 # = 5 seconds + +def serve_combined_remote(): + """Some rclone remote types, like sftp, fail to start on connection + errors. When they are combined with other remotes they block the whole + server. This function monitors the "combined:" remote startup, and + discards failing remotes. It ensures a minimal set of remotes is + served properly.""" + + try: + joutput = subprocess.check_output(['rclone', 'rc', 'config/get', 'name=combined']) + upstreams = json.loads(joutput)["upstreams"] + except Exception: + log("No upstream defined, server not started") + return + + while upstreams: + start_webdav_input = { + "type": "webdav", + "addr": "unix:///var/lib/rclone/backend-webdav.sock", + "vfs_cache_mode": "off", + "fs-cache-expire-duration": "0", + "fs": f":combine,upstreams='{upstreams.replace("'","''")}':", + "_config": { + "LowLevelRetries": RCLONE_LOW_LEVEL_RETRIES, + "ConnectTimeout": RCLONE_CONNECT_TIMEOUT, + }, + } + proc_start = subprocess.run(['rclone', 'rc', 'serve/start', '--json', json.dumps(start_webdav_input)], stdout=subprocess.PIPE, text=True) + if proc_start.returncode == 0: + break + else: + joutput = proc_start.stdout + oerror = json.loads(joutput) + failed_upstream = extract_failed_upstream_remote(oerror) + if not failed_upstream: + raise Exception(joutput["error"]) + upstreams = remove_upstream(upstreams, failed_upstream) + log(f"Remote {failed_upstream} has been discarded. Reason:", oerror["error"]) + else: + log("No upstream left, server not started") + return + + # Start the rest backend with the same upstreams: + start_rest_input = { + "type": "restic", + "addr": "unix:///var/lib/rclone/backend-restic.sock", + "vfs_cache_mode": "off", + "fs_cache_expire_duration": 0, + "cache_objects": "false", + "fs": f":combine,upstreams='{upstreams.replace("'","''")}':", + "_config": { + "LowLevelRetries": RCLONE_LOW_LEVEL_RETRIES, + "ConnectTimeout": RCLONE_CONNECT_TIMEOUT, + }, + } + subprocess.run(['rclone', 'rc', 'serve/start', '--json', json.dumps(start_rest_input)], stdout=subprocess.DEVNULL, text=True, check=True) + +def log(*args): + print("[rclone-serve-start]", *args, file=sys.stderr) + +def extract_failed_upstream_remote(data: dict) -> str | None: + if data.get("status") != 500: + return None + error = data.get("error", "") + if not error.startswith("failed to create upstream "): + return None + m = re.search(r'([0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', error) + return m.group(1) if m else None + +def parse_upstreams(upstreams: str) -> list[str]: + """Parse the rclone combine upstreams string into a list of raw upstream tokens.""" + tokens = [] + i = 0 + s = upstreams + while i < len(s): + if s[i] == ' ': + i += 1 + continue + if s[i] == '"': + i += 1 # skip opening quote + token = [] + while i < len(s): + if s[i] == '"': + if i + 1 < len(s) and s[i + 1] == '"': + token.append('"') # escaped quote + i += 2 + else: + i += 1 # skip closing quote + break + else: + token.append(s[i]) + i += 1 + tokens.append(''.join(token)) + else: + raise ValueError(f"Unexpected character at position {i}: {s[i]!r}") + return tokens + +def format_upstreams(tokens: list[str]) -> str: + """Serialize a list of upstream tokens back to the rclone combine upstreams string.""" + parts = [] + for token in tokens: + escaped = token.replace('"', '""') + parts.append(f'"{escaped}"') + return ' '.join(parts) + +def remove_upstream(upstreams: str, remote_uuid: str) -> str: + """Remove the upstream entry matching the given UUID from the upstreams string.""" + tokens = parse_upstreams(upstreams) + filtered = [t for t in tokens if not t.startswith(f"{remote_uuid}=")] + return format_upstreams(filtered) + +def stop_all_remotes(): + subprocess.check_output(['rclone', 'rc', 'serve/stopall']) + +def main(): + try: + stop_all_remotes() + serve_combined_remote() + except Exception as ex: + log(ex) + sys.exit(1) + +if __name__ == '__main__': + main() diff --git a/core/restic/usr/local/bin/rclone-wrapper b/core/rclone/usr/local/bin/rclone-wrapper similarity index 100% rename from core/restic/usr/local/bin/rclone-wrapper rename to core/rclone/usr/local/bin/rclone-wrapper diff --git a/core/rclone/usr/local/bin/reload-config b/core/rclone/usr/local/bin/reload-config new file mode 100755 index 000000000..34b2da397 --- /dev/null +++ b/core/rclone/usr/local/bin/reload-config @@ -0,0 +1,16 @@ +#!/bin/ash + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +# shellcheck shell=dash + +echo "Restarting backends..." +rclone-serve-restart +rclone_rc=$? +echo "Reloading frontend..." +# shellcheck disable=SC2046 +[ -f /var/lib/rclone/haproxy.pid ] && kill -HUP $(cat /var/lib/rclone/haproxy.pid) +exit "${rclone_rc}" diff --git a/docs/core/backup_restore.md b/docs/core/backup_restore.md index 69a234ec0..5aaa70b4e 100644 --- a/docs/core/backup_restore.md +++ b/docs/core/backup_restore.md @@ -1,134 +1,60 @@ --- layout: default -title: Backup & Restore +title: Backup and Restore nav_order: 10 parent: Core --- -# Backup & Restore +# Backup and Restore -The backup and restore is a procedure for disaster-recovery scenario: it can be used to save the data of an installed -module and restore it to a different node or cluster. +NS8 core provides a backup and restore infrastructure for applications, +and a procedure tailored for single node cluster disaster recovery. * TOC {:toc} -## Design +## Core architecture + +- A backup **destination** is a remote filesystem accessible with Rclone, + where backup data is saved. It's a cluster object persisted in Redis, + under the restricted `private/nodes/*` key space: only nodes can read + its contents. Secrets and configurations are stored in Rclone-format in + the HASH key `private/nodes/backup_destination/rclone_conf`, indexed by + destination ID. Other HASH keys are `cluster/backup_repository/`, with public backup destination attributes, and the HASH key + `private/agents/backup_destination/restic_password`, indexed by + destination ID: it contains the encryption password of all Restic + repositories in the backup destination, shared among the applications + that use it. + +- A **backup** is the relation between a periodic schedule of the + backup, a backup destination, and a set of applications to backup. It + also has auxiliary attributes, like the backup snapshot retention. It's + a cluster object persisted in Redis in HASH keys in the form + `cluster/backup/`. The backup ID is generated from a + sequence: `cluster/backup_sequence`. + +- Every cluster node manages a pool of Systemd timers with the + `backup-timers.service` unit, to start the backups at the scheduled + times. The node `run-backup` helper command iterates over the configured + applications and executes their `module-backup` procedure, by `runagent` + impersonation. The `run-backup` command also updates the backup status + of every application in Redis key `node//backup_status/`, and writes an overall backup status into + `/run/node_exporter/backup.prom` files. It also uploads + `.json` files with backup repositories metadata. When executed on the + leader node, uploads a copy of the encrypted cluster backup. + +- Every cluster node runs a `rclone-gateway` service, an authenticated + HTTP proxy server. Access is granted to nodes and cluster applications + providing their Redis credentials (environment variables `REDIS_USER`, + `REDIS_PASSWORD`). The HTTP proxy server is a frontend to two internal + `rclone serve` backends: `webdav` and `rest`. Nodes can access both, + whilst applications can access a restricted set of operations of the + Restic Rest server, and only on the data they own. Application backup + jobs (Restic) access their repository through the REST endpoint. See + also the [module backup documentation]({{site.baseurl}}/modules/backup_restore). -The backup engine is [Restic](https://restic.net/) which runs inside a -container along with [Rclone](https://rclone.org/) used to inspect the -backup repository contents. - -Backups are saved inside *backup destinations*, remote spaces where data -are stored. A backup destination can contain multiple backup instances, -each module instance has its own sub-directory to avoid conflicts. This -sub-directory is the root of the module instance Restic repository. - -The system implements the common logic for backup inside the agent with `module-backup` command. -Each module can implement `module-dump-state` and `module-cleanup-state` to prepare/cleanup the data that has to be included in the backup. -The `state/environment` file is always included inside the backup, as it is required by the `cluster/restore-module` action. -The restore is implemented using a `restore-module` action inside the module agent, each module can extend it to implement specific restore steps. -The basic `10restore` step actually runs the Restic restore procedure in a temporary container. - -All backups are scheduled by systemd timers. Given a backup with id `1`, it is possible to retrieve the time status with: -- rootless containers, eg. `dokuwiki1`, executed by `dokuwiki1` user: `systemctl --user status backup1.timer` -- rootfull containers, eg. `dnsmasq1`, executed by `root` user: `systemctl status backup1-dnsmasq1.timer` - -## Include and exclude files - -Whenever possible, containers should use volumes to avoid UID/GID -namespace mappings and SELinux issues during backup an restore. - -Includes can be added to the `state-include.conf` file saved inside `AGENT_INSTALL_DIR/etc/`. -In the [source tree](modules/images/#source-tree), the file should be placed under `/imageroot/etc/state-include.conf`. -On installed modules, the file will appear on different paths: -- rootless containers, eg. `dokuwiki1`, full path will be `/home/dokuwiki1/.config/etc/state-include.conf` -- rootfull containers, eg. `dnsmasq1`, full path will be `/var/lib/nethserver/dnsmasq1/etc/state-include.conf` - -Lines are interpreted as path patterns. Only patterns referring to -volumes and the agent `state/` directory are considered. - -Lines starting with `state/` refer to `AGENT_STATE_DIR` contents. Eg. to -include `mykey.dump` under the `AGENT_STATE_DIR` add - - state/mykey.dump - -Lines starting with `volumes/` will be mapped to a volume name. Eg. to -include the whole `dokuwiki-data` volume add - - volumes/dokuwiki-data - -Internally, volumes will be mapped as: - -- `` (1-1) for rootless containers; eg. for module - `dokuwiki1`, line prefix `volumes/dokuwiki-data` maps to volume name - `dokuwiki-data` - -- `-` for rootfull containers; eg. for module - `dnsmasq1`, line prefix `volumes/ data` maps to volume name `dnsmasq1-data` - -Volumes listed in `state-include.conf` are automatically mounted (and -created if necessary) by the basic `10restore` step of the -`restore-module` action. - -Excludes can be added to `state-exclude.conf` file saved inside the `AGENT_INSTALL_DIR`. - -For a complete explanation of the patterns, like wildcard characters, see -the official Restic documentation to -[include](https://restic.readthedocs.io/en/stable/040_backup.html#including-files) -and -[exclude](https://restic.readthedocs.io/en/stable/040_backup.html#excluding-files) -files. Note that include and exclude patterns have a slight different -syntax. - -## Save and restore Redis keys - -### Dump key and state - -To save a Redis key, you should: -- dump the key inside the `module-dump-state` command -- include the dump inside the backup - -Given a module named `mymodule`, create the file `mymodule/imageroot/bin/module-dump-state` inside the module source tree: -``` -#!/bin/bash -redis-dump module/mymodule1/mykey > mykey.dump -``` - -Make sure also `module-dump-state` is executable: -``` -chmod a+x mymodule/imageroot/bin/module-dump-state -``` - -Then, add the key dump path to `mymodule/imageroot/etc/state-include.conf`: -``` -state/mykey.dump -``` - -### Cleanup state - -As best practice, the dump should be removed when the backup has completed. - -Given a module named `mymodule`, create the file `mymodule/imageroot/bin/module-cleanup-state` inside the module source tree: -``` -#!/bin/bash -rm -f mykey.dump -``` - -Make sure also `module-cleanup-state` is executable: -``` -chmod a+x mymodule/imageroot/bin/module-cleanup-state -``` - -### Restore key - -To restore a Redis key, you should add a step inside the `restore-module` action, after index 10. - -Given a module named `mymodule`, create a file named `mymodule/imageroot/actions/restore-module/20loadkey` inside the module source tree: -``` -#!/bin/bash -redis-restore mymodule1/mykey < mykey.dump -``` ## Execute a backup @@ -141,55 +67,49 @@ The flow to execute a backup will be something like: - execute the backup 1. Create the repository: -``` -api-cli run add-backup-repository --data '{"name":"BackBlaze repo1","url":"b2:backupns8","parameters":{"b2_account_id":"xxxxxxxxxxxxxxxxxxxxxxxxx","b2_account_key":"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"},"provider":"backblaze","password":"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}' -``` + + api-cli run add-backup-repository --data '{"name":"BackBlaze repo1","url":"b2:backupns8","parameters":{"b2_account_id":"xxxxxxxxxxxxxxxxxxxxxxxxx","b2_account_key":"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"},"provider":"backblaze","password":"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"}' 2. The output will be something like, please note the `id` field: -```json -{"password": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "id": "48ce000a-79b7-5fe6-8558-177fd70c27b4"} -``` -3. Create a new daily backup named `mybackup` with a retention of 3 snapshots (3 days) which includes `dokuwiki1` and `dnsmasq1` instances: -``` -api-cli run add-backup --data '{"repository":"48ce000a-79b7-5fe6-8558-177fd70c27b4","schedule":"daily","retention":3,"instances":["dokuwiki1","dnsmasq1"],"enabled":true, "name":"mybackup"}' -``` + {"password": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "id": "48ce000a-79b7-5fe6-8558-177fd70c27b4"} + +3. Create a new daily backup named `mybackup` with a retention of 3 + snapshots (3 days) which includes `dokuwiki1` and `dnsmasq1` instances: + + api-cli run add-backup --data '{"repository":"48ce000a-79b7-5fe6-8558-177fd70c27b4","schedule":"daily","retention":3,"instances":["dokuwiki1","dnsmasq1"],"enabled":true, "name":"mybackup"}' -4. The output will the id of the backup: -``` -1 -``` + +4. The output will the id of the backup, e.g. `1` 5. Run the backup with id `1`: -``` -api-cli run run-backup --data '{"id":1}' -``` -For debugging purposes, you can also launch systemd units: -- rootless container, eg. `dokuwiki1`: `runagent -m dokuwiki1 systemctl --user start backup1.service` -- rootfull container, eg. `dnsmasq1`: systemctl start backup1-dnsmasq1.service + runagent -m node run-backup --backup 1 To remove the backup use: -``` -api-cli run remove-backup --data '{"id":1}' -``` + + api-cli run remove-backup --data '{"id":1}' + To remove the backup repository: -``` -api-cli run remove-backup-repository --data '{"id":"c7a9cfea-303c-5104-8ab7-39ac9f9842bd"}' -``` + + api-cli run remove-backup-repository --data '{"id":"c7a9cfea-303c-5104-8ab7-39ac9f9842bd"}' ## Execute a restore Before executing a restore, the backup repository should already be configured. -Restore the `dokuwiki1` instance at node `1` from repository `48ce000a-79b7-5fe6-8558-177fd70c27b4`: -``` -api-cli run cluster/restore-module --data '{"node":1, "repository":"48ce000a-79b7-5fe6-8558-177fd70c27b4", "path":"dokuwiki/dokuwiki1@3792c7db-9450-4bd3-84a3-034cd0087839","snapshot":""}' -``` +Restore latest snapshot of the `dokuwiki1` module on node `1` from +repository `48ce000a-79b7-5fe6-8558-177fd70c27b4`: + + api-cli run cluster/restore-module --data '{"node":1, "repository":"48ce000a-79b7-5fe6-8558-177fd70c27b4", "path":"dokuwiki/3792c7db-9450-4bd3-84a3-034cd0087839","snapshot":""}' ## Cluster configuration backup +The `run-backup` command on the leader node automatically uploads an +encrypted copy of the cluster backup to each backup destination, +alongside the application backups. + The `cluster/download-cluster-backup` API returns a random URL path where an encrypted archive is available for download. @@ -209,9 +129,10 @@ installation: upload the file and specify the password from the UI. ## The `restic-wrapper` command -The Restic binary is not installed in the host system. NS8 runs Restic -within a core container, preparing environment variables with values read -from the Redis DB and properly mounting the application Podman volumes. +The Restic binary is not provided by the Linux distribution. Instead, NS8 +runs Restic within a core container, preparing environment variables with +values read from the Redis DB and properly mounting the application Podman +volumes. The `restic-wrapper` command is designed to manually run Restic from the command line. It can help to restore individual files and directories, or diff --git a/docs/core/database.md b/docs/core/database.md index bc5ef5ab5..4f271f0df 100644 --- a/docs/core/database.md +++ b/docs/core/database.md @@ -384,3 +384,23 @@ For instance "data": ["cancel-task", "list-actions"] } ``` + +### private/agents/ + +Access to keys under the `private/agents/` prefix is restricted to cluster +agent (read + write) and other authenticated agents (read-only). + +|key|type|description| +|---|----|-----------| +|private/agents/backup_destination/restic_password | HASH | Maps a backup destination ID to its restic password | + + +### private/nodes/ + +Access to keys under the `private/nodes/` prefix is restricted to cluster +agent (read + write) and node agents (read-only). + +|key|type|description| +|---|----|-----------| +|private/nodes/backup_destination/rclone_conf | HASH | Maps a backup destination ID to its raw rclone configuration, that typically contains secret credentials | +|private/nodes/restore_uuid | HASH | Maps a destination module UUID to a source module UUID. During module restoration the destination module must read the source module backup repository and this map establish the required trust between them, which is enforced by rclone-gateway.| diff --git a/docs/modules/backup_restore.md b/docs/modules/backup_restore.md new file mode 100644 index 000000000..7a9860563 --- /dev/null +++ b/docs/modules/backup_restore.md @@ -0,0 +1,149 @@ +--- +layout: default +title: Backup and Restore +nav_order: 60 +parent: Modules +--- + +# Backup and Restore + +* TOC +{:toc} + +## General concepts + +The module backup engine is [Restic](https://restic.net/) which runs +inside a container (see `RESTIC_IMAGE` in `/etc/nethserver/core.env`). + +A module backup is saved under a reserved Restic repository inside one or +more *backup destinations*. A module has no direct access to destinations +and their secret configuration: it must access them through the local node +`rclone-gateway` HTTP proxy service, providing the module's Redis +credentials. The URL to access its Restic repository has this form: + + http://127.0.0.1:4694/// + +The URL path has three levels: + +1. The destination ID, in UUID syntax. +2. The application type, the module ID without the trailing number, e.g. `traefik`. +3. The module UUID + +## Entry points + +The `module-backup` command is provided by NS8 core and is executed with +module's privileges to backup the module. + +The module itself can implement two commands: `module-dump-state` and +`module-cleanup-state` to prepare/cleanup the data that has to be included +in the backup. + +The module `state/environment` file is always included inside the backup, +as it is required by the cluster during restoration +(`cluster/restore-module` action). + +The restore is implemented using the module `restore-module` action. The +core provides a basic implementation, and the module can extend it to +implement specific restore steps. The basic `10restore` step actually runs +the Restic restore procedure in a temporary container and extracts the +backup data to the `state/` directory and to Podman named volumes. + +## Include and exclude files + +Whenever possible, containers should use volumes to avoid UID/GID +namespace mappings and SELinux issues during backup an restore. + +Includes can be added to the `state-include.conf` file saved inside `AGENT_INSTALL_DIR/etc/`. + +In the [source tree](modules/images/#source-tree), the file should be +placed under `/imageroot/etc/state-include.conf`. On installed +modules, the file will appear on different paths: + +- rootless containers, eg. `dokuwiki1`, full path will be + `/home/dokuwiki1/.config/etc/state-include.conf` + +- rootfull containers, eg. `dnsmasq1`, full path will be + `/var/lib/nethserver/dnsmasq1/etc/state-include.conf` + +Lines are interpreted as path patterns. Only patterns referring to volumes +and the agent `state/` directory are considered. + +Lines starting with `state/` refer to `AGENT_STATE_DIR` contents. Eg. to +include `mykey.dump` under the `AGENT_STATE_DIR` add + + state/mykey.dump + +Lines starting with `volumes/` will be mapped to a volume name. Eg. to +include the whole `dokuwiki-data` volume add + + volumes/dokuwiki-data + +Internally, volumes will be mapped as: + +- `` (1-1) for rootless containers; eg. for module + `dokuwiki1`, line prefix `volumes/dokuwiki-data` maps to volume name + `dokuwiki-data` + +- `-` for rootfull containers; eg. for module + `dnsmasq1`, line prefix `volumes/ data` maps to volume name `dnsmasq1-data` + +Volumes listed in `state-include.conf` are automatically mounted (and +created if necessary) by the basic `10restore` step of the +`restore-module` action. + +Excludes can be added to `state-exclude.conf` file saved inside the `AGENT_INSTALL_DIR`. + +For a complete explanation of the patterns, like wildcard characters, see +the official Restic documentation to +[include](https://restic.readthedocs.io/en/stable/040_backup.html#including-files) +and +[exclude](https://restic.readthedocs.io/en/stable/040_backup.html#excluding-files) +files. Note that include and exclude patterns have a slight different +syntax. + +## Save Redis keys + +To save a Redis key, you should: + +- dump the key inside the `module-dump-state` command +- include the dump inside the backup + +Given a module named `mymodule`, create the file +`mymodule/imageroot/bin/module-dump-state` inside the module source tree: + + #!/bin/bash + redis-dump module/mymodule1/mykey > mykey.dump + + +Make sure also `module-dump-state` is executable: + + chmod a+x mymodule/imageroot/bin/module-dump-state + + +Then, add the key dump path to `mymodule/imageroot/etc/state-include.conf`: + + state/mykey.dump + + +## Cleanup state + +As best practice, the dump should be removed when the backup has completed. + +Given a module named `mymodule`, create the file `mymodule/imageroot/bin/module-cleanup-state` inside the module source tree: + + #!/bin/bash + rm -f mykey.dump + +Make sure also `module-cleanup-state` is executable: + + chmod a+x mymodule/imageroot/bin/module-cleanup-state + + +## Restore Redis keys + +To restore a Redis key, you should add a step inside the `restore-module` action, after index 10. + +Given a module named `mymodule`, create a file named `mymodule/imageroot/actions/restore-module/20loadkey` inside the module source tree: + + #!/bin/bash + redis-restore mymodule1/mykey < mykey.dump From 614594eeae0722bcecfd3b7ac5313ea324aab5b0 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Fri, 13 Feb 2026 19:11:28 +0100 Subject: [PATCH 03/27] feat(backup): store rclone config in private Redis - Add per-node destination validation. - Use stable UUIDs, generate rclone.conf and store it in new Redis HASH key private/nodes/backup_destination/rclone_conf. Hash keys are the UUIDs of destinations. - Added cluster.backup.run_rclone() helper. - Update add/alter actions to write public data to cluster/backup_repository/ and secrets to private/nodes/backup_destination/parameters/ with parallel node validation via validate-backup-destination action. - Remove action cleans private keys. - Replaced add-backup-repository/10validate with non-executable placeholder to work around legacy update bug. --- .../usr/local/agent/bin/rclone-wrapper | 111 +-------- .../usr/local/agent/pypkg/cluster/backup.py | 211 ++++++++++++++++++ .../actions/add-backup-repository/10validate | 59 +---- .../actions/add-backup-repository/50update | 127 +++++++---- .../actions/alter-backup-repository/50update | 95 +++++--- .../actions/list-backup-repositories/50list | 14 +- .../validate-output.json | 8 +- .../50remove_backup_repository | 16 +- .../50validate_backup_destination | 51 +++++ .../validate-input.json | 30 +++ 10 files changed, 456 insertions(+), 266 deletions(-) mode change 100755 => 100644 core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/10validate create mode 100755 core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/50validate_backup_destination create mode 100644 core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/validate-input.json diff --git a/core/imageroot/usr/local/agent/bin/rclone-wrapper b/core/imageroot/usr/local/agent/bin/rclone-wrapper index b94c1e321..737136081 100755 --- a/core/imageroot/usr/local/agent/bin/rclone-wrapper +++ b/core/imageroot/usr/local/agent/bin/rclone-wrapper @@ -8,18 +8,6 @@ import sys import agent import os -from urllib.parse import urlparse - -def extract_region_code(hostname, position, default=""): - """Obtain the region code for S3 backends by looking at the domain - parts of the endpoint FQDN.""" - try: - return hostname.split('.')[position] - except ValueError: - return default - -core_env = agent.read_envfile("/etc/nethserver/core.env") -rclone_image = core_env["RESTIC_IMAGE"] rdb = agent.redis_connect(privileged=False, host="127.0.0.1") try: @@ -32,110 +20,19 @@ except IndexError: print("e.g.: rclone-wrapper 1 ls REMOTE_PATH/dokuwiki", file=sys.stderr) sys.exit(33) -if repository_id == "0": - # Read orepo attributes directly from the environment. - orepo = os.environ -elif repository_id.isnumeric(): +if repository_id.isnumeric(): # Assume a backup ID has been issued. Translate it to a repository ID. repository_id = rdb.hget(f"cluster/backup/{repository_id}", "repository") - orepo = rdb.hgetall(f"cluster/backup_repository/{repository_id}") -else: - # Read orepo from Redis. - orepo = rdb.hgetall(f"cluster/backup_repository/{repository_id}") -if not orepo: +if not rdb.exists(f"cluster/backup_repository/{repository_id}"): print(f"Could not find any repo with {sys.argv[1]}", file=sys.stderr) sys.exit(34) -# Parse URL and prepare RCLONE_* environment variables -uscheme, upath = orepo['url'].split(':', 1) - -if uscheme == "b2": - rclone_path = ':b2:' + upath - rclone_env = { - 'RCLONE_B2_ACCOUNT': orepo["b2_account_id"], - 'RCLONE_B2_KEY': orepo["b2_account_key"], - } -elif uscheme == "s3": - s3_endpoint, s3_path = upath.split('/', 1) - rclone_path = ':s3:' + s3_path - rclone_env = { - 'RCLONE_S3_ENV_AUTH': 'true', - 'RCLONE_S3_ACCESS_KEY_ID': orepo["aws_access_key_id"], - 'RCLONE_S3_SECRET_ACCESS_KEY': orepo["aws_secret_access_key"], - 'RCLONE_S3_ENDPOINT': s3_endpoint - } - if orepo['provider'] == 'aws': - rclone_env['RCLONE_S3_PROVIDER'] = 'AWS' - rclone_env['RCLONE_S3_REGION'] = orepo.get("aws_default_region", "") - elif orepo['provider'] == 'generic-s3' and 'digitalocean' in s3_endpoint: - rclone_env['RCLONE_S3_PROVIDER'] = 'DigitalOcean' - elif orepo['provider'] == 'generic-s3' and 'ovh.net' in s3_endpoint: - rclone_env['RCLONE_S3_PROVIDER'] = 'Other' - rclone_env['RCLONE_S3_REGION'] = orepo.get("aws_default_region", extract_region_code(s3_endpoint, 1)) - rclone_env['RCLONE_S3_LOCATION_CONSTRAINT'] = rclone_env['RCLONE_S3_REGION'] - elif orepo['provider'] == 'generic-s3' and 'wasabi' in s3_endpoint: - rclone_env['RCLONE_S3_PROVIDER'] = 'Wasabi' - rclone_env['RCLONE_S3_REGION'] = orepo.get("aws_default_region", extract_region_code(s3_endpoint, 1)) - elif orepo['provider'] == 'generic-s3' and 'synology' in s3_endpoint: - rclone_env['RCLONE_S3_PROVIDER'] = 'Synology' - rclone_env['RCLONE_S3_REGION'] = orepo.get("aws_default_region", extract_region_code(s3_endpoint, 0)) - rclone_env['RCLONE_S3_NO_CHECK_BUCKET'] = "1" - elif orepo['provider'] == 'generic-s3' and 'your-objectstorage.com' in s3_endpoint: - rclone_env['RCLONE_S3_PROVIDER'] = 'Other' # Hetzner - rclone_env['RCLONE_S3_REGION'] = orepo.get("aws_default_region", extract_region_code(s3_endpoint, 0)) - elif orepo['provider'] == 'generic-s3' and 'idrivee2' in s3_endpoint: - rclone_env['RCLONE_S3_PROVIDER'] = 'IDrive' - rclone_env['RCLONE_S3_NO_CHECK_BUCKET'] = "1" - elif orepo['provider'] == 'generic-s3' and 'cubbit.eu' in s3_endpoint: - rclone_env['RCLONE_S3_PROVIDER'] = 'Other' - rclone_env['RCLONE_S3_REGION'] = 'eu-west-1' - elif orepo['provider'] == 'generic-s3' and 'scalablestorage.it' in s3_endpoint: - rclone_env['RCLONE_S3_PROVIDER'] = 'Other' # Cloudfire - rclone_env['RCLONE_S3_NO_CHECK_BUCKET'] = "1" - else: - rclone_env['RCLONE_S3_PROVIDER'] = 'Other' - -elif uscheme == "azure": - rclone_path = ':azureblob:' + upath.rstrip(":") - rclone_env = { - 'RCLONE_AZUREBLOB_ACCOUNT': orepo["azure_account_name"], - 'RCLONE_AZUREBLOB_KEY': orepo["azure_account_key"], - } -elif uscheme == "smb": - rclone_path = ':smb:' + upath.rstrip(":") - rclone_env = { - "RCLONE_SMB_HOST": orepo["smb_host"], - "RCLONE_SMB_USER": orepo["smb_user"], - "RCLONE_SMB_PASS": orepo["smb_pass"], - "RCLONE_SMB_DOMAIN": orepo["smb_domain"], - } -elif uscheme == "webdav": - ourl = urlparse(upath) - rclone_path = ':webdav:' + ourl.path.rstrip("/") - rclone_env = { - "RCLONE_WEBDAV_URL": ourl.scheme + '://' + ourl.netloc - } -else: - raise Exception(f"Scheme {uscheme} not supported") - # Build the Podman+Rclone command line -container_name = "rclone-wrapper-" + os.environ.get('MODULE_ID', os.environ["AGENT_ID"]) + "-" + str(os.getpid()) -exec_args = [ - "podman", "run", - "-i", "--attach=stdin", "--attach=stdout", "--attach=stderr", - "--log-driver=none", - "--env=RCLONE*", "--network=host", "--rm", - "--name=" + container_name, - '--entrypoint=["/usr/local/bin/rclone-wrapper"]', - rclone_image, - ] + sys.argv[2:] +exec_args = ["podman", "exec", "-i", "rclone-gateway", "rclone"] + sys.argv[2:] # Substitute REMOTE_PATH placeholder in Rclone args +rclone_path = f'combined:{repository_id}' exec_args = [rarg.replace('REMOTE_PATH', rclone_path) for rarg in exec_args] -if os.getenv('DEBUG', 0): - print(*([f"{k}={v}" for k,v in rclone_env.items()] + exec_args), file=sys.stderr) - -os.environ.update(rclone_env) os.execvp("podman", exec_args) diff --git a/core/imageroot/usr/local/agent/pypkg/cluster/backup.py b/core/imageroot/usr/local/agent/pypkg/cluster/backup.py index ebc920789..97490c3c2 100644 --- a/core/imageroot/usr/local/agent/pypkg/cluster/backup.py +++ b/core/imageroot/usr/local/agent/pypkg/cluster/backup.py @@ -20,6 +20,16 @@ import sys import subprocess +import urllib.parse +import json +import uuid +import os +import time +import agent +import configparser +import tempfile +import base64 +from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes def get_default_backup_repository_name(provider, url, rid=""): """Suggest a default name for a backup repository""" @@ -40,3 +50,204 @@ def get_default_backup_repository_name(provider, url, rid=""): def validate_schedule(schedule): """Check the given schedule string is good for Systemd timers""" return subprocess.run(['/usr/bin/systemd-analyze', 'calendar', schedule], stdout=subprocess.DEVNULL).returncode != 0 + +def run_rclone(rclone_args, temp_config=None, podman_args=[], **runargs): + with tempfile.NamedTemporaryFile(suffix='.conf', prefix="tmp", dir='rclone') as tfile: + if temp_config: + tfile_name = os.path.basename(tfile.name) + tfile.write(temp_config.encode('utf-8')) + tfile.flush() + os.fchmod(tfile.fileno(), 0o644) + podman_args += [ + f"--env=RCLONE_CONFIG=/etc/rclone/{tfile_name}", + ] + if 'input' in runargs: + podman_args.append("-i") + run_command_args = \ + ["podman", "exec"] + podman_args + \ + ["rclone-gateway", "rclone"] + rclone_args + return subprocess.run(run_command_args, **runargs) + +def extract_region_code(hostname, position, default=""): + """Obtain the region code for S3 backends by looking at the domain + parts of the endpoint FQDN.""" + try: + return hostname.split('.')[position] + except (ValueError, IndexError): + return default + +def extract_rclone_basepath(url): + """Extract the destination path component from a backup URL""" + uschema, upath = url.split(':', 1) + if uschema == 's3': + if '/' in upath: + _, upath = upath.split('/', 1) + elif uschema == 'webdav': + upath = urllib.parse.urlparse(upath).path + return upath + +def generate_rclone_conf(dest_uuid, url, provider, params): + """Translate the input arguments in a rclone.conf-compatible string""" + uschema, upath = url.split(':', 1) + if uschema == 'b2': + rclone_conf = ( + f"type = b2\n" + f"account = {params['b2_account_id']}\n" + f"key = {params['b2_account_key']}\n" + ) + elif uschema == 's3': + rclone_conf = ( + f"type = s3\n" + f"env_auth = true\n" + f"access_key_id = {params['aws_access_key_id']}\n" + f"secret_access_key = {params['aws_secret_access_key']}\n" + ) + s3_endpoint = "" + if '/' in upath: + s3_endpoint, _ = upath.split('/', 1) + rclone_conf += f"endpoint = {s3_endpoint}\n" + if provider == 'aws': + rclone_conf += f"provider = AWS\n" + rclone_conf += f"region = {params.get('aws_default_region', '')}\n" + elif provider == 'generic-s3' and 'digitalocean' in s3_endpoint: + rclone_conf += f"provider = DigitalOcean\n" + elif provider == 'generic-s3' and 'ovh.net' in s3_endpoint: + region = params.get("aws_default_region", extract_region_code(s3_endpoint, 1)) + rclone_conf += f"provider = Other\n" + rclone_conf += f"region = {region}\n" + rclone_conf += f"location_constraint = {region}\n" + elif provider == 'generic-s3' and 'wasabi' in s3_endpoint: + rclone_conf += f"provider = Wasabi\n" + rclone_conf += f"region = {params.get('aws_default_region', extract_region_code(s3_endpoint, 1))}\n" + elif provider == 'generic-s3' and 'synology' in s3_endpoint: + rclone_conf += f"provider = Synology\n" + rclone_conf += f"region = {params.get('aws_default_region', extract_region_code(s3_endpoint, 0))}\n" + rclone_conf += f"no_check_bucket = true\n" + elif provider == 'generic-s3' and 'your-objectstorage.com' in s3_endpoint: + rclone_conf += f"provider = Other\n" + rclone_conf += f"region = {params.get('aws_default_region', extract_region_code(s3_endpoint, 0))}\n" + elif provider == 'generic-s3' and 'idrivee2' in s3_endpoint: + rclone_conf += f"provider = IDrive\n" + rclone_conf += f"no_check_bucket = true\n" + elif provider == 'generic-s3' and 'cubbit.eu' in s3_endpoint: + rclone_conf += f"provider = Other\n" + rclone_conf += f"region = eu-west-1\n" + elif provider == 'generic-s3' and 'scalablestorage.it' in s3_endpoint: + rclone_conf += f"provider = Other\n" + rclone_conf += f"no_check_bucket = true\n" + else: + rclone_conf += f"provider = Other\n" + elif uschema == 'azure': + rclone_conf = ( + f"type = azureblob\n" + f"account = {params['azure_account_name']}\n" + f"key = {params['azure_account_key']}\n" + ) + elif uschema == 'smb': + obscured_pass = rclone_obscure(params['smb_pass']) + rclone_conf = ( + f"type = smb\n" + f"host = {params['smb_host']}\n" + f"user = {params['smb_user']}\n" + f"pass = {obscured_pass}\n" + f"domain = {params['smb_domain']}\n" + ) + elif uschema == 'webdav': + ourl = urllib.parse.urlparse(upath) + rclone_conf = ( + f"type = webdav\n" + f"url = {ourl.scheme}://{ourl.netloc}\n" + ) + else: + raise Exception(f"Schema {uschema} not supported") + return f"[{dest_uuid}]\n" + rclone_conf + +def stable_uuid_v5(data: dict) -> str: + payload = json.dumps( + data, + sort_keys=True, # stable order of keys + separators=(",", ":"), # remove whitespace for stability + ensure_ascii=False, # stable UTF-8 bytes for non-ascii + ).encode("utf-8") + return str(uuid.uuid5(uuid.NAMESPACE_URL, payload.decode("utf-8"))) + +def parse_rclone_params(rclone_conf): + """Parse a rclone remote configuration string and return a flat dictionary. + + The returned dictionary contains the raw rclone keys from the first + section of rclone_conf, plus aliases that map rclone-specific key names + back to the original param names expected by generate_rclone_conf(). + This ensures that parse_rclone_params(generate_rclone_conf(...)) is + a superset of the original params dict. + """ + cp = configparser.ConfigParser() + cp.read_string(rclone_conf) + section = cp.sections()[0] + result = dict(cp[section]) + + rtype = result.get('type', '') + if rtype == 'b2': + result['b2_account_id'] = result.get('account', '') + result['b2_account_key'] = result.pop('key', '') + elif rtype == 's3': + result['aws_access_key_id'] = result.get('access_key_id', '') + result['aws_secret_access_key'] = result.get('secret_access_key', '') + if 'region' in result: + result['aws_default_region'] = result['region'] + elif rtype == 'azureblob': + result['azure_account_name'] = result.get('account', '') + result['azure_account_key'] = result.pop('key', '') + elif rtype == 'smb': + result['smb_host'] = result.get('host', '') + result['smb_user'] = result.get('user', '') + result['smb_pass'] = rclone_reveal(result.pop('pass')) + result['smb_domain'] = result.get('domain', '') + + return result + +def lookup_node_from_webdav_url(rdb, url): + """Look up the node ID from its internal Rclone WebDAV URL, that + contains its VPN IP address.""" + # Parse a url like "webdav:http://10.5.4.1:4694" + try: + node_ip = url.split(":")[2].strip('/') + except IndexError: + return None + for node_id in set(rdb.hvals("cluster/module_node")): + if node_ip == rdb.hget(f"node/{node_id}/vpn", "ip_address"): + return node_id + return None + + +# Same key hardcoded in rclone source +_CRYPT_KEY = bytes([ + 0x9c, 0x93, 0x5b, 0x48, 0x73, 0x0a, 0x55, 0x4d, + 0x6b, 0xfd, 0x7c, 0x63, 0xc8, 0x86, 0xa9, 0x2b, + 0xd3, 0x90, 0x19, 0x8e, 0xb8, 0x12, 0x8a, 0xfb, + 0xf4, 0xde, 0x16, 0x2b, 0x8b, 0x95, 0xf6, 0x38, +]) + +_AES_BLOCK_SIZE = 16 # aes.BlockSize in Go + + +def rclone_obscure(plaintext: str) -> str: + """Obscure a value using AES-CTR, compatible with rclone's obscure format.""" + data = plaintext.encode() + iv = os.urandom(_AES_BLOCK_SIZE) + cipher = Cipher(algorithms.AES(_CRYPT_KEY), modes.CTR(iv)) + encryptor = cipher.encryptor() + ciphertext = encryptor.update(data) + encryptor.finalize() + return base64.urlsafe_b64encode(iv + ciphertext).rstrip(b"=").decode() + + +def rclone_reveal(obscured: str) -> str: + """Reveal a value obscured by rclone (or obscure() above).""" + # Restore padding stripped by rclone's RawURLEncoding + padded = obscured + "=" * (-len(obscured) % 4) + raw = base64.urlsafe_b64decode(padded) + if len(raw) < _AES_BLOCK_SIZE: + raise ValueError("input too short — is it actually obscured?") + iv, ciphertext = raw[:_AES_BLOCK_SIZE], raw[_AES_BLOCK_SIZE:] + cipher = Cipher(algorithms.AES(_CRYPT_KEY), modes.CTR(iv)) + decryptor = cipher.decryptor() + return (decryptor.update(ciphertext) + decryptor.finalize()).decode() diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/10validate b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/10validate old mode 100755 new mode 100644 index 712d943ae..ec2400b2e --- a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/10validate +++ b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/10validate @@ -1,60 +1,3 @@ -#!/usr/bin/env python3 - # -# Copyright (C) 2021 Nethesis S.r.l. -# http://www.nethesis.it - nethserver@nethesis.it +# Non-executable file. Placeholder, see bug NethServer/dev#7058 # -# This script is part of NethServer. -# -# NethServer is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, -# or any later version. -# -# NethServer is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with NethServer. If not, see COPYING. -# - -import os -import sys -import json -import agent -import uuid -import subprocess - -errors = [] - -def exit_errors(errors): - agent.set_status('validation-failed') - json.dump(errors, fp=sys.stdout) - sys.exit(2) - -request = json.load(sys.stdin) -rdb = agent.redis_connect(privileged=False) - -agent.set_weight(os.path.basename(__file__), 0) # Validation step, no task progress at all - -rid = str(uuid.uuid5(uuid.NAMESPACE_URL, request['url'])) - -# check for duplicate names -if rdb.exists(f'cluster/backup_repository/{rid}'): - errors.append({'field':'url','parameter':'url','value':request['url'],'error':'backup_repository_exists'}) - exit_errors(errors) - -# validate repository credentials -renv = dict(os.environ) # copy the environment -renv.update(request['parameters']) # merge parameters -renv['url'] = request['url'] # add url -renv['provider'] = request['provider'] # add provider -pvalidate = subprocess.run(['rclone-wrapper', '0', '--low-level-retries', '1', '--contimeout', '10s', 'size', '-q', '--json', 'REMOTE_PATH'], text=True, capture_output=True, env=renv) -if pvalidate.returncode != 0: - print(agent.SD_DEBUG, "The rclone-wrapper probe command has failed:", pvalidate.stderr, file=sys.stderr) - errors.append({'field':'parameters','parameter':'parameters','value':pvalidate.stderr,'error':'backup_repository_not_accessible'}) - -if errors: - exit_errors(errors) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update index 307da48b3..d3861a3e3 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update @@ -1,23 +1,8 @@ #!/usr/bin/env python3 # -# Copyright (C) 2021 Nethesis S.r.l. -# http://www.nethesis.it - nethserver@nethesis.it -# -# This script is part of NethServer. -# -# NethServer is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, -# or any later version. -# -# NethServer is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with NethServer. If not, see COPYING. +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later # import os @@ -27,35 +12,83 @@ import uuid import agent import hashlib import cluster.backup +import agent.tasks -request = json.load(sys.stdin) -rdb = agent.redis_connect(privileged=True) - -url = request['url'] -provider = request['provider'] -rid = str(uuid.uuid5(uuid.NAMESPACE_URL, url)) - -if request['password']: - password = request['password'] -else: - # Generate random password for backup encryption - m = hashlib.sha256() - m.update(uuid.uuid4().bytes) - password = m.hexdigest() - -if request['name']: - name = request['name'] -else: - name = cluster.backup.get_default_backup_repository_name(provider, url, rid) - -data = {'url': url, 'password': password, 'name': name, 'provider': provider} - -# Setup extra parameters for repository access -for k in request['parameters']: - data[k] = request['parameters'][k] - -# Add the repository key -if not rdb.hset(f'cluster/backup_repository/{rid}', mapping=data): - sys.exit(1) +def main(): + request = json.load(sys.stdin) + rdb = agent.redis_connect(privileged=True) + # Calculate the repository (destination) UUID from a reduced version + # of the full request payload. This is a small protection against + # request replay: we want to avoid multiple repos with different + # Restic passwords. + reduced_request = request.copy() + reduced_request['password'] = "" + reduced_request['name'] = "" + destination_uuid = cluster.backup.stable_uuid_v5(reduced_request) + if rdb.exists(f'cluster/backup_repository/{destination_uuid}') \ + or rdb.hexists('private/nodes/backup_destination/rclone_conf', destination_uuid): + # This backup repository (destination) already exists. + agent.set_status('validation-failed') + json.dump([{'field':'','parameter':'','value':'','error':'backup_repository_exists'}], fp=sys.stdout) + sys.exit(2) + url = request['url'] + provider = request['provider'] + if request['password']: + password = request['password'] + else: + # Generate random password for backup encryption + m = hashlib.sha256() + m.update(uuid.uuid4().bytes) + password = m.hexdigest() + if request['name']: + name = request['name'] + else: + name = cluster.backup.get_default_backup_repository_name(provider, url, destination_uuid) + public_data = {'url': url, 'name': name, 'provider': provider} + if provider == 'cluster': + rclone_conf = f"[{destination_uuid}]\ntype=local\n" + destination_basepath = '/srv/repo' + validation_nodes = set(cluster.backup.lookup_node_from_webdav_url(rdb, url) or '0') + else: + rclone_conf = cluster.backup.generate_rclone_conf(destination_uuid, url, provider, request['parameters']) + destination_basepath = cluster.backup.extract_rclone_basepath(url) + validation_nodes = set(rdb.hvals('cluster/module_node')) + # Prepare a list of tasks, one for each node, to run in parallel: + validation_tasks = [] + for node_id in validation_nodes: + validation_tasks.append({ + "action": "validate-backup-destination", + "agent_id": f"node/{node_id}", + "extra": {'isNotificationHidden': True}, + "data": { + "id": destination_uuid, + "basepath": destination_basepath, + "rclone_conf": rclone_conf, + }, + }) + node_task_results = agent.tasks.runp( + validation_tasks, + endpoint="redis://cluster-leader", + ) + # Fail if all nodes cannot reach the backup repository: + if all(ntr["exit_code"] != 0 for ntr in node_task_results): + print(*(ntr['error'] for ntr in node_task_results), file=sys.stderr, sep='\n') + agent.set_status('validation-failed') + json.dump([{'field':'parameters','parameter':'parameters','value':{},'error':'backup_repository_not_accessible'}], fp=sys.stdout) + sys.exit(2) + # Save backup destinations in Redis keys with access limited to node + # agents: + trx = rdb.pipeline() + trx.hdel('private/nodes/backup_destination/rclone_conf', destination_uuid) + trx.hset('private/nodes/backup_destination/rclone_conf', destination_uuid, rclone_conf) + trx.hdel('private/agents/backup_destination/restic_password', destination_uuid) + trx.hset('private/agents/backup_destination/restic_password', destination_uuid, password) + trx.delete('cluster/backup_repository/' + destination_uuid) + trx.hset('cluster/backup_repository/' + destination_uuid, mapping=public_data) + # XXX: publish event + if not trx.execute(): + sys.exit(1) + json.dump({'password': password, 'id': destination_uuid}, fp=sys.stdout) -json.dump({'password': password, 'id': rid}, fp=sys.stdout) +if __name__ == "__main__": + main() diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update index 186ba5a96..fd84e66a2 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update @@ -27,43 +27,74 @@ import agent import cluster.backup import subprocess -request = json.load(sys.stdin) -rdb = agent.redis_connect(privileged=True) +def main(): + request = json.load(sys.stdin) + rdb = agent.redis_connect(privileged=True) -rprovider = request['provider'] -rid = request['id'] -rparameters = request.get('parameters', {}) + provider = request['provider'] + destination_uuid = request['id'] + parameters = request.get('parameters', {}) -url = rdb.hget(f'cluster/backup_repository/{rid}', 'url') -if not url: - agent.set_status('validation-failed') - json.dump([{'field':'id','parameter':'id','value': rid,'error':'backup_repository_not_found'}], fp=sys.stdout) - sys.exit(2) + if not rdb.hget(f'private/nodes/backup_destination/rclone_conf', destination_uuid): + agent.set_status('validation-failed') + json.dump([{'field':'id','parameter':'id','value': destination_uuid,'error':'backup_repository_not_found'}], fp=sys.stdout) + sys.exit(2) -if rprovider != rdb.hget(f'cluster/backup_repository/{rid}', 'provider'): - agent.set_status('validation-failed') - json.dump([{'field':'provider','parameter':'provider','value': rprovider,'error':'backup_provider_mismatch'}], fp=sys.stdout) - sys.exit(3) + if provider != rdb.hget(f'cluster/backup_repository/{destination_uuid}', 'provider'): + agent.set_status('validation-failed') + json.dump([{'field':'provider','parameter':'provider','value': provider,'error':'backup_provider_mismatch'}], fp=sys.stdout) + sys.exit(3) -# validate repository credentials -renv = dict(os.environ) # copy the environment -renv.update(request['parameters']) # merge parameters -renv['url'] = url # add url -renv['provider'] = request['provider'] # add provider -pvalidate = subprocess.run(['rclone-wrapper', '0', '--low-level-retries', '1', '--contimeout', '10s', 'size', '-q', '--json', 'REMOTE_PATH'], text=True, capture_output=True, env=renv) -if pvalidate.returncode != 0: - agent.set_status('validation-failed') - json.dump([{'field':'parameters','parameter':'parameters','value':pvalidate.stderr,'error':'backup_repository_not_accessible'}], fp=sys.stdout) - sys.exit(4) + # The url parameter is optional, if it's missing fetch the value from Redis: + url = request.get('url') or rdb.hget(f'cluster/backup_repository/{destination_uuid}', 'url') + if provider == 'cluster': + rclone_conf = f"[{destination_uuid}]\ntype=local\n" + destination_basepath = '/srv/repo' + validation_nodes = set(cluster.backup.lookup_node_from_webdav_url(rdb, url) or '0') + else: + rclone_conf = cluster.backup.generate_rclone_conf(destination_uuid, url, provider, parameters) + destination_basepath = cluster.backup.extract_rclone_basepath(url) + validation_nodes = set(rdb.hvals('cluster/module_node')) + # Prepare a list of tasks, one for each node, to run in parallel: + validation_tasks = [] + for node_id in validation_nodes: + validation_tasks.append({ + "action": "validate-backup-destination", + "agent_id": f"node/{node_id}", + "extra": {'isNotificationHidden': True}, + "data": { + "id": destination_uuid, + "basepath": destination_basepath, + "rclone_conf": rclone_conf, + }, + }) + node_task_results = agent.tasks.runp( + validation_tasks, + endpoint="redis://cluster-leader", + ) + # Fail if all nodes cannot reach the backup repository: + if all(ntr["exit_code"] != 0 for ntr in node_task_results): + agent.set_status('validation-failed') + json.dump([{'field':'parameters','parameter':'parameters','value':'','error':'backup_repository_not_accessible'}], fp=sys.stdout) + sys.exit(4) -if 'name' in request: - rname = request['name'] -else: - rname = rdb.hget(f'cluster/backup_repository/{rid}', 'name') or "" + if 'name' in request: + rname = request['name'] + else: + rname = rdb.hget(f'cluster/backup_repository/{destination_uuid}', 'name') or "" -if not rname: - rname = cluster.backup.get_default_backup_repository_name(rprovider, url, rid) + if not rname: + rname = cluster.backup.get_default_backup_repository_name(provider, url, destination_uuid) -data = dict(rparameters, name=rname) # add the name field to rparameters + public_data = {'url': url, 'name': rname, 'provider': request['provider']} + trx = rdb.pipeline() + trx.hdel('private/nodes/backup_destination/rclone_conf', destination_uuid) + trx.hset('private/nodes/backup_destination/rclone_conf', destination_uuid, rclone_conf) + trx.delete(f'cluster/backup_repository/{destination_uuid}') + trx.hset(f'cluster/backup_repository/{destination_uuid}', mapping=public_data) + # XXX: publish event + if not trx.execute(): + sys.exit(1) -rdb.hset(f'cluster/backup_repository/{rid}', mapping=data) +if __name__ == "__main__": + main() diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/50list b/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/50list index 5539769d8..e7f080c58 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/50list +++ b/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/50list @@ -24,21 +24,23 @@ import os import json import sys import agent +import cluster.backup -rdb = agent.redis_connect() +rdb = agent.redis_connect(privileged=True) brepos = [] password_exists = os.path.isfile('backup/passphrase') and os.stat('backup/passphrase').st_size != 0 -for krepo in rdb.scan_iter('cluster/backup_repository/*'): - parameters = rdb.hgetall(krepo) +for repoid, rclone_conf in rdb.hgetall('private/nodes/backup_destination/rclone_conf').items(): + parameters = rdb.hgetall(f'cluster/backup_repository/{repoid}') + restic_password = rdb.hget(f'private/agents/backup_destination/restic_password', repoid) or "" orepo = { - "id": krepo.removeprefix('cluster/backup_repository/'), + "id": repoid, "provider": parameters.pop('provider'), "name": parameters.pop('name', ''), "url": parameters.pop('url'), - "password": parameters.pop('password'), - "parameters": parameters + "password": restic_password, + "parameters": cluster.backup.parse_rclone_params(rclone_conf), } brepos.append(orepo) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/list-cluster-backup-endpoints/validate-output.json b/core/imageroot/var/lib/nethserver/cluster/actions/list-cluster-backup-endpoints/validate-output.json index d5405a13f..a1196cc63 100644 --- a/core/imageroot/var/lib/nethserver/cluster/actions/list-cluster-backup-endpoints/validate-output.json +++ b/core/imageroot/var/lib/nethserver/cluster/actions/list-cluster-backup-endpoints/validate-output.json @@ -1,8 +1,8 @@ { "$schema": "http://json-schema.org/draft-07/schema#", - "title": "list-cluster-backup-repositories output", - "$id": "http://schema.nethserver.org/cluster/list-cluster-backup-repositories-output.json", - "description": "Get the list backup repository endpoints provided by cluster nodes", + "title": "list-cluster-backup-endpoints output", + "$id": "http://schema.nethserver.org/cluster/list-cluster-backup-endpoints-output.json", + "description": "Get the list of WebDAV backup endpoints provided by cluster nodes", "examples": [ { "endpoints": [ @@ -33,7 +33,7 @@ }, "url": { "type": "string", - "description": "rclone-like URL of the repository. See rclone-wrapper for details. The format must be valid input for the add-backup-repository action" + "description": "Base URL of rclone-gateway WebDAV service" } } } diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository index cb0e2a976..eeb3ea323 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository +++ b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository @@ -43,25 +43,17 @@ trx = rdb.pipeline() # Drop related backup objects # ptasks = [] +# XXX use an index instead of scan for kbid in rdb.scan_iter('cluster/backup/*'): repo_id = rdb.hget(kbid, 'repository') if repo_id != rid: continue # backup not related, skip - trx.delete(kbid) - for kbackups in rdb.scan_iter('module/*/backups'): - mbid = kbid.removeprefix('cluster/backup/') - if rdb.srem(kbackups, mbid) > 0: - ptasks.append({ - "agent_id": kbackups.removesuffix('/backups'), - "action": "configure-backup", - "data": { - "id": int(mbid), - "optype": "remove", - } - }) trx.delete(f"cluster/backup_repository/{rid}") +trx.hdel('private/nodes/backup_destination/rclone_conf', rid) +trx.hdel('private/agents/backup_destination/restic_password', rid) +# XXX: publish event trx.execute() terrors = agent.tasks.runp_brief(ptasks, diff --git a/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/50validate_backup_destination b/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/50validate_backup_destination new file mode 100755 index 000000000..74a41ada2 --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/50validate_backup_destination @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +import os +import sys +import json +import cluster.backup +import subprocess + +def main(): + request = json.load(sys.stdin) + try: + run_validation(request) + except subprocess.CalledProcessError as ex: + if 'lsd' in ex.cmd: + reason = "no_read_permission" + elif 'mkdir' in ex.cmd: + reason = "no_write_permission" + elif 'rmdir' in ex.cmd: + reason = "no_delete_permission" + else: + reason = 'unknown' + print(f"node {os.environ['NODE_ID']}: {reason} on destination {request['id']}", file=sys.stderr) + sys.exit(ex.returncode) + +def run_validation(request): + rclone_args = [ + '--timeout=5s', + '--contimeout=5s', + '--retries=1', + '--low-level-retries=1', + '--retries-sleep=100ms', + ] + if os.getenv("DEBUG") == '1': + rclone_args.append("-vvvv") + rpath = request['id'] + ':' + request['basepath'] + # Test if connection and read-access is successfull + cluster.backup.run_rclone(['lsd', rpath] + rclone_args, temp_config=request['rclone_conf'], stdout=subprocess.DEVNULL, check=True) + + # Test write access, by creating and deleting a directory + remote_test_dir = rpath + '/' + os.environ["AGENT_TASK_ID"] + '.tmp' + cluster.backup.run_rclone(['mkdir', remote_test_dir] + rclone_args, temp_config=request['rclone_conf'], stdout=subprocess.DEVNULL, check=True) + cluster.backup.run_rclone(['rmdir', remote_test_dir] + rclone_args, temp_config=request['rclone_conf'], stdout=subprocess.DEVNULL, check=True) + json.dump({"node_id":int(os.environ["NODE_ID"]),"id": request["id"]}, fp=sys.stdout) + +if __name__ == "__main__": + main() diff --git a/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/validate-input.json b/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/validate-input.json new file mode 100644 index 000000000..1b384a378 --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/validate-input.json @@ -0,0 +1,30 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "add-rclone-destination input", + "$id": "http://schema.nethserver.org/node/add-rclone-destination-input.json", + "description": "Input schema of the add-rclone-destination action", + "examples": [ + { + "id": "8d3a9df9-f20a-4d4c-9280-718126191d8f", + "basepath": "mybucket", + "rclone_conf": "[8d3a9df9-f20a-4d4c-9280-718126191d8f]\n..." + } + ], + "type": "object", + "required": [ + "id", + "basepath", + "rclone_conf" + ], + "properties": { + "id": { + "type": "string" + }, + "basepath": { + "type": "string" + }, + "rclone_conf": { + "type": "string" + } + } +} From 307d0d47714228e4d695dc88c967e4433a73012e Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Wed, 18 Feb 2026 18:14:16 +0100 Subject: [PATCH 04/27] feat(backup): event backup-destination-changed Define a new core event to synchronize rclone-gateway.service configuration and reload it every time a backup destination changes. --- .../actions/add-backup-repository/50update | 4 +++- .../actions/alter-backup-repository/50update | 23 ++++--------------- .../50remove_backup_repository | 4 +++- .../10reload_rclonegateway | 12 ++++++++++ docs/core/events.md | 3 +++ 5 files changed, 26 insertions(+), 20 deletions(-) create mode 100755 core/imageroot/var/lib/nethserver/node/events/backup-destination-changed/10reload_rclonegateway diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update index d3861a3e3..d9ea319ff 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update @@ -85,7 +85,9 @@ def main(): trx.hset('private/agents/backup_destination/restic_password', destination_uuid, password) trx.delete('cluster/backup_repository/' + destination_uuid) trx.hset('cluster/backup_repository/' + destination_uuid, mapping=public_data) - # XXX: publish event + trx.publish(f"cluster/event/backup-destination-changed", json.dumps({ + "destination_id": destination_uuid, + })) if not trx.execute(): sys.exit(1) json.dump({'password': password, 'id': destination_uuid}, fp=sys.stdout) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update index fd84e66a2..c6f07b767 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update @@ -1,23 +1,8 @@ #!/usr/bin/env python3 # -# Copyright (C) 2023 Nethesis S.r.l. -# http://www.nethesis.it - nethserver@nethesis.it -# -# This script is part of NethServer. -# -# NethServer is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, -# or any later version. -# -# NethServer is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with NethServer. If not, see COPYING. +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later # import os @@ -92,7 +77,9 @@ def main(): trx.hset('private/nodes/backup_destination/rclone_conf', destination_uuid, rclone_conf) trx.delete(f'cluster/backup_repository/{destination_uuid}') trx.hset(f'cluster/backup_repository/{destination_uuid}', mapping=public_data) - # XXX: publish event + trx.publish(f"cluster/event/backup-destination-changed", json.dumps({ + "destination_id": destination_uuid, + })) if not trx.execute(): sys.exit(1) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository index eeb3ea323..0263aed20 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository +++ b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository @@ -53,7 +53,9 @@ for kbid in rdb.scan_iter('cluster/backup/*'): trx.delete(f"cluster/backup_repository/{rid}") trx.hdel('private/nodes/backup_destination/rclone_conf', rid) trx.hdel('private/agents/backup_destination/restic_password', rid) -# XXX: publish event +trx.publish(f"cluster/event/backup-destination-changed", json.dumps({ + "destination_id": rid, +})) trx.execute() terrors = agent.tasks.runp_brief(ptasks, diff --git a/core/imageroot/var/lib/nethserver/node/events/backup-destination-changed/10reload_rclonegateway b/core/imageroot/var/lib/nethserver/node/events/backup-destination-changed/10reload_rclonegateway new file mode 100755 index 000000000..72dc8775f --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/events/backup-destination-changed/10reload_rclonegateway @@ -0,0 +1,12 @@ +#!/bin/bash + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +if [[ ${AGENT_EVENT_SOURCE} != cluster ]] ; then + exit 0 # invalid source, skip restart +fi + +systemctl reload-or-restart rclone-gateway.service diff --git a/docs/core/events.md b/docs/core/events.md index 932474191..ee947eedd 100644 --- a/docs/core/events.md +++ b/docs/core/events.md @@ -42,6 +42,9 @@ Events fired by the `cluster` agent (i.e. channel is `cluster/event/ - `module-removed`: the event is fired at the end of the remove-module process to inform other modules that a module has been removed on the cluster - `leader-changed`: a node was promoted to leader. The `node_id` attribute indicates the new leader, and `endpoint` its public Wireguard VPN endpoint address +- `backup-destination-changed`: a backup destination was + added/altered/removed. See `destination_id` event argument to identify + it. ## Node events From 9e41872c9ac55bce2a6488fe656e5e900a2aa21f Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Thu, 19 Feb 2026 19:21:50 +0100 Subject: [PATCH 05/27] feat: migration of node Redis ACLs and backup keys - Grant node and module agents read-only access to cluster/*, node/*, module/*, and private/agents/* key spaces. - Convert existing backup destination keys to new format. - Trigger reload of rclone-gateway configuration on all nodes. Add acl-changed event handler to reload rclone-gateway. - Update module database documentation with use_replica connection example. --- .../cluster/actions/add-module/50update | 2 + .../cluster/actions/add-node/50update | 2 + .../update-core-pre-modules.d/50update_grants | 70 +++++++++++++++++++ .../events/acl-changed/80reload_rclonegateway | 12 ++++ .../lib/nethserver/node/install-finalize.sh | 2 +- docs/modules/database.md | 13 +++- 6 files changed, 98 insertions(+), 3 deletions(-) create mode 100755 core/imageroot/var/lib/nethserver/node/events/acl-changed/80reload_rclonegateway diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/add-module/50update b/core/imageroot/var/lib/nethserver/cluster/actions/add-module/50update index 4cf73109e..552107d63 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/add-module/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/add-module/50update @@ -192,6 +192,8 @@ agent.assert_exp(rdb.execute_command('ACL', 'SETUSER', f'module/{module_id}', 'ON', '#' + outobj['redis_sha256'], 'resetkeys', f'~module/{module_id}/*', f'~task/module/{module_id}/*', + '%R~cluster/*', '%R~node/*', '%R~module/*', + '%R~private/agents/*', 'resetchannels', f'&progress/module/{module_id}/*', f'&module/{module_id}/event/*', '+@read', '+@write', '+@transaction', '+@connection', '+publish', '-@admin') == 'OK') diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/add-node/50update b/core/imageroot/var/lib/nethserver/cluster/actions/add-node/50update index a779ef23b..28909aa78 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/add-node/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/add-node/50update @@ -124,6 +124,8 @@ for flag in flags: agent.assert_exp(rdb.execute_command('ACL', 'SETUSER', f'node/{node_id}', 'ON', '#' + node_pwh, 'resetkeys', f'~node/{node_id}/*', f'~task/node/{node_id}/*', + '%R~cluster/*', '%R~node/*', '%R~module/*', + '%R~private/agents/*', '%R~private/nodes/*', 'resetchannels', f'&progress/node/{node_id}/*', f'&node/{node_id}/event/*', 'nocommands', '+@read', '+@write', '+@transaction', '+@connection', '+publish', '-@admin', '+psync', '+replconf', # commands for replication diff --git a/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants b/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants index 82b6118b9..d6ee0e3ec 100755 --- a/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants +++ b/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants @@ -9,6 +9,7 @@ import os import sys import agent import cluster.grants +import cluster.backup import uuid rdb = agent.redis_connect(privileged=True) @@ -53,3 +54,72 @@ if not rdb.exists('cluster/module_uuid'): module_uuid = str(uuid.uuid4()) uix.hset('cluster/module_uuid', mid, module_uuid) uix.execute() + +# +# Grant nodes access to private/ key namespace to share backup secrets +# +trx = rdb.pipeline() +for acl_entry in rdb.acl_list(): + acl_fields = acl_entry.split() + agent_id = acl_fields[1] + # Upgrade guard, run only once: + if "%R~private/agents/*" in acl_fields: + continue # skip entry + # Upgrade basic ACLs for authenticated agents, to include read-only + # access on public namespaces and private/agents/*: + rules = [ + "%R~cluster/*", + "%R~node/*", + "%R~module/*", + "%R~private/agents/*", + ] + if acl_entry.startswith("user node/"): + # Add also read-only access on private/nodes/* namespace: + rules.append("%R~private/nodes/*") + elif acl_entry.startswith("user module/"): + pass # only basic ACLs for applications + else: + continue # NOTE: skip non-agent users! + if rules: + # NOTE: The same ACL SETUSER command can be safely repeated as + # many times as wanted. + trx.execute_command('ACL', 'SETUSER', agent_id, *rules) +trx_changes = trx.execute() +if len(trx_changes) > 0: + # Migrate backup destinations, one by one: + for krepo in list(rdb.scan_iter('cluster/backup_repository/*')): + try: + # + # Collect existing backup repository attributes, and migrate to new DB keys: + # + destination_uuid = krepo.removeprefix('cluster/backup_repository/') + if rdb.hexists('private/nodes/backup_destination/rclone_conf', destination_uuid): + continue # skip if already migrated + mgx = rdb.pipeline() # Accumulate all repo changes in one command pipeline + odest = rdb.hgetall(krepo) + rclone_conf = cluster.backup.generate_rclone_conf(destination_uuid, odest['url'], odest['provider'], odest) + # Migrate parameters to private key space: + mgx.hset('private/nodes/backup_destination/rclone_conf', destination_uuid, rclone_conf) + mgx.hset('private/agents/backup_destination/restic_password', destination_uuid, odest['password']) + # Rebuild the public Redis key to clear its private + # parameters: + mgx.delete(krepo) + mgx.hset(krepo, mapping={ + "name": odest.get('name', destination_uuid), + "url": odest['url'], + "provider": odest['provider'], + }) + mgx.execute() + except Exception as ex: + print(agent.SD_WARNING + f"Failed migration of {krepo}", ex, file=sys.stderr) + else: + destination_uuid = None # No destination was migrated + if destination_uuid: + # At least one destination was migrated: trigger rclone-webdav + # restart on all nodes: + trx.publish(f"cluster/event/backup-destination-changed", json.dumps({ + "destination_id": destination_uuid, + })) + # The acl-changed event handler of each node will trigger backup + # credentials generation and rclone-webdav reload: + cluster.grants.save_acls(rdb) diff --git a/core/imageroot/var/lib/nethserver/node/events/acl-changed/80reload_rclonegateway b/core/imageroot/var/lib/nethserver/node/events/acl-changed/80reload_rclonegateway new file mode 100755 index 000000000..72dc8775f --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/events/acl-changed/80reload_rclonegateway @@ -0,0 +1,12 @@ +#!/bin/bash + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +if [[ ${AGENT_EVENT_SOURCE} != cluster ]] ; then + exit 0 # invalid source, skip restart +fi + +systemctl reload-or-restart rclone-gateway.service diff --git a/core/imageroot/var/lib/nethserver/node/install-finalize.sh b/core/imageroot/var/lib/nethserver/node/install-finalize.sh index cde44dc01..a635ade3d 100755 --- a/core/imageroot/var/lib/nethserver/node/install-finalize.sh +++ b/core/imageroot/var/lib/nethserver/node/install-finalize.sh @@ -84,7 +84,7 @@ ACL SETUSER cluster ON #${cluster_pwhash} ~* &* +@all AUTH cluster "${cluster_password}" ACL SETUSER default ON nopass resetkeys ~cluster/* ~node/* ~module/* resetchannels &* nocommands +@read +@connection +subscribe +psubscribe -@admin ACL SETUSER api-server ON #${apiserver_pwhash} ~* &* nocommands +@read +@pubsub +lpush +@transaction +@connection +role +hset -ACL SETUSER node/1 ON #${node_pwhash} resetkeys ~node/1/* ~task/node/1/* resetchannels &progress/node/1/* &node/1/event/* nocommands +@read +@write +@transaction +@connection +publish -@admin +psync +replconf +ACL SETUSER node/1 ON #${node_pwhash} resetkeys ~node/1/* ~task/node/1/* %R~private/nodes/* %R~private/agents/* %R~cluster/* %R~node/* %R~module/* resetchannels &progress/node/1/* &node/1/event/* nocommands +@read +@write +@transaction +@connection +publish -@admin +psync +replconf ACL SAVE SAVE CONFIG SET masteruser node/1 diff --git a/docs/modules/database.md b/docs/modules/database.md index 6a098d8b0..7afeaa0f6 100644 --- a/docs/modules/database.md +++ b/docs/modules/database.md @@ -14,8 +14,8 @@ purpose. To get write access a module must provide the Redis credentials stored in its `agent.env` file. The complete path is `~/.config/state/agent.env`. Write access is restricted to Redis keys and channels with prefix -`module/{module_id}/*`. The same credentials allow read access of keys -with the same prefix. +`module/{module_id}/*`. The same credentials allow broad read access, also +on `private/agents/*` namespace. The above rules are already implemented by the Python `agent` module. @@ -34,3 +34,12 @@ import agent rdb = agent.redis_connect(privileged=True) rdb.hset('module/myapp1/myhash', mapping={'myvar': 'myvalue'}) ``` + +If Redis connection is required at service boot time, prefer connecting to +the local replica. This avoids issues if the leader node is unreachable. +```python +import agent + +rdb = agent.redis_connect(use_replica=True) +cluster_network = rdb.get('cluster/network') +``` From a307f4b1092174f72f22130d24822c882de53d07 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Fri, 3 Apr 2026 16:37:17 +0200 Subject: [PATCH 06/27] feat(api-server,agent): obfuscate *key/*pass Add "key" and "pass" to the list of attribute name suffixes that are considered as sensitive and therefore trigger the value obfuscation. --- core/agent/htask.go | 2 +- core/api-server/configuration/configuration.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/core/agent/htask.go b/core/agent/htask.go index 0ed6472a6..e7a4c8aec 100644 --- a/core/agent/htask.go +++ b/core/agent/htask.go @@ -464,7 +464,7 @@ func obscureTaskInput(jsonStr string) string { } func isSensitive(target string) bool { - sensitiveList := []string{"password", "secret", "token"} + sensitiveList := []string{"password", "secret", "token", "key", "pass"} ltarget := strings.ToLower(target) for _, sensitive := range sensitiveList { if strings.HasSuffix(ltarget, sensitive) { diff --git a/core/api-server/configuration/configuration.go b/core/api-server/configuration/configuration.go index 65fb83a49..2e12451c7 100644 --- a/core/api-server/configuration/configuration.go +++ b/core/api-server/configuration/configuration.go @@ -93,6 +93,6 @@ func Init() { if os.Getenv("SENSITIVE_LIST") != "" { Config.SensitiveList = strings.Split(os.Getenv("SENSITIVE_LIST"), ",") } else { - Config.SensitiveList = []string{"password", "secret", "token", "jwt", "admpass", "adminpass"} + Config.SensitiveList = []string{"password", "secret", "token", "jwt", "admpass", "adminpass", "key", "pass"} } } From 56ad90d05583bc1d7b9b00da1e995d6f766b71f7 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Fri, 10 Apr 2026 18:28:47 +0200 Subject: [PATCH 07/27] feat(backup): add schedule-backup command Introduce schedule-backup command with start-timers, stop-timers, and list-timers subcommands. Timers are created as systemd transient units from backup schedules stored in Redis. Add backup-timers.service to manage timer lifecycle with redis.service. Assisted-by: copilot:claude-sonnet-4.6 --- .../redis.service.wants/backup-timers.service | 9 +++ .../lib/nethserver/node/bin/schedule-backup | 70 +++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 core/imageroot/etc/systemd/system/redis.service.wants/backup-timers.service create mode 100755 core/imageroot/var/lib/nethserver/node/bin/schedule-backup diff --git a/core/imageroot/etc/systemd/system/redis.service.wants/backup-timers.service b/core/imageroot/etc/systemd/system/redis.service.wants/backup-timers.service new file mode 100644 index 000000000..3c451cce9 --- /dev/null +++ b/core/imageroot/etc/systemd/system/redis.service.wants/backup-timers.service @@ -0,0 +1,9 @@ +[Unit] +Description=Backup timers (from Redis state) +After=redis.service + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=runagent -m node schedule-backup start-timers +ExecStop=runagent -m node schedule-backup stop-timers diff --git a/core/imageroot/var/lib/nethserver/node/bin/schedule-backup b/core/imageroot/var/lib/nethserver/node/bin/schedule-backup new file mode 100755 index 000000000..23b5e2dc9 --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/bin/schedule-backup @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +import argparse +import subprocess +import sys + +import agent + +TIMER_PREFIX = "backup" + +def start_timers(rdb): + """Create a transient systemd timer for each backup defined in Redis.""" + for kbackup in rdb.scan_iter("cluster/backup/*"): + bid = kbackup.removeprefix("cluster/backup/") + battrs = rdb.hgetall(kbackup) + schedule = battrs.get("schedule", "") + enabled = battrs.get("enabled", "") # returns "1" or "" + unit_name = f"{TIMER_PREFIX}{bid}" + if not schedule or not enabled: + print(f"Skipping {unit_name} timer creation: schedule is not defined or not enabled", file=sys.stderr) + continue + sd_run = subprocess.run( + [ + "systemd-run", + "--unit", unit_name, + "--on-calendar", schedule, + "--description", f"Backup {bid} schedule", + "--", "runagent", "-m", "node", + "run-backup", f"--backup={bid}", + ], + ) + if sd_run.returncode == 0: + print(f"Started timer {unit_name} with schedule: {schedule}", file=sys.stderr) + +def stop_timers(rdb): + """Stop all transient backup timers.""" + subprocess.run(["systemctl", "stop", f"{TIMER_PREFIX}*.timer"]) + subprocess.run(["systemctl", "reset-failed", f"{TIMER_PREFIX}*.service"]) + +def list_timers(): + """List active backup timers and their status.""" + subprocess.run( + ["systemctl", "list-timers", f"{TIMER_PREFIX}*"], + ) + +def main(): + parser = argparse.ArgumentParser(description="Manage backup schedule timers") + parser.add_argument( + "command", + choices=["start-timers", "stop-timers", "list-timers"], + help="Start, stop, or list backup schedule timers", + ) + args = parser.parse_args() + if args.command == "list-timers": + list_timers() + return + rdb = agent.redis_connect() + if args.command == "start-timers": + stop_timers(rdb) # cleanup of previous state + start_timers(rdb) + elif args.command == "stop-timers": + stop_timers(rdb) + +if __name__ == "__main__": + main() From fbdffcb084ac7f8723c3e0fedbbd76e795234622 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Fri, 10 Apr 2026 18:29:04 +0200 Subject: [PATCH 08/27] refactor(backup): module-backup output and locking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add exclusive flock to prevent concurrent runs (exit 3 if locked). Remove retention policy, rclone upload, and atexit status handler — these move to run-backup. Assisted-by: copilot:claude-sonnet-4.6 --- .../usr/local/agent/bin/module-backup | 102 +++++------------- 1 file changed, 25 insertions(+), 77 deletions(-) diff --git a/core/imageroot/usr/local/agent/bin/module-backup b/core/imageroot/usr/local/agent/bin/module-backup index 71d8a99a8..98d51eb6a 100755 --- a/core/imageroot/usr/local/agent/bin/module-backup +++ b/core/imageroot/usr/local/agent/bin/module-backup @@ -21,49 +21,34 @@ # import agent -import json +import fcntl import sys import os import os.path import subprocess -import time -import atexit -backup_status = {'total_size': 0, 'total_file_count': 0, 'snapshots_count': 0, 'errors': 0, 'start': int(time.time())} +EXIT_ALREADY_RUNNING = 3 backup_id = int(sys.argv[1]) -rdb = agent.redis_connect(host='127.0.0.1') # Connect to local replica + +lock_file = open(os.environ['AGENT_STATE_DIR'] + f"/.module-backup.lock", "w") +try: + fcntl.flock(lock_file, fcntl.LOCK_EX | fcntl.LOCK_NB) +except OSError: + print(f"Another module-backup is already running", file=sys.stderr) + sys.exit(EXIT_ALREADY_RUNNING) +rdb = agent.redis_connect(privileged=True, use_replica=True) # Connect to local replica obackup = rdb.hgetall(f'cluster/backup/{backup_id}') agent.assert_exp(len(obackup) > 0) # The backup object must exist repository = obackup['repository'] module_id = os.environ['MODULE_ID'] +agent.assert_exp(module_id in obackup['instances'].split()) # The module must belong to the backup schedule install_dir = os.environ['AGENT_INSTALL_DIR'] state_dir = os.environ['AGENT_STATE_DIR'] module_is_rootfull = os.geteuid() == 0 repopath = agent.get_image_name_from_url(os.environ['IMAGE_URL']) + "/" + os.environ['MODULE_UUID'] -def exit_handler(): - global module_id, backup_status, backup_id - wrdb = agent.redis_connect(privileged = True) - trx = wrdb.pipeline() - backup_status['end'] = int(time.time()) - trx.delete(f"module/{module_id}/backup_status/{backup_id}") - trx.hset(f"module/{module_id}/backup_status/{backup_id}", mapping=backup_status) - trx.publish(f"module/{module_id}/event/backup-status-changed", json.dumps({ - "node_id": int(os.environ["NODE_ID"]), - "module_id": module_id, - "backup_id": backup_id, - })) - trx.execute() - wrdb.close() - -if obackup['enabled'] != '1': - print(f"Backup {backup_id} is not enabled, exit now.", file=sys.stderr) - sys.exit(0) - -atexit.register(exit_handler) - # This file is always required by the restore procedure # See the cluster/restore-module action for details backup_args = ["state/environment"] @@ -93,7 +78,6 @@ except PermissionError: except Exception as ex: print("[ERROR] module-dump-state failed.", ex, file=sys.stderr) print("[ERROR] module-backup aborted.", file=sys.stderr) - backup_status['errors'] += 1 sys.exit(1) podman_args = [ @@ -128,16 +112,16 @@ if os.path.isfile(install_dir + "/etc/state-exclude.conf"): backup_args.append("--exclude-file=/etc/state-exclude.conf") # Ensure the restic repository has been initialized -if agent.run_restic(rdb, repository, repopath, [], ["snapshots"], stdout=subprocess.DEVNULL).returncode == 0: - print(f"Repository {repository} is present at path {repopath}", file=sys.stderr) -else: - try: - print(f"Initializing repository {repository} at path {repopath}", file=sys.stderr) - agent.run_restic(rdb, repository, repopath, [], ["init"]).check_returncode() - except subprocess.CalledProcessError as ex: - print("[ERROR] restic init failed.", ex, file=sys.stderr) - backup_status['errors'] += 1 - sys.exit(1) +try: + proc_snapshots = agent.run_restic(rdb, repository, repopath, [], ["snapshots"], stdout=subprocess.DEVNULL) + if proc_snapshots.returncode == 0: + print(f"Repository is present at path {repository}/{repopath}", file=sys.stderr) + else: + print(f"Initializing repository at path {repository}/{repopath}", file=sys.stderr) + agent.run_restic(rdb, repository, repopath, [], ["init"], check=True) +except Exception as ex: + print(f"[ERROR] restic init failed: {ex}", file=sys.stderr) + sys.exit(1) agent_progress_callback = agent.get_progress_callback(1, 95) def backup_progress_callback(omessage): @@ -149,45 +133,12 @@ def backup_progress_callback(omessage): try: # Run the backup if os.getenv('AGENT_TASK_ID'): - pbackup = agent.run_restic(rdb, repository, repopath, podman_args, ["backup", "--json"] + backup_args, progress_callback=backup_progress_callback) - if pbackup.returncode != 0: - print(agent.SD_ERR + f"Restic restore command failed with exit code {pbackup.returncode}.", file=sys.stderr) - sys.exit(1) + agent.run_restic(rdb, repository, repopath, podman_args, ["backup", "--json"] + backup_args, progress_callback=backup_progress_callback, check=True) else: agent.run_restic(rdb, repository, repopath, podman_args, ["backup", "--no-scan"] + backup_args).check_returncode() - - # Apply retention policy - agent.run_restic(rdb, repository, repopath, [], ["forget", "--prune", "--keep-last=" + obackup['retention']]).check_returncode() - -except subprocess.CalledProcessError as ex: - print("[ERROR] restic backup failed.", ex, file=sys.stderr) - backup_status['errors'] += 1 - -# Advertise the backup status -stats_proc = agent.run_restic(rdb, repository, repopath, [], ["stats", "--json", "latest"], text=True, stdout=subprocess.PIPE) -if stats_proc.returncode == 0: - backup_status.update(json.loads(stats_proc.stdout)) -else: - print("[ERROR] restic stats failed.", file=sys.stderr) - backup_status['errors'] += 1 - -try: - ometa = {} - ometa["module_id"] = module_id - ometa["module_ui_name"] = rdb.get(f'module/{module_id}/ui_name') or "" - ometa["node_fqdn"] = agent.get_hostname() - ometa["cluster_uuid"] = rdb.get("cluster/uuid") or "" - ometa["uuid"] = os.environ["MODULE_UUID"] - ometa["timestamp"] = int(time.time()) - ometa["success"] = bool(backup_status['errors'] == 0) - subprocess.run(["rclone-wrapper", str(backup_id), "rcat", f"REMOTE_PATH/{repopath}.json"], - stdout=sys.stderr, - input='\n' + json.dumps(ometa, separators=(',', ':')) + '\n', - text=True, - check=True, - ) -except subprocess.CalledProcessError as ex: - backup_status['errors'] += 1 +except Exception as ex: + print(f"[ERROR] restic backup failed: {ex}", file=sys.stderr) + sys.exit(1) try: subprocess.run(["module-cleanup-state"] + sys.argv[1:], check=True) @@ -197,7 +148,4 @@ except PermissionError: print("[WARNING] module-cleanup-state is not executable: ignored.", file=sys.stderr) except Exception as ex: print("[ERROR] module-cleanup-state failed.", ex, file=sys.stderr) - backup_status['errors'] += 1 - -if backup_status['errors'] > 0: sys.exit(1) From 2c323b6e5027604caf49356eedfaa1eed99bfa2b Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Fri, 10 Apr 2026 18:29:51 +0200 Subject: [PATCH 09/27] feat(backup): add run-backup node orchestrator Orchestrate per-module backups at the node level. Capture module-backup JSON stdout, store status in Redis under node/{nodeID}/backup_status/{backupID}, run retention policy on success, and upload repopath.json metadata. Retry modules returning exit code 3 (already running). Assisted-by: copilot:claude-sonnet-4.6 --- .../cluster/actions/run-backup/50run_backup | 7 +- .../node/actions/run-backup/50run_backup | 18 ++ .../actions/run-backup/validate-input.json | 22 ++ .../var/lib/nethserver/node/bin/run-backup | 206 ++++++++++++++++++ 4 files changed, 250 insertions(+), 3 deletions(-) create mode 100755 core/imageroot/var/lib/nethserver/node/actions/run-backup/50run_backup create mode 100644 core/imageroot/var/lib/nethserver/node/actions/run-backup/validate-input.json create mode 100755 core/imageroot/var/lib/nethserver/node/bin/run-backup diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/50run_backup b/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/50run_backup index bccd8bd48..f3427a908 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/50run_backup +++ b/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/50run_backup @@ -54,10 +54,11 @@ for kbackups in rdb.scan_iter('module/*/backups'): } }) -terrors = agent.tasks.runp_brief(ptasks, +results = agent.tasks.runp(ptasks, endpoint="redis://cluster-leader", progress_callback=agent.get_progress_callback(2,98) ) -if terrors: - print(terrors, file=sys.stderr) +berrors = len([r['exit_code'] != 0 for r in results]) +if berrors > 0: + print(agent.SD_ERR + f"run-backup {bid} failed on {berrors} node(s)", file=sys.stderr) sys.exit(1) diff --git a/core/imageroot/var/lib/nethserver/node/actions/run-backup/50run_backup b/core/imageroot/var/lib/nethserver/node/actions/run-backup/50run_backup new file mode 100755 index 000000000..8ef0ada6b --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/actions/run-backup/50run_backup @@ -0,0 +1,18 @@ +#!/usr/bin/env python3 + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + + +import sys +import json +import agent + +request = json.load(sys.stdin) +bid = request['id'] + +proc = agent.run_helper("run-backup", f"--backup={bid}") +if proc.returncode != 0: + sys.exit(1) diff --git a/core/imageroot/var/lib/nethserver/node/actions/run-backup/validate-input.json b/core/imageroot/var/lib/nethserver/node/actions/run-backup/validate-input.json new file mode 100644 index 000000000..64b7596bd --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/actions/run-backup/validate-input.json @@ -0,0 +1,22 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "run-backup input", + "$id": "http://schema.nethserver.org/node/run-backup-input.json", + "description": "Run the given backup immediately", + "examples": [ + { + "id": 2 + } + ], + "type": "object", + "required": [ + "id" + ], + "properties": { + "id": { + "title": "Backup ID", + "type": "integer", + "minimum": 1 + } + } +} diff --git a/core/imageroot/var/lib/nethserver/node/bin/run-backup b/core/imageroot/var/lib/nethserver/node/bin/run-backup new file mode 100755 index 000000000..ecbe23198 --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/bin/run-backup @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +import redis +import argparse +import json +import os +import subprocess +import sys +import time + +import agent + +EXIT_ALREADY_RUNNING = 3 +RETRY_INTERVAL = 60 # seconds between retries +RETRY_TIMEOUT = 3600 # total retry window in seconds + +def get_module_repopath(rdb, module_id): + """Compute the restic repository path for a module.""" + env = rdb.hgetall(f"module/{module_id}/environment") + image_url = env.get("IMAGE_URL", "") + module_uuid = rdb.hget("cluster/module_uuid", module_id) or "" + return agent.get_image_name_from_url(image_url) + "/" + module_uuid + +def run_module_backup(module_id, backup_id): + """Run backup for a single module. Returns the exit code.""" + print(f"{module_id}: starting backup {backup_id} at {time.asctime()}", file=sys.stderr) + result = subprocess.run( + ["runagent", "-m", module_id, "module-backup", backup_id], + ) + return result.returncode + +def collect_backup_stats(rdb, repository, repopath, start_time): + """Collect restic stats for a module's backup. Returns status dict.""" + status = {"start": start_time, "total_size": 0, "total_file_count": 0, "snapshots_count": 0, "errors": 0} + stats_proc = agent.run_restic(rdb, repository, repopath, [], ["stats", "--json", "latest"], text=True, stdout=subprocess.PIPE) + if stats_proc.returncode == 0: + try: + status.update(json.loads(stats_proc.stdout)) + except json.JSONDecodeError: + status["errors"] += 1 + else: + print(f"[ERROR] restic stats failed for {repopath}", file=sys.stderr) + status["errors"] += 1 + status["end"] = int(time.time()) + return status + +def store_backup_status(node_id, backup_id, module_id, status): + """Store module backup status in Redis under the node key and publish backup-status-changed event.""" + rdb = agent.redis_connect(privileged=True) # Connect to leader node + trx = rdb.pipeline() + agent_id = os.environ["AGENT_ID"] + try: + event_payload = { + "node_id": int(node_id), + "backup_id": int(backup_id), + "module_id": module_id, + } + trx.hset( + f"node/{node_id}/backup_status/{backup_id}", + module_id, + json.dumps(status), + ) + trx.publish(f"{agent_id}/event/backup-status-changed", json.dumps(event_payload)) + trx.execute() + except (ValueError, redis.exceptions.RedisError) as ex: + print(agent.SD_WARNING + "Cannot store backup status!", ex, file=sys.stderr) + +def run_retention(rdb, repository, repopath, retention): + """Apply retention policy after a successful backup.""" + try: + agent.run_restic(rdb, repository, repopath, [], [ + "forget", "--prune", f"--keep-last={retention}", + ]).check_returncode() + except subprocess.CalledProcessError as ex: + print(f"[ERROR] retention policy failed for {repopath}: {ex}", file=sys.stderr) + +def update_repopath_json(rdb, backup_id, module_id, repopath, success): + """Upload {repopath}.json metadata to the remote destination.""" + ometa = { + "module_id": module_id, + "module_ui_name": rdb.get(f"module/{module_id}/ui_name") or "", + "node_fqdn": agent.get_hostname(), + "cluster_uuid": rdb.get("cluster/uuid") or "", + "uuid": rdb.hget("cluster/module_uuid", module_id) or "", + "timestamp": int(time.time()), + "success": success, + } + try: + subprocess.run( + ["rclone-wrapper", str(backup_id), "rcat", f"REMOTE_PATH/{repopath}.json"], + stdout=sys.stderr, + input="\n" + json.dumps(ometa, separators=(",", ":")) + "\n", + text=True, + check=True, + ) + except subprocess.CalledProcessError as ex: + print(f"[ERROR] rclone-wrapper failed for {repopath}.json: {ex}", file=sys.stderr) + + +def process_module(rdb, module_id, backup_id, node_id, repository, retention): + """Run backup for one module, handle post-backup tasks. Returns exit code.""" + start_time = int(time.time()) + rc = run_module_backup(module_id, backup_id) + if rc == EXIT_ALREADY_RUNNING: + return rc + + repopath = get_module_repopath(rdb, module_id) + success = (rc == 0) + + # Collect stats and store backup status + status = collect_backup_stats(rdb, repository, repopath, start_time) + if not success: + status["errors"] += 1 + + store_backup_status(node_id, backup_id, module_id, status) + + if success: + run_retention(rdb, repository, repopath, retention) + + update_repopath_json(rdb, backup_id, module_id, repopath, success) + return rc + +def main(): + parser = argparse.ArgumentParser(description="Run a backup by ID") + parser.add_argument( + "--backup", + required=True, + help="Backup ID to run", + ) + args = parser.parse_args() + backup_id = args.backup + + rdb = agent.redis_connect(use_replica=True, privileged=True) + node_id = os.environ["NODE_ID"] + + battrs = rdb.hgetall(f"cluster/backup/{backup_id}") + if not battrs: + print(f"Backup {backup_id} not found in Redis", file=sys.stderr) + sys.exit(1) + + if battrs.get("enabled") != "1": + print(f"Backup {backup_id} is not enabled", file=sys.stderr) + return + + repository = battrs.get("repository", "") + retention = battrs.get("retention", "3") + instances = battrs.get("instances", "").split() + module_node = rdb.hgetall("cluster/module_node") + + # Keep only modules that belong to this node + local_modules = [m for m in instances if module_node.get(m) == node_id] + if not local_modules: + print(f"No local modules for backup {backup_id}", file=sys.stderr) + return + + failed = [] + retry_queue = [] + + # First pass: run all local modules sequentially + for module_id in local_modules: + rc = process_module(rdb, module_id, backup_id, node_id, repository, retention) + if rc == 0: + print(f"{module_id}: backup {backup_id} completed", file=sys.stderr) + elif rc == EXIT_ALREADY_RUNNING: + print(f"{module_id}: already running, queued for retry", file=sys.stderr) + retry_queue.append(module_id) + else: + print(f"{module_id}: backup {backup_id} failed (exit code {rc})", file=sys.stderr) + failed.append(module_id) + + # Retry loop for modules that were already running + deadline = time.monotonic() + RETRY_TIMEOUT + while retry_queue and time.monotonic() < deadline: + print(f"Waiting {RETRY_INTERVAL}s before retrying {len(retry_queue)} module(s)...", file=sys.stderr) + time.sleep(RETRY_INTERVAL) + still_busy = [] + for module_id in retry_queue: + rc = process_module(rdb, module_id, backup_id, node_id, repository, retention) + if rc == 0: + print(f"{module_id}: backup {backup_id} completed (retry)", file=sys.stderr) + elif rc == EXIT_ALREADY_RUNNING: + print(f"{module_id}: still running, will retry", file=sys.stderr) + still_busy.append(module_id) + else: + print(f"{module_id}: backup {backup_id} failed on retry (exit code {rc})", file=sys.stderr) + failed.append(module_id) + retry_queue = still_busy + + # Modules that never became available within the timeout + for module_id in retry_queue: + print(f"{module_id}: gave up retrying after {RETRY_TIMEOUT}s", file=sys.stderr) + failed.append(module_id) + + if failed: + print(f"Backup {backup_id} finished with failures: {', '.join(failed)}", file=sys.stderr) + sys.exit(1) + + print(f"Backup {backup_id} completed successfully", file=sys.stderr) + +if __name__ == "__main__": + main() From 1aeb3b6776e50dd7f42f274fa9e385bcb1c8527e Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Fri, 10 Apr 2026 18:30:03 +0200 Subject: [PATCH 10/27] refactor(list-backups): read status from node keys Update list-backups to read from node/{nodeID}/backup_status/{backupID} instead of the old module/{mid}/backup_status keys. Derive instance lists from the cluster/backup/{id} instances field. Assisted-by: copilot:claude-sonnet-4.6 --- .../cluster/actions/list-backups/50list | 48 +++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/50list b/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/50list index 3a940cf4e..2bbcfd102 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/50list +++ b/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/50list @@ -50,30 +50,38 @@ for kbackup in rdb.scan_iter('cluster/backup/*'): configured_instances = set() -for mbackup in rdb.scan_iter('module/*/backups'): - for bid in rdb.sscan_iter(mbackup): - if bid in backups: - mid = mbackup.removeprefix('module/').removesuffix('/backups') - module_uuid = rdb.hget("cluster/module_uuid", mid) - status = rdb.hgetall(f"module/{mid}/backup_status/{bid}") or None - if not status is None: - for k in status.keys(): - status[k] = int(status[k]) - status["success"] = status["errors"] == 0 - del(status["errors"]) - backups[bid]['instances'].append({ - "module_id": mid, - "ui_name": rdb.get(f"module/{mid}/ui_name") or "", - "repository_path": module_uuid, - "status": status, - }) - configured_instances.add(mid) +# Collect backup status from all nodes +node_ids = set(rdb.hvals("cluster/module_node")) +backup_statuses = {} # {(backup_id, module_id): status_dict} +for node_id in node_ids: + for bid in backups: + raw = rdb.hgetall(f"node/{node_id}/backup_status/{bid}") + for mid, status_json in raw.items(): + try: + backup_statuses[(bid, mid)] = json.loads(status_json) + except (json.JSONDecodeError, TypeError): + pass + +for bid, backup in backups.items(): + instances = rdb.hget(f"cluster/backup/{bid}", "instances") or "" + for mid in instances.split(): + module_uuid = rdb.hget("cluster/module_uuid", mid) + status = backup_statuses.get((bid, mid)) + if status is not None: + status = dict(status) + status["success"] = status.pop("errors", 0) == 0 + backup['instances'].append({ + "module_id": mid, + "ui_name": rdb.get(f"module/{mid}/ui_name") or "", + "repository_path": module_uuid, + "status": status, + }) + configured_instances.add(mid) unconfigured_instances = [] # The user will not be warned if the following modules have the `no_data_backup` flag. -for xins in rdb.scan_iter('module/*/environment'): - mid = xins.removesuffix('/environment').removeprefix('module/') +for mid in rdb.hkeys('cluster/module_node'): flags = rdb.smembers(f'module/{mid}/flags') if 'no_data_backup' in flags: continue From 6acb611aee9a9a20365ffb08a1206ac74652b26d Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Thu, 23 Apr 2026 13:43:13 +0200 Subject: [PATCH 11/27] feat(backup): write per-backup prom files Generate backup{id}.prom in /run/node_exporter from run-backup with UNKNOWN=-1 on start, CONFLICT=2 in case of concurrent runs of the same module, SUCCESS=1 on completion, FAILED=0 if any module fails. Remove the old 10node_monitor event handler. --- .../backup-status-changed/10node_monitor | 66 ------------------- .../var/lib/nethserver/node/bin/run-backup | 59 +++++++++++++---- 2 files changed, 48 insertions(+), 77 deletions(-) delete mode 100755 core/imageroot/var/lib/nethserver/cluster/events/backup-status-changed/10node_monitor diff --git a/core/imageroot/var/lib/nethserver/cluster/events/backup-status-changed/10node_monitor b/core/imageroot/var/lib/nethserver/cluster/events/backup-status-changed/10node_monitor deleted file mode 100755 index cf3c657db..000000000 --- a/core/imageroot/var/lib/nethserver/cluster/events/backup-status-changed/10node_monitor +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 - -# -# Copyright (C) 2024 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-3.0-or-later -# - -import agent -import json -import sys -import os -import tempfile - -FAILED = 0 -SUCCESS = 1 -UNKNOWN = -1 -OUTPUT_FILE = "/run/node_exporter/backup.prom" - -rdb = agent.redis_connect() -leader_id = int(rdb.hget('cluster/environment', 'NODE_ID')) -self_id = int(os.environ['NODE_ID']) - -if self_id != leader_id: - # Remove backup status from the worker node: - # avoid false alarms after a switch-leader - try: - os.remove(OUTPUT_FILE) - except: - pass - sys.exit(0) # LEADER ONLY! Do not run this procedure in worker nodes. - -# Ensure the output directory exists -os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) - -modules = set(rdb.hkeys("cluster/module_node")) - -backups = {} -for module_id in modules: - for backup_id in rdb.smembers(f"module/{module_id}/backups"): - if not backup_id in backups: - name = rdb.hget(f"cluster/backup/{backup_id}", "name") - backups[backup_id] = {"name": name, "status": UNKNOWN} - nerrors = rdb.hget(f"module/{module_id}/backup_status/{backup_id}", "errors") or "0" - if not nerrors.isnumeric(): - continue - if int(nerrors) > 0: - backups[backup_id]["status"] = FAILED - elif int(nerrors) == 0 and backups[backup_id]["status"] != FAILED: - backups[backup_id]["status"] = SUCCESS - -# Create the content to be written in node_exporter format -content = f"""# HELP node_backup_status Status of the backup (0 = failure, 1 = success, -1 = unknown) -# TYPE node_backup_status gauge -""" -for backup_id in backups: - backup = backups[backup_id] - content += 'node_backup_status{id="%s",name="%s"} %i\n' % (backup_id, backup.get('name', '_'), backup.get('status')) - -# Write the content to the output file atomically -with tempfile.NamedTemporaryFile('w', delete=False, dir=os.path.dirname(OUTPUT_FILE)) as temp_file: - temp_file.write(content) - temp_filename = temp_file.name - -os.replace(temp_filename, OUTPUT_FILE) -# make OUTPUT_FILE readable by node_exporter -os.chmod(OUTPUT_FILE, 0o644) diff --git a/core/imageroot/var/lib/nethserver/node/bin/run-backup b/core/imageroot/var/lib/nethserver/node/bin/run-backup index ecbe23198..3ad62a3f7 100755 --- a/core/imageroot/var/lib/nethserver/node/bin/run-backup +++ b/core/imageroot/var/lib/nethserver/node/bin/run-backup @@ -19,6 +19,34 @@ EXIT_ALREADY_RUNNING = 3 RETRY_INTERVAL = 60 # seconds between retries RETRY_TIMEOUT = 3600 # total retry window in seconds +PROM_DIR = "/run/node_exporter" +PROM_UNKNOWN = -1 +PROM_FAILED = 0 +PROM_SUCCESS = 1 +PROM_CONFLICT = 2 + +def prometheus_escape_label_value(value: str) -> str: + return ( + value + .replace("\\", "\\\\") + .replace("\n", "\\n") + .replace('"', '\\"') + ) + +def write_backup_prom(backup_id, backup_name, status): + """Write a Prometheus .prom file for this backup's status.""" + output_file = os.path.join(PROM_DIR, f"backup{backup_id}.prom") + content = ( + '# HELP node_backup_status Status of the backup (0 = failure, 1 = success, 2 = conflict, -1 = unknown)\n' + '# TYPE node_backup_status gauge\n' + f'node_backup_status{{id="{backup_id}",name="{prometheus_escape_label_value(backup_name)}"}} {status}\n' + ) + os.makedirs(PROM_DIR, exist_ok=True) + tmp_file = f"{output_file}-{os.getpid()}.tmp" + with open(tmp_file, "w") as f: + f.write(content) + os.rename(tmp_file, output_file) + def get_module_repopath(rdb, module_id): """Compute the restic repository path for a module.""" env = rdb.hgetall(f"module/{module_id}/environment") @@ -37,14 +65,11 @@ def run_module_backup(module_id, backup_id): def collect_backup_stats(rdb, repository, repopath, start_time): """Collect restic stats for a module's backup. Returns status dict.""" status = {"start": start_time, "total_size": 0, "total_file_count": 0, "snapshots_count": 0, "errors": 0} - stats_proc = agent.run_restic(rdb, repository, repopath, [], ["stats", "--json", "latest"], text=True, stdout=subprocess.PIPE) - if stats_proc.returncode == 0: - try: - status.update(json.loads(stats_proc.stdout)) - except json.JSONDecodeError: - status["errors"] += 1 - else: - print(f"[ERROR] restic stats failed for {repopath}", file=sys.stderr) + try: + stats_proc = agent.run_restic(rdb, repository, repopath, [], ["stats", "--json", "latest"], text=True, stdout=subprocess.PIPE, check=True) + status.update(json.loads(stats_proc.stdout)) + except Exception as ex: + print(f"[ERROR] restic stats failed for {repopath}: {ex}", file=sys.stderr) status["errors"] += 1 status["end"] = int(time.time()) return status @@ -75,9 +100,11 @@ def run_retention(rdb, repository, repopath, retention): try: agent.run_restic(rdb, repository, repopath, [], [ "forget", "--prune", f"--keep-last={retention}", - ]).check_returncode() - except subprocess.CalledProcessError as ex: - print(f"[ERROR] retention policy failed for {repopath}: {ex}", file=sys.stderr) + ], check=True) + except Exception as ex: + # Backup is OK, but retention failed. We assume the error + # condition is temporary and will be recovered by next backup run. + print(f"[WARNING] retention policy failed for {repopath}: {ex}", file=sys.stderr) def update_repopath_json(rdb, backup_id, module_id, repopath, success): """Upload {repopath}.json metadata to the remote destination.""" @@ -149,6 +176,7 @@ def main(): repository = battrs.get("repository", "") retention = battrs.get("retention", "3") + backup_name = battrs.get("name", "") instances = battrs.get("instances", "").split() module_node = rdb.hgetall("cluster/module_node") @@ -158,6 +186,9 @@ def main(): print(f"No local modules for backup {backup_id}", file=sys.stderr) return + # Initialize prom file to UNKNOWN + write_backup_prom(backup_id, backup_name, PROM_UNKNOWN) + failed = [] retry_queue = [] @@ -167,9 +198,12 @@ def main(): if rc == 0: print(f"{module_id}: backup {backup_id} completed", file=sys.stderr) elif rc == EXIT_ALREADY_RUNNING: + if len(failed) == len(retry_queue) == 0: + write_backup_prom(backup_id, backup_name, PROM_CONFLICT) print(f"{module_id}: already running, queued for retry", file=sys.stderr) retry_queue.append(module_id) else: + write_backup_prom(backup_id, backup_name, PROM_FAILED) print(f"{module_id}: backup {backup_id} failed (exit code {rc})", file=sys.stderr) failed.append(module_id) @@ -187,6 +221,8 @@ def main(): print(f"{module_id}: still running, will retry", file=sys.stderr) still_busy.append(module_id) else: + # A run failure must be recorded in prom file immediately + write_backup_prom(backup_id, backup_name, PROM_FAILED) print(f"{module_id}: backup {backup_id} failed on retry (exit code {rc})", file=sys.stderr) failed.append(module_id) retry_queue = still_busy @@ -200,6 +236,7 @@ def main(): print(f"Backup {backup_id} finished with failures: {', '.join(failed)}", file=sys.stderr) sys.exit(1) + write_backup_prom(backup_id, backup_name, PROM_SUCCESS) print(f"Backup {backup_id} completed successfully", file=sys.stderr) if __name__ == "__main__": From 48e35d3ff87b34e2fde28abe8261d2ba8c8dbb52 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Fri, 10 Apr 2026 18:30:19 +0200 Subject: [PATCH 12/27] refactor(backup): remove configure-backup action Remove the configure-backup action that created per-module systemd timer/service units. Add cleanup in 20restart_webdav to remove leftover backup*.timer and backup*.service files for both rootfull and rootless modules. Move backup-timers.service to top-level systemd dir with Requires=redis.service and ConditionPathExists guard. Wire it as Wants= dependency of rclone-gateway.service. Assisted-by: copilot:claude-sonnet-4.6 --- .../backup-timers.service | 2 + .../etc/systemd/system/rclone-gateway.service | 2 +- .../configure-backup/50write_systemd_units | 79 ------------------- .../configure-backup/validate-input.json | 69 ---------------- .../node/update-core.d/20restart_webdav | 22 +++++- 5 files changed, 22 insertions(+), 152 deletions(-) rename core/imageroot/etc/systemd/system/{redis.service.wants => }/backup-timers.service (73%) delete mode 100755 core/imageroot/usr/local/agent/actions/configure-backup/50write_systemd_units delete mode 100644 core/imageroot/usr/local/agent/actions/configure-backup/validate-input.json diff --git a/core/imageroot/etc/systemd/system/redis.service.wants/backup-timers.service b/core/imageroot/etc/systemd/system/backup-timers.service similarity index 73% rename from core/imageroot/etc/systemd/system/redis.service.wants/backup-timers.service rename to core/imageroot/etc/systemd/system/backup-timers.service index 3c451cce9..4a4fb84fb 100644 --- a/core/imageroot/etc/systemd/system/redis.service.wants/backup-timers.service +++ b/core/imageroot/etc/systemd/system/backup-timers.service @@ -1,6 +1,8 @@ [Unit] Description=Backup timers (from Redis state) After=redis.service +Requires=redis.service +ConditionPathExists=/var/lib/nethserver/node/state/rclone [Service] Type=oneshot diff --git a/core/imageroot/etc/systemd/system/rclone-gateway.service b/core/imageroot/etc/systemd/system/rclone-gateway.service index 71153c131..d7185347c 100644 --- a/core/imageroot/etc/systemd/system/rclone-gateway.service +++ b/core/imageroot/etc/systemd/system/rclone-gateway.service @@ -1,7 +1,7 @@ [Unit] Description=Rclone Gateway server After=redis.service -Wants=redis.service +Wants=redis.service backup-timers.service StartLimitIntervalSec=10s StartLimitBurst=3 ConditionPathExists=/etc/wireguard/wg0.conf diff --git a/core/imageroot/usr/local/agent/actions/configure-backup/50write_systemd_units b/core/imageroot/usr/local/agent/actions/configure-backup/50write_systemd_units deleted file mode 100755 index 873d32431..000000000 --- a/core/imageroot/usr/local/agent/actions/configure-backup/50write_systemd_units +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python3 - -# -# Copyright (C) 2021 Nethesis S.r.l. -# http://www.nethesis.it - nethserver@nethesis.it -# -# This script is part of NethServer. -# -# NethServer is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, -# or any later version. -# -# NethServer is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with NethServer. If not, see COPYING. -# - -import os, os.path -import sys -import json -import agent -import subprocess - -request = json.load(sys.stdin) - -backup_name = "backup" + str(request["id"]) -backup_description = request.get("name", "") or backup_name - -is_rootfull = (os.geteuid() == 0) -if is_rootfull: - timer_path = f'/etc/systemd/system/{backup_name}-{os.environ["MODULE_ID"]}.timer' - service_path = f'/etc/systemd/system/{backup_name}-{os.environ["MODULE_ID"]}.service' - description_text = backup_description + " " + os.environ["MODULE_ID"] - command = f'/usr/local/bin/runagent -m {os.environ["MODULE_ID"]} module-backup {request["id"]}' - systemctl_options = [] -else: - timer_path = f"{os.environ['AGENT_INSTALL_DIR']}/systemd/user/{backup_name}.timer" - service_path = f"{os.environ['AGENT_INSTALL_DIR']}/systemd/user/{backup_name}.service" - description_text = backup_description - command = f'/usr/local/bin/runagent module-backup {request["id"]}' - systemctl_options = ["--user"] - -timer_unit = f"""[Unit] -Description={description_text} - -[Timer] -OnCalendar={request.get("schedule", "")} - -[Install] -WantedBy=timers.target -""" - -service_unit = f"""[Unit] -Description={description_text} - -[Service] -Type=oneshot -ExecStart={command} -SyslogIdentifier=%N -""" - -if request['optype'] == "remove": - os.unlink(timer_path) - os.unlink(service_path) -else: - with open(timer_path, 'w', encoding='utf-8') as timer: - timer.write(timer_unit) - - with open(service_path, 'w', encoding='utf-8') as service: - service.write(service_unit) - - subprocess.run(["systemctl", "enable" if request["enabled"] else "disable", "--now", os.path.basename(timer_path)] + systemctl_options).check_returncode() - -subprocess.run(["systemctl", "daemon-reload"] + systemctl_options).check_returncode() diff --git a/core/imageroot/usr/local/agent/actions/configure-backup/validate-input.json b/core/imageroot/usr/local/agent/actions/configure-backup/validate-input.json deleted file mode 100644 index 531206744..000000000 --- a/core/imageroot/usr/local/agent/actions/configure-backup/validate-input.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "configure-backup input", - "$id": "http://schema.nethserver.org/agent/configure-backup-input.json", - "description": "Input schema of the basic configure-backup action", - "examples": [ - { - "optype": "add", - "id": 5, - "name": "BB daily", - "schedule": "daily", - "enabled": true - } - ], - "type": "object", - "anyOf": [ - { - "title": "Add or alter the backup configuration", - "required": [ - "optype", - "id", - "name", - "schedule", - "enabled" - ], - "properties": { - "optype": { - "enum": [ - "add", - "alter" - ] - } - } - }, - { - "title": "Remove the backup configuration", - "required": [ - "optype", - "id" - ], - "properties": { - "optype": { - "const": "remove" - } - } - } - ], - "properties": { - "optype": { - "type": "string", - "title": "Operation type" - }, - "id": { - "title": "Backup ID", - "type": "integer", - "minimum": 1 - }, - "name": { - "title": "Backup name", - "type": "string" - }, - "schedule": { - "type": "string" - }, - "enabled": { - "type": "boolean" - } - } -} diff --git a/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav b/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav index 2f78c3af2..119ec6f20 100755 --- a/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav +++ b/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav @@ -7,12 +7,28 @@ exec 1>&2 -if [[ ! -d rclone ]] ; then +if [[ -d rclone ]] ; then + # On normal update, restart the gateway: + systemctl restart rclone-gateway.service +else + # Run once this procedure: + mkdir -vp rclone # Replace rclone-webdav with the gateway service: systemctl disable --now rclone-webdav.service rm -vf /etc/systemd/system/rclone-webdav.service \ /etc/systemd/system/default.target.wants/rclone-webdav.service systemctl enable rclone-gateway.service -fi -systemctl restart rclone-gateway.service + # Remove old per-module backup timer/service units (rootfull) + rm -vf /etc/systemd/system/backup[0-9]*-*.timer /etc/systemd/system/backup[0-9]*-*.service + systemctl daemon-reload + + # Remove old per-module backup timer/service units (rootless) + while IFS=: read -r username _ uid _ _ homedir _; do + [[ "$uid" -ge 1000 ]] || continue + [[ -f "${homedir}/.config/state/agent.env" ]] || continue + rm -vf "${homedir}/.config/systemd/user"/backup[0-9]*.timer \ + "${homedir}/.config/systemd/user"/backup[0-9]*.service + systemctl --user -M "${username}@" daemon-reload + done < /etc/passwd +fi From 2bd582d4d1d379a3d55c5ea7fdbd49c7a7517470 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Fri, 10 Apr 2026 18:30:36 +0200 Subject: [PATCH 13/27] refactor(backup): simplify cluster backup actions Update add-backup, alter-backup, remove-backup, and run-backup cluster actions for the new backup model. Remove the module-level run-backup action. Update database documentation. Define and document "backup-schedule-changed" event. Migrate module/{mid}/backups SET keys into cluster/backup/{bid} instances field. Migrate module/{mid}/backup_status/{bid} keys to node/{nodeID}/backup_status/{bid}. Assisted-by: copilot:claude-sonnet-4.6 --- .../agent/actions/run-backup/50run_backup | 9 --- .../actions/run-backup/validate-input.json | 20 ------ .../cluster/actions/add-backup/10validate | 2 +- .../cluster/actions/add-backup/50add_backup | 26 +------- .../cluster/actions/alter-backup/50update | 61 +------------------ .../50remove_backup_repository | 20 +++--- .../actions/remove-backup/50remove_backup | 22 +------ .../cluster/actions/run-backup/50run_backup | 14 ++--- .../update-core-pre-modules.d/50update_grants | 35 ++++++++++- .../10restart_backuptimers | 12 ++++ .../20purge_backup_prom | 33 ++++++++++ docs/core/database.md | 4 +- docs/core/events.md | 2 + 13 files changed, 104 insertions(+), 156 deletions(-) delete mode 100755 core/imageroot/usr/local/agent/actions/run-backup/50run_backup delete mode 100644 core/imageroot/usr/local/agent/actions/run-backup/validate-input.json create mode 100755 core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/10restart_backuptimers create mode 100755 core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/20purge_backup_prom diff --git a/core/imageroot/usr/local/agent/actions/run-backup/50run_backup b/core/imageroot/usr/local/agent/actions/run-backup/50run_backup deleted file mode 100755 index ff59e4fdb..000000000 --- a/core/imageroot/usr/local/agent/actions/run-backup/50run_backup +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -# -# Copyright (C) 2024 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-3.0-or-later -# - -BACKUP_ID=$(jq -r .id) -exec module-backup "${BACKUP_ID}" diff --git a/core/imageroot/usr/local/agent/actions/run-backup/validate-input.json b/core/imageroot/usr/local/agent/actions/run-backup/validate-input.json deleted file mode 100644 index 448903312..000000000 --- a/core/imageroot/usr/local/agent/actions/run-backup/validate-input.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "run-backup input", - "$id": "http://schema.nethserver.org/agent/run-backup-input.json", - "description": "Run the given backup immediately", - "examples": [ - { - "id": 5 - } - ], - "type": "object", - "required": ["id"], - "properties": { - "id": { - "title": "Backup ID", - "type": "integer", - "minimum": 1 - } - } -} diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup/10validate b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup/10validate index ab7f165d9..e8f5a4e12 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup/10validate +++ b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup/10validate @@ -44,7 +44,7 @@ if cluster.backup.validate_schedule(request['schedule']): errors.append({'field':'schedule','parameter':'schedule','value': request['schedule'],'error':'bad_schedule_format'}) for mid in request['instances']: - if not rdb.exists(f'module/{mid}/environment'): + if not rdb.exists(f'module/{mid}/environment') or not mid in rdb.hkeys("cluster/module_node"): errors.append({'field':'instances','parameter':'instances','value': mid,'error':'module_not_found'}) if errors: diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup/50add_backup b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup/50add_backup index 33cdba6f0..50e8010f5 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup/50add_backup +++ b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup/50add_backup @@ -35,34 +35,12 @@ data = { "schedule": request['schedule'], "schedule_hint": json.dumps(request.get("schedule_hint", {})), "enabled": "1" if request['enabled'] else "", + "instances": ' '.join(request["instances"]), } backup_id = int(rdb.incr('cluster/backup_sequence')) if not data['name']: data['name'] = "Backup " + str(backup_id) rdb.hset(f'cluster/backup/{backup_id}', mapping=data) - -ptasks = [] -for target in request['instances']: - rdb.sadd(f'module/{target}/backups', backup_id) - ptasks.append({ - "action": "configure-backup", - "agent_id": f"module/{target}", - "data": { - "optype": "add", - "id": backup_id, - "name": request["name"], - "schedule": request["schedule"], - "enabled": request["enabled"], - }, - }) - -terrors = agent.tasks.runp_brief(ptasks, - endpoint="redis://cluster-leader", - progress_callback=agent.get_progress_callback(10,98) -) -if terrors: - print(terrors, file=sys.stderr) - sys.exit(1) - +rdb.publish('cluster/event/backup-schedule-changed', json.dumps({"backup_ids":[str(backup_id)]})) print(backup_id, file=sys.stdout) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup/50update b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup/50update index ea2546184..c7b54aaf2 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup/50update @@ -47,68 +47,11 @@ data = { "schedule": request['schedule'], "schedule_hint": json.dumps(request.get("schedule_hint", {})), "enabled": "1" if request['enabled'] else "", + "instances": ' '.join(request["instances"]), } if not data['name']: data['name'] = "Backup " + str(backup_id) # Save the object attribute new values: rdb.hset(f'cluster/backup/{backup_id}', mapping=data) - -currinst = set() # current set of modules using backup_id -for mkey in rdb.scan_iter('module/*/backups'): - mbackups = set(rdb.smembers(mkey)) - if str(backup_id) in mbackups: - currinst.add(mkey.removeprefix('module/').removesuffix('/backups')) - -ptasks = [] - -# 1/3 Iterate over the modules we want to REMOVE -for mid in currinst - set(request['instances']): - rdb.srem(f'module/{mid}/backups', backup_id) - ptasks.append({ - "action": "configure-backup", - "agent_id": f"module/{mid}", - "data": { - "optype": "remove", - "id": backup_id, - }, - }) - -# 2/3 Iterate over the modules we want to ADD -for mid in set(request['instances']) - currinst: - rdb.sadd(f'module/{mid}/backups', backup_id) - ptasks.append({ - "action": "configure-backup", - "agent_id": f"module/{mid}", - "data": { - "optype": "add", - "id": backup_id, - "name": request["name"], - "schedule": request["schedule"], - "enabled": request["enabled"], - }, - }) - -# 3/3 Iterate over the modules we want to ALTER -for mid in set(request['instances']) & currinst: - ptasks.append({ - "action": "configure-backup", - "agent_id": f"module/{mid}", - "data": { - "optype": "alter", - "id": backup_id, - "name": request["name"], - "schedule": request["schedule"], - "enabled": request["enabled"], - }, - }) - -print(ptasks, file=sys.stderr) - -terrors = agent.tasks.runp_brief(ptasks, - endpoint="redis://cluster-leader", - progress_callback=agent.get_progress_callback(10,98) -) -if terrors: - print(terrors, file=sys.stderr) - sys.exit(1) +rdb.publish('cluster/event/backup-schedule-changed', json.dumps({"backup_ids":[str(backup_id)]})) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository index 0263aed20..604c12bc3 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository +++ b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository @@ -42,14 +42,22 @@ trx = rdb.pipeline() # # Drop related backup objects # -ptasks = [] +related_backups = [] # XXX use an index instead of scan for kbid in rdb.scan_iter('cluster/backup/*'): repo_id = rdb.hget(kbid, 'repository') if repo_id != rid: continue # backup not related, skip trx.delete(kbid) - + backup_id = kbid.removeprefix('cluster/backup/') + related_backups.append(backup_id) +if related_backups: + trx.publish(f"cluster/event/backup-schedule-changed", json.dumps({ + "backup_ids": related_backups, + })) +# +# Delete the backup destination +# trx.delete(f"cluster/backup_repository/{rid}") trx.hdel('private/nodes/backup_destination/rclone_conf', rid) trx.hdel('private/agents/backup_destination/restic_password', rid) @@ -57,11 +65,3 @@ trx.publish(f"cluster/event/backup-destination-changed", json.dumps({ "destination_id": rid, })) trx.execute() - -terrors = agent.tasks.runp_brief(ptasks, - endpoint="redis://cluster-leader", - progress_callback=agent.get_progress_callback(25,98) -) -if terrors: - print(terrors, file=sys.stderr) - sys.exit(1) \ No newline at end of file diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup/50remove_backup b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup/50remove_backup index 8b225fbcc..dc1b029b8 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup/50remove_backup +++ b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup/50remove_backup @@ -36,25 +36,7 @@ if not rdb.exists(f"cluster/backup/{bid}"): sys.exit(2) # -# Remove the backup and all its backward references in module keys +# Remove the backup # -ptasks = [] rdb.delete(f"cluster/backup/{bid}") -for kbackups in rdb.scan_iter('module/*/backups'): - if rdb.srem(kbackups, bid) > 0: - ptasks.append({ - "agent_id": kbackups.removesuffix('/backups'), - "action": "configure-backup", - "data": { - "id": bid, - "optype": "remove", - } - }) - -terrors = agent.tasks.runp_brief(ptasks, - endpoint="redis://cluster-leader", - progress_callback=agent.get_progress_callback(2,98) -) -if terrors: - print(terrors, file=sys.stderr) - sys.exit(1) +rdb.publish('cluster/event/backup-schedule-changed', json.dumps({"backup_ids":[str(bid)]})) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/50run_backup b/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/50run_backup index f3427a908..b9fb2c672 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/50run_backup +++ b/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/50run_backup @@ -35,20 +35,18 @@ if not rdb.exists(f"cluster/backup/{bid}"): json.dump([{'field':'id', 'parameter':'id','value': bid, 'error':'backup_not_found'}], fp=sys.stdout) sys.exit(2) -# -# Remove the backup and all its backward references in module keys -# +cluster_nodes = set(rdb.hvals("cluster/module_node")) + ptasks = [] -for kbackups in rdb.scan_iter('module/*/backups'): - if rdb.sismember(kbackups, bid) > 0: +for node_id in cluster_nodes: ptasks.append({ - "agent_id": kbackups.removesuffix('/backups'), + "agent_id": f"node/{node_id}", "action": "run-backup", "data": { "id": bid, }, "extra": { - 'title': kbackups.removesuffix('/backups') + "/run-backup", + 'title': f"node/{node_id}/run-backup", 'description': "run-backup agent action", 'isNotificationHidden': True, } @@ -58,7 +56,7 @@ results = agent.tasks.runp(ptasks, endpoint="redis://cluster-leader", progress_callback=agent.get_progress_callback(2,98) ) -berrors = len([r['exit_code'] != 0 for r in results]) +berrors = len([r for r in results if isinstance(r, Exception) or r['exit_code'] != 0]) if berrors > 0: print(agent.SD_ERR + f"run-backup {bid} failed on {berrors} node(s)", file=sys.stderr) sys.exit(1) diff --git a/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants b/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants index d6ee0e3ec..d379f280d 100755 --- a/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants +++ b/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants @@ -7,6 +7,7 @@ import os import sys +import json import agent import cluster.grants import cluster.backup @@ -117,9 +118,37 @@ if len(trx_changes) > 0: if destination_uuid: # At least one destination was migrated: trigger rclone-webdav # restart on all nodes: - trx.publish(f"cluster/event/backup-destination-changed", json.dumps({ + rdb.publish(f"cluster/event/backup-destination-changed", json.dumps({ "destination_id": destination_uuid, })) - # The acl-changed event handler of each node will trigger backup - # credentials generation and rclone-webdav reload: + # Migrate backup schedules: populate instances field from module/*/backups SET keys + module_node = rdb.hgetall("cluster/module_node") + schedule_instances = {} # {backup_id: set of module_ids} + sgx = rdb.pipeline() + for mid in module_node: + for bid in rdb.smembers(f"module/{mid}/backups"): + schedule_instances.setdefault(bid, set()).add(mid) + sgx.delete(f"module/{mid}/backups") + for bid, mids in schedule_instances.items(): + if rdb.exists(f"cluster/backup/{bid}"): + sgx.hset(f"cluster/backup/{bid}", "instances", " ".join(sorted(mids))) + sgx.execute() + + # Migrate backup_status: convert module/{mid}/backup_status/{bid} + # HASH to node/{nodeID}/backup_status/{bid} with module_id as field + bgx = rdb.pipeline() + for mid, node_id in module_node.items(): + for kstatus in rdb.scan_iter(f"module/{mid}/backup_status/*"): + bid = kstatus.removeprefix(f"module/{mid}/backup_status/") + status_data = rdb.hgetall(kstatus) + if status_data: + # Convert string values to integers to match the new format + for k in status_data: + try: + status_data[k] = int(status_data[k]) + except (ValueError, TypeError): + pass + bgx.hset(f"node/{node_id}/backup_status/{bid}", mid, json.dumps(status_data)) + bgx.execute() + # rclone-gateway.service should be stopped at this point and this will start it: cluster.grants.save_acls(rdb) diff --git a/core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/10restart_backuptimers b/core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/10restart_backuptimers new file mode 100755 index 000000000..751609749 --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/10restart_backuptimers @@ -0,0 +1,12 @@ +#!/bin/bash + +# +# Copyright (C) 2026 Nethesis S.r.l. +# SPDX-License-Identifier: GPL-3.0-or-later +# + +if [[ ${AGENT_EVENT_SOURCE} != cluster ]] ; then + exit 0 # invalid source, skip restart +fi + +systemctl restart backup-timers.service diff --git a/core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/20purge_backup_prom b/core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/20purge_backup_prom new file mode 100755 index 000000000..343891950 --- /dev/null +++ b/core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/20purge_backup_prom @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 + +# +# Copyright (C) 2024 Nethesis S.r.l. +# SPDX-License-Identifier: AGPL-3.0-or-later +# + +import os +import agent +import glob + +def purge_backup_prom_files(keep_bids): + """Remove stale backup*.prom files.""" + keep_files = [f'backup{bid}.prom' for bid in keep_bids] + for prom_file in glob.glob('/run/node_exporter/backup*.prom'): + if os.path.basename(prom_file) not in keep_files: + os.unlink(prom_file) + print("Removed", prom_file) + +def main(): + keep_bids = [] + if os.environ["AGENT_EVENT_SOURCE"] != "cluster": + return + rdb = agent.redis_connect(use_replica=True) + for kbk in rdb.scan_iter('cluster/backup/*'): + bid = kbk.removeprefix('cluster/backup/') + benabled = rdb.hget(kbk, 'enabled') # returns "1" or "" + if benabled: + keep_bids.append(bid) + purge_backup_prom_files(keep_bids) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docs/core/database.md b/docs/core/database.md index 4f271f0df..2b80e0724 100644 --- a/docs/core/database.md +++ b/docs/core/database.md @@ -246,6 +246,7 @@ The attribute of a backup schedule are stored in an HASH key under the |cluster/backup/{id} retention |INTEGER |Number of snapshots to keep| |cluster/backup/{id} schedule |STRING |Schedule time using `onCalendar` systemd syntax| |cluster/backup/{id} schedule_hint |STRING |Schedule in JSON format for the UI| +|cluster/backup/{id} instances |STRING |Space-separated list of module IDs| #### cluster/user_domain/ @@ -293,6 +294,7 @@ of the node (e.g. `node/1/...`). |node/{id}/default_instance/{image} |STRING |A module ID for the given image ID| |node/{id}/environment |HASH |Node environment variables| |node/{id}/ui_name |STRING |UI label for the node| +|node/{id}/backup_status/{bkid} |HASH |Backup status information with Restic output for each module | ### module/ @@ -305,10 +307,8 @@ access to its keys under `module/{id}/`, where `{id}` is the module |---|----|-----------| |module/{id}/environment |HASH| |module/{id}/roles/{role} |SET |glob patterns matching the actions that {role} can run. {role} is one of "owner", "reader"...| -|module/{id}/backups |SET |List of backup numeric IDs| |module/{id}/flags |SET |Images flags copied from `org.nethserver.flags` image label| |module/{id}/srv/{transport}/{service} | HASH |Service discovery information for other modules. See [Service providers]({{site.baseurl}}/modules/service_providers) | -|module/{id}/backup_status/{backup_id} | HASH |Backup status information with Restic output | ### task/ diff --git a/docs/core/events.md b/docs/core/events.md index ee947eedd..ec4a6cd1e 100644 --- a/docs/core/events.md +++ b/docs/core/events.md @@ -31,6 +31,8 @@ Well known events: has been changed. The JSON parameter format is `{"domains":[DOMAIN1, DOMAIN2 ...], "modules":[MODULE_ID1, MODULE_ID2...]}` and reflects the domains and modules affected by the latest change. +- `backup-schedule-changed`: some backup schedules were changed, JSON parameter + format is `{"backup_ids": LIST[STRING]}`. - `backup-status-changed`: the HASH key containing backup status was updated. JSON parameter format is `{"node_id":INT, "module_id":STRING, "backup_id":INT}` From 44a518be5b6aab24ae558675b40af78b7a677126 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Fri, 10 Apr 2026 18:30:52 +0200 Subject: [PATCH 14/27] feat(backup): upload cluster backup from run-backup Move cluster backup upload from the cluster action to run-backup. On the leader node, generate and upload cluster-backup-{uuid}.json.gz.gpg before starting module backups. Stub out 80upload_cluster_backup. Assisted-by: copilot:claude-sonnet-4.6 --- .../run-backup/80upload_cluster_backup | 20 +------------- .../var/lib/nethserver/node/bin/run-backup | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+), 19 deletions(-) mode change 100755 => 100644 core/imageroot/var/lib/nethserver/cluster/actions/run-backup/80upload_cluster_backup diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/80upload_cluster_backup b/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/80upload_cluster_backup old mode 100755 new mode 100644 index 557ed9d45..ec2400b2e --- a/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/80upload_cluster_backup +++ b/core/imageroot/var/lib/nethserver/cluster/actions/run-backup/80upload_cluster_backup @@ -1,21 +1,3 @@ -#!/bin/bash - # -# Copyright (C) 2023 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-3.0-or-later +# Non-executable file. Placeholder, see bug NethServer/dev#7058 # - -set -e - -config_dump="./backup/dump.json.gz.gpg" -id=$(cat /dev/stdin | jq -r '.id // empty') - -if [ "x$id" == "x" ]; then - exit 1 -fi - -if [ ! -f "$config_dump" ]; then - cluster-backup -fi - -cat "$config_dump" | /usr/local/agent/bin/rclone-wrapper "$id" rcat REMOTE_PATH/$(basename "$config_dump") diff --git a/core/imageroot/var/lib/nethserver/node/bin/run-backup b/core/imageroot/var/lib/nethserver/node/bin/run-backup index 3ad62a3f7..ef3900986 100755 --- a/core/imageroot/var/lib/nethserver/node/bin/run-backup +++ b/core/imageroot/var/lib/nethserver/node/bin/run-backup @@ -128,6 +128,27 @@ def update_repopath_json(rdb, backup_id, module_id, repopath, success): except subprocess.CalledProcessError as ex: print(f"[ERROR] rclone-wrapper failed for {repopath}.json: {ex}", file=sys.stderr) +def upload_cluster_backup(backup_id, cluster_uuid): + """Generate and upload the cluster backup to the remote destination.""" + dump_path = "/var/lib/nethserver/cluster/state/backup/dump.json.gz.gpg" + if not os.path.isfile(dump_path): + try: + subprocess.run(["runagent", "-m", "cluster", "cluster-backup"], check=True) + except subprocess.CalledProcessError as ex: + print(f"[ERROR] cluster-backup failed: {ex}", file=sys.stderr) + return + remote_name = f"cluster-backup-{cluster_uuid}.json.gz.gpg" + try: + with open(dump_path, "rb") as f: + subprocess.run( + ["rclone-wrapper", str(backup_id), "rcat", f"REMOTE_PATH/{remote_name}"], + stdin=f, + stdout=sys.stderr, + check=True, + ) + print(f"Cluster backup uploaded as {remote_name}", file=sys.stderr) + except subprocess.CalledProcessError as ex: + print(f"[ERROR] rclone-wrapper failed for cluster backup: {ex}", file=sys.stderr) def process_module(rdb, module_id, backup_id, node_id, repository, retention): """Run backup for one module, handle post-backup tasks. Returns exit code.""" @@ -180,6 +201,12 @@ def main(): instances = battrs.get("instances", "").split() module_node = rdb.hgetall("cluster/module_node") + # Upload cluster backup before module backups (leader node only) + leader_id = rdb.hget("cluster/environment", "NODE_ID") + if node_id == leader_id: + cluster_uuid = rdb.get("cluster/uuid") or "" + upload_cluster_backup(backup_id, cluster_uuid) + # Keep only modules that belong to this node local_modules = [m for m in instances if module_node.get(m) == node_id] if not local_modules: From d056da927c476dcf8921828539c44c9f8ed71cec Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Tue, 14 Apr 2026 11:20:02 +0200 Subject: [PATCH 15/27] fix(list-backups): repository_path value - The image name was not prepended to the repository path. The value was incomplete/uncorrect. - Change has no impact, since the repository_path value is actually not used by UI. - Fixed related UI stories, still referencing to an old value format that was never implemented. --- .../lib/nethserver/cluster/actions/list-backups/50list | 3 ++- .../cluster/actions/list-backups/validate-output.json | 2 +- core/ui/src/stories/NsBackupCard.stories.js | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/50list b/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/50list index 2bbcfd102..70553e7d9 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/50list +++ b/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/50list @@ -66,6 +66,7 @@ for bid, backup in backups.items(): instances = rdb.hget(f"cluster/backup/{bid}", "instances") or "" for mid in instances.split(): module_uuid = rdb.hget("cluster/module_uuid", mid) + path_prefix = agent.get_image_name_from_url(rdb.hget(f"module/{mid}/environment", "IMAGE_URL") or "/:") status = backup_statuses.get((bid, mid)) if status is not None: status = dict(status) @@ -73,7 +74,7 @@ for bid, backup in backups.items(): backup['instances'].append({ "module_id": mid, "ui_name": rdb.get(f"module/{mid}/ui_name") or "", - "repository_path": module_uuid, + "repository_path": f"{path_prefix}/{module_uuid}", "status": status, }) configured_instances.add(mid) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/validate-output.json b/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/validate-output.json index 8f6cd3b35..3a3b0c9a1 100644 --- a/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/validate-output.json +++ b/core/imageroot/var/lib/nethserver/cluster/actions/list-backups/validate-output.json @@ -17,7 +17,7 @@ { "module_id": "dokuwiki1", "ui_name": "", - "repository_path": "dokuwiki1@2f72561e-89b2-4cdc-b4e4-425ca23bbec9", + "repository_path": "dokuwiki/2f72561e-89b2-4cdc-b4e4-425ca23bbec9", "status": { "total_size": 4053660, "total_file_count": 21744, diff --git a/core/ui/src/stories/NsBackupCard.stories.js b/core/ui/src/stories/NsBackupCard.stories.js index c4fbf3f31..d95193ce0 100644 --- a/core/ui/src/stories/NsBackupCard.stories.js +++ b/core/ui/src/stories/NsBackupCard.stories.js @@ -58,7 +58,7 @@ const sharedBackups = [ instances: [ { module_id: "dokuwiki1", - repository_path: "dokuwiki1@fb745f96-65dc-4a2d-ba4f-712e8c049227", + repository_path: "dokuwiki/fb745f96-65dc-4a2d-ba4f-712e8c049227", status: { success: true, end: 1641832747, @@ -148,7 +148,7 @@ BackupDisabled.args = { instances: [ { module_id: "dokuwiki1", - repository_path: "dokuwiki1@fb745f96-65dc-4a2d-ba4f-712e8c049227", + repository_path: "dokuwiki/fb745f96-65dc-4a2d-ba4f-712e8c049227", status: { success: true, end: 1641832747, @@ -183,7 +183,7 @@ MultipleBackup.args = { instances: [ { module_id: "dokuwiki1", - repository_path: "dokuwiki1@fb745f96-65dc-4a2d-ba4f-712e8c049227", + repository_path: "dokuwiki/fb745f96-65dc-4a2d-ba4f-712e8c049227", status: null, ui_name: "", }, @@ -200,7 +200,7 @@ MultipleBackup.args = { instances: [ { module_id: "dokuwiki1", - repository_path: "dokuwiki1@th745f96-65dc-4a2d-ba4f-712e8c049245", + repository_path: "dokuwiki/f2afdd11-5b4e-4850-8e5d-a05541050ef7", status: { success: true, end: 1641832747, From 87da3baa4b6bde319825ca36f844fa55be66b866 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Tue, 14 Apr 2026 17:04:17 +0200 Subject: [PATCH 16/27] chore: fix example of backup repo path The "dokuwiki1@" prefix was never implemented. --- .../usr/local/agent/actions/restore-module/validate-input.json | 2 +- .../actions/determine-restore-eligibility/validate-input.json | 2 +- .../cluster/actions/read-backup-snapshots/validate-input.json | 2 +- .../cluster/actions/restore-module/validate-input.json | 2 +- .../cluster/actions/restore-modules/validate-input.json | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/core/imageroot/usr/local/agent/actions/restore-module/validate-input.json b/core/imageroot/usr/local/agent/actions/restore-module/validate-input.json index ac1a35754..2777c8dfc 100644 --- a/core/imageroot/usr/local/agent/actions/restore-module/validate-input.json +++ b/core/imageroot/usr/local/agent/actions/restore-module/validate-input.json @@ -6,7 +6,7 @@ "examples": [ { "repository": "48ce000a-79b7-5fe6-8558-177fd70c27b4", - "path": "dokuwiki/dokuwiki1@f5d24fcd-819c-4b1d-98ad-a1b2ebcee8cf", + "path": "dokuwiki/f5d24fcd-819c-4b1d-98ad-a1b2ebcee8cf", "snapshot": "", "environment": { "IMAGE_URL": "ghcr.io/nethserver/dokuwiki:latest", diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/determine-restore-eligibility/validate-input.json b/core/imageroot/var/lib/nethserver/cluster/actions/determine-restore-eligibility/validate-input.json index 703668fdc..27f15e5d3 100644 --- a/core/imageroot/var/lib/nethserver/cluster/actions/determine-restore-eligibility/validate-input.json +++ b/core/imageroot/var/lib/nethserver/cluster/actions/determine-restore-eligibility/validate-input.json @@ -6,7 +6,7 @@ "examples": [ { "repository": "48ce000a-79b7-5fe6-8558-177fd70c27b4", - "path": "dokuwiki/dokuwiki1@f5d24fcd-819c-4b1d-98ad-a1b2ebcee8cf", + "path": "dokuwiki/f5d24fcd-819c-4b1d-98ad-a1b2ebcee8cf", "snapshot": "a6b8317eef" } ], diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/validate-input.json b/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/validate-input.json index c681be785..ed44ca2f3 100644 --- a/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/validate-input.json +++ b/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/validate-input.json @@ -6,7 +6,7 @@ "examples": [ { "repository": "48ce000a-79b7-5fe6-8558-177fd70c27b4", - "path": "dokuwiki/dokuwiki1@f5d24fcd-819c-4b1d-98ad-a1b2ebcee8cf" + "path": "dokuwiki/f5d24fcd-819c-4b1d-98ad-a1b2ebcee8cf" } ], "type": "object", diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/validate-input.json b/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/validate-input.json index 1c1cee227..32189f998 100644 --- a/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/validate-input.json +++ b/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/validate-input.json @@ -6,7 +6,7 @@ "examples": [ { "repository": "48ce000a-79b7-5fe6-8558-177fd70c27b4", - "path": "dokuwiki/dokuwiki1@f5d24fcd-819c-4b1d-98ad-a1b2ebcee8cf", + "path": "dokuwiki/f5d24fcd-819c-4b1d-98ad-a1b2ebcee8cf", "snapshot": "", "node": 1, "replace": false diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/restore-modules/validate-input.json b/core/imageroot/var/lib/nethserver/cluster/actions/restore-modules/validate-input.json index ee9a6bf51..9c9af10a0 100644 --- a/core/imageroot/var/lib/nethserver/cluster/actions/restore-modules/validate-input.json +++ b/core/imageroot/var/lib/nethserver/cluster/actions/restore-modules/validate-input.json @@ -7,7 +7,7 @@ [ { "repository": "48ce000a-79b7-5fe6-8558-177fd70c27b4", - "path": "dokuwiki/dokuwiki1@f5d24fcd-819c-4b1d-98ad-a1b2ebcee8cf", + "path": "dokuwiki/f5d24fcd-819c-4b1d-98ad-a1b2ebcee8cf", "snapshot": "", "node": 1 } From 6f9cbd461c1cdc4d7f1607e83161681e35e82852 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Tue, 14 Apr 2026 18:08:53 +0200 Subject: [PATCH 17/27] feat(backup): persist secrets and UUIDs in dump Bump cluster-backup dump version to 4. Include rclone_conf and restic_password in backup_repository export. Convert module IDs to UUIDs in backup schedules for portability. On restore-cluster, handle v3-to-v4 conversion for backup destinations and restart backup-timers. On restore-module, resolve UUIDs back to module IDs in backup schedules. Switch cluster-backup to privileged Redis connection. Reload rclone-gateway at end of restore-cluster. Skip schedules with broken destination references. Assisted-by: copilot:claude-sonnet-4.6 --- .../cluster/actions/restore-cluster/30load | 35 ++++++++- .../actions/restore-module/50restore_module | 25 ++++++- .../lib/nethserver/cluster/bin/cluster-backup | 74 ++++++++++++++----- docs/core/database.md | 1 + 4 files changed, 113 insertions(+), 22 deletions(-) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/restore-cluster/30load b/core/imageroot/var/lib/nethserver/cluster/actions/restore-cluster/30load index e9ab11e1b..27209252c 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/restore-cluster/30load +++ b/core/imageroot/var/lib/nethserver/cluster/actions/restore-cluster/30load @@ -26,6 +26,8 @@ import sys import json import gzip import agent +import cluster.backup +import time dump_file = './backup/dump.json.gz' @@ -34,6 +36,11 @@ fgz = gzip.open(dump_file, mode='r') dump = json.loads(fgz.read().decode('utf-8')) fgz.close() +try: + dump_version = int(dump["version"]) +except ValueError: + dump_version = 3 + rdb = agent.redis_connect(privileged=True) # Restore simple keys @@ -67,14 +74,34 @@ del(dump['cluster']['user_domain']) # Restore backup repositories for r in dump['cluster']['backup_repository'].keys(): - rdb.hset(f'cluster/backup_repository/{r}', mapping=dump['cluster']['backup_repository'][r]) + odest = dump['cluster']['backup_repository'][r] + if dump_version > 3: + rclone_conf = odest['rclone_conf'] + restic_password = odest['restic_password'] + else: + # Convert backup destination data model from old format: + rclone_conf = cluster.backup.generate_rclone_conf(r, odest['url'], odest['provider'], odest) + restic_password = odest['password'] + rdb.hset(f'cluster/backup_repository/{r}', mapping={ + "provider": odest['provider'], + "url": odest['url'], + "name": odest['name'], + }) + # Secrets are stored under a private namespace: + rdb.hset('private/agents/backup_destination/restic_password', r, restic_password) + rdb.hset('private/nodes/backup_destination/rclone_conf', r, rclone_conf) # Restore backups, make sure backup_sequence is consistent max_backup = -1 for b in dump['cluster']['backup'].keys(): max_backup = max(max_backup, int(b)) rdb.hset(f'cluster/backup/{b}', mapping=dump['cluster']['backup'][b]) -rdb.set('cluster/backup_sequence', max_backup) +if max_backup > 0: + # If backup schedules were restored, set the id sequence to protect + # against id collisions. + rdb.set('cluster/backup_sequence', max_backup) + # Restart scheduled backup timers: + agent.run_helper("systemctl", "restart", "backup-timers.service") # Restore cluster override for modules if 'override' in dump['cluster'] and dump['cluster']['override']['modules']: @@ -87,3 +114,7 @@ if 'subscription' in dump['cluster'] and dump['cluster']['subscription']: # Restore update policy, if present if 'apply_updates' in dump['cluster'] and dump['cluster']['apply_updates']: rdb.hset('cluster/apply_updates', mapping=dump['cluster']['apply_updates']) + +# NOTE: reload of rclone-gateway is implicitly blocking, to avoid races +# with read-backup-repositories +agent.run_helper("systemctl", "reload-or-restart", "rclone-gateway.service") diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module b/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module index d92cdb202..2e109d19d 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module +++ b/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module @@ -40,7 +40,7 @@ replace_requested = request.get('replace', False) node_id = int(request['node']) mvolumes = request.get('volumes', {}) -rdb = agent.redis_connect() +rdb = agent.redis_connect(privileged=True) # Write the output to a named temporary file, # to parse it with the existing read_envfile() function @@ -156,4 +156,27 @@ if replace_requested and remove_modules: ) agent.assert_exp(remove_modules_errors == 0) +# Check if the new_module_uuid was previously referenced by some scheduled +# backup. If so, rebuild the reference in the backup schedule data model. +new_module_uuid = add_module_result['output']['module_uuid'] +changed_backup_ids = [] +trx = rdb.pipeline() +for kbackup in rdb.scan_iter('cluster/backup/*'): + obackup = rdb.hgetall(kbackup) + uinstances = set(obackup.get("uinstances", "").split()) + if new_module_uuid in uinstances: + # The UUID is known, update the references for the new module + # instance: + minstances = set(obackup.get("instances", "").split()) + minstances.add(module_id) + uinstances.remove(new_module_uuid) + trx.hset(kbackup, mapping={ + "instances": " ".join(sorted(minstances)), + "uinstances": " ".join(sorted(uinstances)), + }) + changed_backup_ids.append(kbackup.removeprefix("cluster/backup/")) +if changed_backup_ids: + trx.publish('cluster/event/backup-schedule-changed', json.dumps({"backup_ids":changed_backup_ids})) + trx.execute() + json.dump(add_module_result['output'], fp=sys.stdout) diff --git a/core/imageroot/var/lib/nethserver/cluster/bin/cluster-backup b/core/imageroot/var/lib/nethserver/cluster/bin/cluster-backup index 735e669b4..063b7688c 100755 --- a/core/imageroot/var/lib/nethserver/cluster/bin/cluster-backup +++ b/core/imageroot/var/lib/nethserver/cluster/bin/cluster-backup @@ -37,11 +37,30 @@ from glob import glob output_dir = f'{os.environ["AGENT_STATE_DIR"]}/backup' -rdb = agent.redis_connect(host='127.0.0.1') - -VERSION = "3" - -dump = { 'version': VERSION, 'modules': {}, 'vpn': {}, 'cluster': { 'node_count': 0, 'subscription': {}, 'apply_updates': {}, 'repository': {}, 'backup': {}, 'backup_repository': {}, 'user_domain': { 'ldap': {} },'override': {'modules': {} } } } +rdb = agent.redis_connect(privileged=True) +module_uuid_map = rdb.hgetall("cluster/module_uuid") + +DUMP_VERSION = 4 + +dump = { + 'version': DUMP_VERSION, + 'modules': {}, + 'vpn': {}, + 'cluster': { + 'node_count': 0, # added by DUMP_VERSION 3 + 'subscription': {}, # added by DUMP_VERSION 2 + 'apply_updates': {}, + 'repository': {}, + 'backup': {}, # alias backup_schedules + 'backup_repository': {}, # alias backup_destinations + 'user_domain': { + 'ldap': {}, + }, + 'override': { + 'modules': {}, + } + } +} # cluster: number of nodes dump['cluster']['node_count'] = len(set(rdb.hvals('cluster/module_node'))) @@ -57,15 +76,39 @@ for r in rdb.scan_iter('cluster/repository/*'): k = r.removeprefix('cluster/repository/') dump['cluster']['repository'][k] = rdb.hgetall(r) -# cluster: backup repositories -for r in rdb.scan_iter('cluster/backup_repository/*'): - k = r.removeprefix('cluster/backup_repository/') - dump['cluster']['backup_repository'][k] = rdb.hgetall(r) - -# cluster: backups +# cluster: backup repositories (destinations) +for dest_id, rclone_conf in rdb.hgetall('private/nodes/backup_destination/rclone_conf').items(): + restic_password = rdb.hget('private/agents/backup_destination/restic_password', dest_id) + odest = rdb.hgetall(f'cluster/backup_repository/{dest_id}') + # If secrets cannot be retrieved the whole cluster-backup is useless. + # Make sure the required information is present with some assertions: + agent.assert_exp(odest, f'Backup destination {dest_id} is empty!') + agent.assert_exp(restic_password, f'Restic password of backup destination {dest_id} is empty!') + agent.assert_exp(rclone_conf, f'Rclone configuration of backup destination {dest_id} is empty!') + dump['cluster']['backup_repository'][dest_id] = odest + dump['cluster']['backup_repository'][dest_id]['rclone_conf'] = rclone_conf + dump['cluster']['backup_repository'][dest_id]['restic_password'] = restic_password + +# cluster: backup (schedules) for r in rdb.scan_iter('cluster/backup/*'): k = r.removeprefix('cluster/backup/') - dump['cluster']['backup'][k] = rdb.hgetall(r) + obackup = rdb.hgetall(r) + if obackup["repository"] not in dump['cluster']['backup_repository']: + # Destination reference is broken, skip. + continue + uinstances = set() + if 'instances' in obackup: + # Convert module IDs to UUIDs + for mid in obackup['instances'].split(): + if mid in module_uuid_map: + uinstances.add(module_uuid_map[mid]) + # Clear module internal references: + obackup['instances'] = "" + if 'uinstances' in obackup: + # Keep previously loaded references, that were never resolved: + uinstances.update(obackup['uinstances'].split()) + obackup['uinstances'] = " ".join(sorted(uinstances)) + dump['cluster']['backup'][k] = obackup # cluster ldap user domains for d in rdb.scan_iter('cluster/user_domain/ldap/*/conf'): @@ -87,13 +130,6 @@ dump['cluster']['favorites'] = sorted(rdb.smembers(f'cluster/favorites')) # cluster: module URL override dump['cluster']['override']['modules'] = rdb.hgetall('cluster/override/modules') -# modules: backup schedules -for m in rdb.scan_iter('module/*/backups'): - k = m.removeprefix('module/').removesuffix('/backups') - uuid = rdb.hget('cluster/module_uuid', k) - dump['modules'][uuid] = {'backups': []} - dump['modules'][uuid]['backups'] = sorted(rdb.smembers(m)) - leader_id = rdb.hget("cluster/environment", "NODE_ID") # leader vpn endpoint, required to invoke the create-cluster diff --git a/docs/core/database.md b/docs/core/database.md index 2b80e0724..b4f2ecb63 100644 --- a/docs/core/database.md +++ b/docs/core/database.md @@ -247,6 +247,7 @@ The attribute of a backup schedule are stored in an HASH key under the |cluster/backup/{id} schedule |STRING |Schedule time using `onCalendar` systemd syntax| |cluster/backup/{id} schedule_hint |STRING |Schedule in JSON format for the UI| |cluster/backup/{id} instances |STRING |Space-separated list of module IDs| +|cluster/backup/{id} uinstances |STRING |Space-separated list of UUIDs used to rebuild the `instances` field reference after module restoration| #### cluster/user_domain/ From 99246891378b6024f9c606dacd92866183cdd727 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Fri, 3 Apr 2026 16:55:46 +0200 Subject: [PATCH 18/27] feat: route restic through rclone-gateway REST endpoint Refactor prepare_restic_command() to use a local REST server (rclone-gateway on port 4694) for all backup destinations, replacing per-backend credential handling (S3, B2, Azure, SMB, WebDAV). Authentication now uses REDIS_USER/REDIS_PASSWORD credentials; cluster agents fall back to node credentials since rclone-gateway doesn't know cluster creds. Restic password is now fetched from Redis private key private/agents/backup_destination/restic_password/ with a graceful fallback and deprecation warning when the caller passes an unprivileged connection. Refactor list-backup-repositories and read-backup-repositories to use the new cluster.backup shared library. Parallelize gateway probing, rclone lsjson, and metadata fetches with ThreadPoolExecutor. Assisted-by: copilot:claude-sonnet-4.6 --- .../50list_backup_repositories | 140 ++++++++---------- .../agent/actions/restore-module/10restore | 2 +- .../usr/local/agent/bin/restic-wrapper | 2 +- .../usr/local/agent/pypkg/agent/__init__.py | 92 ++++++------ .../usr/local/agent/pypkg/cluster/backup.py | 61 ++++++++ .../actions/read-backup-repositories/50read | 139 +++++++++++------ 6 files changed, 264 insertions(+), 172 deletions(-) diff --git a/core/imageroot/usr/local/agent/actions/list-backup-repositories/50list_backup_repositories b/core/imageroot/usr/local/agent/actions/list-backup-repositories/50list_backup_repositories index 145bb28c4..d8b63bdf1 100755 --- a/core/imageroot/usr/local/agent/actions/list-backup-repositories/50list_backup_repositories +++ b/core/imageroot/usr/local/agent/actions/list-backup-repositories/50list_backup_repositories @@ -1,111 +1,93 @@ #!/usr/bin/env python3 # -# Copyright (C) 2024 Nethesis S.r.l. +# Copyright (C) 2026 Nethesis S.r.l. # SPDX-License-Identifier: GPL-3.0-or-later # import sys import json import agent -import asyncio +import requests +import cluster.backup import os -import time -from datetime import datetime, timezone - -rdb = agent.redis_connect(privileged=False) -module_id = os.environ['MODULE_ID'] -module_uuid = os.environ['MODULE_UUID'] -module_ui_name = rdb.get(f'module/{module_id}/ui_name') or "" -image_name = agent.get_image_name_from_url(os.environ["IMAGE_URL"]) -cluster_uuid = rdb.get("cluster/uuid") or "" -odests = {} -for krepo in rdb.scan_iter('cluster/backup_repository/*'): - dest_uuid = krepo.removeprefix('cluster/backup_repository/') - odests[dest_uuid] = rdb.hgetall(krepo) -rdb.close() +from concurrent.futures import ThreadPoolExecutor, as_completed # -# Fetch data from all backup destinations +# Return a list of Restic repositories from available backup destinations # -async def read_destination_repo(dest_uuid, dest_path): - proc = await asyncio.create_subprocess_exec('rclone-wrapper', dest_uuid, 'lsjson', f'REMOTE_PATH/{dest_path}/config', stdout=asyncio.subprocess.PIPE) - # Return the first and only element of the expected JSON array - out, _ = await proc.communicate() - if out == b'[\n]\n' or not out: - data = {} - else: +def probe_destination(rdb, dest_id, config_path): + """Probe gateway, check for restic repo, and fetch metadata. + Returns a dict with repo info or None.""" + webdav_url = cluster.backup.GATEWAY_URL + '/' + dest_id + with cluster.backup.TimeoutSession(timeout=(3, 10)) as s: try: - data = json.loads(out)[0] + mtime = cluster.backup.webdav_get_mtime(s, webdav_url, config_path) except Exception as ex: - print(agent.SD_DEBUG + f"Ignored output from rclone-wrapper. Does the Restic repository configuration file, {dest_path}/config, exist in destination {dest_uuid}?", repr(ex), 'Data read:', out, file=sys.stderr) - data = {} - return data - -async def read_destination_meta(dest_uuid, dest_path): - proc = await asyncio.create_subprocess_exec('rclone-wrapper', dest_uuid, 'cat', f'REMOTE_PATH/{dest_path}.json', stdout=asyncio.subprocess.PIPE) - out, _ = await proc.communicate() - if out: + print('webdav_get_mtime:', ex, file=sys.stderr) + return None + if mtime is None: + return None + result = {"mtime": mtime, "webdav_url": webdav_url} try: - data = json.loads(out) + result["meta"] = cluster.backup.webdav_read_json(s, webdav_url, config_path.removesuffix("/config") + ".json") except Exception as ex: - print(agent.SD_DEBUG + f"Ignored output from rclone-wrapper. Does {dest_path}.json file exist in destination {dest_uuid}?", repr(ex), 'Data read:', out, file=sys.stderr) - data = {} - else: - data = {} - return data - - -async def get_destination_info(dest_uuid, odest): - global cluster_uuid, module_id, module_uuid, module_ui_name, image_name - - dest_path = f"{image_name}/{module_uuid}" - - async with asyncio.TaskGroup() as tg: - task_repo = tg.create_task(read_destination_repo(dest_uuid, dest_path)) - task_meta = tg.create_task(read_destination_meta(dest_uuid, dest_path)) - - info = { + print("webdav_read_json:", ex, file=sys.stderr) + result["meta"] = {} + return result + +def main(): + rdb = agent.redis_connect() + module_id = os.environ['MODULE_ID'] + module_uuid = os.environ['MODULE_UUID'] + module_ui_name = rdb.get(f'module/{module_id}/ui_name') or "" + image_name = agent.get_image_name_from_url(os.environ["IMAGE_URL"]) + cluster_uuid = rdb.get("cluster/uuid") or "" + config_path = f"{image_name}/{module_uuid}/config" + + base_info = { "module_id": module_id, "module_ui_name": module_ui_name, "node_fqdn": "", - "path": dest_path, + "path": "", "name": image_name, "uuid": module_uuid, "timestamp": 0, - "repository_id" : dest_uuid, - "repository_name": odest["name"], - "repository_provider": odest["provider"], - "repository_url": odest["url"], + "repository_id" : "", + "repository_name": "", + "repository_provider": "", + "repository_url": "", "installed_instance": module_id, "installed_instance_ui_name": module_ui_name, "is_generated_locally": False, } - result_repo = task_repo.result() - if not result_repo: - return None - - try: - # Obtain from lsjson the repository creation timestamp - info['timestamp'] = int(time.mktime(datetime.fromisoformat(result_repo["ModTime"]).timetuple())) - except: - info['timestamp'] = int(time.time()) - - result_meta = task_meta.result() - if "cluster_uuid" in result_meta and result_meta["cluster_uuid"] == cluster_uuid: - info['is_generated_locally'] = True - info.update(result_meta) # merge two dictionaries - - return info + destination_id_list = [r.removeprefix("cluster/backup_repository/") for r in rdb.keys("cluster/backup_repository/*")] + + destinations = [] + with ThreadPoolExecutor() as pool: + futures = {pool.submit(probe_destination, rdb, dest_id, config_path): dest_id for dest_id in destination_id_list} + for future in as_completed(futures): + dest_id = futures[future] + result = future.result() + if result is None: + continue + info = base_info.copy() + info["path"] = f"{image_name}/{module_uuid}" + info["timestamp"] = int(result["mtime"].timestamp()) + orepo = rdb.hgetall("cluster/backup_repository/" + dest_id) + info["repository_id"] = dest_id + info["repository_name"] = orepo["name"] + info["repository_provider"] = orepo["provider"] + info["repository_url"] = orepo["url"] + result_meta = result["meta"] + if "cluster_uuid" in result_meta and result_meta["cluster_uuid"] == cluster_uuid: + info['is_generated_locally'] = True + info.update(result_meta) + destinations.append(info) -async def print_destinations(odests): - tasks = [] - async with asyncio.TaskGroup() as tg: - for dest_uuid, odest in odests.items(): - tasks.append(tg.create_task(get_destination_info(dest_uuid, odest))) - destinations = list(filter(lambda r: r, [task.result() for task in tasks])) json.dump(destinations, fp=sys.stdout) -asyncio.run(print_destinations(odests)) +if __name__ == '__main__': + main() diff --git a/core/imageroot/usr/local/agent/actions/restore-module/10restore b/core/imageroot/usr/local/agent/actions/restore-module/10restore index cc81b2c2f..0f7efc2a1 100755 --- a/core/imageroot/usr/local/agent/actions/restore-module/10restore +++ b/core/imageroot/usr/local/agent/actions/restore-module/10restore @@ -32,7 +32,7 @@ repopath = request['path'] snapshot = request['snapshot'] or "latest" original_environment = request['environment'] -rdb = agent.redis_connect(host='127.0.0.1') # Connect to local replica +rdb = agent.redis_connect(privileged=True, use_replica=True) podman_args = ["--workdir=/srv"] podman_args.extend(agent.get_state_volume_args()) # get volumes from state-include.conf diff --git a/core/imageroot/usr/local/agent/bin/restic-wrapper b/core/imageroot/usr/local/agent/bin/restic-wrapper index 4657aa401..88ec98512 100755 --- a/core/imageroot/usr/local/agent/bin/restic-wrapper +++ b/core/imageroot/usr/local/agent/bin/restic-wrapper @@ -54,7 +54,7 @@ if '--help' in rargs: # Print the wrapper helper, then continue, to forward the help flag to Restic parser.print_help() -rdb = agent.redis_connect(use_replica=True) # Connect to local replica +rdb = agent.redis_connect(use_replica=True, privileged=True) # Connect to local replica if wargs.show: header="No backup destination found.\n" diff --git a/core/imageroot/usr/local/agent/pypkg/agent/__init__.py b/core/imageroot/usr/local/agent/pypkg/agent/__init__.py index 62793efab..f6447c6c4 100644 --- a/core/imageroot/usr/local/agent/pypkg/agent/__init__.py +++ b/core/imageroot/usr/local/agent/pypkg/agent/__init__.py @@ -212,58 +212,62 @@ def run_helper(*args, log_command=True, **kwargs): return subprocess.CompletedProcess(args, proc.returncode) def prepare_restic_command(rdb, repository, repo_path, podman_args, restic_args): + """Return a full podman command line with arguments to run Restic in a + temporary container, and a dictionary of expected environment + variables for it. Parameters for Podman ("podman_args") and Restic + ("restic_args") are accepted with separate argument lists. The + "repository" parameter must be a remote backup destination ID, and + "repo_path" corresponds to the path of Restic repository, relative to + the destination remote filesystem. + """ core_env = read_envfile('/etc/nethserver/core.env') # Import URLs of core images orepo = rdb.hgetall(f"cluster/backup_repository/{repository}") assert_exp(len(orepo) > 0) # Check the repository exists - # Build the environment to run Restic against the given repository+repo_path - restic_env = {} - restic_env["RESTIC_PASSWORD"] = orepo['password'] - restic_env["RESTIC_CACHE_DIR"] = '/var/cache/restic' - - uschema, upath = orepo['url'].split(':', 1) - if uschema == 's3': - restic_env["RESTIC_REPOSITORY"] = orepo['url'] + "/" + repo_path - restic_env["AWS_ACCESS_KEY_ID"] = orepo['aws_access_key_id'] - restic_env["AWS_SECRET_ACCESS_KEY"] = orepo['aws_secret_access_key'] - if orepo['provider'] == 'aws': - restic_env['AWS_DEFAULT_REGION'] = orepo.get('aws_default_region', '') - elif uschema == 'b2': - restic_env["RESTIC_REPOSITORY"] = orepo['url'] + ":" + repo_path - restic_env["B2_ACCOUNT_ID"] = orepo['b2_account_id'] - restic_env["B2_ACCOUNT_KEY"] = orepo['b2_account_key'] - elif uschema == 'azure': - restic_env["RESTIC_REPOSITORY"] = orepo['url'] + ":" + repo_path - restic_env["AZURE_ACCOUNT_NAME"] = orepo['azure_account_name'] - restic_env["AZURE_ACCOUNT_KEY"] = orepo['azure_account_key'] - elif uschema == 'smb': - restic_env["RESTIC_REPOSITORY"] = 'rclone::' + orepo['url'].rstrip("/") + "/" + repo_path - restic_env["RCLONE_SMB_HOST"] = orepo["smb_host"] - restic_env["RCLONE_SMB_USER"] = orepo["smb_user"] - restic_env["RCLONE_SMB_PASS"] = orepo["smb_pass"] - restic_env["RCLONE_SMB_DOMAIN"] = orepo["smb_domain"] - restic_args.insert(0, "--option=rclone.program=/usr/local/bin/rclone-wrapper") - elif uschema == 'webdav' and orepo['provider'] == 'cluster': - ourl = urlparse(upath) - restic_env["RESTIC_REPOSITORY"] = 'rclone::webdav:' + ourl.path.rstrip("/") + "/" + repo_path - restic_env["RCLONE_WEBDAV_URL"] = ourl.scheme + '://' + ourl.netloc + if os.environ['REDIS_USER'] == 'cluster': + # cluster credentials are not known to rclone-gateway, use node + # creds instead. + node_env = read_envfile('/var/lib/nethserver/node/state/agent.env') + http_auth = (node_env['REDIS_USER'], node_env['REDIS_PASSWORD']) + restic_rest_username = node_env['REDIS_USER'] + restic_rest_password = node_env['REDIS_PASSWORD'] else: - raise Exception(f"Schema {uschema} not supported") + # default authentication is based on agent environment and suits + # node and modules as-is. + http_auth = None + restic_rest_username = os.environ['REDIS_USER'] + restic_rest_password = os.environ['REDIS_PASSWORD'] + + try: + restic_password = rdb.hget('private/agents/backup_destination/restic_password', repository) + except redis.exceptions.NoPermissionError: + # Warn app devs to fix the connection privileges of rdb argument: + warnings.warn("Fix your agent.prepare_restic_command() or agent.run_restic() invocation by passing a privileged Redis connection (e.g.: \"redis_connect(privileged=True)\").", stacklevel=3) + rdbrw = redis_connect(privileged=True) + restic_password = rdbrw.hget('private/agents/backup_destination/restic_password', repository) + + # Build the environment to run Restic against the given repository+repo_path + restic_env = { + "RESTIC_PASSWORD": restic_password, + "RESTIC_CACHE_DIR": "/var/cache/restic", + "RESTIC_REPOSITORY": f"rest:http://127.0.0.1:4694/{repository}/{repo_path}", + "RESTIC_REST_USERNAME": restic_rest_username, + "RESTIC_REST_PASSWORD": restic_rest_password, + } # Build the Podman command line to run Restic - container_name = "restic-" + os.environ.get('MODULE_ID', os.environ["AGENT_ID"]) + "-" + str(os.getpid()) - podman_cmd = ['podman', 'run', '-i', '--rm', f'--name={container_name}', '--privileged', '--network=host', + container_name = "restic-" + os.environ.get('MODULE_ID', os.environ["AGENT_ID"].replace("/", "")) + "-" + str(os.getpid()) + podman_cmd = ['podman', 'run', + '--interactive', '--rm', '--replace', f'--name={container_name}', + '--privileged', '--network=host', + '--env=RESTIC_*', '--volume=restic-cache:/var/cache/restic', - "--log-driver=none", + '--log-driver=none', + *podman_args, + core_env["RESTIC_IMAGE"], + *restic_args, ] - for envvar in restic_env: - podman_cmd.extend(['-e', envvar]) # Import Restic environment variables - - podman_cmd.extend(podman_args) # Any argument is appended to podman invocation - podman_cmd.append(core_env["RESTIC_IMAGE"]) - podman_cmd.extend(restic_args) - return (podman_cmd, restic_env) def run_restic(rdb, repository, repo_path, podman_args, restic_args, progress_callback=None, **kwargs): @@ -271,9 +275,7 @@ def run_restic(rdb, repository, repo_path, podman_args, restic_args, progress_ca penv = os.environ.copy() penv.update(restic_env) - if os.getenv('DEBUG', False): - print(*([f"{k}={v}" for k,v in restic_env.items()] + podman_cmd), file=sys.stderr) - else: + if os.getenv('DEBUG') == "1": print("restic", *restic_args, file=sys.stderr) kwargs.setdefault('encoding', 'utf-8') diff --git a/core/imageroot/usr/local/agent/pypkg/cluster/backup.py b/core/imageroot/usr/local/agent/pypkg/cluster/backup.py index 97490c3c2..7cce67ba1 100644 --- a/core/imageroot/usr/local/agent/pypkg/cluster/backup.py +++ b/core/imageroot/usr/local/agent/pypkg/cluster/backup.py @@ -30,6 +30,9 @@ import tempfile import base64 from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes +import requests +from xml.etree import ElementTree as ET +from email.utils import parsedate_to_datetime def get_default_backup_repository_name(provider, url, rid=""): """Suggest a default name for a backup repository""" @@ -251,3 +254,61 @@ def rclone_reveal(obscured: str) -> str: cipher = Cipher(algorithms.AES(_CRYPT_KEY), modes.CTR(iv)) decryptor = cipher.decryptor() return (decryptor.update(ciphertext) + decryptor.finalize()).decode() + +GATEWAY_URL = "http://127.0.0.1:4694" + +class TimeoutSession(requests.Session): + def __init__(self, timeout=10, auth=None): + super().__init__() + self.timeout = timeout # (connect, read) or single float + if auth == None: + self.auth = (os.environ["REDIS_USER"], os.environ["REDIS_PASSWORD"]) + else: + self.auth = auth + def request(self, method, url, **kwargs): + kwargs.setdefault("timeout", self.timeout) + return super().request(method, url, **kwargs) + +def webdav_propfind(session, base_url, path, depth="0"): + url = f"{base_url.rstrip('/')}/{path.lstrip('/')}" + body = """ + + + + + + + """ + r = session.request("PROPFIND", url, + data=body, + headers={"Depth": depth, + "Content-Type": "application/xml"}) + if r.status_code == 404: + return None + r.raise_for_status() + return r + +def webdav_get_mtime(session, base_url, path): + r = webdav_propfind(session, base_url, path) + if r is None: + return None # file does not exist + + # Parse DAV getlastmodified + ns = {"d": "DAV:"} + root = ET.fromstring(r.content) + lm = root.findtext(".//d:getlastmodified", namespaces={"d": "DAV:"}) + if lm: + return parsedate_to_datetime(lm) + + # Last resort: HEAD + Last-Modified header + url = f"{base_url.rstrip('/')}/{path.lstrip('/')}" + head = session.head(url) + head.raise_for_status() + lm_header = head.headers.get("Last-Modified") + return parsedate_to_datetime(lm_header) if lm_header else None + +def webdav_read_json(session, base_url, path): + url = f"{base_url.rstrip('/')}/{path.lstrip('/')}" + r = session.get(url) + r.raise_for_status() + return r.json() diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-repositories/50read b/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-repositories/50read index c2d4123cb..ff7e5340f 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-repositories/50read +++ b/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-repositories/50read @@ -27,31 +27,82 @@ import agent import subprocess import os import time +from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime, timezone +import cluster.backup +import requests request = json.load(sys.stdin) popen_args={"encoding": 'utf-8', "stdout": subprocess.PIPE, "stderr": sys.stderr, "text": True} -rdb = agent.redis_connect(privileged=False) +# Read node credentials for rclone-gateway WebDAV authentication: +node_env = agent.read_envfile('/var/lib/nethserver/node/state/agent.env') +os.environ["RCLONE_WEBDAV_USER"] = node_env["REDIS_USER"] +os.environ["RCLONE_WEBDAV_PASS"] = cluster.backup.rclone_obscure(node_env["REDIS_PASSWORD"]) +os.environ.setdefault("RCLONE_LOG_FORMAT", "") +os.environ.setdefault("RCLONE_LOG_LEVEL", "DEBUG" if os.getenv('DEBUG') == "1" else "ERROR") +node_http_auth=(node_env["REDIS_USER"], node_env["REDIS_PASSWORD"]) +# Privileged connection needed to read private backup destination secrets: +rdb = agent.redis_connect(privileged=True) cluster_uuid = rdb.get("cluster/uuid") or "" -repositories = {} -backups = {} -for krepo in rdb.scan_iter('cluster/backup_repository/*'): - repo_uuid = krepo.removeprefix('cluster/backup_repository/') - rclone_lsjson_cmd = ['rclone-wrapper', repo_uuid, +# Generate a module_uuid -> module_id dict exchanging key-value pairs: +module_uuids = {v: k for k,v in rdb.hgetall("cluster/module_uuid").items()} + +def lsjson_repo(repo_uuid, orepo): + """Probe gateway and launch rclone lsjson for one repository. + Returns (repo_uuid, orepo, webdav_url, lsjson_data).""" + webdav_url = cluster.backup.GATEWAY_URL + '/' + repo_uuid + rclone_lsjson_cmd = ['podman', 'exec', + '--env=RCLONE_WEBDAV_USER', + '--env=RCLONE_WEBDAV_PASS', + '--env=RCLONE_LOG_FORMAT', + '--env=RCLONE_LOG_LEVEL', + 'rclone-gateway', 'rclone', + '--webdav-url=' + webdav_url, + '--webdav-vendor=rclone', '--max-depth=3', '--include=config', 'lsjson', '--files-only', - 'REMOTE_PATH', + ':webdav:', ] - orepo = rdb.hgetall(krepo) - repositories[repo_uuid] = orepo + if os.getenv('DEBUG') == "1": + print(' '.join(rclone_lsjson_cmd), file=sys.stderr) try: proot = subprocess.Popen(rclone_lsjson_cmd, **popen_args) lsjson_data = json.load(proot.stdout) + proot.wait() except Exception as ex: lsjson_data = [] - print(agent.SD_WARNING, 'Failed invocation of', *rclone_lsjson_cmd, ':', ex, file=sys.stderr) + print(f'rclone lsjson failed for backup destination {repo_uuid}', file=sys.stderr) + return repo_uuid, orepo, webdav_url, lsjson_data + +def fetch_meta(session, webdav_url, json_path): + """Fetch a .json metadata file from WebDAV. Returns parsed dict or None.""" + try: + return cluster.backup.webdav_read_json(session, webdav_url, json_path) + except requests.RequestException as ex: + print(ex, file=sys.stderr) + return None + +# Collect repository info +repo_list = [] +for krepo in rdb.scan_iter('cluster/backup_repository/*'): + repo_uuid = krepo.removeprefix('cluster/backup_repository/') + orepo = rdb.hgetall(krepo) + repo_list.append((repo_uuid, orepo)) + +# First level: parallelize rclone lsjson across repositories +backups = {} +repo_results = [] +with ThreadPoolExecutor() as pool: + futures = {pool.submit(lsjson_repo, repo_uuid, orepo): repo_uuid for repo_uuid, orepo in repo_list} + for future in as_completed(futures): + repo_uuid, orepo, webdav_url, lsjson_data = future.result() + repo_results.append((repo_uuid, orepo, webdav_url, lsjson_data)) + +# Build backup entries and collect metadata fetch tasks +meta_tasks = [] # (restic_uuid, repo_uuid, webdav_url, json_path, unix_timestamp) +for repo_uuid, orepo, webdav_url, lsjson_data in repo_results: for oroot in lsjson_data: try: restic_prefix, restic_uuid, _ = oroot["Path"].split("/") @@ -59,7 +110,6 @@ for krepo in rdb.scan_iter('cluster/backup_repository/*'): print('Ignored Restic repository with unexpected path:', oroot["Path"], file=sys.stderr) continue try: - # Obtain from lsjson the repository creation timestamp unix_timestamp = int(time.mktime(datetime.fromisoformat(oroot["ModTime"]).timetuple())) except: unix_timestamp = int(time.time()) @@ -69,10 +119,10 @@ for krepo in rdb.scan_iter('cluster/backup_repository/*'): "module_ui_name": "", "node_fqdn": "", "path": restic_prefix + '/' + restic_uuid, - "name": restic_prefix, # keep "name" attribute for historical reason + "name": restic_prefix, "uuid": restic_uuid, "timestamp": unix_timestamp, - "repository_id" : repo_uuid, + "repository_id": repo_uuid, "repository_name": orepo["name"], "repository_provider": orepo["provider"], "repository_url": orepo["url"], @@ -80,40 +130,37 @@ for krepo in rdb.scan_iter('cluster/backup_repository/*'): "installed_instance_ui_name": "", "is_generated_locally": None, } + if webdav_url: + json_path = restic_prefix + '/' + restic_uuid + '.json' + meta_tasks.append((restic_uuid, repo_uuid, webdav_url, json_path, unix_timestamp)) -# Fetch module UUIDs to search destination matches: -for module_id in rdb.hkeys("cluster/module_node"): - module_uuid = rdb.hget("cluster/module_uuid", module_id) - if not module_uuid or module_uuid not in backups: - continue - for repo_uuid in backups[module_uuid]: - backups[module_uuid][repo_uuid]["is_generated_locally"] = True - backups[module_uuid][repo_uuid]["installed_instance"] = module_id - backups[module_uuid][repo_uuid]["installed_instance_ui_name"] = rdb.get(f"module/{module_id}/ui_name") or "" - backups[module_uuid][repo_uuid]["module_id"] = module_id - backups[module_uuid][repo_uuid]["module_ui_name"] = backups[module_uuid][repo_uuid]["installed_instance_ui_name"] - -for repo_uuid in repositories: - rclone_cat_cmd = ['rclone-wrapper', repo_uuid, - '--include=*.json', - '--max-depth=2', - 'cat', - 'REMOTE_PATH', - ] - proc_cat = subprocess.Popen(rclone_cat_cmd, **popen_args) - for cat_slice in proc_cat.stdout.readlines(): - try: - ometa = json.loads(cat_slice.strip()) - except: - ometa = {} +# Second level: parallelize webdav_read_json requests +with cluster.backup.TimeoutSession(timeout=(3, 10), auth=node_http_auth) as rses: + with ThreadPoolExecutor() as pool: + futures = {} + for restic_uuid, repo_uuid, webdav_url, json_path, unix_timestamp in meta_tasks: + f = pool.submit(fetch_meta, rses, webdav_url, json_path) + futures[f] = (restic_uuid, repo_uuid, unix_timestamp) + for future in as_completed(futures): + restic_uuid, repo_uuid, unix_timestamp = futures[future] + ometa = future.result() + if ometa: + backups[restic_uuid][repo_uuid]["module_id"] = ometa.get("module_id", "") + backups[restic_uuid][repo_uuid]["module_ui_name"] = ometa.get("module_ui_name", "") + backups[restic_uuid][repo_uuid]["node_fqdn"] = ometa.get("node_fqdn", "") + backups[restic_uuid][repo_uuid]["timestamp"] = ometa.get("timestamp", unix_timestamp) + backups[restic_uuid][repo_uuid]["is_generated_locally"] = cluster_uuid == ometa.get("cluster_uuid") - if ometa.get('uuid') in backups: - module_uuid = ometa["uuid"] - backups[module_uuid][repo_uuid]["module_id"] = ometa.get("module_id", "") - backups[module_uuid][repo_uuid]["module_ui_name"] = ometa.get("module_ui_name", "") - backups[module_uuid][repo_uuid]["node_fqdn"] = ometa.get("node_fqdn", "") - backups[module_uuid][repo_uuid]["timestamp"] = ometa.get("timestamp", unix_timestamp) - if "cluster_uuid" in ometa and not backups[module_uuid][repo_uuid].get("is_generated_locally"): - backups[module_uuid][repo_uuid]["is_generated_locally"] = cluster_uuid == ometa["cluster_uuid"] +# Enrich with locally installed instance info +for restic_uuid in backups: + if restic_uuid not in module_uuids: + continue + module_id = module_uuids[restic_uuid] + for repo_uuid in backups[restic_uuid]: + backups[restic_uuid][repo_uuid]["is_generated_locally"] = True + backups[restic_uuid][repo_uuid]["installed_instance"] = module_id + backups[restic_uuid][repo_uuid]["installed_instance_ui_name"] = rdb.get(f"module/{module_id}/ui_name") or "" + backups[restic_uuid][repo_uuid].setdefault("module_id", module_id) + backups[restic_uuid][repo_uuid].setdefault("module_ui_name", backups[restic_uuid][repo_uuid]["installed_instance_ui_name"]) json.dump([restic_repo for xrepo in backups.values() for restic_repo in xrepo.values()], fp=sys.stdout) From a66aa85275e82f0628bc7207f84b82c4ea1fb08e Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Thu, 16 Apr 2026 17:02:26 +0200 Subject: [PATCH 19/27] fix(backup): import destinations properly Store rclone_conf and restic_password in private keys instead of saving the raw destination object. Handle dump version to generate rclone_conf from legacy v3 format. Use a pipeline for atomic writes and publish backup-destination-changed event. Assisted-by: copilot:claude-sonnet-4.6 --- .../10import_backup_destinations | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/import-backup-destinations/10import_backup_destinations b/core/imageroot/var/lib/nethserver/cluster/actions/import-backup-destinations/10import_backup_destinations index f2cdf7a43..e6224e9b4 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/import-backup-destinations/10import_backup_destinations +++ b/core/imageroot/var/lib/nethserver/cluster/actions/import-backup-destinations/10import_backup_destinations @@ -12,14 +12,20 @@ import gzip import subprocess import sys import agent +import cluster.backup def import_destinations(cluster_backup): """Import backup destination configurations into Redis""" rdb = agent.redis_connect(privileged=True) - imported_count = 0 + imported_ids = [] skipped_count = 0 local_cluster_uuid = rdb.get("cluster/uuid") or "unknown_local" import_cluster_uuid = cluster_backup["cluster"].get("uuid", "unknown_import") + try: + dump_version = int(cluster_backup["version"]) + except (KeyError, ValueError): + dump_version = 3 + trx = rdb.pipeline() for dkey, odest in cluster_backup["cluster"]["backup_repository"].items(): # Check if the destination UUID key exists. Historically, a backup # destination was named "backup repository". @@ -30,10 +36,26 @@ def import_destinations(cluster_backup): skipped_count += 1 continue print("Importing backup destination", dkey, odest["name"], file=sys.stderr) - # FIXME: validate configuration before storing odest in Redis - rdb.hset(f"cluster/backup_repository/{dkey}", mapping=odest) - imported_count += 1 - return imported_count, skipped_count + if dump_version > 3: + rclone_conf = odest["rclone_conf"] + restic_password = odest["restic_password"] + else: + rclone_conf = cluster.backup.generate_rclone_conf(dkey, odest["url"], odest["provider"], odest) + restic_password = odest["password"] + trx.hset(f"cluster/backup_repository/{dkey}", mapping={ + "provider": odest["provider"], + "url": odest["url"], + "name": odest["name"], + }) + trx.hset("private/agents/backup_destination/restic_password", dkey, restic_password) + trx.hset("private/nodes/backup_destination/rclone_conf", dkey, rclone_conf) + imported_ids.append(dkey) + if imported_ids: + trx.publish("cluster/event/backup-destination-changed", json.dumps({ + "destination_ids": imported_ids, + })) + trx.execute() + return len(imported_ids), skipped_count def main(): request = json.load(sys.stdin) From e5eb815b9927973fc054daa409ee1391bb3af25b Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Thu, 16 Apr 2026 17:02:36 +0200 Subject: [PATCH 20/27] refactor(backup): use destination_ids list in event Change backup-destination-changed event payload from destination_id (string) to destination_ids (list) to support bulk imports. Assisted-by: copilot:claude-sonnet-4.6 --- .../nethserver/cluster/actions/add-backup-repository/50update | 2 +- .../cluster/actions/alter-backup-repository/50update | 2 +- .../remove-backup-repository/50remove_backup_repository | 2 +- .../cluster/update-core-pre-modules.d/50update_grants | 2 +- docs/core/events.md | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update index d9ea319ff..97592e9c4 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update @@ -86,7 +86,7 @@ def main(): trx.delete('cluster/backup_repository/' + destination_uuid) trx.hset('cluster/backup_repository/' + destination_uuid, mapping=public_data) trx.publish(f"cluster/event/backup-destination-changed", json.dumps({ - "destination_id": destination_uuid, + "destination_ids": [destination_uuid], })) if not trx.execute(): sys.exit(1) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update index c6f07b767..9d1eaa138 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update @@ -78,7 +78,7 @@ def main(): trx.delete(f'cluster/backup_repository/{destination_uuid}') trx.hset(f'cluster/backup_repository/{destination_uuid}', mapping=public_data) trx.publish(f"cluster/event/backup-destination-changed", json.dumps({ - "destination_id": destination_uuid, + "destination_ids": [destination_uuid], })) if not trx.execute(): sys.exit(1) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository index 604c12bc3..0dc6ac5f8 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository +++ b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository @@ -62,6 +62,6 @@ trx.delete(f"cluster/backup_repository/{rid}") trx.hdel('private/nodes/backup_destination/rclone_conf', rid) trx.hdel('private/agents/backup_destination/restic_password', rid) trx.publish(f"cluster/event/backup-destination-changed", json.dumps({ - "destination_id": rid, + "destination_ids": [rid], })) trx.execute() diff --git a/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants b/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants index d379f280d..5969889f4 100755 --- a/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants +++ b/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants @@ -119,7 +119,7 @@ if len(trx_changes) > 0: # At least one destination was migrated: trigger rclone-webdav # restart on all nodes: rdb.publish(f"cluster/event/backup-destination-changed", json.dumps({ - "destination_id": destination_uuid, + "destination_ids": [destination_uuid], })) # Migrate backup schedules: populate instances field from module/*/backups SET keys module_node = rdb.hgetall("cluster/module_node") diff --git a/docs/core/events.md b/docs/core/events.md index ec4a6cd1e..44a206db5 100644 --- a/docs/core/events.md +++ b/docs/core/events.md @@ -45,8 +45,8 @@ Events fired by the `cluster` agent (i.e. channel is `cluster/event/ - `leader-changed`: a node was promoted to leader. The `node_id` attribute indicates the new leader, and `endpoint` its public Wireguard VPN endpoint address - `backup-destination-changed`: a backup destination was - added/altered/removed. See `destination_id` event argument to identify - it. + added/altered/removed. JSON parameter format is + `{"destination_ids": LIST[STRING]}`. ## Node events From a92b33a94c2854069ce226c38493a56601654183 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Thu, 16 Apr 2026 17:39:26 +0200 Subject: [PATCH 21/27] refactor(backup): clean up read-backup-snapshots Use fromisoformat for timestamp parsing. Switch to returncode check to avoid stacktrace noise in logs. Use privileged Redis connection for private/* keys. Assisted-by: copilot:claude-sonnet-4.6 --- .../actions/read-backup-snapshots/50read | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/50read b/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/50read index 7c0df3bf4..bac9c1b5b 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/50read +++ b/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/50read @@ -25,7 +25,7 @@ import sys import json import agent import subprocess -from datetime import datetime, timezone +from datetime import datetime request = json.load(sys.stdin) @@ -33,16 +33,16 @@ request = json.load(sys.stdin) repository = request['repository'] path = request['path'] -rdb = agent.redis_connect() +# Privileged connection for access to private/* keys: +rdb = agent.redis_connect(privileged=True) -snapshots = [] -pret = agent.run_restic(rdb, repository, path, ["--workdir=/srv"], ["snapshots", "--json"], text=True, encoding='utf-8', stdout=subprocess.PIPE, check=True) -for el in json.loads(pret.stdout): - # transform time like "2022-02-05T05:00:13.028068843Z" to a UTC timestamp - sec_index = el['time'].rfind('.') - # remove seconds and timezone before parsing - time = datetime.strptime(el['time'][0:sec_index],'%Y-%m-%dT%H:%M:%S') - time = int(time.replace(tzinfo=timezone.utc).timestamp()) - snapshots.append({'timestamp': time, 'id': el['id']}) - -print(json.dumps(snapshots)) +pret = agent.run_restic(rdb, repository, path, ["--workdir=/srv"], ["snapshots", "--json"], text=True, encoding='utf-8', stdout=subprocess.PIPE) +if pret.returncode == 0: + snapshots = [] + for el in json.loads(pret.stdout): + time = int(datetime.fromisoformat(el['time']).timestamp()) + snapshots.append({'timestamp': time, 'id': el['id']}) + print(json.dumps(snapshots)) +else: + # Restic error message is sent through stderr as-is (in JSON format). + sys.exit(1) From 345f4ca90b327aaaa034fc9f99b0b7c39d4e9db5 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Fri, 17 Apr 2026 15:46:45 +0200 Subject: [PATCH 22/27] fix(rclone-gateway): missing haproxy dir on first setup Alias unit rclone-webdav.service is considered running during core update, hence service startup fails for a missing haproxy/ dir. Create haproxy/ dir and ensure rclone-webdav.service is properly stopped, waiting next acl-changed event handler to start it again. --- .../lib/nethserver/node/update-core.d/20restart_webdav | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav b/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav index 119ec6f20..a80ca4f31 100755 --- a/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav +++ b/core/imageroot/var/lib/nethserver/node/update-core.d/20restart_webdav @@ -12,12 +12,11 @@ if [[ -d rclone ]] ; then systemctl restart rclone-gateway.service else # Run once this procedure: - mkdir -vp rclone + mkdir -vp rclone haproxy # Replace rclone-webdav with the gateway service: - systemctl disable --now rclone-webdav.service + systemctl stop rclone-webdav.service rm -vf /etc/systemd/system/rclone-webdav.service \ /etc/systemd/system/default.target.wants/rclone-webdav.service - systemctl enable rclone-gateway.service # Remove old per-module backup timer/service units (rootfull) rm -vf /etc/systemd/system/backup[0-9]*-*.timer /etc/systemd/system/backup[0-9]*-*.service @@ -31,4 +30,8 @@ else "${homedir}/.config/systemd/user"/backup[0-9]*.service systemctl --user -M "${username}@" daemon-reload done < /etc/passwd + + # Enable the service, but start it later by acl-changed event handler, + # ensuring that new Redis ACLs are present. + systemctl enable rclone-gateway.service fi From 4f9a86285310911809c9105937f2a2332a970464 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Mon, 20 Apr 2026 18:52:45 +0200 Subject: [PATCH 23/27] feat(backup): add rclone provider, hide secrets Add a new "rclone" backup provider type that accepts raw rclone.conf content, either as plain text or base64-encoded. The configuration is normalized with a canonical section header and validated through configparser before use. Both add-backup-repository and alter-backup-repository actions handle the new provider with proper error handling and URL generation compatible with extract_rclone_basepath(). Hide secrets from list-backup-repositories output by adding a hide_secrets flag to parse_rclone_params(). The restic password is no longer returned either. Rename the rclone_conf field to rclone_conf_secret in the validate-backup-destination action input schema to follow the naming convention for sensitive data. Allow alter-backup-repository to preserve existing secrets when the client sends an empty string: the action reads the current value from Redis as fallback, for all provider types. Store a basepath field in each backup repository Redis HASH to support rclone provider paths that are not encoded in the url. Assisted-by: copilot:claude-sonnet-4.6 --- .../usr/local/agent/pypkg/cluster/backup.py | 141 +++++++++++++++--- .../actions/add-backup-repository/50update | 17 ++- .../add-backup-repository/validate-input.json | 50 +++++++ .../actions/alter-backup-repository/50update | 26 +++- .../validate-input.json | 54 ++++++- .../10import_backup_destinations | 3 + .../actions/list-backup-repositories/50list | 5 +- .../validate-output.json | 4 +- .../update-core-pre-modules.d/50update_grants | 1 + .../50validate_backup_destination | 6 +- .../validate-input.json | 6 +- 11 files changed, 276 insertions(+), 37 deletions(-) diff --git a/core/imageroot/usr/local/agent/pypkg/cluster/backup.py b/core/imageroot/usr/local/agent/pypkg/cluster/backup.py index 7cce67ba1..c09bdedf1 100644 --- a/core/imageroot/usr/local/agent/pypkg/cluster/backup.py +++ b/core/imageroot/usr/local/agent/pypkg/cluster/backup.py @@ -27,6 +27,7 @@ import time import agent import configparser +import io import tempfile import base64 from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes @@ -89,21 +90,94 @@ def extract_rclone_basepath(url): upath = urllib.parse.urlparse(upath).path return upath -def generate_rclone_conf(dest_uuid, url, provider, params): +def sanitize_rclone_conf(destination_uuid, params): + """Return the rclone_conf string and its UI-url value from the given + input parameters. This function may raise decode and parse errors.""" + + if params['rclone_conf_secret'] is None: + b64payload = params['rclone_conf_b64_secret'] + rclone_conf = base64.b64decode(b64payload).decode('utf-8') + else: + rclone_conf = params['rclone_conf_secret'] + + rclone_conf, cfg = _normalize_rclone_conf(destination_uuid, rclone_conf) + + if not cfg: + raise Exception("Rclone configuration is empty") + + path = params.get('basepath', '').strip('/') + for host_field in ['host', 'endpoint']: + if host_field in cfg: + try: + parsed = urllib.parse.urlparse(cfg[host_field]) + hostname = (parsed.hostname or parsed.path).rstrip('/') + break + except Exception as ex: + print(ex, file=sys.stderr) + else: + # Fallback to a configuration hash + hostname = stable_uuid_v5(cfg)[0:5] + + if 'type' not in cfg: + raise Exception("Rclone configuration is missing the 'type' field") + + # Build the url as a rclone_conf human-readable identifier + url = f"rclone:{cfg['type']}:{hostname}:{path}" + return rclone_conf, url + +def _normalize_rclone_conf(destination_uuid, rclone_conf): + """Parse an rclone.conf string, ensure it has a single section header + named after destination_uuid, and return the normalized config text along + with a dict of its key-value pairs.""" + + cp = configparser.ConfigParser() + + # Make sure a section header exists and is set to destination_uuid: + try: + cp.read_string(rclone_conf) + old_section = cp.sections()[0] + if old_section != destination_uuid: + cp[destination_uuid] = cp[old_section] + cp.remove_section(old_section) + except configparser.MissingSectionHeaderError: + cp.read_string(f"[{destination_uuid}]\n" + rclone_conf) + + cfg = dict(cp[destination_uuid]) + buf = io.StringIO() + cp.write(buf) + rclone_conf = buf.getvalue() + + return rclone_conf, cfg + +def generate_rclone_conf(dest_uuid, url, provider, params, fallback_conf=None): """Translate the input arguments in a rclone.conf-compatible string""" + + if fallback_conf: + _, fbcfg = _normalize_rclone_conf(dest_uuid, fallback_conf) + else: + fbcfg = {} + uschema, upath = url.split(':', 1) if uschema == 'b2': + if params['b2_account_key'] == "": + secret = fbcfg.get('key', '') + else: + secret = params['b2_account_key'] rclone_conf = ( f"type = b2\n" f"account = {params['b2_account_id']}\n" - f"key = {params['b2_account_key']}\n" + f"key = {secret}\n" ) elif uschema == 's3': + if params['aws_secret_access_key'] == "": + secret = fbcfg.get('secret_access_key') + else: + secret = params['aws_secret_access_key'] rclone_conf = ( f"type = s3\n" f"env_auth = true\n" f"access_key_id = {params['aws_access_key_id']}\n" - f"secret_access_key = {params['aws_secret_access_key']}\n" + f"secret_access_key = {secret}\n" ) s3_endpoint = "" if '/' in upath: @@ -141,13 +215,20 @@ def generate_rclone_conf(dest_uuid, url, provider, params): else: rclone_conf += f"provider = Other\n" elif uschema == 'azure': + if params['azure_account_key'] == "": + secret = fbcfg.get('key', '') + else: + secret = params['azure_account_key'] rclone_conf = ( f"type = azureblob\n" f"account = {params['azure_account_name']}\n" - f"key = {params['azure_account_key']}\n" + f"key = {secret}\n" ) elif uschema == 'smb': - obscured_pass = rclone_obscure(params['smb_pass']) + if params['smb_pass'] == "": + obscured_pass = fbcfg.get('pass', '') + else: + obscured_pass = rclone_obscure(params['smb_pass']) rclone_conf = ( f"type = smb\n" f"host = {params['smb_host']}\n" @@ -174,7 +255,7 @@ def stable_uuid_v5(data: dict) -> str: ).encode("utf-8") return str(uuid.uuid5(uuid.NAMESPACE_URL, payload.decode("utf-8"))) -def parse_rclone_params(rclone_conf): +def parse_rclone_params(rclone_conf, hide_secrets=False): """Parse a rclone remote configuration string and return a flat dictionary. The returned dictionary contains the raw rclone keys from the first @@ -186,25 +267,45 @@ def parse_rclone_params(rclone_conf): cp = configparser.ConfigParser() cp.read_string(rclone_conf) section = cp.sections()[0] - result = dict(cp[section]) + dsection = dict(cp[section]) - rtype = result.get('type', '') + rtype = dsection.get('type', '') + result = { + "type": rtype, + } if rtype == 'b2': - result['b2_account_id'] = result.get('account', '') - result['b2_account_key'] = result.pop('key', '') + result['b2_account_id'] = dsection.get('account', '') + if hide_secrets: + result['b2_account_key'] = "" + else: + result['b2_account_key'] = dsection.get('key', '') elif rtype == 's3': - result['aws_access_key_id'] = result.get('access_key_id', '') - result['aws_secret_access_key'] = result.get('secret_access_key', '') - if 'region' in result: - result['aws_default_region'] = result['region'] + result['aws_access_key_id'] = dsection.get('access_key_id', '') + if hide_secrets: + result['aws_secret_access_key'] = "" + else: + result['aws_secret_access_key'] = dsection.get('secret_access_key', '') + if 'region' in dsection: + result['aws_default_region'] = dsection['region'] elif rtype == 'azureblob': - result['azure_account_name'] = result.get('account', '') - result['azure_account_key'] = result.pop('key', '') + result['azure_account_name'] = dsection.get('account', '') + if hide_secrets: + result['azure_account_key'] = "" + else: + result['azure_account_key'] = dsection.get('key', '') elif rtype == 'smb': - result['smb_host'] = result.get('host', '') - result['smb_user'] = result.get('user', '') - result['smb_pass'] = rclone_reveal(result.pop('pass')) - result['smb_domain'] = result.get('domain', '') + result['smb_host'] = dsection.get('host', '') + result['smb_user'] = dsection.get('user', '') + if hide_secrets: + result['smb_pass'] = "" + else: + result['smb_pass'] = rclone_reveal(dsection.get('pass')) + result['smb_domain'] = dsection.get('domain', '') + else: + if hide_secrets: + result = {k: v for k,v in dsection.items() if k in ['type']} + else: + result = dsection return result diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update index 97592e9c4..3f4c912ef 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/50update @@ -44,11 +44,24 @@ def main(): name = request['name'] else: name = cluster.backup.get_default_backup_repository_name(provider, url, destination_uuid) - public_data = {'url': url, 'name': name, 'provider': provider} + public_data = {'url': url, 'name': name, 'provider': provider, 'basepath': ''} if provider == 'cluster': rclone_conf = f"[{destination_uuid}]\ntype=local\n" destination_basepath = '/srv/repo' validation_nodes = set(cluster.backup.lookup_node_from_webdav_url(rdb, url) or '0') + elif provider == 'rclone': + # Ignore url from input, and derive its value from request parameters: + try: + rclone_conf, url = cluster.backup.sanitize_rclone_conf(destination_uuid, request['parameters']) + except Exception as ex: + print(ex, file=sys.stderr) + agent.set_status('validation-failed') + json.dump([{'field':'parameters','parameter':'parameters','value':'***','error':'rclone_conf_error'}], fp=sys.stdout) + sys.exit(2) + destination_basepath = request['parameters'].get('basepath', '') + public_data['basepath'] = destination_basepath + public_data['url'] = url + validation_nodes = set(rdb.hvals('cluster/module_node')) else: rclone_conf = cluster.backup.generate_rclone_conf(destination_uuid, url, provider, request['parameters']) destination_basepath = cluster.backup.extract_rclone_basepath(url) @@ -63,7 +76,7 @@ def main(): "data": { "id": destination_uuid, "basepath": destination_basepath, - "rclone_conf": rclone_conf, + "rclone_conf_secret": rclone_conf, }, }) node_task_results = agent.tasks.runp( diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/validate-input.json b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/validate-input.json index c35cde7fa..051107994 100644 --- a/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/validate-input.json +++ b/core/imageroot/var/lib/nethserver/cluster/actions/add-backup-repository/validate-input.json @@ -53,6 +53,16 @@ "password": "", "url": "webdav:http://10.5.4.2:4694", "parameters": {} + }, + { + "name": "destination 7", + "provider": "rclone", + "password": "", + "url": "", + "parameters": { + "rclone_conf_secret": "type = s3\naccess_key_id = 45dc5e09346648fa9fed2a0267f58708\nsecret_access_key = db6b04c924f249fdbd86938b19959be0\nendpoint = s3.example.org\nprovider = Other\n", + "basepath": "mybucket/subfolder" + } } ], "type": "object", @@ -89,6 +99,22 @@ "parameters" ], "anyOf": [ + { + "allOf": [ + { + "title": "Rclone schema", + "properties": { + "provider": { + "title": "Raw Rclone Config provider", + "const": "rclone" + }, + "parameters": { + "$ref": "#/$defs/rclone_parameters" + } + } + } + ] + }, { "allOf": [ { @@ -180,6 +206,30 @@ } ], "$defs": { + "rclone_parameters": { + "title": "Rclone raw configuration", + "type": "object", + "required": [ + "rclone_conf_secret" + ], + "additionalProperties": false, + "properties": { + "rclone_conf_secret": { + "type": ["null", "string"], + "maxLength": 100000, + "description": "Basic INI-like string validation", + "pattern": "^(\\[[^\\]]+\\]\\n)?(\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*=\\s*[^\\n]*(\\n|$))*$" + }, + "rclone_conf_b64_secret": { + "type": "string", + "maxLength": 100000 + }, + "basepath": { + "type": "string", + "maxLength": 512 + } + } + }, "b2_parameters": { "title": "B2 (Backblaze) protocol parameters", "type": "object", diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update index 9d1eaa138..a22f1d352 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/50update @@ -36,8 +36,22 @@ def main(): rclone_conf = f"[{destination_uuid}]\ntype=local\n" destination_basepath = '/srv/repo' validation_nodes = set(cluster.backup.lookup_node_from_webdav_url(rdb, url) or '0') + elif provider == 'rclone': + if parameters.get('rclone_conf_secret') == "": + # Configuration Unchanged, replace empty string with the full value read from DB: + parameters['rclone_conf_secret'] = rdb.hget('private/nodes/backup_destination/rclone_conf', destination_uuid) + try: + rclone_conf, url = cluster.backup.sanitize_rclone_conf(destination_uuid, parameters) + except Exception as ex: + print(ex, file=sys.stderr) + agent.set_status('validation-failed') + json.dump([{'field':'parameters','parameter':'parameters','value':'***','error':'rclone_conf_error'}], fp=sys.stdout) + sys.exit(2) + destination_basepath = parameters.get('basepath', rdb.hget(f'cluster/backup_repository/{destination_uuid}', 'basepath') or '') + validation_nodes = set(rdb.hvals('cluster/module_node')) else: - rclone_conf = cluster.backup.generate_rclone_conf(destination_uuid, url, provider, parameters) + fallback_conf = rdb.hget('private/nodes/backup_destination/rclone_conf', destination_uuid) + rclone_conf = cluster.backup.generate_rclone_conf(destination_uuid, url, provider, parameters, fallback_conf=fallback_conf) destination_basepath = cluster.backup.extract_rclone_basepath(url) validation_nodes = set(rdb.hvals('cluster/module_node')) # Prepare a list of tasks, one for each node, to run in parallel: @@ -50,7 +64,7 @@ def main(): "data": { "id": destination_uuid, "basepath": destination_basepath, - "rclone_conf": rclone_conf, + "rclone_conf_secret": rclone_conf, }, }) node_task_results = agent.tasks.runp( @@ -71,7 +85,13 @@ def main(): if not rname: rname = cluster.backup.get_default_backup_repository_name(provider, url, destination_uuid) - public_data = {'url': url, 'name': rname, 'provider': request['provider']} + if provider != 'rclone': + # Only rclone provider stores basepath in Redis HASH. Other + # providers derive its value from the url field and the Redis + # field is kept as an empty string. + destination_basepath = '' + + public_data = {'url': url, 'name': rname, 'provider': provider, 'basepath': destination_basepath} trx = rdb.pipeline() trx.hdel('private/nodes/backup_destination/rclone_conf', destination_uuid) trx.hset('private/nodes/backup_destination/rclone_conf', destination_uuid, rclone_conf) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/validate-input.json b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/validate-input.json index ca52d6973..1b2fb94ef 100644 --- a/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/validate-input.json +++ b/core/imageroot/var/lib/nethserver/cluster/actions/alter-backup-repository/validate-input.json @@ -16,9 +16,24 @@ { "name": "repository 6", "provider": "cluster", - "password": "", "url": "webdav:http://10.5.4.2:4694", "parameters": {} + }, + { + "name": "destination 7", + "provider": "rclone", + "parameters": { + "rclone_conf_secret": "type = s3\naccess_key_id = 45dc5e09346648fa9fed2a0267f58708\nsecret_access_key = db6b04c924f249fdbd86938b19959be0\nendpoint = s3.example.org\nprovider = Other\n", + "basepath": "mybucket/subfolder" + } + }, + { + "name": "destination 7 - change path", + "provider": "rclone", + "parameters": { + "rclone_conf_secret": "", + "basepath": "mybucket/subfolder2" + } } ], "type": "object", @@ -47,6 +62,22 @@ "provider" ], "anyOf": [ + { + "allOf": [ + { + "title": "Rclone schema", + "properties": { + "provider": { + "title": "Raw Rclone Config provider", + "const": "rclone" + }, + "parameters": { + "$ref": "#/$defs/rclone_parameters" + } + } + } + ] + }, { "allOf": [ { @@ -135,6 +166,27 @@ } ], "$defs": { + "rclone_parameters": { + "title": "Rclone raw configuration", + "type": "object", + "additionalProperties": false, + "properties": { + "rclone_conf_secret": { + "type": ["null", "string"], + "maxLength": 100000, + "description": "Basic INI-like string validation", + "pattern": "^(\\[[^\\]]+\\]\\n)?(\\s*([A-Za-z_][A-Za-z0-9_]*)\\s*=\\s*[^\\n]*\\n)*$" + }, + "rclone_conf_b64_secret": { + "type": "string", + "maxLength": 100000 + }, + "basepath": { + "type": "string", + "maxLength": 512 + } + } + }, "b2_parameters": { "title": "B2 (Backblaze) protocol parameters", "type": "object", diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/import-backup-destinations/10import_backup_destinations b/core/imageroot/var/lib/nethserver/cluster/actions/import-backup-destinations/10import_backup_destinations index e6224e9b4..23fee2685 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/import-backup-destinations/10import_backup_destinations +++ b/core/imageroot/var/lib/nethserver/cluster/actions/import-backup-destinations/10import_backup_destinations @@ -39,13 +39,16 @@ def import_destinations(cluster_backup): if dump_version > 3: rclone_conf = odest["rclone_conf"] restic_password = odest["restic_password"] + destination_basepath = odest["basepath"] else: rclone_conf = cluster.backup.generate_rclone_conf(dkey, odest["url"], odest["provider"], odest) restic_password = odest["password"] + destination_basepath = "" # value ignored, it's derived from url at runtime trx.hset(f"cluster/backup_repository/{dkey}", mapping={ "provider": odest["provider"], "url": odest["url"], "name": odest["name"], + "basepath": destination_basepath, }) trx.hset("private/agents/backup_destination/restic_password", dkey, restic_password) trx.hset("private/nodes/backup_destination/rclone_conf", dkey, rclone_conf) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/50list b/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/50list index e7f080c58..39954f282 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/50list +++ b/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/50list @@ -33,14 +33,13 @@ password_exists = os.path.isfile('backup/passphrase') and os.stat('backup/passph for repoid, rclone_conf in rdb.hgetall('private/nodes/backup_destination/rclone_conf').items(): parameters = rdb.hgetall(f'cluster/backup_repository/{repoid}') - restic_password = rdb.hget(f'private/agents/backup_destination/restic_password', repoid) or "" orepo = { "id": repoid, "provider": parameters.pop('provider'), "name": parameters.pop('name', ''), "url": parameters.pop('url'), - "password": restic_password, - "parameters": cluster.backup.parse_rclone_params(rclone_conf), + "password": "", # restic password is hidden + "parameters": cluster.backup.parse_rclone_params(rclone_conf, hide_secrets=True), } brepos.append(orepo) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/validate-output.json b/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/validate-output.json index c716c5fcf..b40feabe7 100644 --- a/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/validate-output.json +++ b/core/imageroot/var/lib/nethserver/cluster/actions/list-backup-repositories/validate-output.json @@ -11,10 +11,10 @@ "provider": "backblaze", "name": "BackBlaze repo1", "url": "b2:backupex1", - "password": "d59a90ec7ad2b2967257a7a308c82c96ac006efd138254bc1e58c8ea07c18400", + "password": "", "parameters": { "b2_account_id": "xxxxxxxxxxxxxx", - "b2_account_key": "yyyyyyyyyyyyyyyyyyyyyy" + "b2_account_key": "" } } ], diff --git a/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants b/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants index 5969889f4..f65360c38 100755 --- a/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants +++ b/core/imageroot/var/lib/nethserver/cluster/update-core-pre-modules.d/50update_grants @@ -109,6 +109,7 @@ if len(trx_changes) > 0: "name": odest.get('name', destination_uuid), "url": odest['url'], "provider": odest['provider'], + "basepath": "", }) mgx.execute() except Exception as ex: diff --git a/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/50validate_backup_destination b/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/50validate_backup_destination index 74a41ada2..5752f3d31 100755 --- a/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/50validate_backup_destination +++ b/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/50validate_backup_destination @@ -39,12 +39,12 @@ def run_validation(request): rclone_args.append("-vvvv") rpath = request['id'] + ':' + request['basepath'] # Test if connection and read-access is successfull - cluster.backup.run_rclone(['lsd', rpath] + rclone_args, temp_config=request['rclone_conf'], stdout=subprocess.DEVNULL, check=True) + cluster.backup.run_rclone(['lsd', rpath] + rclone_args, temp_config=request['rclone_conf_secret'], stdout=subprocess.DEVNULL, check=True) # Test write access, by creating and deleting a directory remote_test_dir = rpath + '/' + os.environ["AGENT_TASK_ID"] + '.tmp' - cluster.backup.run_rclone(['mkdir', remote_test_dir] + rclone_args, temp_config=request['rclone_conf'], stdout=subprocess.DEVNULL, check=True) - cluster.backup.run_rclone(['rmdir', remote_test_dir] + rclone_args, temp_config=request['rclone_conf'], stdout=subprocess.DEVNULL, check=True) + cluster.backup.run_rclone(['mkdir', remote_test_dir] + rclone_args, temp_config=request['rclone_conf_secret'], stdout=subprocess.DEVNULL, check=True) + cluster.backup.run_rclone(['rmdir', remote_test_dir] + rclone_args, temp_config=request['rclone_conf_secret'], stdout=subprocess.DEVNULL, check=True) json.dump({"node_id":int(os.environ["NODE_ID"]),"id": request["id"]}, fp=sys.stdout) if __name__ == "__main__": diff --git a/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/validate-input.json b/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/validate-input.json index 1b384a378..2c996d253 100644 --- a/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/validate-input.json +++ b/core/imageroot/var/lib/nethserver/node/actions/validate-backup-destination/validate-input.json @@ -7,14 +7,14 @@ { "id": "8d3a9df9-f20a-4d4c-9280-718126191d8f", "basepath": "mybucket", - "rclone_conf": "[8d3a9df9-f20a-4d4c-9280-718126191d8f]\n..." + "rclone_conf_secret": "[8d3a9df9-f20a-4d4c-9280-718126191d8f]\n..." } ], "type": "object", "required": [ "id", "basepath", - "rclone_conf" + "rclone_conf_secret" ], "properties": { "id": { @@ -23,7 +23,7 @@ "basepath": { "type": "string" }, - "rclone_conf": { + "rclone_conf_secret": { "type": "string" } } From 8c191b27bccdd952297c06bc036633bf2f658144 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Tue, 21 Apr 2026 19:24:32 +0200 Subject: [PATCH 24/27] feat(backup): use WebDAV for metadata uploads Replace rclone-wrapper subprocess calls with direct WebDAV HTTP requests for uploading repopath JSON manifests and cluster backup GPG files. Add webdav_write_file() helper to cluster.backup module. Remove both rclone-wrapper scripts (agent bin/ and container bin/), now fully replaced by WebDAV calls. Assisted-by: copilot:claude-sonnet-4.6 --- .../usr/local/agent/bin/rclone-wrapper | 38 ----------------- .../usr/local/agent/pypkg/cluster/backup.py | 9 ++++ .../var/lib/nethserver/node/bin/run-backup | 42 ++++++++----------- core/rclone/usr/local/bin/rclone-wrapper | 15 ------- 4 files changed, 27 insertions(+), 77 deletions(-) delete mode 100755 core/imageroot/usr/local/agent/bin/rclone-wrapper delete mode 100755 core/rclone/usr/local/bin/rclone-wrapper diff --git a/core/imageroot/usr/local/agent/bin/rclone-wrapper b/core/imageroot/usr/local/agent/bin/rclone-wrapper deleted file mode 100755 index 737136081..000000000 --- a/core/imageroot/usr/local/agent/bin/rclone-wrapper +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env python3 - -# -# Copyright (C) 2023 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-3.0-or-later -# - -import sys -import agent -import os - -rdb = agent.redis_connect(privileged=False, host="127.0.0.1") -try: - repository_id = sys.argv[1] -except IndexError: - print("Usage:", file=sys.stderr) - print(" rclone-wrapper {REPOSITORY_ID|BACKUP_ID} [rclone args...]", file=sys.stderr) - print("", file=sys.stderr) - print("The string \"REMOTE_PATH\" is replaced with the value defined by REPOSITORY_ID", file=sys.stderr) - print("e.g.: rclone-wrapper 1 ls REMOTE_PATH/dokuwiki", file=sys.stderr) - sys.exit(33) - -if repository_id.isnumeric(): - # Assume a backup ID has been issued. Translate it to a repository ID. - repository_id = rdb.hget(f"cluster/backup/{repository_id}", "repository") - -if not rdb.exists(f"cluster/backup_repository/{repository_id}"): - print(f"Could not find any repo with {sys.argv[1]}", file=sys.stderr) - sys.exit(34) - -# Build the Podman+Rclone command line -exec_args = ["podman", "exec", "-i", "rclone-gateway", "rclone"] + sys.argv[2:] - -# Substitute REMOTE_PATH placeholder in Rclone args -rclone_path = f'combined:{repository_id}' -exec_args = [rarg.replace('REMOTE_PATH', rclone_path) for rarg in exec_args] - -os.execvp("podman", exec_args) diff --git a/core/imageroot/usr/local/agent/pypkg/cluster/backup.py b/core/imageroot/usr/local/agent/pypkg/cluster/backup.py index c09bdedf1..db8693749 100644 --- a/core/imageroot/usr/local/agent/pypkg/cluster/backup.py +++ b/core/imageroot/usr/local/agent/pypkg/cluster/backup.py @@ -413,3 +413,12 @@ def webdav_read_json(session, base_url, path): r = session.get(url) r.raise_for_status() return r.json() + +def webdav_write_file(session, base_url, path, data: bytes, content_type="application/octet-stream"): + url = f"{base_url.rstrip('/')}/{path.lstrip('/')}" + # Make sure parent exists: + parent = '/'.join(url.split('/')[:-1]) + session.request("MKCOL", parent) + r = session.put(url, data=data, headers={"Content-Type": content_type}) + r.raise_for_status() + return r.status_code # 201 Created or 204 No Content diff --git a/core/imageroot/var/lib/nethserver/node/bin/run-backup b/core/imageroot/var/lib/nethserver/node/bin/run-backup index ef3900986..d531a1086 100755 --- a/core/imageroot/var/lib/nethserver/node/bin/run-backup +++ b/core/imageroot/var/lib/nethserver/node/bin/run-backup @@ -14,6 +14,7 @@ import sys import time import agent +import cluster.backup EXIT_ALREADY_RUNNING = 3 RETRY_INTERVAL = 60 # seconds between retries @@ -69,7 +70,7 @@ def collect_backup_stats(rdb, repository, repopath, start_time): stats_proc = agent.run_restic(rdb, repository, repopath, [], ["stats", "--json", "latest"], text=True, stdout=subprocess.PIPE, check=True) status.update(json.loads(stats_proc.stdout)) except Exception as ex: - print(f"[ERROR] restic stats failed for {repopath}: {ex}", file=sys.stderr) + print(f"[ERROR] restic stats failed for {repository}/{repopath}: {ex}", file=sys.stderr) status["errors"] += 1 status["end"] = int(time.time()) return status @@ -104,9 +105,9 @@ def run_retention(rdb, repository, repopath, retention): except Exception as ex: # Backup is OK, but retention failed. We assume the error # condition is temporary and will be recovered by next backup run. - print(f"[WARNING] retention policy failed for {repopath}: {ex}", file=sys.stderr) + print(f"[WARNING] retention policy failed for {repository}/{repopath}: {ex}", file=sys.stderr) -def update_repopath_json(rdb, backup_id, module_id, repopath, success): +def update_repopath_json(rdb, repository, backup_id, module_id, repopath, success): """Upload {repopath}.json metadata to the remote destination.""" ometa = { "module_id": module_id, @@ -117,18 +118,14 @@ def update_repopath_json(rdb, backup_id, module_id, repopath, success): "timestamp": int(time.time()), "success": success, } + json_path = f"{repository}/{repopath}.json" try: - subprocess.run( - ["rclone-wrapper", str(backup_id), "rcat", f"REMOTE_PATH/{repopath}.json"], - stdout=sys.stderr, - input="\n" + json.dumps(ometa, separators=(",", ":")) + "\n", - text=True, - check=True, - ) - except subprocess.CalledProcessError as ex: - print(f"[ERROR] rclone-wrapper failed for {repopath}.json: {ex}", file=sys.stderr) + with cluster.backup.TimeoutSession(timeout=(3, 10)) as ses: + cluster.backup.webdav_write_file(ses, cluster.backup.GATEWAY_URL, json_path, json.dumps(ometa).encode('utf-8'), 'application/json') + except Exception as ex: + print(f"[ERROR] upload of JSON manifest failed: {ex}", file=sys.stderr) -def upload_cluster_backup(backup_id, cluster_uuid): +def upload_cluster_backup(rdb, repository, backup_id, cluster_uuid): """Generate and upload the cluster backup to the remote destination.""" dump_path = "/var/lib/nethserver/cluster/state/backup/dump.json.gz.gpg" if not os.path.isfile(dump_path): @@ -137,18 +134,15 @@ def upload_cluster_backup(backup_id, cluster_uuid): except subprocess.CalledProcessError as ex: print(f"[ERROR] cluster-backup failed: {ex}", file=sys.stderr) return - remote_name = f"cluster-backup-{cluster_uuid}.json.gz.gpg" + remote_name = f"{repository}/cluster-backup-{cluster_uuid}.json.gz.gpg" try: with open(dump_path, "rb") as f: - subprocess.run( - ["rclone-wrapper", str(backup_id), "rcat", f"REMOTE_PATH/{remote_name}"], - stdin=f, - stdout=sys.stderr, - check=True, - ) + gpg_data = f.read() + with cluster.backup.TimeoutSession(timeout=(3, 10)) as ses: + cluster.backup.webdav_write_file(ses, cluster.backup.GATEWAY_URL, remote_name, gpg_data) print(f"Cluster backup uploaded as {remote_name}", file=sys.stderr) - except subprocess.CalledProcessError as ex: - print(f"[ERROR] rclone-wrapper failed for cluster backup: {ex}", file=sys.stderr) + except Exception as ex: + print(f"[ERROR] upload of cluster backup GPG file failed: {ex}", file=sys.stderr) def process_module(rdb, module_id, backup_id, node_id, repository, retention): """Run backup for one module, handle post-backup tasks. Returns exit code.""" @@ -170,7 +164,7 @@ def process_module(rdb, module_id, backup_id, node_id, repository, retention): if success: run_retention(rdb, repository, repopath, retention) - update_repopath_json(rdb, backup_id, module_id, repopath, success) + update_repopath_json(rdb, repository, backup_id, module_id, repopath, success) return rc def main(): @@ -205,7 +199,7 @@ def main(): leader_id = rdb.hget("cluster/environment", "NODE_ID") if node_id == leader_id: cluster_uuid = rdb.get("cluster/uuid") or "" - upload_cluster_backup(backup_id, cluster_uuid) + upload_cluster_backup(rdb, repository, backup_id, cluster_uuid) # Keep only modules that belong to this node local_modules = [m for m in instances if module_node.get(m) == node_id] diff --git a/core/rclone/usr/local/bin/rclone-wrapper b/core/rclone/usr/local/bin/rclone-wrapper deleted file mode 100755 index 43478497f..000000000 --- a/core/rclone/usr/local/bin/rclone-wrapper +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/sh - -# -# Copyright (C) 2023 Nethesis S.r.l. -# SPDX-License-Identifier: GPL-3.0-or-later -# - -# Rclone expects the password is in obscured format - -if [ -n "${RCLONE_SMB_PASS}" ]; then - RCLONE_SMB_PASS="$(/usr/bin/rclone obscure "${RCLONE_SMB_PASS}")" - export RCLONE_SMB_PASS -fi - -exec /usr/bin/rclone "${@}" From 1082e2af5e16f853f7c6180f35356b95d43bee05 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Wed, 6 May 2026 18:13:48 +0200 Subject: [PATCH 25/27] fix(restore-module): never remove Traefik During Traefik restore, the sigterm handler must not trigger the module removal. --- .../nethserver/cluster/actions/restore-module/50restore_module | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module b/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module index 2e109d19d..1f32b242d 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module +++ b/core/imageroot/var/lib/nethserver/cluster/actions/restore-module/50restore_module @@ -129,7 +129,7 @@ restore_task_result = agent.tasks.run("module/" + module_id, "restore-module", "replace": replace_requested, }, endpoint="redis://cluster-leader", - on_sigterm_tasks=[remove_module_on_sigterm], + on_sigterm_tasks=[remove_module_on_sigterm] if image_name != "traefik" else [], progress_callback=agent.get_progress_callback(16, 99 if len(remove_modules) == 0 else 94), ) From 1e7c66cdcca0b62c630ac658fd416f718fc51247 Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Tue, 12 May 2026 13:35:19 +0200 Subject: [PATCH 26/27] feat(read-backup-snapshots): return size and dates Return the snapshot backup size and start/end timestamps. --- .../cluster/actions/read-backup-snapshots/50read | 8 +++++++- .../read-backup-snapshots/validate-output.json | 16 +++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/50read b/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/50read index bac9c1b5b..2618beaa6 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/50read +++ b/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/50read @@ -41,7 +41,13 @@ if pret.returncode == 0: snapshots = [] for el in json.loads(pret.stdout): time = int(datetime.fromisoformat(el['time']).timestamp()) - snapshots.append({'timestamp': time, 'id': el['id']}) + snapshots.append({ + 'timestamp': time, + 'id': el['id'], + 'total_bytes': el.get('summary', {}).get('total_bytes_processed', -1), + 'backup_start': el.get('summary', {}).get('backup_start', ""), + 'backup_end': el.get('summary', {}).get('backup_end', "") + }) print(json.dumps(snapshots)) else: # Restic error message is sent through stderr as-is (in JSON format). diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/validate-output.json b/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/validate-output.json index 271dfe407..10c23baa9 100644 --- a/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/validate-output.json +++ b/core/imageroot/var/lib/nethserver/cluster/actions/read-backup-snapshots/validate-output.json @@ -6,26 +6,40 @@ "examples": [ [ { + "total_bytes": -1, + "backup_start": "", + "backup_end": "", "timestamp": 1643964838, "id": "a2a4cb238e4bd428900756376d4ff94009a8f487effe25c145bfaffa72406693" }, { + "total_bytes": -1, + "backup_start": "", + "backup_end": "", "timestamp": 1644037213, "id": "e993a67283eefa7a5d148284790e526f70f360788ed71e17e18406bc5a8a1185" }, { + "total_bytes": -1, + "backup_start": "", + "backup_end": "", "timestamp": 1644123618, "id": "4ea007b7770cfab125ab6035ae5371fe0e87464b8caa1607b2ced8c7e8732b4a" }, { + "total_bytes": -1, + "backup_start": "", + "backup_end": "", "timestamp": 1644210013, "id": "72a3cecc8b6acbd1610ebabd9aae1ac2b0b96864fa986a7437188036378e61ad" }, { + "backup_start": "2026-05-09T23:00:31.429599171Z", + "backup_end": "2026-05-09T23:06:25.901411958Z", + "total_bytes": 482687369942, "timestamp": 1644296416, "id": "ebd99130107f282e47e7eb161de8fe14e0b832b7eacb04c9bf60761407e6081a" } ] - ] } From 2acb109a06487a4fb2d14aec21c79db8d83b0cce Mon Sep 17 00:00:00 2001 From: Davide Principi Date: Thu, 14 May 2026 19:27:09 +0200 Subject: [PATCH 27/27] fix: code review - remove redundant tmpfs volume - fix copyright year - remove development comment --- core/imageroot/etc/systemd/system/rclone-gateway.service | 3 +-- .../remove-backup-repository/50remove_backup_repository | 1 - .../node/events/backup-schedule-changed/20purge_backup_prom | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/core/imageroot/etc/systemd/system/rclone-gateway.service b/core/imageroot/etc/systemd/system/rclone-gateway.service index d7185347c..02d1d13e8 100644 --- a/core/imageroot/etc/systemd/system/rclone-gateway.service +++ b/core/imageroot/etc/systemd/system/rclone-gateway.service @@ -33,9 +33,8 @@ ExecStart=/usr/bin/podman run \ --volume=./rclone:/etc/rclone:ro,Z \ --volume=./haproxy:/etc/haproxy:ro,Z \ --volume=${BACKUP_VOLUME}:/srv/repo:z \ - --mount=type=tmpfs,tmpfs-size=10M,destination=/var/lib/rclone,chown=true \ --volume=/dev/log:/dev/log \ - --volume=rclone-cache:/var/cache/rclone:Z \ + --volume=rclone-cache:/var/cache/rclone:z \ --entrypoint=rclone-gateway-entrypoint.sh \ --env-file=rclone-webdav.env \ ${RCLONE_IMAGE} diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository index 0dc6ac5f8..7f640c168 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository +++ b/core/imageroot/var/lib/nethserver/cluster/actions/remove-backup-repository/50remove_backup_repository @@ -43,7 +43,6 @@ trx = rdb.pipeline() # Drop related backup objects # related_backups = [] -# XXX use an index instead of scan for kbid in rdb.scan_iter('cluster/backup/*'): repo_id = rdb.hget(kbid, 'repository') if repo_id != rid: diff --git a/core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/20purge_backup_prom b/core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/20purge_backup_prom index 343891950..98f5d6cf1 100755 --- a/core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/20purge_backup_prom +++ b/core/imageroot/var/lib/nethserver/node/events/backup-schedule-changed/20purge_backup_prom @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -# Copyright (C) 2024 Nethesis S.r.l. +# Copyright (C) 2026 Nethesis S.r.l. # SPDX-License-Identifier: AGPL-3.0-or-later #