From 45dc4d2430abcdc2976e2ce1fb88e81c8b7e96de Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Mon, 30 Jun 2025 14:34:02 +0200 Subject: [PATCH 001/337] Revert "chore: update apt versions based on rebuild" This reverts commit da2bea54 --- docker_config/Dockerfile_ODELIA | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index d0917b89..145a82d3 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,13 +12,13 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 +RUN apt install -y apt apt-utils libapt-pkg6.0 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.3 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.5 libpam-modules=1.4.0-11ubuntu2.5 libpam-runtime=1.4.0-11ubuntu2.5 libpam0g=1.4.0-11ubuntu2.5 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-141.151 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev logsave mount openssl util-linux # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.3 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.3 gnupg-utils=2.2.27-3ubuntu2.3 gnupg=2.2.27-3ubuntu2.3 gpg-agent=2.2.27-3ubuntu2.3 gpg-wks-client=2.2.27-3ubuntu2.3 gpg-wks-server=2.2.27-3ubuntu2.3 gpg=2.2.27-3ubuntu2.3 gpgconf=2.2.27-3ubuntu2.3 gpgsm=2.2.27-3ubuntu2.3 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.9 libpython3.10-stdlib=3.10.12-1~22.04.9 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.3 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.9 python3.10=3.10.12-1~22.04.9 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 +RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip # Prepare Docker installation @@ -28,7 +28,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.24.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.2.2-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.2.2-1~ubuntu.22.04~jammy docker-ce=5:28.2.2-1~ubuntu.22.04~jammy docker-compose-plugin=2.36.2-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.12 git=1:2.34.1-1ubuntu1.12 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -69,4 +69,4 @@ RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm COPY . /MediSwarm RUN mkdir -p /fl_admin/transfer -RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm +RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm \ No newline at end of file From a7f10c3c5636824bfdb7d07b39d835f6d44baf30 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Mon, 30 Jun 2025 14:35:21 +0200 Subject: [PATCH 002/337] Revert "WIP: remove apt versions for rebuild" This reverts commit b89263947111dfa63b04e0c34fbd21759ed37013. --- docker_config/Dockerfile_ODELIA | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 145a82d3..d0917b89 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,13 +12,13 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install -y apt apt-utils libapt-pkg6.0 +RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 # Update versions of installed packages -RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev logsave mount openssl util-linux +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.3 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.5 libpam-modules=1.4.0-11ubuntu2.5 libpam-runtime=1.4.0-11ubuntu2.5 libpam0g=1.4.0-11ubuntu2.5 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-141.151 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip +RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.3 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.3 gnupg-utils=2.2.27-3ubuntu2.3 gnupg=2.2.27-3ubuntu2.3 gpg-agent=2.2.27-3ubuntu2.3 gpg-wks-client=2.2.27-3ubuntu2.3 gpg-wks-server=2.2.27-3ubuntu2.3 gpg=2.2.27-3ubuntu2.3 gpgconf=2.2.27-3ubuntu2.3 gpgsm=2.2.27-3ubuntu2.3 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.9 libpython3.10-stdlib=3.10.12-1~22.04.9 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.3 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.9 python3.10=3.10.12-1~22.04.9 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 # Prepare Docker installation @@ -28,7 +28,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils +RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.24.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.2.2-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.2.2-1~ubuntu.22.04~jammy docker-ce=5:28.2.2-1~ubuntu.22.04~jammy docker-compose-plugin=2.36.2-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.12 git=1:2.34.1-1ubuntu1.12 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -69,4 +69,4 @@ RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm COPY . /MediSwarm RUN mkdir -p /fl_admin/transfer -RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm \ No newline at end of file +RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm From f9be4ee1ddd163aeb068270730af9f26097ec50e Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Mon, 30 Jun 2025 14:36:18 +0200 Subject: [PATCH 003/337] Revert "chore: update apt versions based on rebuild" This reverts commit da2bea54 --- docker_config/Dockerfile_ODELIA | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index d0917b89..145a82d3 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,13 +12,13 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 +RUN apt install -y apt apt-utils libapt-pkg6.0 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.3 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.5 libpam-modules=1.4.0-11ubuntu2.5 libpam-runtime=1.4.0-11ubuntu2.5 libpam0g=1.4.0-11ubuntu2.5 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-141.151 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev logsave mount openssl util-linux # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.3 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.3 gnupg-utils=2.2.27-3ubuntu2.3 gnupg=2.2.27-3ubuntu2.3 gpg-agent=2.2.27-3ubuntu2.3 gpg-wks-client=2.2.27-3ubuntu2.3 gpg-wks-server=2.2.27-3ubuntu2.3 gpg=2.2.27-3ubuntu2.3 gpgconf=2.2.27-3ubuntu2.3 gpgsm=2.2.27-3ubuntu2.3 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.9 libpython3.10-stdlib=3.10.12-1~22.04.9 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.3 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.9 python3.10=3.10.12-1~22.04.9 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 +RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip # Prepare Docker installation @@ -28,7 +28,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.24.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.2.2-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.2.2-1~ubuntu.22.04~jammy docker-ce=5:28.2.2-1~ubuntu.22.04~jammy docker-compose-plugin=2.36.2-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.12 git=1:2.34.1-1ubuntu1.12 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -69,4 +69,4 @@ RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm COPY . /MediSwarm RUN mkdir -p /fl_admin/transfer -RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm +RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm \ No newline at end of file From efa15b209232ca414ba55b8d0fb3fb74a5ea1e1f Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Mon, 30 Jun 2025 14:38:23 +0200 Subject: [PATCH 004/337] revert --- docker_config/Dockerfile_ODELIA | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 145a82d3..ac3a61f0 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -15,11 +15,10 @@ RUN apt update RUN apt install -y apt apt-utils libapt-pkg6.0 # Update versions of installed packages -RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev logsave mount openssl util-linux +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.3 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.5 libpam-modules=1.4.0-11ubuntu2.5 libpam-runtime=1.4.0-11ubuntu2.5 libpam0g=1.4.0-11ubuntu2.5 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-141.151 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip - +RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.3 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.3 gnupg-utils=2.2.27-3ubuntu2.3 gnupg=2.2.27-3ubuntu2.3 gpg-agent=2.2.27-3ubuntu2.3 gpg-wks-client=2.2.27-3ubuntu2.3 gpg-wks-server=2.2.27-3ubuntu2.3 gpg=2.2.27-3ubuntu2.3 gpgconf=2.2.27-3ubuntu2.3 gpgsm=2.2.27-3ubuntu2.3 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.9 libpython3.10-stdlib=3.10.12-1~22.04.9 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.3 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.9 python3.10=3.10.12-1~22.04.9 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ From 0c917e09e7d5109bd0bd020888e24c799ec5f0dc Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Mon, 7 Jul 2025 14:58:56 +0200 Subject: [PATCH 005/337] Potential fix for code scanning alert no. 3: Workflow does not contain permissions Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- .github/workflows/pr-test.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index ccaf7a5c..eb9d0a2b 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -8,6 +8,9 @@ on: - main - dev +permissions: + contents: read + jobs: validate-swarm: runs-on: self-hosted From ee7f57d2881d6cb15f9a54865ec71f8fc4317a3a Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 8 Jul 2025 10:29:30 +0200 Subject: [PATCH 006/337] updated apt package versions --- docker_config/Dockerfile_ODELIA | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index c6b4f894..26348579 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -15,10 +15,10 @@ RUN apt update RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.3 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.3 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.3 gnupg-utils=2.2.27-3ubuntu2.3 gnupg=2.2.27-3ubuntu2.3 gpg-agent=2.2.27-3ubuntu2.3 gpg-wks-client=2.2.27-3ubuntu2.3 gpg-wks-server=2.2.27-3ubuntu2.3 gpg=2.2.27-3ubuntu2.3 gpgconf=2.2.27-3ubuntu2.3 gpgsm=2.2.27-3ubuntu2.3 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.3 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 +RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -68,4 +68,4 @@ RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm COPY . /MediSwarm RUN mkdir -p /fl_admin/transfer -RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm \ No newline at end of file +RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm From f7c9afcbe8ccf94fae15e2b4b38845af3775d046 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 8 Jul 2025 13:32:33 +0200 Subject: [PATCH 007/337] install einops and x-transformers which apparently is no longer available --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 26348579..57da5891 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -43,7 +43,7 @@ RUN python3 -m pip install --upgrade psutil==7.0.0 RUN python3 -m pip install Flask==3.0.2 Flask-JWT-Extended==4.6.0 Flask-SQLAlchemy==3.1.1 PyJWT==2.10.1 SQLAlchemy==2.0.16 Werkzeug==3.0.1 blinker==1.9.0 docker==7.1.0 greenlet==3.1.1 grpcio==1.62.1 gunicorn==23.0.0 itsdangerous==2.2.0 msgpack==1.1.0 protobuf==4.24.4 pyhocon==0.3.61 pyparsing==3.0.9 websockets==15.0 # Install additional Python packages for swarm training at defined versions -RUN python3 -m pip install Deprecated==1.2.14 SimpleITK==2.2.1 absl-py==2.1.0 aiohttp==3.9.5 aiosignal==1.3.1 async-timeout==4.0.3 cachetools==5.3.3 contourpy==1.2.1 cycler==0.12.1 et-xmlfile==1.1.0 fonttools==4.53.1 frozenlist==1.4.1 google-auth-oauthlib==1.0.0 google-auth==2.31.0 huggingface_hub==0.23.4 humanize==4.9.0 joblib==1.4.2 kiwisolver==1.4.5 lightning-utilities==0.11.3.post0 markdown-it-py==3.0.0 markdown==3.6 matplotlib==3.7.2 mdurl==0.1.2 monai==1.3.0 multidict==6.0.5 nibabel==5.2.1 oauthlib==3.2.2 openpyxl==3.1.0 pandas==2.2.2 pyasn1-modules==0.4.0 pyasn1==0.6.0 pydicom==2.4.4 python-dateutil==2.9.0.post0 pytorch-lightning==1.9.0 requests-oauthlib==2.0.0 rich==13.7.1 rsa==4.9 safetensors==0.4.3 scikit-learn==1.3.0 scipy==1.14.0 seaborn==0.12.2 shellingham==1.5.4 tensorboard-data-server==0.7.2 tensorboard-plugin-wit==1.8.1 tensorboard==2.12.1 threadpoolctl==3.5.0 timm==0.9.16 torchio==0.19.6 torchmetrics==1.4.0.post0 torchvision==0.17.0 tqdm==4.65.0 typer==0.12.3 tzdata==2024.1 wrapt==1.16.0 yarl==1.9.4 +RUN python3 -m pip install Deprecated==1.2.14 SimpleITK==2.2.1 absl-py==2.1.0 aiohttp==3.9.5 aiosignal==1.3.1 async-timeout==4.0.3 cachetools==5.3.3 contourpy==1.2.1 cycler==0.12.1 einops==0.8.1 et-xmlfile==1.1.0 fonttools==4.53.1 frozenlist==1.4.1 google-auth-oauthlib==1.0.0 google-auth==2.31.0 huggingface_hub==0.23.4 humanize==4.9.0 joblib==1.4.2 kiwisolver==1.4.5 lightning-utilities==0.11.3.post0 markdown-it-py==3.0.0 markdown==3.6 matplotlib==3.7.2 mdurl==0.1.2 monai==1.3.0 multidict==6.0.5 nibabel==5.2.1 oauthlib==3.2.2 openpyxl==3.1.0 pandas==2.2.2 pyasn1-modules==0.4.0 pyasn1==0.6.0 pydicom==2.4.4 python-dateutil==2.9.0.post0 pytorch-lightning==1.9.0 requests-oauthlib==2.0.0 rich==13.7.1 rsa==4.9 safetensors==0.4.3 scikit-learn==1.3.0 scipy==1.14.0 seaborn==0.12.2 shellingham==1.5.4 tensorboard-data-server==0.7.2 tensorboard-plugin-wit==1.8.1 tensorboard==2.12.1 threadpoolctl==3.5.0 timm==0.9.16 torchio==0.19.6 torchmetrics==1.4.0.post0 torchvision==0.17.0 tqdm==4.65.0 typer==0.12.3 tzdata==2024.1 wrapt==1.16.0 x-transformers==2.4.9 yarl==1.9.4 # Install packages needed for testing and for listing licenses of installed packages RUN python3 -m pip install coverage==7.5.4 mock==5.1.0 From f68b582d24a7064edbfc4dee3bb6e850bf94dcc5 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 8 Jul 2025 13:46:17 +0200 Subject: [PATCH 008/337] removed precision specification that apparently does not work any more --- .../ODELIA_ternary_classification/app/custom/threedcnn_ptl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index b24d1281..ce7f3740 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -122,7 +122,6 @@ def prepare_training(logger, max_epochs: int, site_name: str): trainer = Trainer( accelerator='gpu', accumulate_grad_batches=1, - precision='16-mixed', default_root_dir=str(path_run_dir), callbacks=[checkpointing], enable_checkpointing=True, From fce123d353835104285f5d1590b1a5918eb55ccf Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 8 Jul 2025 10:18:37 +0200 Subject: [PATCH 009/337] add pre-trained model to docker image --- .gitignore | 3 +++ buildDockerImageAndStartupKits.sh | 8 ++++++-- docker_config/Dockerfile_ODELIA | 11 +++++++---- docker_config/master_template.yml | 2 +- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 57af2b10..86efbffa 100644 --- a/.gitignore +++ b/.gitignore @@ -180,3 +180,6 @@ provision # Ignore provisioned files /workspace/ + +# Ignore directory for caching pre-trained models +docker_config/torch_home_cache diff --git a/buildDockerImageAndStartupKits.sh b/buildDockerImageAndStartupKits.sh index dac8be1f..b39e92c2 100755 --- a/buildDockerImageAndStartupKits.sh +++ b/buildDockerImageAndStartupKits.sh @@ -30,8 +30,9 @@ DOCKER_IMAGE=jefftud/odelia:$VERSION # prepare clean version of source code repository clone for building Docker image CWD=`pwd` CLEAN_SOURCE_DIR=`mktemp -d` -cp -r . $CLEAN_SOURCE_DIR/ -cd $CLEAN_SOURCE_DIR +mkdir $CLEAN_SOURCE_DIR/MediSwarm +cp -r . $CLEAN_SOURCE_DIR/MediSwarm/ +cd $CLEAN_SOURCE_DIR/MediSwarm git clean -x -q -f . cd docker_config/NVFlare git clean -x -q -f . @@ -40,6 +41,9 @@ rm .git -rf chmod a+rX . -R cd $CWD +cp -r ./docker_config/torch_home_cache $CLEAN_SOURCE_DIR/torch_home_cache +chmod a+rX $CLEAN_SOURCE_DIR/torch_home_cache -R + docker build $DOCKER_BUILD_ARGS -t $DOCKER_IMAGE $CLEAN_SOURCE_DIR -f docker_config/Dockerfile_ODELIA echo "Docker image $DOCKER_IMAGE built successfully" diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 57da5891..a57e5b5d 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -54,18 +54,21 @@ RUN python3 -m pip cache purge # install ODELIA fork of NVFlare from local source WORKDIR /workspace/ -COPY ./docker_config/NVFlare /workspace/nvflare +COPY ./MediSwarm/docker_config/NVFlare /workspace/nvflare ## use startup kit template in the dashboard -COPY ./docker_config/master_template.yml /workspace/nvflare/nvflare/lighter/impl/ +COPY ./MediSwarm/docker_config/master_template.yml /workspace/nvflare/nvflare/lighter/impl/ RUN python3 -m pip install /workspace/nvflare RUN rm -rf /workspace/nvflare # Install the ODELIA controller package from local source -COPY ./controller /workspace/controller +COPY ./MediSwarm/controller /workspace/controller RUN python3 -m pip install /workspace/controller RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm -COPY . /MediSwarm +COPY ./MediSwarm /MediSwarm RUN mkdir -p /fl_admin/transfer RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm + +# Copy pre-trained model weights to image +COPY ./torch_home_cache /torch_home diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 968bffe7..e9cfba12 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -701,7 +701,7 @@ docker_cln_sh: | ENV_VARS="--env SITE_NAME={~~client_name~~} \ --env DATA_DIR=/data \ --env SCRATCH_DIR=/scratch \ - --env TORCH_HOME=/scratch \ + --env TORCH_HOME=/torch_home \ --env GPU_DEVICE=$GPU2USE \ --env MODEL_NAME=MST \ --env CONFIG=unilateral" From 95e589f3af26468d3605be124964a6e53b08d835 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 8 Jul 2025 14:15:51 +0200 Subject: [PATCH 010/337] check that the correct version of the pretrained weights is available and the code license has not changed --- buildDockerImageAndStartupKits.sh | 32 +++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/buildDockerImageAndStartupKits.sh b/buildDockerImageAndStartupKits.sh index b39e92c2..8ae98191 100755 --- a/buildDockerImageAndStartupKits.sh +++ b/buildDockerImageAndStartupKits.sh @@ -27,7 +27,9 @@ fi VERSION=`./getVersionNumber.sh` DOCKER_IMAGE=jefftud/odelia:$VERSION + # prepare clean version of source code repository clone for building Docker image + CWD=`pwd` CLEAN_SOURCE_DIR=`mktemp -d` mkdir $CLEAN_SOURCE_DIR/MediSwarm @@ -41,9 +43,31 @@ rm .git -rf chmod a+rX . -R cd $CWD -cp -r ./docker_config/torch_home_cache $CLEAN_SOURCE_DIR/torch_home_cache + +# prepare pre-trained model weights for being included in Docker image + +MODEL_WEIGHTS_FILE='docker_config/torch_home_cache/hub/checkpoints/dinov2_vits14_pretrain.pth' +MODEL_LICENSE_FILE='docker_config/torch_home_cache/hub/facebookresearch_dinov2_main/LICENSE' +if [[ ! -f $MODEL_WEIGHTS_FILE || ! -f $MODEL_LICENSE_FILE ]]; then + read -p "Pre-trained model not available. Build the image without them? " -n 1 -r + if [[ ! $REPLY = ^[Yy]$ ]]; then + BUILT_WITHOUT_PRETRAINED_WEIGHTS=1 + mkdir $CLEAN_SOURCE_DIR/torch_home_cache + else + exit 1 + fi +else + if echo 2e405cee1bad14912278296d4f42e993 $MODEL_WEIGHTS_FILE | md5sum --check - && echo 153d2db1c329326a2d9f881317ea942e $MODEL_LICENSE_FILE | md5sum --check -; then + cp -r ./docker_config/torch_home_cache $CLEAN_SOURCE_DIR/torch_home_cache + else + exit 1 + fi +fi chmod a+rX $CLEAN_SOURCE_DIR/torch_home_cache -R + +# build and print follow-up steps + docker build $DOCKER_BUILD_ARGS -t $DOCKER_IMAGE $CLEAN_SOURCE_DIR -f docker_config/Dockerfile_ODELIA echo "Docker image $DOCKER_IMAGE built successfully" @@ -53,4 +77,8 @@ echo "Startup kits built successfully" rm -rf $CLEAN_SOURCE_DIR -echo "If you wish, manually push $DOCKER_IMAGE now" +if [ -z BUILT_WITHOUT_PRETRAINED_WEIGHTS ]; then + echo "If you wish, manually push $DOCKER_IMAGE now" +else + echo "Now run a dummy training to download the pretrained model weights, export them to docker_config/torch_home_cache/hub, and re-build the image" +fi From 953857d5d27fa3194d6b6bb94b7cb39421045a67 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 8 Jul 2025 15:02:04 +0200 Subject: [PATCH 011/337] download pretrained model weights if not already available --- buildDockerImageAndStartupKits.sh | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/buildDockerImageAndStartupKits.sh b/buildDockerImageAndStartupKits.sh index 8ae98191..e1d582c8 100755 --- a/buildDockerImageAndStartupKits.sh +++ b/buildDockerImageAndStartupKits.sh @@ -49,19 +49,20 @@ cd $CWD MODEL_WEIGHTS_FILE='docker_config/torch_home_cache/hub/checkpoints/dinov2_vits14_pretrain.pth' MODEL_LICENSE_FILE='docker_config/torch_home_cache/hub/facebookresearch_dinov2_main/LICENSE' if [[ ! -f $MODEL_WEIGHTS_FILE || ! -f $MODEL_LICENSE_FILE ]]; then - read -p "Pre-trained model not available. Build the image without them? " -n 1 -r - if [[ ! $REPLY = ^[Yy]$ ]]; then - BUILT_WITHOUT_PRETRAINED_WEIGHTS=1 - mkdir $CLEAN_SOURCE_DIR/torch_home_cache - else - exit 1 - fi + echo "Pre-trained model not available. Attempting download" + HUBDIR=$(dirname $(dirname $MODEL_LICENSE_FILE)) + mkdir -p $(dirname $MODEL_WEIGHTS_FILE) + wget https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth -O $MODEL_WEIGHTS_FILE + wget https://github.com/facebookresearch/dinov2/archive/refs/heads/main.zip -O /tmp/dinov2.zip + unzip /tmp/dinov2.zip -d $HUBDIR + mv $HUBDIR/dinov2-main $HUBDIR/$(basename $(dirname $MODEL_LICENSE_FILE)) + touch $HUBDIR/trusted_list +fi + +if echo 2e405cee1bad14912278296d4f42e993 $MODEL_WEIGHTS_FILE | md5sum --check - && echo 153d2db1c329326a2d9f881317ea942e $MODEL_LICENSE_FILE | md5sum --check -; then + cp -r ./docker_config/torch_home_cache $CLEAN_SOURCE_DIR/torch_home_cache else - if echo 2e405cee1bad14912278296d4f42e993 $MODEL_WEIGHTS_FILE | md5sum --check - && echo 153d2db1c329326a2d9f881317ea942e $MODEL_LICENSE_FILE | md5sum --check -; then - cp -r ./docker_config/torch_home_cache $CLEAN_SOURCE_DIR/torch_home_cache - else - exit 1 - fi + exit 1 fi chmod a+rX $CLEAN_SOURCE_DIR/torch_home_cache -R @@ -77,8 +78,4 @@ echo "Startup kits built successfully" rm -rf $CLEAN_SOURCE_DIR -if [ -z BUILT_WITHOUT_PRETRAINED_WEIGHTS ]; then - echo "If you wish, manually push $DOCKER_IMAGE now" -else - echo "Now run a dummy training to download the pretrained model weights, export them to docker_config/torch_home_cache/hub, and re-build the image" -fi +echo "If you wish, manually push $DOCKER_IMAGE now" From 7ad01789a5d4f3383ecdcfd1fd3cb8a548070c44 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 8 Jul 2025 15:32:20 +0200 Subject: [PATCH 012/337] quoted, as suggested by copilot --- scripts/dev_utils/remove_old_odelia_docker_images.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/dev_utils/remove_old_odelia_docker_images.sh b/scripts/dev_utils/remove_old_odelia_docker_images.sh index c24081fd..7da4ee25 100755 --- a/scripts/dev_utils/remove_old_odelia_docker_images.sh +++ b/scripts/dev_utils/remove_old_odelia_docker_images.sh @@ -8,7 +8,7 @@ docker image list echo "The following Docker images are old ODELIA docker images:" -echo $OLD_ODELIA_DOCKER_IMAGES +echo "$OLD_ODELIA_DOCKER_IMAGES" read -p "Delete these Docker images, unless they have additional tags? (y/n): " answer From ee26af0d1ee16f2867039d0a053f73155daa89f1 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 8 Jul 2025 15:34:11 +0200 Subject: [PATCH 013/337] adapted to changed argument in script --- .github/workflows/pr-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index eb9d0a2b..a9cd48a1 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -67,7 +67,7 @@ jobs: - name: Run 3D CNN preflight check continue-on-error: false run: | - ./runTestsInDocker.sh preflight_check + ./runTestsInDocker.sh run_3dcnn_tests echo "Preflight check finished" echo "=== Checking synthetic log output ===" ls -lh workspace/*/prod_00/client_A/logs || echo "No logs found for preflight" From 4860629db55090522d595d80f7d1e66f39d96048 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 8 Jul 2025 17:05:28 +0200 Subject: [PATCH 014/337] restored base image and pip package installation/versions that were accidentally downdated in a previous merge --- docker_config/Dockerfile_ODELIA | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index a57e5b5d..bd53ebbc 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -1,5 +1,5 @@ # Use the specified PyTorch image as the base -ARG PYTORCH_IMAGE=pytorch/pytorch:2.2.0-cuda12.1-cudnn8-runtime +ARG PYTORCH_IMAGE=pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime FROM ${PYTORCH_IMAGE} # Specify the NVFlare version @@ -36,18 +36,18 @@ RUN rm -rf /var/lib/apt/lists/* RUN python3 -m pip uninstall -y conda conda-package-handling conda_index # Install specific versions of pip and setuptools -RUN python3 -m pip install -U pip==23.3.1 setuptools==75.8.2 +RUN python3 -m pip install -U pip==25.1.1 setuptools==80.8.0 # Install dependencies of NVFlare at fixed versions RUN python3 -m pip install --upgrade psutil==7.0.0 -RUN python3 -m pip install Flask==3.0.2 Flask-JWT-Extended==4.6.0 Flask-SQLAlchemy==3.1.1 PyJWT==2.10.1 SQLAlchemy==2.0.16 Werkzeug==3.0.1 blinker==1.9.0 docker==7.1.0 greenlet==3.1.1 grpcio==1.62.1 gunicorn==23.0.0 itsdangerous==2.2.0 msgpack==1.1.0 protobuf==4.24.4 pyhocon==0.3.61 pyparsing==3.0.9 websockets==15.0 +RUN python3 -m pip install Flask==3.0.2 Flask-JWT-Extended==4.6.0 Flask-SQLAlchemy==3.1.1 PyJWT==2.10.1 SQLAlchemy==2.0.16 Werkzeug==3.0.1 blinker==1.9.0 docker==7.1.0 greenlet==3.2.2 grpcio==1.62.1 gunicorn==23.0.0 itsdangerous==2.2.0 msgpack==1.1.0 protobuf==4.24.4 pyhocon==0.3.61 pyparsing==3.2.3 websockets==15.0.1 -# Install additional Python packages for swarm training at defined versions -RUN python3 -m pip install Deprecated==1.2.14 SimpleITK==2.2.1 absl-py==2.1.0 aiohttp==3.9.5 aiosignal==1.3.1 async-timeout==4.0.3 cachetools==5.3.3 contourpy==1.2.1 cycler==0.12.1 einops==0.8.1 et-xmlfile==1.1.0 fonttools==4.53.1 frozenlist==1.4.1 google-auth-oauthlib==1.0.0 google-auth==2.31.0 huggingface_hub==0.23.4 humanize==4.9.0 joblib==1.4.2 kiwisolver==1.4.5 lightning-utilities==0.11.3.post0 markdown-it-py==3.0.0 markdown==3.6 matplotlib==3.7.2 mdurl==0.1.2 monai==1.3.0 multidict==6.0.5 nibabel==5.2.1 oauthlib==3.2.2 openpyxl==3.1.0 pandas==2.2.2 pyasn1-modules==0.4.0 pyasn1==0.6.0 pydicom==2.4.4 python-dateutil==2.9.0.post0 pytorch-lightning==1.9.0 requests-oauthlib==2.0.0 rich==13.7.1 rsa==4.9 safetensors==0.4.3 scikit-learn==1.3.0 scipy==1.14.0 seaborn==0.12.2 shellingham==1.5.4 tensorboard-data-server==0.7.2 tensorboard-plugin-wit==1.8.1 tensorboard==2.12.1 threadpoolctl==3.5.0 timm==0.9.16 torchio==0.19.6 torchmetrics==1.4.0.post0 torchvision==0.17.0 tqdm==4.65.0 typer==0.12.3 tzdata==2024.1 wrapt==1.16.0 x-transformers==2.4.9 yarl==1.9.4 +# Install additional Python packages for application code at defined versions +RUN python3 -m pip install Deprecated==1.2.18 SimpleITK==2.5.0 absl-py==2.2.2 aiohttp==3.11.18 aiosignal==1.3.2 async-timeout==5.0.1 cachetools==5.5.2 contourpy==1.3.2 cycler==0.12.1 et-xmlfile==2.0.0 fonttools==4.58.0 frozenlist==1.6.0 google-auth-oauthlib==1.2.2 google-auth==2.40.2 huggingface_hub==0.29.3 datasets==3.4.1 coral_pytorch==1.4.0 humanize==4.12.3 joblib==1.5.1 kiwisolver==1.4.8 lightning-utilities==0.14.3 markdown-it-py==3.0.0 markdown==3.8 matplotlib==3.9.2 mdurl==0.1.2 monai==1.4.0 multidict==6.4.4 nibabel==5.3.2 oauthlib==3.2.2 openpyxl==3.1.5 pandas==2.2.3 numpy==1.26.4 pyasn1-modules==0.4.2 pyasn1==0.6.1 pydicom==3.0.1 python-dateutil==2.9.0.post0 x-transformers==2.3.5 pytorch-lightning==2.4.0 requests==2.32.3 requests-oauthlib==2.0.0 rich==14.0.0 rsa==4.9.1 safetensors==0.5.3 scikit-learn==1.5.2 scipy==1.15.3 seaborn==0.13.2 wandb==0.18.6 einops==0.8.0 shellingham==1.5.4 tensorboard-data-server==0.7.2 tensorboard-plugin-wit==1.8.1 tensorboard==2.19.0 threadpoolctl==3.6.0 timm==1.0.15 torchio==0.20.1 torchmetrics==1.7.1 torchvision==0.17.2 torchaudio==2.2.2 tqdm==4.67.0 typer==0.15.4 tzdata==2025.2 wrapt==1.17.2 yarl==1.20.0 aiohappyeyeballs==2.6.1 annotated-types==0.7.0 dill==0.3.8 docker-pycreds==0.4.0 einx==0.3.0 frozendict==2.4.6 gitdb==4.0.12 gitpython==3.1.44 hf-xet==1.1.2 importlib-resources==6.5.2 loguru==0.7.3 multiprocess==0.70.16 propcache==0.3.1 pyarrow==20.0.0 pydantic==2.11.5 pydantic-core==2.33.2 sentry-sdk==2.29.1 setproctitle==1.3.6 smmap==5.0.2 typing-extensions==4.13.2 typing-inspection==0.4.1 xxhash==3.5.0 # Install packages needed for testing and for listing licenses of installed packages -RUN python3 -m pip install coverage==7.5.4 mock==5.1.0 -RUN python3 -m pip install pip-licenses==5.0.0 prettytable==3.14.0 +RUN python3 -m pip install coverage==7.8.2 mock==5.2.0 +RUN python3 -m pip install pip-licenses==5.0.0 prettytable==3.16.0 # Clean up pip cache RUN python3 -m pip cache purge From 0d00d52a9e8ced4cd33696c444b71ff113d4e33b Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 8 Jul 2025 17:05:56 +0200 Subject: [PATCH 015/337] =?UTF-8?q?Revert=20"removed=20precision=20specifi?= =?UTF-8?q?cation=20that=20apparently=20does=20not=20work=20any=20more"=20?= =?UTF-8?q?=E2=80=93=20should=20work=20again=20with=20intended=20pytorch?= =?UTF-8?q?=20lightning=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit f68b582d24a7064edbfc4dee3bb6e850bf94dcc5. --- .../ODELIA_ternary_classification/app/custom/threedcnn_ptl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index ce7f3740..b24d1281 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -122,6 +122,7 @@ def prepare_training(logger, max_epochs: int, site_name: str): trainer = Trainer( accelerator='gpu', accumulate_grad_batches=1, + precision='16-mixed', default_root_dir=str(path_run_dir), callbacks=[checkpointing], enable_checkpointing=True, From 6de586b303aa0e16bd3093cb47d48e4574cfc957 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Wed, 9 Jul 2025 11:02:19 +0200 Subject: [PATCH 016/337] chore: simplify APT package installation in Dockerfile and update build script for better error handling Signed-off-by: GitHub CI --- .github/workflows/pr-test.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index a9cd48a1..a75c14a5 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -64,13 +64,13 @@ jobs: echo "=== Checking log output ===" ls -lh workspace/*/prod_00/client_A/logs || echo "No logs found for dummy training" - - name: Run 3D CNN preflight check + - name: Run 3D CNN tests continue-on-error: false run: | ./runTestsInDocker.sh run_3dcnn_tests - echo "Preflight check finished" + echo "3D CNN tests check finished" echo "=== Checking synthetic log output ===" - ls -lh workspace/*/prod_00/client_A/logs || echo "No logs found for preflight" + ls -lh workspace/*/prod_00/client_A/logs || echo "No logs found for 3D CNN tests" - name: Run Unit Tests inside Docker continue-on-error: true From 8232bc2c872fa7fbaff2e5edcd8321d34321b2cb Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Wed, 9 Jul 2025 11:16:27 +0200 Subject: [PATCH 017/337] WIP: remove apt versions for rebuild --- docker_config/Dockerfile_ODELIA | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index c6b4f894..c32087b5 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,13 +12,13 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 +RUN apt install -y apt apt-utils libapt-pkg6.0 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.3 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev login logsave mount openssl passwd util-linux # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.3 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.3 gnupg-utils=2.2.27-3ubuntu2.3 gnupg=2.2.27-3ubuntu2.3 gpg-agent=2.2.27-3ubuntu2.3 gpg-wks-client=2.2.27-3ubuntu2.3 gpg-wks-server=2.2.27-3ubuntu2.3 gpg=2.2.27-3ubuntu2.3 gpgconf=2.2.27-3ubuntu2.3 gpgsm=2.2.27-3ubuntu2.3 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.3 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 +RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.25.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.3.1-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.3.1-1~ubuntu.22.04~jammy docker-ce=5:28.3.1-1~ubuntu.22.04~jammy docker-compose-plugin=2.38.1-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.12 git=1:2.34.1-1ubuntu1.12 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* From 003f5fbca16e4f1aea5b8d2765fcfb7434bbe845 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Wed, 9 Jul 2025 11:19:56 +0200 Subject: [PATCH 018/337] chore: update apt versions based on rebuild --- docker_config/Dockerfile_ODELIA | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index c32087b5..95cf5035 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,13 +12,13 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install -y apt apt-utils libapt-pkg6.0 +RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 # Update versions of installed packages -RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev login logsave mount openssl passwd util-linux +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip +RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils +RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.13 git=1:2.34.1-1ubuntu1.13 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -68,4 +68,4 @@ RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm COPY . /MediSwarm RUN mkdir -p /fl_admin/transfer -RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm \ No newline at end of file +RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm From 40914c2eab04c1c5a64eda882289d4214bd848ac Mon Sep 17 00:00:00 2001 From: Jeff Date: Wed, 9 Jul 2025 16:09:59 +0200 Subject: [PATCH 019/337] fix echo --- .github/workflows/update-apt-versions.yml | 17 ++++++++--------- scripts/ci/update_apt_versions.sh | 10 +++------- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/.github/workflows/update-apt-versions.yml b/.github/workflows/update-apt-versions.yml index baeeea8f..6074ff5a 100644 --- a/.github/workflows/update-apt-versions.yml +++ b/.github/workflows/update-apt-versions.yml @@ -2,7 +2,6 @@ name: Auto Update APT Versions on: schedule: - # Every day at 05:00 UTC - cron: '0 5 * * *' workflow_dispatch: @@ -30,10 +29,6 @@ jobs: git config --global user.email "ci@github.com" git config --global user.name "GitHub CI" - - name: Create and switch to apt-update branch - run: | - git checkout -b ci/apt-update || git switch ci/apt-update - - name: Make update script executable run: chmod +x scripts/ci/update_apt_versions.sh @@ -43,18 +38,22 @@ jobs: - name: Show git diff for debugging run: git diff - - name: Push ci/apt-update to origin - if: env.NO_CHANGES == 'false' - run: git push origin ci/apt-update --force - - name: Create Pull Request if: env.NO_CHANGES == 'false' + id: cpr uses: peter-evans/create-pull-request@v5 with: commit-message: "chore: update apt versions in Dockerfile_ODELIA" branch: ci/apt-update + branch-suffix: timestamp title: "chore: Update APT versions in Dockerfile" body: | This PR automatically updates APT package version numbers in `Dockerfile_ODELIA` based on a rebuild and inspection of installation logs. base: main + delete-branch: false + + - name: Print created PR URL + if: env.NO_CHANGES == 'false' + run: | + echo "Created PR: ${{ steps.cpr.outputs.pull-request-url }}" diff --git a/scripts/ci/update_apt_versions.sh b/scripts/ci/update_apt_versions.sh index 9c3af31b..6dfc7b6c 100755 --- a/scripts/ci/update_apt_versions.sh +++ b/scripts/ci/update_apt_versions.sh @@ -1,5 +1,4 @@ #!/usr/bin/env bash - set -e DOCKERFILE_PATH="docker_config/Dockerfile_ODELIA" @@ -22,7 +21,6 @@ if [ "$exit_code" -ne 0 ]; then exit "$exit_code" fi - echo "[INFO] Re-adding updated APT version pins to Dockerfile..." scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py "$DOCKERFILE_PATH" "$LOG_PATH" rm "$LOG_PATH" @@ -42,11 +40,9 @@ while IFS= read -r match; do fi done < <(grep -oP '\b[a-z0-9\.\-]+=[a-zA-Z0-9:~.+-]+\b' "$DOCKERFILE_PATH") -if git diff --quiet; then - echo "[INFO] No changes to apt versions found. Skipping commit." +git fetch origin main +if git diff --quiet origin/main..HEAD; then echo "NO_CHANGES=true" >> "$GITHUB_ENV" else - echo "[INFO] Committing updated apt versions..." - git commit "$DOCKERFILE_PATH" -m "chore: update apt versions based on rebuild" echo "NO_CHANGES=false" >> "$GITHUB_ENV" -fi +fi \ No newline at end of file From 3c7e3e81bf7bf93f9ca8d0a367d7e5b13355024e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 9 Jul 2025 16:26:08 +0200 Subject: [PATCH 020/337] slightly extended README and marked todos --- README.md | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 1defd46f..17ae0dfa 100644 --- a/README.md +++ b/README.md @@ -44,11 +44,11 @@ A VPN is necessary so that the swarm nodes can communicate with each other secur # Usage for Swarm Participants ## Setup 1. Make sure your compute node satisfies the specification and has the necessary software installed. -2. Clone the repository and connect the client node to the VPN as described above. +2. Clone the repository and connect the client node to the VPN as described above. TODO is cloning the repository necessary for swarm participants? 3. TODO anything else? ## Prepare Dataset -1. TODO which data is expected in which folder structure + table structure +1. see Step 3: Prepare Data in (this document)[application/jobs/ODELIA_ternary_classification/app/scripts/README.md] ## Prepare Training Participation 1. Extract startup kit provided by swarm operator @@ -56,7 +56,7 @@ A VPN is necessary so that the swarm nodes can communicate with each other secur ## Run Pre-Flight Check 1. Directories ```bash - export SITE_NAME= # TODO should be defined above, also needed for dataset location + export SITE_NAME= # TODO should be defined above, also needed for dataset location export DATADIR= export SCRATCHDIR= ``` @@ -76,23 +76,24 @@ A VPN is necessary so that the swarm nodes can communicate with each other secur ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --preflight_check ``` * Training time depends on the size of the local dataset. - * This will download pre-trained model weights if used in the training, if not already cached locally ## Configurable Parameters for docker.sh +TODO consider what should be described and recommended as configurable here, given that the goal of the startup kits is to ensure everyone runs the same training + When launching the client using `./docker.sh`, the following environment variables are automatically passed into the container. You can override them to customize training behavior: | Environment Variable | Default | Description | |----------------------|----------------|-----------------------------------------------------------------------------| -| `SITE_NAME` | *from flag* | Name of your local site, e.g. `TUD_1`, passed via `--start_client` | -| `DATA_DIR` | *from flag* | Path to the host folder that contains your local data | -| `SCRATCH_DIR` | *from flag* | Path for saving training outputs and temporary files | -| `GPU_DEVICE` | `device=0` | GPU identifier to use inside the container (or `all`) | -| `MODEL` | `MST` | Model architecture, choices: `MST`, `ResNet` | -| `INSTITUTION` | `ODELIA` | Institution name, used to group experiment logs | -| `CONFIG` | `unilateral` | Configuration schema for dataset (e.g. label scheme) | -| `NUM_EPOCHS` | `1` (test mode)| Number of training epochs (used in preflight/local training) | -| `TRAINING_MODE` | derived | Internal use. Automatically set based on flags like `--start_client` | +| `SITE_NAME` | *from flag* | Name of your local site, e.g. `TUD_1`, passed via `--start_client` | +| `DATA_DIR` | *from flag* | Path to the host folder that contains your local data | +| `SCRATCH_DIR` | *from flag* | Path for saving training outputs and temporary files | +| `GPU_DEVICE` | `device=0` | GPU identifier to use inside the container (or `all`) | +| `MODEL` | `MST` | Model architecture, choices: `MST`, `ResNet` | +| `INSTITUTION` | `ODELIA` | Institution name, used to group experiment logs | +| `CONFIG` | `unilateral` | Configuration schema for dataset (e.g. label scheme) | +| `NUM_EPOCHS` | `1` (test mode)| Number of training epochs (used in preflight/local training) | +| `TRAINING_MODE` | derived | Internal use. Automatically set based on flags like `--start_client` | These are injected into the container as `--env` variables. You can modify their defaults by editing `docker.sh` or exporting before run: @@ -102,8 +103,6 @@ export CONFIG=original ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=1 --start_client ``` ---- - ## Start Swarm Node 1. From the directory where you unpacked the startup kit: @@ -117,7 +116,7 @@ export CONFIG=original ``` If you have multiple GPUs and 0 is busy, use a different one. -3. Console output is captured in `nohup.out`, which may have been created by the root user in the container, so make it readable: +3. Console output is captured in `nohup.out`, which may have been created with limited permissions in the container, so make it readable if necessary: ```bash sudo chmod a+r nohup.out ``` @@ -125,15 +124,17 @@ export CONFIG=original 4. Output files: - **Training logs and checkpoints** are saved under: ``` - $SCRATCHDIR/runs/INSTITUTION/MODEL_TASK_CONFIG_TIMESTAMP/ + $SCRATCHDIR/runs/$SITE_NAME// ``` - **Best checkpoint** usually saved as `best.ckpt` or `last.ckpt` - **Prediction results**, if enabled, will appear in subfolders of the same directory - - **TensorBoard or WandB logs**, if activated, are stored in their respective folders inside the run directory + - **TensorBoard logs**, if activated, are stored in their respective folders inside the run directory + - TODO what is enabled/activated should be hard-coded, adapt accordingly 5. (Optional) You can verify that the container is running properly: ```bash docker ps # Check if odelia_swarm_client_$SITE_NAME is listed + nvidia-smi # Check if the GPU is busy training (it will be idling while waiting for model transfer) tail -f nohup.out # Follow training log ``` @@ -177,6 +178,7 @@ You should see 3. output of a successful proof-of-concept run run with two nodes 4. output of a set of startup kits being generated 5. output of a dummy training run using one of the startup kits +6. TODO update this to what the tests output now Optionally, uncomment running NVFlare unit tests in `_runTestsInsideDocker.sh`. From b233434321224ef19109fd75b13945c31af88fb9 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 9 Jul 2025 16:32:48 +0200 Subject: [PATCH 021/337] updated apt package versions --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 26348579..901e4356 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.25.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.3.1-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.3.1-1~ubuntu.22.04~jammy docker-ce=5:28.3.1-1~ubuntu.22.04~jammy docker-compose-plugin=2.38.1-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.12 git=1:2.34.1-1ubuntu1.12 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.25.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.3.1-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.3.1-1~ubuntu.22.04~jammy docker-ce=5:28.3.1-1~ubuntu.22.04~jammy docker-compose-plugin=2.38.1-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.13 git=1:2.34.1-1ubuntu1.13 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* From badbc6aebac4764f54865dc574a95ebff1c7a567 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 9 Jul 2025 16:49:00 +0200 Subject: [PATCH 022/337] incremented version number --- odelia_image.version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/odelia_image.version b/odelia_image.version index ecc84fdd..4812aa29 100644 --- a/odelia_image.version +++ b/odelia_image.version @@ -1,2 +1,2 @@ # version of the ODELIA Docker image, read by different scripts -0.9 \ No newline at end of file +1.0 \ No newline at end of file From d4cc4dc39692a51ec668fd05bb05f8fe3b1172c6 Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Thu, 10 Jul 2025 10:01:08 +0200 Subject: [PATCH 023/337] feat: add Odelia all sites configuration YAML for server and client setup Signed-off-by: GitHub CI --- .../provision/project_Odelia_allsites.yml | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 application/provision/project_Odelia_allsites.yml diff --git a/application/provision/project_Odelia_allsites.yml b/application/provision/project_Odelia_allsites.yml new file mode 100644 index 00000000..4e817c29 --- /dev/null +++ b/application/provision/project_Odelia_allsites.yml @@ -0,0 +1,95 @@ +api_version: 3 +name: odelia___REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS___allsites_test +description: Odelia TUD server all collaborators clients on Odelia challenge dataset provision http based yaml file + +participants: + # change example.com to the FQDN of the server + - name: dl3.tud.de + type: server + org: TUD + fed_learn_port: 8002 + admin_port: 8003 + - name: TUD_1 + type: client + org: TUD + - name: TUD_2 + type: client + org: TUD + # Specifying listening_host will enable the creation of one pair of + # certificate/private key for this client, allowing the client to function + # as a server for 3rd-party integration. + # The value must be a hostname that the external trainer can reach via the network. + # listening_host: site-1-lh + - name: MEVIS_1 + type: client + org: MEVIS + - name: MEVIS_2 + type: client + org: MEVIS + - name: UKA_1 + type: client + org: UKA + - name: CAM_1 + type: client + org: Cambridge + - name: VHIO_1 + type: client + org: VHIO + - name: MHA_1 + type: client + org: MHA + - name: RSH_1 + type: client + org: RSH + - name: USZ_1 + type: client + org: USZ + - name: UMCU_1 + type: client + org: UMCU + - name: RUMC_1 + type: client + org: RUMC + - name: jiefu.zhu@tu-dresden.de + type: admin + org: TUD + role: project_admin + +# The same methods in all builders are called in their order defined in builders section +builders: + - path: nvflare.lighter.impl.workspace.WorkspaceBuilder + args: + template_file: master_template.yml + - path: nvflare.lighter.impl.template.TemplateBuilder + - path: nvflare.lighter.impl.static_file.StaticFileBuilder + args: + # config_folder can be set to inform NVIDIA FLARE where to get configuration + config_folder: config + + # scheme for communication driver (currently supporting the default, grpc, only). + scheme: http + + # app_validator is used to verify if uploaded app has proper structures + # if not set, no app_validator is included in fed_server.json + # app_validator: PATH_TO_YOUR_OWN_APP_VALIDATOR + + # when docker_image is set to a docker image name, docker.sh will be generated on server/client/admin + docker_image: jefftud/odelia:__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__ + + # download_job_url is set to http://download.server.com/ as default in fed_server.json. You can override this + # to different url. + # download_job_url: http://download.server.com/ + + overseer_agent: + path: nvflare.ha.dummy_overseer_agent.DummyOverseerAgent + # if overseer_exists is true, args here are ignored. Provisioning + # tool will fill role, name and other local parameters automatically. + # if overseer_exists is false, args in this section will be used and the sp_end_point + # must match the server defined above in the format of SERVER_NAME:FL_PORT:ADMIN_PORT + # + overseer_exists: false + args: + sp_end_point: dl3.tud.de:8002:8003 + + - path: nvflare.lighter.impl.cert.CertBuilder + - path: nvflare.lighter.impl.signature.SignatureBuilder From 5e11d99aa0323ce1ddafb4674e3e2dfcce7770cc Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 10:18:21 +0200 Subject: [PATCH 024/337] WIP: remove apt versions for rebuild --- docker_config/Dockerfile_ODELIA | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index c6b4f894..c32087b5 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,13 +12,13 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 +RUN apt install -y apt apt-utils libapt-pkg6.0 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.3 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev login logsave mount openssl passwd util-linux # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.3 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.3 gnupg-utils=2.2.27-3ubuntu2.3 gnupg=2.2.27-3ubuntu2.3 gpg-agent=2.2.27-3ubuntu2.3 gpg-wks-client=2.2.27-3ubuntu2.3 gpg-wks-server=2.2.27-3ubuntu2.3 gpg=2.2.27-3ubuntu2.3 gpgconf=2.2.27-3ubuntu2.3 gpgsm=2.2.27-3ubuntu2.3 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.3 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 +RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.25.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.3.1-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.3.1-1~ubuntu.22.04~jammy docker-ce=5:28.3.1-1~ubuntu.22.04~jammy docker-compose-plugin=2.38.1-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.12 git=1:2.34.1-1ubuntu1.12 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* From 1c17c462f97e86f127f226a8bc8a9c9737adbe91 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 10:22:38 +0200 Subject: [PATCH 025/337] chore: pin APT package versions in Dockerfile for consistency Signed-off-by: GitHub CI --- docker_config/Dockerfile_ODELIA | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index c32087b5..e6068793 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,13 +12,13 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install -y apt apt-utils libapt-pkg6.0 +RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 # Update versions of installed packages -RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev login logsave mount openssl passwd util-linux +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip +RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils +RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.14 git=1:2.34.1-1ubuntu1.14 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -68,4 +68,4 @@ RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm COPY . /MediSwarm RUN mkdir -p /fl_admin/transfer -RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm \ No newline at end of file +RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm From 5c1d0c0816fad2cf406fce6cf7422dd09d91c1ce Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 10:34:07 +0200 Subject: [PATCH 026/337] chore: enhance APT update workflow with debug logging and final Dockerfile diff Signed-off-by: GitHub CI --- .github/workflows/update-apt-versions.yml | 3 +++ scripts/ci/update_apt_versions.sh | 15 ++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/update-apt-versions.yml b/.github/workflows/update-apt-versions.yml index 6074ff5a..da9ad308 100644 --- a/.github/workflows/update-apt-versions.yml +++ b/.github/workflows/update-apt-versions.yml @@ -57,3 +57,6 @@ jobs: if: env.NO_CHANGES == 'false' run: | echo "Created PR: ${{ steps.cpr.outputs.pull-request-url }}" + + - name: Show final Dockerfile diff + run: cat docker_config/Dockerfile_ODELIA diff --git a/scripts/ci/update_apt_versions.sh b/scripts/ci/update_apt_versions.sh index 6dfc7b6c..d86bbc6d 100755 --- a/scripts/ci/update_apt_versions.sh +++ b/scripts/ci/update_apt_versions.sh @@ -14,13 +14,18 @@ git config user.name "GitHub CI" git commit "$DOCKERFILE_PATH" -m "WIP: remove apt versions for rebuild" || echo "[INFO] No version pin removal change to commit." echo "[INFO] Rebuilding Docker image and capturing logs..." -./buildDockerImageAndStartupKits.sh -p "$PROJECT_YML" 2>&1 | tee "$LOG_PATH" -exit_code=${PIPESTATUS[0]} -if [ "$exit_code" -ne 0 ]; then - echo "Build failed with exit code $exit_code" - exit "$exit_code" +if ! ./buildDockerImageAndStartupKits.sh -p "$PROJECT_YML" > "$LOG_PATH" 2>&1; then + echo "Build failed. Output:" + cat "$LOG_PATH" + exit 1 fi +echo "[DEBUG] First 20 lines of build log:" +head -n 20 "$LOG_PATH" + +echo "[DEBUG] Checking for apt install commands:" +grep "apt install" "$LOG_PATH" || echo "[WARN] No apt install command found in log!" + echo "[INFO] Re-adding updated APT version pins to Dockerfile..." scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py "$DOCKERFILE_PATH" "$LOG_PATH" rm "$LOG_PATH" From 8016685ba39f01f1dd1c68bc5cbf3781b4dde985 Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Thu, 10 Jul 2025 10:42:05 +0200 Subject: [PATCH 027/337] chore: update APT version check to use dpkg for improved accuracy Signed-off-by: GitHub CI --- scripts/ci/update_apt_versions.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ci/update_apt_versions.sh b/scripts/ci/update_apt_versions.sh index d86bbc6d..fab30c03 100755 --- a/scripts/ci/update_apt_versions.sh +++ b/scripts/ci/update_apt_versions.sh @@ -36,7 +36,7 @@ while IFS= read -r match; do pkg="$(echo "$match" | cut -d= -f1)" ver="$(echo "$match" | cut -d= -f2)" echo -n "Checking $pkg=$ver... " - if ! apt-cache madison "$pkg" | grep -q "$ver"; then + if ! dpkg -l "$pkg" | grep -q "$ver"; then echo "NOT FOUND – removing pin" sed -i "s|\b$pkg=$ver\b|$pkg|" "$DOCKERFILE_PATH" has_invalid_versions=1 From 2ca49b332ab011906aa75d436917be84fde2c6c0 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 10:42:32 +0200 Subject: [PATCH 028/337] WIP: remove apt versions for rebuild --- docker_config/Dockerfile_ODELIA | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index e6068793..c32087b5 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,13 +12,13 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 +RUN apt install -y apt apt-utils libapt-pkg6.0 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev login logsave mount openssl passwd util-linux # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 +RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.14 git=1:2.34.1-1ubuntu1.14 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -68,4 +68,4 @@ RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm COPY . /MediSwarm RUN mkdir -p /fl_admin/transfer -RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm +RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm \ No newline at end of file From 7a1ff1aec648c3605c13c98315c5e4b81126b4c6 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 10:50:21 +0200 Subject: [PATCH 029/337] chore: pin APT package versions in Dockerfile and update version check in script Signed-off-by: GitHub CI --- docker_config/Dockerfile_ODELIA | 8 ++++---- scripts/ci/update_apt_versions.sh | 3 ++- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index c32087b5..7fc502a1 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -15,10 +15,10 @@ RUN apt update RUN apt install -y apt apt-utils libapt-pkg6.0 # Update versions of installed packages -RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev login logsave mount openssl passwd util-linux +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv libblkid1=2.37.2-4ubuntu3.4 libc-bin libc-dev-bin libc6-dev libc6 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0 libtasn1-6=4.18.0-4ubuntu0.1 libudev1 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip +RUN apt install -y apt-transport-https curl=7.81.0-1ubuntu1.20 dirmngr distro-info-data=0.52ubuntu0.9 gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal libpython3.10-stdlib libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal python3.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils +RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0=1.72.0-1 git-man git iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0 libglib2.0-data libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd libpam-systemd libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv systemd-timesyncd systemd xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -68,4 +68,4 @@ RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm COPY . /MediSwarm RUN mkdir -p /fl_admin/transfer -RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm \ No newline at end of file +RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm diff --git a/scripts/ci/update_apt_versions.sh b/scripts/ci/update_apt_versions.sh index fab30c03..1a3a267d 100755 --- a/scripts/ci/update_apt_versions.sh +++ b/scripts/ci/update_apt_versions.sh @@ -36,13 +36,14 @@ while IFS= read -r match; do pkg="$(echo "$match" | cut -d= -f1)" ver="$(echo "$match" | cut -d= -f2)" echo -n "Checking $pkg=$ver... " - if ! dpkg -l "$pkg" | grep -q "$ver"; then + if ! dpkg-query -W -f='${Version}' "$pkg" 2>/dev/null | grep -q "$ver"; then echo "NOT FOUND – removing pin" sed -i "s|\b$pkg=$ver\b|$pkg|" "$DOCKERFILE_PATH" has_invalid_versions=1 else echo "OK" fi + done < <(grep -oP '\b[a-z0-9\.\-]+=[a-zA-Z0-9:~.+-]+\b' "$DOCKERFILE_PATH") git fetch origin main From 242a4832bf27c7a9fc76d9a445b4b61b92b625c5 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 10:50:54 +0200 Subject: [PATCH 030/337] WIP: remove apt versions for rebuild --- docker_config/Dockerfile_ODELIA | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 7fc502a1..c32087b5 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -15,10 +15,10 @@ RUN apt update RUN apt install -y apt apt-utils libapt-pkg6.0 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv libblkid1=2.37.2-4ubuntu3.4 libc-bin libc-dev-bin libc6-dev libc6 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0 libtasn1-6=4.18.0-4ubuntu0.1 libudev1 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev login logsave mount openssl passwd util-linux # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https curl=7.81.0-1ubuntu1.20 dirmngr distro-info-data=0.52ubuntu0.9 gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal libpython3.10-stdlib libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal python3.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 +RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0=1.72.0-1 git-man git iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0 libglib2.0-data libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd libpam-systemd libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv systemd-timesyncd systemd xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -68,4 +68,4 @@ RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm COPY . /MediSwarm RUN mkdir -p /fl_admin/transfer -RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm +RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm \ No newline at end of file From 8dd171ac801ea9361c0a695ac8a146c86aa623fc Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Thu, 10 Jul 2025 11:01:01 +0200 Subject: [PATCH 031/337] chore: update APT package versions in Dockerfile and adjust CI workflow Signed-off-by: GitHub CI --- .github/workflows/update-apt-versions.yml | 44 ++++++++++------------- docker_config/Dockerfile_ODELIA | 8 ++--- scripts/ci/update_apt_versions.sh | 3 +- 3 files changed, 23 insertions(+), 32 deletions(-) diff --git a/.github/workflows/update-apt-versions.yml b/.github/workflows/update-apt-versions.yml index da9ad308..4ac7ae08 100644 --- a/.github/workflows/update-apt-versions.yml +++ b/.github/workflows/update-apt-versions.yml @@ -1,46 +1,46 @@ -name: Auto Update APT Versions +name: Auto Update APT Versions (Self-hosted) on: schedule: - - cron: '0 5 * * *' + # run eveyday at 04:00 UTC + - cron: '0 4 * * *' workflow_dispatch: jobs: update-apt: - name: Update APT Package Versions in Dockerfile - runs-on: ubuntu-latest + runs-on: self-hosted + timeout-minutes: 60 steps: - name: Checkout repository (with submodules) uses: actions/checkout@v3 with: submodules: true + fetch-depth: 0 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.x' - - - name: Install dependencies - run: sudo apt-get update && sudo apt-get install -y git apt-utils - - - name: Configure Git for CI + - name: Set up Git run: | git config --global user.email "ci@github.com" git config --global user.name "GitHub CI" - - name: Make update script executable - run: chmod +x scripts/ci/update_apt_versions.sh + - name: Create and switch to apt-update branch + run: | + git checkout -b ci/apt-update || git switch ci/apt-update - name: Run APT update script - run: scripts/ci/update_apt_versions.sh + run: | + chmod +x scripts/ci/update_apt_versions.sh + scripts/ci/update_apt_versions.sh - name: Show git diff for debugging - run: git diff + run: git diff || true + + - name: Push apt-update branch + if: env.NO_CHANGES == 'false' + run: git push origin ci/apt-update --force - name: Create Pull Request if: env.NO_CHANGES == 'false' - id: cpr uses: peter-evans/create-pull-request@v5 with: commit-message: "chore: update apt versions in Dockerfile_ODELIA" @@ -52,11 +52,3 @@ jobs: based on a rebuild and inspection of installation logs. base: main delete-branch: false - - - name: Print created PR URL - if: env.NO_CHANGES == 'false' - run: | - echo "Created PR: ${{ steps.cpr.outputs.pull-request-url }}" - - - name: Show final Dockerfile diff - run: cat docker_config/Dockerfile_ODELIA diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index c32087b5..7fc502a1 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -15,10 +15,10 @@ RUN apt update RUN apt install -y apt apt-utils libapt-pkg6.0 # Update versions of installed packages -RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev login logsave mount openssl passwd util-linux +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv libblkid1=2.37.2-4ubuntu3.4 libc-bin libc-dev-bin libc6-dev libc6 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0 libtasn1-6=4.18.0-4ubuntu0.1 libudev1 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip +RUN apt install -y apt-transport-https curl=7.81.0-1ubuntu1.20 dirmngr distro-info-data=0.52ubuntu0.9 gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal libpython3.10-stdlib libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal python3.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils +RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0=1.72.0-1 git-man git iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0 libglib2.0-data libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd libpam-systemd libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv systemd-timesyncd systemd xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -68,4 +68,4 @@ RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm COPY . /MediSwarm RUN mkdir -p /fl_admin/transfer -RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm \ No newline at end of file +RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm diff --git a/scripts/ci/update_apt_versions.sh b/scripts/ci/update_apt_versions.sh index 1a3a267d..d86bbc6d 100755 --- a/scripts/ci/update_apt_versions.sh +++ b/scripts/ci/update_apt_versions.sh @@ -36,14 +36,13 @@ while IFS= read -r match; do pkg="$(echo "$match" | cut -d= -f1)" ver="$(echo "$match" | cut -d= -f2)" echo -n "Checking $pkg=$ver... " - if ! dpkg-query -W -f='${Version}' "$pkg" 2>/dev/null | grep -q "$ver"; then + if ! apt-cache madison "$pkg" | grep -q "$ver"; then echo "NOT FOUND – removing pin" sed -i "s|\b$pkg=$ver\b|$pkg|" "$DOCKERFILE_PATH" has_invalid_versions=1 else echo "OK" fi - done < <(grep -oP '\b[a-z0-9\.\-]+=[a-zA-Z0-9:~.+-]+\b' "$DOCKERFILE_PATH") git fetch origin main From a4c97b6314a937a8cb59589f0757623a8f0919bc Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 11:01:55 +0200 Subject: [PATCH 032/337] WIP: remove apt versions for rebuild --- docker_config/Dockerfile_ODELIA | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 7fc502a1..c32087b5 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -15,10 +15,10 @@ RUN apt update RUN apt install -y apt apt-utils libapt-pkg6.0 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv libblkid1=2.37.2-4ubuntu3.4 libc-bin libc-dev-bin libc6-dev libc6 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0 libtasn1-6=4.18.0-4ubuntu0.1 libudev1 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev login logsave mount openssl passwd util-linux # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https curl=7.81.0-1ubuntu1.20 dirmngr distro-info-data=0.52ubuntu0.9 gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal libpython3.10-stdlib libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal python3.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 +RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0=1.72.0-1 git-man git iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0 libglib2.0-data libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd libpam-systemd libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv systemd-timesyncd systemd xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -68,4 +68,4 @@ RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm COPY . /MediSwarm RUN mkdir -p /fl_admin/transfer -RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm +RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm \ No newline at end of file From 9a8ad273c5b60221ed584bd613f372f6bbd03fb5 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Thu, 10 Jul 2025 11:05:22 +0200 Subject: [PATCH 033/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 5af4aff0..476a3044 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.25.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.3.1-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.3.1-1~ubuntu.22.04~jammy docker-ce=5:28.3.1-1~ubuntu.22.04~jammy docker-compose-plugin=2.38.1-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.13 git=1:2.34.1-1ubuntu1.13 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.14 git=1:2.34.1-1ubuntu1.14 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -69,6 +69,3 @@ RUN rm -rf /workspace/controller COPY ./MediSwarm /MediSwarm RUN mkdir -p /fl_admin/transfer RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm - -# Copy pre-trained model weights to image -COPY ./torch_home_cache /torch_home From cba7221e49c343f4cd3cdd907c02563b273c4b69 Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Thu, 10 Jul 2025 11:07:18 +0200 Subject: [PATCH 034/337] Potential fix for code scanning alert no. 4: Workflow does not contain permissions Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- .github/workflows/update-apt-versions.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/update-apt-versions.yml b/.github/workflows/update-apt-versions.yml index 4ac7ae08..44d248c3 100644 --- a/.github/workflows/update-apt-versions.yml +++ b/.github/workflows/update-apt-versions.yml @@ -1,9 +1,13 @@ name: Auto Update APT Versions (Self-hosted) +permissions: + contents: read + pull-requests: write + on: schedule: # run eveyday at 04:00 UTC - - cron: '0 4 * * *' + - cron: '0 4 * * *' workflow_dispatch: jobs: From f0cf38673aa8ca1a2df246ee0862d001c3cf47f5 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 11:11:00 +0200 Subject: [PATCH 035/337] WIP: remove apt versions for rebuild --- docker_config/Dockerfile_ODELIA | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 5af4aff0..eda17b9b 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,13 +12,13 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 +RUN apt install -y apt apt-utils libapt-pkg6.0 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 login=1:4.8.1-2ubuntu2.2 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd=1:4.8.1-2ubuntu2.2 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev login logsave mount openssl passwd util-linux # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 +RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.25.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.3.1-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.3.1-1~ubuntu.22.04~jammy docker-ce=5:28.3.1-1~ubuntu.22.04~jammy docker-compose-plugin=2.38.1-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.13 git=1:2.34.1-1ubuntu1.13 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -71,4 +71,4 @@ RUN mkdir -p /fl_admin/transfer RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm # Copy pre-trained model weights to image -COPY ./torch_home_cache /torch_home +COPY ./torch_home_cache /torch_home \ No newline at end of file From d0f7a65d318c36cf9560e4d26b332c37f8a6cbc1 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 11:15:36 +0200 Subject: [PATCH 036/337] fix: pin APT package versions in Dockerfile for consistent builds Signed-off-by: GitHub CI --- docker_config/Dockerfile_ODELIA | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index eda17b9b..a61e9505 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,13 +12,13 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install -y apt apt-utils libapt-pkg6.0 +RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 # Update versions of installed packages -RUN apt install -y base-files bash bsdutils ca-certificates coreutils dpkg e2fsprogs gpgv libblkid1 libc-bin libc-dev-bin libc6-dev libc6 libcap2 libcom-err2 libext2fs2 libgnutls30 libgssapi-krb5-2 libk5crypto3 libkrb5-3 libkrb5support0 libmount1 libpam-modules-bin libpam-modules libpam-runtime libpam0g libseccomp2 libsmartcols1 libss2 libssl3 libsystemd0 libtasn1-6 libudev1 libuuid1 linux-libc-dev login logsave mount openssl passwd util-linux +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 login logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https curl dirmngr distro-info-data gnupg-l10n gnupg-utils gnupg gpg-agent gpg-wks-client gpg-wks-server gpg gpgconf gpgsm libassuan0 libbrotli1 libcurl4 libexpat1 libksba8 libldap-2.5-0 libldap-common libmpdec3 libnghttp2-14 libnpth0 libpsl5 libpython3-stdlib libpython3.10-minimal libpython3.10-stdlib libreadline8 librtmp1 libsasl2-2 libsasl2-modules-db libsasl2-modules libsqlite3-0 libssh-4 lsb-release media-types pinentry-curses publicsuffix python3-minimal python3.10-minimal python3.10 python3 readline-common unzip zip +RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor containerd.io dbus-user-session dbus dmsetup docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0 git-man git iptables less libapparmor1 libargon2-1 libbsd0 libcbor0.8 libcryptsetup12 libcurl3-gnutls libdbus-1-3 libdevmapper1.02.1 libedit2 liberror-perl libfido2-1 libgdbm-compat4 libgdbm6 libgirepository-1.0-1 libglib2.0-0 libglib2.0-data libicu70 libip4tc2 libip6tc2 libjson-c5 libkmod2 libltdl7 libmd0 libmnl0 libnetfilter-conntrack3 libnfnetlink0 libnftnl11 libnss-systemd libpam-systemd libperl5.34 libslirp0 libx11-6 libx11-data libxau6 libxcb1 libxdmcp6 libxext6 libxml2 libxmuu1 libxtables12 netbase networkd-dispatcher openssh-client patch perl-base perl-modules-5.34 perl pigz python3-dbus python3-gi shared-mime-info slirp4netns systemd-sysv systemd-timesyncd systemd xauth xdg-user-dirs xz-utils +RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.14 git=1:2.34.1-1ubuntu1.14 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -71,4 +71,4 @@ RUN mkdir -p /fl_admin/transfer RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm # Copy pre-trained model weights to image -COPY ./torch_home_cache /torch_home \ No newline at end of file +COPY ./torch_home_cache /torch_home From 055ad7a0ad6abfc6426c980c993955da6e364fb6 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 12:16:03 +0200 Subject: [PATCH 037/337] chore: copy pre-trained model weights to Docker image Signed-off-by: GitHub CI --- docker_config/Dockerfile_ODELIA | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 476a3044..7fbc0bc3 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -69,3 +69,6 @@ RUN rm -rf /workspace/controller COPY ./MediSwarm /MediSwarm RUN mkdir -p /fl_admin/transfer RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm + +# Copy pre-trained model weights to image +COPY ./torch_home_cache /torch_home From de3d561ea0f0300ce52d8470ab372c7b20eec04a Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Thu, 10 Jul 2025 12:57:32 +0200 Subject: [PATCH 038/337] docs: add developer and operator usage guides to README files Signed-off-by: GitHub CI --- README.md | 306 ++--------------------- README_old.md | 362 ++++++++++++++++++++++++++++ assets/readme/README.developer.md | 65 +++++ assets/readme/README.operator.md | 85 +++++++ assets/readme/README.participant.md | 145 +++++++++++ 5 files changed, 683 insertions(+), 280 deletions(-) create mode 100644 README_old.md create mode 100644 assets/readme/README.developer.md create mode 100644 assets/readme/README.operator.md create mode 100644 assets/readme/README.participant.md diff --git a/README.md b/README.md index 86533caf..fecfbc6a 100644 --- a/README.md +++ b/README.md @@ -1,297 +1,43 @@ -# Introduction -MediSwarm is an open-source project dedicated to advancing medical deep learning through swarm intelligence, leveraging the NVFlare platform. Developed in collaboration with the Odelia consortium, this repository aims to create a decentralized and collaborative framework for medical research and applications. +# MediSwarm -## Key Features -- **Swarm Learning:** Utilizes swarm intelligence principles to improve model performance and adaptability. -- **NVFlare Integration:** Built on NVFlare, providing robust and scalable federated learning capabilities. -- **Data Privacy:** Ensures data security and compliance with privacy regulations by keeping data local to each institution. -- **Collaborative Research:** Facilitates collaboration among medical researchers and institutions for enhanced outcomes. -- **Extensible Framework:** Designed to support various medical applications and easily integrate with existing workflows. +An open-source platform advancing medical AI via privacy-preserving swarm learning, based on NVFlare and developed with +the ODELIA consortium. -## Prerequisites -### Hardware recommendations -* 64 GB of RAM (32 GB is the absolute minimum) -* 16 CPU cores (8 is the absolute minimum) -* an NVIDIA GPU with 48 GB of RAM (24 GB is the minimum) -* 8 TB of Storage (4 TB is the absolute minimum) +[![PR Tests]([pr-test.yaml](.github/workflows/pr-test.yaml)) +[![Docker Build]([update-apt-versions.yml](.github/workflows/update-apt-versions.yml)) -We demonstrate that the system can run on lightweight hardware like this. For less than 10k EUR, you can configure systems from suppliers like Lambda, Dell Precision, and Dell Alienware. +## Quick Start for Your Role -### Operating System -* Ubuntu 20.04 LTS +Choose your role and follow the instructions: -### Software -* Docker -* openvpn -* git +- [Swarm Participant (Medical Site / Data Scientist)](assets/readme/README.participant.md)) +- [Developer (Docker, Code, Pipeline)](assets/readme/README.developer.md) +- [Swarm Operator (Provisioning, VPN, Server)](assets/readme/README.operator.md)) -### Cloning the repository - ```bash - git clone https://github.com/KatherLab/MediSwarm.git --recurse-submodules - ``` -* The last argument is necessary because we are using a git submodule for the (ODELIA fork of NVFlare)[https://github.com/KatherLab/NVFlare_MediSwarm] -* If you have cloned it without this argument, use `git submodule update --init --recursive` +## Overview -### VPN -A VPN is necessary so that the swarm nodes can communicate with each other securely across firewalls. For that purpose, -1. Install OpenVPN - ```bash - sudo apt-get install openvpn - ``` -2. If you have a graphical user interface(GUI), follow this guide to connect to the VPN: [VPN setup guide(GUI).pdf](assets/VPN%20setup%20guide%28GUI%29.pdf) -3. If you have a command line interface(CLI), follow this guide to connect to the VPN: [VPN setup guide(CLI).md](assets/VPN%20setup%20guide%28CLI%29.md) +MediSwarm enables: -# Usage for Swarm Participants -## Setup -1. Make sure your compute node satisfies the specification and has the necessary software installed. -2. Clone the repository and connect the client node to the VPN as described above. TODO is cloning the repository - necessary for swarm participants? -3. TODO anything else? +- **Privacy-preserving training** of deep learning models on distributed medical datasets +- **Decentralized collaboration** between institutions +- **Dockerized, reproducible** experiments built on NVFlare -## Prepare Dataset +## License -1. see Step 3: Prepare Data in (this document)[application/jobs/ODELIA_ternary_classification/app/scripts/README.md] +MIT — see [LICENSE](LICENSE). -## Prepare Training Participation -1. Extract startup kit provided by swarm operator +## Maintainers -## Run Pre-Flight Check -1. Directories - ```bash - export SITE_NAME= # TODO should be defined above, also needed for dataset location - export DATADIR= - export SCRATCHDIR= - ``` -2. From the directory where you unpacked the startup kit, - ```bash - cd $SITE_NAME/startup - ``` -3. Verify that your Docker/GPU setup is working - ```bash - ./docker.sh --scratch_dir $SCRATCHDIR --GPU device=0 --dummy_training - ``` - * This will pull the Docker image, which might take a while. - * If you have multiple GPUs and 0 is busy, use a different one. - * The “training” itself should take less than minute and does not yield a meaningful classification performance. -4. Verify that your local data can be accessed and the model can be trained locally - ```bash - ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --preflight_check - ``` - * Training time depends on the size of the local dataset. +- [Jeff](https://github.com/Ultimate-Storm) +- [Ole Schwen](mailto:ole.schwen@mevis.fraunhofer.de) +- [Steffen Renisch](mailto:steffen.renisch@mevis.fraunhofer.de) -## Configurable Parameters for docker.sh +## Contributing -TODO consider what should be described and recommended as configurable here, given that the goal of the startup kits is -to ensure everyone runs the same training +Contributions welcome! [Open an issue](https://github.com/KatherLab/MediSwarm/issues) or submit a PR. -When launching the client using `./docker.sh`, the following environment variables are automatically passed into the -container. You can override them to customize training behavior: +## Credits -| Environment Variable | Default | Description | -|----------------------|-----------------|----------------------------------------------------------------------| -| `SITE_NAME` | *from flag* | Name of your local site, e.g. `TUD_1`, passed via `--start_client` | -| `DATA_DIR` | *from flag* | Path to the host folder that contains your local data | -| `SCRATCH_DIR` | *from flag* | Path for saving training outputs and temporary files | -| `GPU_DEVICE` | `device=0` | GPU identifier to use inside the container (or `all`) | -| `MODEL` | `MST` | Model architecture, choices: `MST`, `ResNet` | -| `INSTITUTION` | `ODELIA` | Institution name, used to group experiment logs | -| `CONFIG` | `unilateral` | Configuration schema for dataset (e.g. label scheme) | -| `NUM_EPOCHS` | `1` (test mode) | Number of training epochs (used in preflight/local training) | -| `TRAINING_MODE` | derived | Internal use. Automatically set based on flags like `--start_client` | +Built on: -These are injected into the container as `--env` variables. You can modify their defaults by editing `docker.sh` or -exporting before run: - -```bash -export MODEL=ResNet -export CONFIG=original -./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=1 --start_client -``` - -## Start Swarm Node - -1. From the directory where you unpacked the startup kit: - ```bash - cd $SITE_NAME/startup # Skip this if you just ran the pre-flight check - ``` - -2. Start the client: - ```bash - ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --start_client - ``` - If you have multiple GPUs and 0 is busy, use a different one. - -3. Console output is captured in `nohup.out`, which may have been created with limited permissions in the container, so - make it readable if necessary: - ```bash - sudo chmod a+r nohup.out - ``` - -4. Output files: - - **Training logs and checkpoints** are saved under: - ``` - $SCRATCHDIR/runs/$SITE_NAME// - ``` - - **Best checkpoint** usually saved as `best.ckpt` or `last.ckpt` - - **Prediction results**, if enabled, will appear in subfolders of the same directory - - **TensorBoard logs**, if activated, are stored in their respective folders inside the run directory - - TODO what is enabled/activated should be hard-coded, adapt accordingly - -5. (Optional) You can verify that the container is running properly: - ```bash - docker ps # Check if odelia_swarm_client_$SITE_NAME is listed - nvidia-smi # Check if the GPU is busy training (it will be idling while waiting for model transfer) - tail -f nohup.out # Follow training log - ``` - -## Run Local Training -1. From the directory where you unpacked the startup kit - ```bash - cd $SITE_NAME/startup - ``` -2. Start local training - ```bash - /docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU all --local_training - ``` - * TODO update when handling of the number of epochs has been implemented -3. Output files - * TODO describe - -# Usage for MediSwarm and Application Code Developers -## Versioning of ODELIA Docker Images -If needed, update the version number in file (odelia_image.version)[odelia_image.version]. It will be used automatically for the Docker image and startup kits. - -## Build the Docker Image and Startup Kits -The Docker image contains all dependencies for administrative purposes (dashboard, command-line provisioning, admin console, server) as well as for running the 3DCNN pipeline under the pytorch-lightning framework. -The project description specifies the swarm nodes etc. to be used for a swarm training. - ```bash - cd MediSwarm - ./buildDockerImageAndStartupKits.sh -p application/provision/ - ``` - -1. Make sure you have no uncommitted changes. -2. If package versions are still not available, you may have to check what the current version is and update the `Dockerfile` accordingly. Version numbers are hard-coded to avoid issues due to silently different versions being installed. -3. After successful build (and after verifying that everything works as expected, i.e., local tests, building startup kits, running local trainings in the startup kit), you can manually push the image to DockerHub, provided you have the necessary rights. Make sure you are not re-using a version number for this purpose. - -## Running Local Tests - ```bash - ./runTestsInDocker.sh - ``` - -You should see -1. several expected errors and warnings printed from unit tests that should succeed overall, and a coverage report -2. output of a successful simulation run with two nodes -3. output of a successful proof-of-concept run run with two nodes -4. output of a set of startup kits being generated -5. output of a dummy training run using one of the startup kits -6. TODO update this to what the tests output now - -Optionally, uncomment running NVFlare unit tests in `_runTestsInsideDocker.sh`. - -## Distributing Startup Kits -Distribute the startup kits to the clients. - -## Running the Application -1. **CIFAR-10 example:** - See [cifar10/README.md](application/jobs/cifar10/README.md) -2. **Minimal PyTorch CNN example:** - See [application/jobs/minimal_training_pytorch_cnn/README.md](application/jobs/minimal_training_pytorch_cnn/README.md) -3. **3D CNN for classifying breast tumors:** - See [ODELIA_ternary_classification/README.md](application/jobs/ODELIA_ternary_classification/README.md) - -## Contributing Application Code -1. Take a look at application/jobs/minimal_training_pytorch_cnn for a minimal example how pytorch code can be adapted to work with NVFlare -2. Take a look at application/jobs/ODELIA_ternary_classification for a more relastic example of pytorch code that can - run in the swarm -3. Use the local tests to check if the code is swarm-ready -4. TODO more detailed instructions - -# Usage for Swarm Operators -## Setting up a Swarm -Production mode is designed for secure, real-world deployments. It supports both local and remote setups, whether on-premise or in the cloud. For more details, refer to the [NVFLARE Production Mode](https://nvflare.readthedocs.io/en/2.4.1/real_world_fl.html). - -To set up production mode, follow these steps: - -## Edit `/etc/hosts` -Ensure that your `/etc/hosts` file includes the correct host mappings. All hosts need to be able to communicate to the server node. - -For example, add the following line (replace `` with the server's actual IP address): - -```plaintext - dl3.tud.de dl3 -``` - -## Create Startup Kits -### Via Script (recommended) -1. Use, e.g., the file `application/provision/project_MEVIS_test.yml`, adapt as needed (network protocol etc.) -2. Call `buildStartupKits.sh /path/to/project_configuration.yml` to build the startup kits -3. Startup kits are generated to `workspace//prod_00/` -4. Deploy startup kits to the respective server/clients - -### Via the Dashboard (not recommended) -```bash -docker run -d --rm \ - --ipc=host -p 8443:8443 \ - --name=odelia_swarm_admin \ - -v /var/run/docker.sock:/var/run/docker.sock \ - \ - /bin/bash -c "nvflare dashboard --start --local --cred :" -``` -using some credentials chosen for the swarm admin account. - -Access the dashboard in a web browser at `https://localhost:8443` log in with these credentials, and configure the project: -1. enter project short name, name, description -2. enter docker download link: jefftud/odelia: -3. if needed, enter dates -4. click save -5. Server Configuration > Server (DNS name): -6. click make project public - -#### Register client per site -Access the dashboard at `https://:8443`. - -1. register a user -2. enter organziation (corresponding to the site) -3. enter role (e.g., org admin) -4. add a site (note: must not contain spaces, best use alphanumerical name) -5. specify number of GPUs and their memory - -#### Approve clients and finish configuration -Access the dashboard at `https://localhost:8443` log in with the admin credentials. -1. Users Dashboard > approve client user -2. Client Sites > approve client sites -3. Project Home > freeze project - -## Download startup kits -After setting up the project admin configuration, server and clients can download their startup kits. Store the passwords somewhere, they are only displayed once (or you can download them again). - -## Starting a Swarm Training -1. Connect the *server* host to the VPN as described above. -2. Start the *server* startup kit using the respective `startup/docker.sh` script with the option to start the server -3. Provide the *client* startup kits to the swarm participants (be aware that email providers or other channels may prevent encrypted archives) -4. Make sure the participants have started their clients via the respective startup kits, see below -5. Start the *admin* startup kit using the respective `startup/docker.sh` script to start the admin console -6. Deploy a job by `submit_job ` - - -# License -This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. - -# Maintainers -[Jeff](https://github.com/Ultimate-Storm) -[Ole Schwen](mailto:ole.schwen@mevis.fraunhofer.de) -[Steffen Renisch](mailto:steffen.renisch@mevis.fraunhofer.de) - -# Contributing -Feel free to dive in! [Open an issue](https://github.com/KatherLab/MediSwarm/issues) or submit pull requests. - -# Credits -This project utilizes platforms and resources from the following repositories: - -- **[NVFLARE](https://github.com/NVIDIA/NVFlare)**: NVFLARE (NVIDIA Federated Learning Application Runtime Environment) is an open-source framework that provides a robust and scalable platform for federated learning applications. We have integrated NVFLARE to efficiently handle the federated learning aspects of our project. - -Special thanks to the contributors and maintainers of these repositories for their valuable work and support. - ---- - -For more details about NVFLARE and its features, please visit the [NVFLARE GitHub repository](https://github.com/NVIDIA/NVFlare). +- [NVFLARE](https://github.com/NVIDIA/NVFlare) diff --git a/README_old.md b/README_old.md new file mode 100644 index 00000000..516d4d0b --- /dev/null +++ b/README_old.md @@ -0,0 +1,362 @@ +# Introduction + +MediSwarm is an open-source project dedicated to advancing medical deep learning through swarm intelligence, leveraging +the NVFlare platform. Developed in collaboration with the Odelia consortium, this repository aims to create a +decentralized and collaborative framework for medical research and applications. + +## Key Features + +- **Swarm Learning:** Utilizes swarm intelligence principles to improve model performance and adaptability. +- **NVFlare Integration:** Built on NVFlare, providing robust and scalable federated learning capabilities. +- **Data Privacy:** Ensures data security and compliance with privacy regulations by keeping data local to each + institution. +- **Collaborative Research:** Facilitates collaboration among medical researchers and institutions for enhanced + outcomes. +- **Extensible Framework:** Designed to support various medical applications and easily integrate with existing + workflows. + +## Prerequisites + +### Hardware recommendations + +* 64 GB of RAM (32 GB is the absolute minimum) +* 16 CPU cores (8 is the absolute minimum) +* an NVIDIA GPU with 48 GB of RAM (24 GB is the minimum) +* 8 TB of Storage (4 TB is the absolute minimum) + +We demonstrate that the system can run on lightweight hardware like this. For less than 10k EUR, you can configure +systems from suppliers like Lambda, Dell Precision, and Dell Alienware. + +### Operating System + +* Ubuntu 20.04 LTS + +### Software + +* Docker +* openvpn +* git + +### Cloning the repository + + ```bash + git clone https://github.com/KatherLab/MediSwarm.git --recurse-submodules + ``` + +* The last argument is necessary because we are using a git submodule for the (ODELIA fork of + NVFlare)[https://github.com/KatherLab/NVFlare_MediSwarm] +* If you have cloned it without this argument, use `git submodule update --init --recursive` + +### VPN + +A VPN is necessary so that the swarm nodes can communicate with each other securely across firewalls. For that purpose, + +1. Install OpenVPN + ```bash + sudo apt-get install openvpn + ``` +2. If you have a graphical user interface(GUI), follow this guide to connect to the + VPN: [VPN setup guide(GUI).pdf](assets/VPN%20setup%20guide%28GUI%29.pdf) +3. If you have a command line interface(CLI), follow this guide to connect to the + VPN: [VPN setup guide(CLI).md](assets/VPN%20setup%20guide%28CLI%29.md) + +# Usage for Swarm Participants + +## Setup + +1. Make sure your compute node satisfies the specification and has the necessary software installed. +2. Clone the repository and connect the client node to the VPN as described above. TODO is cloning the repository + necessary for swarm participants? +3. TODO anything else? + +## Prepare Dataset + +1. see Step 3: Prepare Data in (this document)[application/jobs/ODELIA_ternary_classification/app/scripts/README.md] + +## Prepare Training Participation + +1. Extract startup kit provided by swarm operator + +## Run Pre-Flight Check + +1. Directories + ```bash + export SITE_NAME= # TODO should be defined above, also needed for dataset location + export DATADIR= + export SCRATCHDIR= + ``` +2. From the directory where you unpacked the startup kit, + ```bash + cd $SITE_NAME/startup + ``` +3. Verify that your Docker/GPU setup is working + ```bash + ./docker.sh --scratch_dir $SCRATCHDIR --GPU device=0 --dummy_training + ``` + * This will pull the Docker image, which might take a while. + * If you have multiple GPUs and 0 is busy, use a different one. + * The “training” itself should take less than minute and does not yield a meaningful classification performance. +4. Verify that your local data can be accessed and the model can be trained locally + ```bash + ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --preflight_check + ``` + * Training time depends on the size of the local dataset. + +## Configurable Parameters for docker.sh + +TODO consider what should be described and recommended as configurable here, given that the goal of the startup kits is +to ensure everyone runs the same training + +When launching the client using `./docker.sh`, the following environment variables are automatically passed into the +container. You can override them to customize training behavior: + +| Environment Variable | Default | Description | +|----------------------|-----------------|----------------------------------------------------------------------| +| `SITE_NAME` | *from flag* | Name of your local site, e.g. `TUD_1`, passed via `--start_client` | +| `DATA_DIR` | *from flag* | Path to the host folder that contains your local data | +| `SCRATCH_DIR` | *from flag* | Path for saving training outputs and temporary files | +| `GPU_DEVICE` | `device=0` | GPU identifier to use inside the container (or `all`) | +| `MODEL` | `MST` | Model architecture, choices: `MST`, `ResNet` | +| `INSTITUTION` | `ODELIA` | Institution name, used to group experiment logs | +| `CONFIG` | `unilateral` | Configuration schema for dataset (e.g. label scheme) | +| `NUM_EPOCHS` | `1` (test mode) | Number of training epochs (used in preflight/local training) | +| `TRAINING_MODE` | derived | Internal use. Automatically set based on flags like `--start_client` | + +These are injected into the container as `--env` variables. You can modify their defaults by editing `docker.sh` or +exporting before run: + +```bash +export MODEL=ResNet +export CONFIG=original +./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=1 --start_client +``` + +## Start Swarm Node + +1. From the directory where you unpacked the startup kit: + ```bash + cd $SITE_NAME/startup # Skip this if you just ran the pre-flight check + ``` + +2. Start the client: + ```bash + ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --start_client + ``` + If you have multiple GPUs and 0 is busy, use a different one. + +3. Console output is captured in `nohup.out`, which may have been created with limited permissions in the container, so + make it readable if necessary: + ```bash + sudo chmod a+r nohup.out + ``` + +4. Output files: + - **Training logs and checkpoints** are saved under: + ``` + $SCRATCHDIR/runs/$SITE_NAME// + ``` + - **Best checkpoint** usually saved as `best.ckpt` or `last.ckpt` + - **Prediction results**, if enabled, will appear in subfolders of the same directory + - **TensorBoard logs**, if activated, are stored in their respective folders inside the run directory + - TODO what is enabled/activated should be hard-coded, adapt accordingly + +5. (Optional) You can verify that the container is running properly: + ```bash + docker ps # Check if odelia_swarm_client_$SITE_NAME is listed + nvidia-smi # Check if the GPU is busy training (it will be idling while waiting for model transfer) + tail -f nohup.out # Follow training log + ``` + +## Run Local Training + +1. From the directory where you unpacked the startup kit + ```bash + cd $SITE_NAME/startup + ``` +2. Start local training + ```bash + /docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU all --local_training + ``` + * TODO update when handling of the number of epochs has been implemented +3. Output files + * TODO describe + +# Usage for MediSwarm and Application Code Developers + +## Versioning of ODELIA Docker Images + +If needed, update the version number in file (odelia_image.version)[odelia_image.version]. It will be used automatically +for the Docker image and startup kits. + +## Build the Docker Image and Startup Kits + +The Docker image contains all dependencies for administrative purposes (dashboard, command-line provisioning, admin +console, server) as well as for running the 3DCNN pipeline under the pytorch-lightning framework. +The project description specifies the swarm nodes etc. to be used for a swarm training. + +```bash +cd MediSwarm +./buildDockerImageAndStartupKits.sh -p application/provision/ +``` + +1. Make sure you have no uncommitted changes. +2. If package versions are still not available, you may have to check what the current version is and update the + `Dockerfile` accordingly. Version numbers are hard-coded to avoid issues due to silently different versions being + installed. +3. After successful build (and after verifying that everything works as expected, i.e., local tests, building startup + kits, running local trainings in the startup kit), you can manually push the image to DockerHub, provided you have + the necessary rights. Make sure you are not re-using a version number for this purpose. + +## Running Local Tests + + ```bash + ./runTestsInDocker.sh + ``` + +You should see + +1. several expected errors and warnings printed from unit tests that should succeed overall, and a coverage report +2. output of a successful simulation run with two nodes +3. output of a successful proof-of-concept run run with two nodes +4. output of a set of startup kits being generated +5. output of a dummy training run using one of the startup kits +6. TODO update this to what the tests output now + +Optionally, uncomment running NVFlare unit tests in `_runTestsInsideDocker.sh`. + +## Distributing Startup Kits + +Distribute the startup kits to the clients. + +## Running the Application + +1. **CIFAR-10 example:** + See [cifar10/README.md](application/jobs/cifar10/README.md) +2. **Minimal PyTorch CNN example:** + See [application/jobs/minimal_training_pytorch_cnn/README.md](application/jobs/minimal_training_pytorch_cnn/README.md) +3. **3D CNN for classifying breast tumors:** + See [ODELIA_ternary_classification/README.md](application/jobs/ODELIA_ternary_classification/README.md) + +## Contributing Application Code + +1. Take a look at application/jobs/minimal_training_pytorch_cnn for a minimal example how pytorch code can be adapted to + work with NVFlare +2. Take a look at application/jobs/ODELIA_ternary_classification for a more relastic example of pytorch code that can + run in the swarm +3. Use the local tests to check if the code is swarm-ready +4. TODO more detailed instructions + +# Usage for Swarm Operators + +## Setting up a Swarm + +Production mode is designed for secure, real-world deployments. It supports both local and remote setups, whether +on-premise or in the cloud. For more details, refer to +the [NVFLARE Production Mode](https://nvflare.readthedocs.io/en/2.4.1/real_world_fl.html). + +To set up production mode, follow these steps: + +## Edit `/etc/hosts` + +Ensure that your `/etc/hosts` file includes the correct host mappings. All hosts need to be able to communicate to the +server node. + +For example, add the following line (replace `` with the server's actual IP address): + +```plaintext + dl3.tud.de dl3 +``` + +## Create Startup Kits + +### Via Script (recommended) + +1. Use, e.g., the file `application/provision/project_MEVIS_test.yml`, adapt as needed (network protocol etc.) +2. Call `buildStartupKits.sh /path/to/project_configuration.yml` to build the startup kits +3. Startup kits are generated to `workspace//prod_00/` +4. Deploy startup kits to the respective server/clients + +### Via the Dashboard (not recommended) + +```bash +docker run -d --rm \ + --ipc=host -p 8443:8443 \ + --name=odelia_swarm_admin \ + -v /var/run/docker.sock:/var/run/docker.sock \ + \ + /bin/bash -c "nvflare dashboard --start --local --cred :" +``` + +using some credentials chosen for the swarm admin account. + +Access the dashboard in a web browser at `https://localhost:8443` log in with these credentials, and configure the +project: + +1. enter project short name, name, description +2. enter docker download link: jefftud/odelia: +3. if needed, enter dates +4. click save +5. Server Configuration > Server (DNS name): +6. click make project public + +#### Register client per site + +Access the dashboard at `https://:8443`. + +1. register a user +2. enter organziation (corresponding to the site) +3. enter role (e.g., org admin) +4. add a site (note: must not contain spaces, best use alphanumerical name) +5. specify number of GPUs and their memory + +#### Approve clients and finish configuration + +Access the dashboard at `https://localhost:8443` log in with the admin credentials. + +1. Users Dashboard > approve client user +2. Client Sites > approve client sites +3. Project Home > freeze project + +## Download startup kits + +After setting up the project admin configuration, server and clients can download their startup kits. Store the +passwords somewhere, they are only displayed once (or you can download them again). + +## Starting a Swarm Training + +1. Connect the *server* host to the VPN as described above. +2. Start the *server* startup kit using the respective `startup/docker.sh` script with the option to start the server +3. Provide the *client* startup kits to the swarm participants (be aware that email providers or other channels may + prevent encrypted archives) +4. Make sure the participants have started their clients via the respective startup kits, see below +5. Start the *admin* startup kit using the respective `startup/docker.sh` script to start the admin console +6. Deploy a job by `submit_job ` + +# License + +This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. + +# Maintainers + +[Jeff](https://github.com/Ultimate-Storm) +[Ole Schwen](mailto:ole.schwen@mevis.fraunhofer.de) +[Steffen Renisch](mailto:steffen.renisch@mevis.fraunhofer.de) + +# Contributing + +Feel free to dive in! [Open an issue](https://github.com/KatherLab/MediSwarm/issues) or submit pull requests. + +# Credits + +This project utilizes platforms and resources from the following repositories: + +- **[NVFLARE](https://github.com/NVIDIA/NVFlare)**: NVFLARE (NVIDIA Federated Learning Application Runtime Environment) + is an open-source framework that provides a robust and scalable platform for federated learning applications. We have + integrated NVFLARE to efficiently handle the federated learning aspects of our project. + +Special thanks to the contributors and maintainers of these repositories for their valuable work and support. + +--- + +For more details about NVFLARE and its features, please visit +the [NVFLARE GitHub repository](https://github.com/NVIDIA/NVFlare). diff --git a/assets/readme/README.developer.md b/assets/readme/README.developer.md new file mode 100644 index 00000000..a43026c7 --- /dev/null +++ b/assets/readme/README.developer.md @@ -0,0 +1,65 @@ +# Usage for MediSwarm and Application Code Developers + +## Versioning of ODELIA Docker Images + +If needed, update the version number in file [odelia_image.version](../../odelia_image.version). It will be used +automatically for the Docker image and startup kits. + +## Build the Docker Image and Startup Kits + +The Docker image contains all dependencies for administrative purposes (dashboard, command-line provisioning, admin +console, server) as well as for running the 3DCNN pipeline under the pytorch-lightning framework. +The project description specifies the swarm nodes etc. to be used for a swarm training. + + ```bash + cd MediSwarm + ./buildDockerImageAndStartupKits.sh -p application/provision/ + ``` + +1. Make sure you have no uncommitted changes. +2. If package versions are still not available, you may have to check what the current version is and update the + `Dockerfile` accordingly. Version numbers are hard-coded to avoid issues due to silently different versions being + installed. +3. After successful build (and after verifying that everything works as expected, i.e., local tests, building startup + kits, running local trainings in the startup kit), you can manually push the image to DockerHub, provided you have + the necessary rights. Make sure you are not re-using a version number for this purpose. + +## Running Local Tests + + ```bash + ./runTestsInDocker.sh + ``` + +You should see + +1. several expected errors and warnings printed from unit tests that should succeed overall, and a coverage report +2. output of a successful simulation run with two nodes +3. output of a successful proof-of-concept run run with two nodes +4. output of a set of startup kits being generated +5. output of a dummy training run using one of the startup kits +6. TODO update this to what the tests output now + +Optionally, uncomment running NVFlare unit tests in `_runTestsInsideDocker.sh`. + +## Distributing Startup Kits + +Distribute the startup kits to the clients. + +## Running the Application + +1. **CIFAR-10 example:** + See [README.md](../../application/jobs/cifar10/README.md) +2. **Minimal PyTorch CNN example:** + See [README.md](../../application/jobs/minimal_training_pytorch_cnn/README.md) +3. **3D CNN for classifying breast tumors:** + See [README.md](../../application/jobs/ODELIA_ternary_classification/README.md) + +## Contributing Application Code + +1. Take a look at application/jobs/minimal_training_pytorch_cnn for a minimal example how pytorch code can be adapted to + work with NVFlare +2. Take a look at application/jobs/ODELIA_ternary_classification for a more relastic example of pytorch code that can + run in the swarm +3. Use the local tests to check if the code is swarm-ready +4. TODO more detailed instructions + diff --git a/assets/readme/README.operator.md b/assets/readme/README.operator.md new file mode 100644 index 00000000..101d5266 --- /dev/null +++ b/assets/readme/README.operator.md @@ -0,0 +1,85 @@ +# Usage for Swarm Operators + +## Setting up a Swarm + +Production mode is designed for secure, real-world deployments. It supports both local and remote setups, whether +on-premise or in the cloud. For more details, refer to +the [NVFLARE Production Mode](https://nvflare.readthedocs.io/en/2.4.1/real_world_fl.html). + +To set up production mode, follow these steps: + +## Edit `/etc/hosts` + +Ensure that your `/etc/hosts` file includes the correct host mappings. All hosts need to be able to communicate to the +server node. + +For example, add the following line (replace `` with the server's actual IP address): + +```plaintext + dl3.tud.de dl3 +``` + +## Create Startup Kits + +### Via Script (recommended) + +1. Use, e.g., the file `application/provision/project_MEVIS_test.yml`, adapt as needed (network protocol etc.) +2. Call `buildStartupKits.sh /path/to/project_configuration.yml` to build the startup kits +3. Startup kits are generated to `workspace//prod_00/` +4. Deploy startup kits to the respective server/clients + +### Via the Dashboard (not recommended) + +```bash +docker run -d --rm \ + --ipc=host -p 8443:8443 \ + --name=odelia_swarm_admin \ + -v /var/run/docker.sock:/var/run/docker.sock \ + \ + /bin/bash -c "nvflare dashboard --start --local --cred :" +``` + +using some credentials chosen for the swarm admin account. + +Access the dashboard in a web browser at `https://localhost:8443` log in with these credentials, and configure the +project: + +1. enter project short name, name, description +2. enter docker download link: jefftud/odelia: +3. if needed, enter dates +4. click save +5. Server Configuration > Server (DNS name): +6. click make project public + +#### Register client per site + +Access the dashboard at `https://:8443`. + +1. register a user +2. enter organziation (corresponding to the site) +3. enter role (e.g., org admin) +4. add a site (note: must not contain spaces, best use alphanumerical name) +5. specify number of GPUs and their memory + +#### Approve clients and finish configuration + +Access the dashboard at `https://localhost:8443` log in with the admin credentials. + +1. Users Dashboard > approve client user +2. Client Sites > approve client sites +3. Project Home > freeze project + +## Download startup kits + +After setting up the project admin configuration, server and clients can download their startup kits. Store the +passwords somewhere, they are only displayed once (or you can download them again). + +## Starting a Swarm Training + +1. Connect the *server* host to the VPN as described above. +2. Start the *server* startup kit using the respective `startup/docker.sh` script with the option to start the server +3. Provide the *client* startup kits to the swarm participants (be aware that email providers or other channels may + prevent encrypted archives) +4. Make sure the participants have started their clients via the respective startup kits, see below +5. Start the *admin* startup kit using the respective `startup/docker.sh` script to start the admin console +6. Deploy a job by `submit_job ` diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md new file mode 100644 index 00000000..0a24120d --- /dev/null +++ b/assets/readme/README.participant.md @@ -0,0 +1,145 @@ +# MediSwarm Participant Guide + +This guide is for data scientists and medical research sites participating in a Swarm Learning project. + +## Prerequisites + +- Hardware: Min. 32GB RAM, 8 cores, NVIDIA GPU with 24GB VRAM, 4TB storage +- OS: Ubuntu 20.04 LTS +- Software: Docker, OpenVPN, Git + +## Setup + +1. Make sure your compute node satisfies the specification and has the necessary software installed. +2. Clone the repository and connect the client node to the VPN as described above. TODO is cloning the repository + necessary for swarm participants? +3. TODO anything else? + +## Prepare Dataset + +1. see Step 3: Prepare Data in [README.md](../../application/jobs/ODELIA_ternary_classification/app/scripts/README.md) + +## Prepare Training Participation + +1. Extract startup kit provided by swarm operator + +## Local Testing on Your Data + +1. Directories + ```bash + export SITE_NAME= # TODO should be defined above, also needed for dataset location + export DATADIR= + export SCRATCHDIR= + ``` +2. From the directory where you unpacked the startup kit, + ```bash + cd $SITE_NAME/startup + ``` +3. Verify that your Docker/GPU setup is working + ```bash + ./docker.sh --scratch_dir $SCRATCHDIR --GPU device=0 --dummy_training + ``` + * This will pull the Docker image, which might take a while. + * If you have multiple GPUs and 0 is busy, use a different one. + * The “training” itself should take less than minute and does not yield a meaningful classification performance. +4. Verify that your local data can be accessed and the model can be trained locally + ```bash + ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --preflight_check + ``` + * Training time depends on the size of the local dataset. + +## Configurable Parameters for docker.sh + +TODO consider what should be described and recommended as configurable here, given that the goal of the startup kits is +to ensure everyone runs the same training + +When launching the client using `./docker.sh`, the following environment variables are automatically passed into the +container. You can override them to customize training behavior: + +| Environment Variable | Default | Description | +|----------------------|-----------------|----------------------------------------------------------------------| +| `SITE_NAME` | *from flag* | Name of your local site, e.g. `TUD_1`, passed via `--start_client` | +| `DATA_DIR` | *from flag* | Path to the host folder that contains your local data | +| `SCRATCH_DIR` | *from flag* | Path for saving training outputs and temporary files | +| `GPU_DEVICE` | `device=0` | GPU identifier to use inside the container (or `all`) | +| `MODEL` | `MST` | Model architecture, choices: `MST`, `ResNet` | +| `INSTITUTION` | `ODELIA` | Institution name, used to group experiment logs | +| `CONFIG` | `unilateral` | Configuration schema for dataset (e.g. label scheme) | +| `NUM_EPOCHS` | `1` (test mode) | Number of training epochs (used in preflight/local training) | +| `TRAINING_MODE` | derived | Internal use. Automatically set based on flags like `--start_client` | + +These are injected into the container as `--env` variables. You can modify their defaults by editing `docker.sh` or +exporting before run: + +```bash +export MODEL=ResNet +export CONFIG=original +./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=1 --start_client +``` + +## Start Swarm Node + +### VPN + +A VPN is necessary so that the swarm nodes can communicate with each other securely across firewalls. For that purpose, + +1. Install OpenVPN + ```bash + sudo apt-get install openvpn + ``` +2. If you have a graphical user interface(GUI), follow this guide to connect to the + VPN: [VPN setup guide(GUI).pdf](../VPN%20setup%20guide%28GUI%29.pdf) +3. If you have a command line interface(CLI), follow this guide to connect to the + VPN: [VPN setup guide(CLI).md](../VPN%20setup%20guide%28CLI%29.md) + +### Start the Client + +1. From the directory where you unpacked the startup kit: + ```bash + cd $SITE_NAME/startup # Skip this if you just ran the pre-flight check + ``` + +2. Start the client: + ```bash + ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --start_client + ``` + If you have multiple GPUs and 0 is busy, use a different one. + +3. Console output is captured in `nohup.out`, which may have been created with limited permissions in the container, so + make it readable if necessary: + ```bash + sudo chmod a+r nohup.out + ``` + +4. Output files: + - **Training logs and checkpoints** are saved under: + ``` + $SCRATCHDIR/runs/$SITE_NAME// + ``` + - **Best checkpoint** usually saved as `best.ckpt` or `last.ckpt` + - **Prediction results**, if enabled, will appear in subfolders of the same directory + - **TensorBoard logs**, if activated, are stored in their respective folders inside the run directory + - TODO what is enabled/activated should be hard-coded, adapt accordingly + +5. (Optional) You can verify that the container is running properly: + ```bash + docker ps # Check if odelia_swarm_client_$SITE_NAME is listed + nvidia-smi # Check if the GPU is busy training (it will be idling while waiting for model transfer) + tail -f nohup.out # Follow training log + ``` + +For any issues, contact your Swarm Operator or check with `docker ps`, `nvidia-smi`, and `tail -f nohup.out`. + +## (Optional) Run Local Training + +1. From the directory where you unpacked the startup kit + ```bash + cd $SITE_NAME/startup + ``` +2. Start local training + ```bash + /docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU all --local_training + ``` + * TODO update when handling of the number of epochs has been implemented +3. Output files + * TODO describe \ No newline at end of file From d1893bab6d6a5d25485049e518d7dadd770731bd Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 13:00:48 +0200 Subject: [PATCH 039/337] docs: update README badges for PR tests and build status Signed-off-by: GitHub CI --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index fecfbc6a..0c66e8fa 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,8 @@ An open-source platform advancing medical AI via privacy-preserving swarm learning, based on NVFlare and developed with the ODELIA consortium. -[![PR Tests]([pr-test.yaml](.github/workflows/pr-test.yaml)) -[![Docker Build]([update-apt-versions.yml](.github/workflows/update-apt-versions.yml)) +[![PR Tests](https://github.com/KatherLab/MediSwarm/actions/workflows/pr-test.yaml/badge.svg)](https://github.com/KatherLab/MediSwarm/actions/workflows/pr-test.yaml) +[![Build](https://github.com/KatherLab/MediSwarm/actions/workflows/update-apt-versions.yml/badge.svg)](https://github.com/KatherLab/MediSwarm/actions/workflows/update-apt-versions.yml) ## Quick Start for Your Role From d2e3b5de51781092d22fe4e80778f32930220925 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 13:13:34 +0200 Subject: [PATCH 040/337] fix: comment out git fetch and diff check in update_apt_versions.sh Signed-off-by: GitHub CI --- scripts/ci/update_apt_versions.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/ci/update_apt_versions.sh b/scripts/ci/update_apt_versions.sh index d86bbc6d..d5a4ca13 100755 --- a/scripts/ci/update_apt_versions.sh +++ b/scripts/ci/update_apt_versions.sh @@ -45,9 +45,9 @@ while IFS= read -r match; do fi done < <(grep -oP '\b[a-z0-9\.\-]+=[a-zA-Z0-9:~.+-]+\b' "$DOCKERFILE_PATH") -git fetch origin main -if git diff --quiet origin/main..HEAD; then - echo "NO_CHANGES=true" >> "$GITHUB_ENV" -else - echo "NO_CHANGES=false" >> "$GITHUB_ENV" -fi \ No newline at end of file +#git fetch origin main +#if git diff --quiet origin/main..HEAD; then +# echo "NO_CHANGES=true" >> "$GITHUB_ENV" +#else +# echo "NO_CHANGES=false" >> "$GITHUB_ENV" +#fi \ No newline at end of file From 5c1a536a1e9ea2f2055b62dfc98089bc2ded426f Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Thu, 10 Jul 2025 13:26:08 +0200 Subject: [PATCH 041/337] Update README.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0c66e8fa..2eb29561 100644 --- a/README.md +++ b/README.md @@ -10,9 +10,9 @@ the ODELIA consortium. Choose your role and follow the instructions: -- [Swarm Participant (Medical Site / Data Scientist)](assets/readme/README.participant.md)) +- [Swarm Participant (Medical Site / Data Scientist)](assets/readme/README.participant.md) - [Developer (Docker, Code, Pipeline)](assets/readme/README.developer.md) -- [Swarm Operator (Provisioning, VPN, Server)](assets/readme/README.operator.md)) +- [Swarm Operator (Provisioning, VPN, Server)](assets/readme/README.operator.md) ## Overview From ac86350eae021d34f527cc70a15994113ec8b698 Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Thu, 10 Jul 2025 13:26:30 +0200 Subject: [PATCH 042/337] Update assets/readme/README.participant.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- assets/readme/README.participant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 0a24120d..5609fc72 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -138,7 +138,7 @@ For any issues, contact your Swarm Operator or check with `docker ps`, `nvidia-s ``` 2. Start local training ```bash - /docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU all --local_training + ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU all --local_training ``` * TODO update when handling of the number of epochs has been implemented 3. Output files From 73b685e1db31e5b39da67b6e2986a016b8090144 Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Thu, 10 Jul 2025 13:26:43 +0200 Subject: [PATCH 043/337] Update assets/readme/README.participant.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- assets/readme/README.participant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 5609fc72..717c30d0 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -11,7 +11,7 @@ This guide is for data scientists and medical research sites participating in a ## Setup 1. Make sure your compute node satisfies the specification and has the necessary software installed. -2. Clone the repository and connect the client node to the VPN as described above. TODO is cloning the repository +2. Clone the repository and connect the client node to the VPN as described in the VPN setup section below. TODO is cloning the repository necessary for swarm participants? 3. TODO anything else? From 2405785ecfcecf5333479736b14728220dfbf697 Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Thu, 10 Jul 2025 13:26:57 +0200 Subject: [PATCH 044/337] Update assets/readme/README.developer.md Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- assets/readme/README.developer.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.developer.md b/assets/readme/README.developer.md index a43026c7..ac3f75a4 100644 --- a/assets/readme/README.developer.md +++ b/assets/readme/README.developer.md @@ -58,7 +58,7 @@ Distribute the startup kits to the clients. 1. Take a look at application/jobs/minimal_training_pytorch_cnn for a minimal example how pytorch code can be adapted to work with NVFlare -2. Take a look at application/jobs/ODELIA_ternary_classification for a more relastic example of pytorch code that can +2. Take a look at application/jobs/ODELIA_ternary_classification for a more realistic example of pytorch code that can run in the swarm 3. Use the local tests to check if the code is swarm-ready 4. TODO more detailed instructions From 77ebb12419c80f3e5de8f955d472d495dfdc27c9 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 13:27:43 +0200 Subject: [PATCH 045/337] chore: rename README_old.md for improved organization Signed-off-by: GitHub CI --- README_old.md => assets/readme/README_old.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename README_old.md => assets/readme/README_old.md (100%) diff --git a/README_old.md b/assets/readme/README_old.md similarity index 100% rename from README_old.md rename to assets/readme/README_old.md From 40893b4e78b7251660d17fa84e51f74394d09ed5 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 10 Jul 2025 14:09:29 +0200 Subject: [PATCH 046/337] iterated README for swarm participants --- assets/readme/README.participant.md | 93 ++++++++++++++--------------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 717c30d0..76f56829 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -11,8 +11,16 @@ This guide is for data scientists and medical research sites participating in a ## Setup 1. Make sure your compute node satisfies the specification and has the necessary software installed. -2. Clone the repository and connect the client node to the VPN as described in the VPN setup section below. TODO is cloning the repository - necessary for swarm participants? +2. Set up the VPN. A VPN is necessary so that the swarm nodes can communicate with each other securely across firewalls. For that purpose, + 1. Install OpenVPN + ```bash + sudo apt-get install openvpn + ``` + 2. If you have a graphical user interface(GUI), follow this guide to connect to the + VPN: [VPN setup guide(GUI).pdf](../VPN%20setup%20guide%28GUI%29.pdf) + 3. If you have a command line interface(CLI), follow this guide to connect to the + VPN: [VPN setup guide(CLI).md](../VPN%20setup%20guide%28CLI%29.md) + 4. You may want to clone this repository or selectively download VPN-related scripts for this purpose. 3. TODO anything else? ## Prepare Dataset @@ -23,7 +31,7 @@ This guide is for data scientists and medical research sites participating in a 1. Extract startup kit provided by swarm operator -## Local Testing on Your Data +### Local Testing on Your Data 1. Directories ```bash @@ -48,51 +56,13 @@ This guide is for data scientists and medical research sites participating in a ``` * Training time depends on the size of the local dataset. -## Configurable Parameters for docker.sh +### Start Swarm Node -TODO consider what should be described and recommended as configurable here, given that the goal of the startup kits is -to ensure everyone runs the same training - -When launching the client using `./docker.sh`, the following environment variables are automatically passed into the -container. You can override them to customize training behavior: - -| Environment Variable | Default | Description | -|----------------------|-----------------|----------------------------------------------------------------------| -| `SITE_NAME` | *from flag* | Name of your local site, e.g. `TUD_1`, passed via `--start_client` | -| `DATA_DIR` | *from flag* | Path to the host folder that contains your local data | -| `SCRATCH_DIR` | *from flag* | Path for saving training outputs and temporary files | -| `GPU_DEVICE` | `device=0` | GPU identifier to use inside the container (or `all`) | -| `MODEL` | `MST` | Model architecture, choices: `MST`, `ResNet` | -| `INSTITUTION` | `ODELIA` | Institution name, used to group experiment logs | -| `CONFIG` | `unilateral` | Configuration schema for dataset (e.g. label scheme) | -| `NUM_EPOCHS` | `1` (test mode) | Number of training epochs (used in preflight/local training) | -| `TRAINING_MODE` | derived | Internal use. Automatically set based on flags like `--start_client` | +#### VPN -These are injected into the container as `--env` variables. You can modify their defaults by editing `docker.sh` or -exporting before run: +1. Connect to VPN as described in [VPN setup guide(GUI).pdf](../VPN%20setup%20guide%28GUI%29.pdf) (GUI) or [VPN setup guide(CLI).md](../VPN%20setup%20guide%28CLI%29.md) (command line). -```bash -export MODEL=ResNet -export CONFIG=original -./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=1 --start_client -``` - -## Start Swarm Node - -### VPN - -A VPN is necessary so that the swarm nodes can communicate with each other securely across firewalls. For that purpose, - -1. Install OpenVPN - ```bash - sudo apt-get install openvpn - ``` -2. If you have a graphical user interface(GUI), follow this guide to connect to the - VPN: [VPN setup guide(GUI).pdf](../VPN%20setup%20guide%28GUI%29.pdf) -3. If you have a command line interface(CLI), follow this guide to connect to the - VPN: [VPN setup guide(CLI).md](../VPN%20setup%20guide%28CLI%29.md) - -### Start the Client +#### Start the Client 1. From the directory where you unpacked the startup kit: ```bash @@ -130,7 +100,7 @@ A VPN is necessary so that the swarm nodes can communicate with each other secur For any issues, contact your Swarm Operator or check with `docker ps`, `nvidia-smi`, and `tail -f nohup.out`. -## (Optional) Run Local Training +### (Optional) Run Local Training 1. From the directory where you unpacked the startup kit ```bash @@ -142,4 +112,33 @@ For any issues, contact your Swarm Operator or check with `docker ps`, `nvidia-s ``` * TODO update when handling of the number of epochs has been implemented 3. Output files - * TODO describe \ No newline at end of file + * TODO describe + +### Configurable Parameters for docker.sh + +TODO consider what should be described and recommended as configurable here, given that the goal of the startup kits is +to ensure everyone runs the same training + +When launching the client using `./docker.sh`, the following environment variables are automatically passed into the +container. You can override them to customize training behavior: + +| Environment Variable | Default | Description | +|----------------------|-----------------|----------------------------------------------------------------------| +| `SITE_NAME` | *from flag* | Name of your local site, e.g. `TUD_1`, passed via `--start_client` | +| `DATA_DIR` | *from flag* | Path to the host folder that contains your local data | +| `SCRATCH_DIR` | *from flag* | Path for saving training outputs and temporary files | +| `GPU_DEVICE` | `device=0` | GPU identifier to use inside the container (or `all`) | +| `MODEL` | `MST` | Model architecture, choices: `MST`, `ResNet` | +| `INSTITUTION` | `ODELIA` | Institution name, used to group experiment logs | +| `CONFIG` | `unilateral` | Configuration schema for dataset (e.g. label scheme) | +| `NUM_EPOCHS` | `1` (test mode) | Number of training epochs (used in preflight/local training) | +| `TRAINING_MODE` | derived | Internal use. Automatically set based on flags like `--start_client` | + +These are injected into the container as `--env` variables. You can modify their defaults by editing `docker.sh` or +exporting before run: + +```bash +export MODEL=ResNet +export CONFIG=original +./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=1 --start_client +``` From 9a2b3fdda86903effc77380cba1909b4ba2249b8 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 10 Jul 2025 14:11:08 +0200 Subject: [PATCH 047/337] note on site name --- assets/readme/README.participant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 76f56829..6a71bc73 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -35,7 +35,7 @@ This guide is for data scientists and medical research sites participating in a 1. Directories ```bash - export SITE_NAME= # TODO should be defined above, also needed for dataset location + export SITE_NAME= # this should end in `_1`, e.g., `UKA_1`, unless you participate with multiple nodes export DATADIR= export SCRATCHDIR= ``` From 3b4196eb10157ecc3e395604532abf0c2c62b4c7 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 10 Jul 2025 14:13:12 +0200 Subject: [PATCH 048/337] indentation for nested list --- assets/readme/README.participant.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 6a71bc73..04beb0f9 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -12,15 +12,15 @@ This guide is for data scientists and medical research sites participating in a 1. Make sure your compute node satisfies the specification and has the necessary software installed. 2. Set up the VPN. A VPN is necessary so that the swarm nodes can communicate with each other securely across firewalls. For that purpose, - 1. Install OpenVPN - ```bash - sudo apt-get install openvpn - ``` - 2. If you have a graphical user interface(GUI), follow this guide to connect to the - VPN: [VPN setup guide(GUI).pdf](../VPN%20setup%20guide%28GUI%29.pdf) - 3. If you have a command line interface(CLI), follow this guide to connect to the - VPN: [VPN setup guide(CLI).md](../VPN%20setup%20guide%28CLI%29.md) - 4. You may want to clone this repository or selectively download VPN-related scripts for this purpose. + 1. Install OpenVPN + ```bash + sudo apt-get install openvpn + ``` + 2. If you have a graphical user interface(GUI), follow this guide to connect to the + VPN: [VPN setup guide(GUI).pdf](../VPN%20setup%20guide%28GUI%29.pdf) + 3. If you have a command line interface(CLI), follow this guide to connect to the + VPN: [VPN setup guide(CLI).md](../VPN%20setup%20guide%28CLI%29.md) + 4. You may want to clone this repository or selectively download VPN-related scripts for this purpose. 3. TODO anything else? ## Prepare Dataset From 85387a09de66009219300f28b317ff4d7cb4d2d4 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 10 Jul 2025 14:28:21 +0200 Subject: [PATCH 049/337] moved local training before swarm training --- assets/readme/README.participant.md | 35 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 04beb0f9..d889724e 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -56,6 +56,20 @@ This guide is for data scientists and medical research sites participating in a ``` * Training time depends on the size of the local dataset. +### (Optional) Run Local Training + +1. From the directory where you unpacked the startup kit + ```bash + cd $SITE_NAME/startup + ``` +2. Start local training + ```bash + ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --local_training + ``` + * TODO update when handling of the number of epochs has been implemented +3. Output files + * TODO describe + ### Start Swarm Node #### VPN @@ -93,26 +107,11 @@ This guide is for data scientists and medical research sites participating in a 5. (Optional) You can verify that the container is running properly: ```bash - docker ps # Check if odelia_swarm_client_$SITE_NAME is listed - nvidia-smi # Check if the GPU is busy training (it will be idling while waiting for model transfer) + docker ps # Check if odelia_swarm_client_$SITE_NAME is listed + nvidia-smi # Check if the GPU is busy training (it will be idling while waiting for model transfer) tail -f nohup.out # Follow training log ``` - -For any issues, contact your Swarm Operator or check with `docker ps`, `nvidia-smi`, and `tail -f nohup.out`. - -### (Optional) Run Local Training - -1. From the directory where you unpacked the startup kit - ```bash - cd $SITE_NAME/startup - ``` -2. Start local training - ```bash - ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU all --local_training - ``` - * TODO update when handling of the number of epochs has been implemented -3. Output files - * TODO describe +For any issues, check if the commands above point to problems and contact your Swarm Operator. ### Configurable Parameters for docker.sh From b0c78298981caa7604f8f56c7a1ab9c2e5bb9982 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Thu, 10 Jul 2025 15:59:19 +0200 Subject: [PATCH 050/337] fix: update script paths in README for improved clarity Signed-off-by: GitHub CI --- .../app/scripts/README.md | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/scripts/README.md b/application/jobs/ODELIA_ternary_classification/app/scripts/README.md index 5afef8d5..caf02444 100644 --- a/application/jobs/ODELIA_ternary_classification/app/scripts/README.md +++ b/application/jobs/ODELIA_ternary_classification/app/scripts/README.md @@ -25,13 +25,13 @@ ## Step 2: Prepare Data ([DUKE](https://sites.duke.edu/mazurowski/resources/breast-cancer-mri-dataset/)) * Specify the path to the parent folder as `path_root=...` and `dataset=DUKE` in the following scripts -* Run [scripts/preprocessing/duke/step1_dicom2nifti.py](scripts/preprocessing/duke/step1_dicom2nifti.py) - It will +* Run [step1_dicom2nifti.py](preprocessing/duke/step1_dicom2nifti.py) - It will store DICOM files as NIFTI files in a new folder `data` -* Run [scripts/preprocessing/step2_compute_sub.py](scripts/preprocessing/step2_compute_sub.py) - computes the +* Run [scripts/preprocessing/step2_compute_sub.py](preprocessing/step2_compute_sub.py) - computes the subtraction image -* Run [scripts/preprocessing/step3_unilateral.py](scripts/preprocessing/step3_unilateral.py) - splits breasts into left +* Run [scripts/preprocessing/step3_unilateral.py](preprocessing/step3_unilateral.py) - splits breasts into left and right side and resamples to uniform shape. The result is stored in a new folder `data_unilateral` -* Run [scripts/preprocessing/duke/step4_create_split.py](scripts/preprocessing/duke/step4_create_split.py) - creates a +* Run [scripts/preprocessing/duke/step4_create_split.py](preprocessing/duke/step4_create_split.py) - creates a stratified five-fold split and stores the result in `metadata/split.csv`
@@ -43,22 +43,19 @@ * Create a folder `metadata` with the following file inside: * Challenge: `annotation.xlsx` * Local Training: `ODELIA annotation scheme-2.0.xlsx` -* Overwrite [scripts/preprocessing/odelia/step1_dicom2nifti.py](scripts/preprocessing/odelia/step1_dicom2nifti.py). It +* Overwrite [scripts/preprocessing/odelia/step1_dicom2nifti.py](preprocessing/odelia/step1_dicom2nifti.py). It should create a subfolder `data` and subfolders with files named as `T2.nii.gz`, `Pre.nii.gz`, `Post_1.nii.gz`, `Post_2.nii.gz`, etc. The subfolder should be labeled as follows: * Challenge: Folders must have the same name as the entries in the `ID` column of the `annotation.xlsx` file. * Local Training: Folders must have the same name as the entries in the `StudyInstanceUID` column of the `ODELIA annotation scheme-2.0.xlsx` file. -* Run [scripts/preprocessing/step2_compute_sub.py](scripts/preprocessing/step2_compute_sub.py) - computes the +* Run [scripts/preprocessing/step2_compute_sub.py](preprocessing/step2_compute_sub.py) - computes the subtraction image -* Run [scripts/preprocessing/step3_unilateral.py](scripts/preprocessing/step3_unilateral.py) - splits breasts into left +* Run [scripts/preprocessing/step3_unilateral.py](preprocessing/step3_unilateral.py) - splits breasts into left and right side and resamples to uniform shape. The result is stored in a new folder `data_unilateral` * To create a five-fold stratified split and store the result in `metadata/split.csv`, run the following script: - * - Challenge: [scripts/preprocessing/odelia/step4_create_split_challenge.py](scripts/preprocessing/odelia/step4_create_split_challenge.py) - * Local - Training: [scripts/preprocessing/odelia/step4_create_split.py](scripts/preprocessing/odelia/step4_create_split.py) + * Local Training: [scripts/preprocessing/odelia/step4_create_split.py](preprocessing/odelia/step4_create_split.py) * The final folder structure should look like: ```bash @@ -83,10 +80,11 @@ ## Step 4: Run Training -* Specify path to downloaded folder as `PATH_ROOT=` in [dataset_3d_odelia.py](odelia/data/datasets/dataset_3d_odelia.py) -* Run Script: [scripts/main_train.py --institution DUKE](scripts/main_train.py) +* Specify path to downloaded folder as `PATH_ROOT=` + in [dataset_3d_odelia.py](../custom/data/datasets/dataset_3d_odelia.py) +* Run Script: [main_train.py](main_train.py) ## Step 5: Predict & Evaluate Performance -* Run Script: [scripts/main_predict.py](scripts/main_predict.py) +* Run Script: [main_predict.py](main_predict.py) * Set `path_run` to root directory of latest model From bf21531ee8cc2fc1f2d2baef672533796aeb9a42 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 11 Jul 2025 09:05:25 +0200 Subject: [PATCH 051/337] addressed some todos and moved description of docker.sh (in startup kits) to developer README --- assets/readme/README.developer.md | 32 ++++++++++- assets/readme/README.participant.md | 88 +++++++++++++++++------------ 2 files changed, 82 insertions(+), 38 deletions(-) diff --git a/assets/readme/README.developer.md b/assets/readme/README.developer.md index ac3f75a4..fb6aafc3 100644 --- a/assets/readme/README.developer.md +++ b/assets/readme/README.developer.md @@ -45,6 +45,37 @@ Optionally, uncomment running NVFlare unit tests in `_runTestsInsideDocker.sh`. Distribute the startup kits to the clients. +## Running the Startup Kits + +See [README.participant.md](./README.participant.md). + +### Configurable Parameters for docker.sh + +* The `docker.sh` script run by the swarm participants passes the following environment variables into the container automatically. +* You can override them to customize training behavior. +* Only do this for testing and debugging purposes! The startup kits are designed to ensure that all sites run the same training code, manipulating `docker.sh` might break this. + +| Environment Variable | Default | Description | +|----------------------|-----------------|----------------------------------------------------------------------| +| `SITE_NAME` | *from flag* | Name of your local site, e.g. `TUD_1`, passed via `--start_client` | +| `DATA_DIR` | *from flag* | Path to the host folder that contains your local data | +| `SCRATCH_DIR` | *from flag* | Path for saving training outputs and temporary files | +| `GPU_DEVICE` | `device=0` | GPU identifier to use inside the container (or `all`) | +| `MODEL` | `MST` | Model architecture, choices: `MST`, `ResNet` | +| `INSTITUTION` | `ODELIA` | Institution name, used to group experiment logs | +| `CONFIG` | `unilateral` | Configuration schema for dataset (e.g. label scheme) | +| `NUM_EPOCHS` | `1` (test mode) | Number of training epochs (used in preflight/local training) | +| `TRAINING_MODE` | derived | Internal use. Automatically set based on flags like `--start_client` | + +These are injected into the container as `--env` variables. You can modify their defaults by editing `docker.sh` or exporting before run: + +```bash +export MODEL=ResNet +export CONFIG=original +./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=1 --start_client +``` + + ## Running the Application 1. **CIFAR-10 example:** @@ -62,4 +93,3 @@ Distribute the startup kits to the clients. run in the swarm 3. Use the local tests to check if the code is swarm-ready 4. TODO more detailed instructions - diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index d889724e..d1dd593e 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -21,11 +21,55 @@ This guide is for data scientists and medical research sites participating in a 3. If you have a command line interface(CLI), follow this guide to connect to the VPN: [VPN setup guide(CLI).md](../VPN%20setup%20guide%28CLI%29.md) 4. You may want to clone this repository or selectively download VPN-related scripts for this purpose. -3. TODO anything else? ## Prepare Dataset -1. see Step 3: Prepare Data in [README.md](../../application/jobs/ODELIA_ternary_classification/app/scripts/README.md) +The dataset must be in the following format. + +### Folder Structure + + ```bash + + ├── data_unilateral + │ ├── ID_001_left + │ │ └── Sub_1.nii.gz + │ ├── ID_001_right + │ │ └── Sub_1.nii.gz + │ ├── ID_002_left + │ │ └── Sub_1.nii.gz + │ ├── ID_002_right + │ │ └── Sub_1.nii.gz + │ └── ... + └── metadata_unilateral + ├── annotation.csv + └── split.csv + ``` + +* The name of your site should usually end in `_1`, e.g., `UKA_1`, unless you participate with multiple nodes. +* `ID_001`, `ID_002` need to be unique identifiers in your dataset, not specifically of this format +* You might have additional images in the folder like `Pre.nii.gz`, `Post_1.nii.gz`, `Post_2.nii.gz`, `T2.nii.gz`, and you might have additional folders like `data_raw`, `data`, `metadata` etc. These will be ignored and should not cause problems. +* If you clone the repository, you will find a script that generates a synthetic dataset as an example. + +### Table Format + +#### Annotation + +* `split.csv` defines the class labels +* The file contains the columns `UID`, `PatientID`, `Age`, `Lesion` + * `UID` is the identifier used in the folder name, e.g., `ID_001_left`. + * `PatientID` is the identifier of the patient, in this case, `ID_001`. + * `Age` is the age of the patient at the time of the scan in days. + * `Lesion` is 0 for no lesion, 1 for benign lesion, and 2 for malicious lesion. + +#### Split + +* `split.csv` defines the training/validation/test split. +* These splits are hard-coded rather than randomized during training in order to have consistent and documented splits. +* The file contains the columns `UID`, `Split`, and `Fold`. + * `UID` is the identifier used in the folder name, e.g., `ID_001_left`. + * `Split` is either `train`, `val`, or `test`. The test set is currently ignored. + * `Fold` is the 0-based index of the fold (for a potential cross-validation). + ## Prepare Training Participation @@ -35,8 +79,8 @@ This guide is for data scientists and medical research sites participating in a 1. Directories ```bash - export SITE_NAME= # this should end in `_1`, e.g., `UKA_1`, unless you participate with multiple nodes - export DATADIR= + export SITE_NAME= + export DATADIR= export SCRATCHDIR= ``` 2. From the directory where you unpacked the startup kit, @@ -68,7 +112,7 @@ This guide is for data scientists and medical research sites participating in a ``` * TODO update when handling of the number of epochs has been implemented 3. Output files - * TODO describe + * Same as for the swarm training (see below). ### Start Swarm Node @@ -101,9 +145,8 @@ This guide is for data scientists and medical research sites participating in a $SCRATCHDIR/runs/$SITE_NAME// ``` - **Best checkpoint** usually saved as `best.ckpt` or `last.ckpt` - - **Prediction results**, if enabled, will appear in subfolders of the same directory - - **TensorBoard logs**, if activated, are stored in their respective folders inside the run directory - - TODO what is enabled/activated should be hard-coded, adapt accordingly + - TODO describe prediction results once implemented + - **TensorBoard logs** are stored in their respective folders inside the run directory 5. (Optional) You can verify that the container is running properly: ```bash @@ -112,32 +155,3 @@ This guide is for data scientists and medical research sites participating in a tail -f nohup.out # Follow training log ``` For any issues, check if the commands above point to problems and contact your Swarm Operator. - -### Configurable Parameters for docker.sh - -TODO consider what should be described and recommended as configurable here, given that the goal of the startup kits is -to ensure everyone runs the same training - -When launching the client using `./docker.sh`, the following environment variables are automatically passed into the -container. You can override them to customize training behavior: - -| Environment Variable | Default | Description | -|----------------------|-----------------|----------------------------------------------------------------------| -| `SITE_NAME` | *from flag* | Name of your local site, e.g. `TUD_1`, passed via `--start_client` | -| `DATA_DIR` | *from flag* | Path to the host folder that contains your local data | -| `SCRATCH_DIR` | *from flag* | Path for saving training outputs and temporary files | -| `GPU_DEVICE` | `device=0` | GPU identifier to use inside the container (or `all`) | -| `MODEL` | `MST` | Model architecture, choices: `MST`, `ResNet` | -| `INSTITUTION` | `ODELIA` | Institution name, used to group experiment logs | -| `CONFIG` | `unilateral` | Configuration schema for dataset (e.g. label scheme) | -| `NUM_EPOCHS` | `1` (test mode) | Number of training epochs (used in preflight/local training) | -| `TRAINING_MODE` | derived | Internal use. Automatically set based on flags like `--start_client` | - -These are injected into the container as `--env` variables. You can modify their defaults by editing `docker.sh` or -exporting before run: - -```bash -export MODEL=ResNet -export CONFIG=original -./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=1 --start_client -``` From 0304a590daf1e06d4703aa37492802e34b2bc2bc Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 11 Jul 2025 11:25:49 +0200 Subject: [PATCH 052/337] updated apt package versions --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index a61e9505..919ca771 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin docker-ce-cli docker-ce-rootless-extras docker-ce docker-compose-plugin gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.14 git=1:2.34.1-1ubuntu1.14 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.25.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.3.2-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.3.2-1~ubuntu.22.04~jammy docker-ce=5:28.3.2-1~ubuntu.22.04~jammy docker-compose-plugin=2.38.2-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.15 git=1:2.34.1-1ubuntu1.15 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* From 88786031ab45da101370da0914a724e196344b18 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Fri, 11 Jul 2025 11:33:15 +0200 Subject: [PATCH 053/337] fix: enable change detection in update_apt_versions.sh Signed-off-by: GitHub CI --- scripts/ci/update_apt_versions.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/ci/update_apt_versions.sh b/scripts/ci/update_apt_versions.sh index d5a4ca13..d86bbc6d 100755 --- a/scripts/ci/update_apt_versions.sh +++ b/scripts/ci/update_apt_versions.sh @@ -45,9 +45,9 @@ while IFS= read -r match; do fi done < <(grep -oP '\b[a-z0-9\.\-]+=[a-zA-Z0-9:~.+-]+\b' "$DOCKERFILE_PATH") -#git fetch origin main -#if git diff --quiet origin/main..HEAD; then -# echo "NO_CHANGES=true" >> "$GITHUB_ENV" -#else -# echo "NO_CHANGES=false" >> "$GITHUB_ENV" -#fi \ No newline at end of file +git fetch origin main +if git diff --quiet origin/main..HEAD; then + echo "NO_CHANGES=true" >> "$GITHUB_ENV" +else + echo "NO_CHANGES=false" >> "$GITHUB_ENV" +fi \ No newline at end of file From bc4081546554f24f8258b1c1c678e81e477f4bc3 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Fri, 11 Jul 2025 14:30:50 +0200 Subject: [PATCH 054/337] feat: add new client configuration for MEVIS_3 in project_Odelia_allsites.yml Signed-off-by: GitHub CI --- application/provision/project_Odelia_allsites.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/application/provision/project_Odelia_allsites.yml b/application/provision/project_Odelia_allsites.yml index 4e817c29..530b356d 100644 --- a/application/provision/project_Odelia_allsites.yml +++ b/application/provision/project_Odelia_allsites.yml @@ -26,6 +26,9 @@ participants: - name: MEVIS_2 type: client org: MEVIS + - name: MEVIS_3 + type: client + org: MEVIS - name: UKA_1 type: client org: UKA From 42dc4229a884414bc28ade81b51a5a921febc982 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 14 Jul 2025 11:39:57 +0200 Subject: [PATCH 055/337] added tee to capture command line output to files --- assets/readme/README.participant.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index d1dd593e..1eaa0941 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -89,14 +89,14 @@ The dataset must be in the following format. ``` 3. Verify that your Docker/GPU setup is working ```bash - ./docker.sh --scratch_dir $SCRATCHDIR --GPU device=0 --dummy_training + ./docker.sh --scratch_dir $SCRATCHDIR --GPU device=0 --dummy_training 2>&1 | tee dummy_training_console_output.txt ``` * This will pull the Docker image, which might take a while. * If you have multiple GPUs and 0 is busy, use a different one. * The “training” itself should take less than minute and does not yield a meaningful classification performance. 4. Verify that your local data can be accessed and the model can be trained locally ```bash - ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --preflight_check + ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --preflight_check 2>&1 | tee preflight_check_console_output.txt ``` * Training time depends on the size of the local dataset. @@ -108,7 +108,7 @@ The dataset must be in the following format. ``` 2. Start local training ```bash - ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --local_training + ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --local_training 2>&1 | tee local_training_console_output.txt ``` * TODO update when handling of the number of epochs has been implemented 3. Output files From fce2d9afd7857dc701ad669fbf82276645fd8dd1 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 14 Jul 2025 11:40:42 +0200 Subject: [PATCH 056/337] explanation on local training, not marked as optional for now --- assets/readme/README.participant.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 1eaa0941..afc6e269 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -100,9 +100,11 @@ The dataset must be in the following format. ``` * Training time depends on the size of the local dataset. -### (Optional) Run Local Training +### Run Local Training -1. From the directory where you unpacked the startup kit +To have a baseline for swarm training, train the same model in a comparable way on the local data only. + +1. From the directory where you unpacked the startup kit (unless you just ran the pre-flight check) ```bash cd $SITE_NAME/startup ``` @@ -110,7 +112,7 @@ The dataset must be in the following format. ```bash ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --local_training 2>&1 | tee local_training_console_output.txt ``` - * TODO update when handling of the number of epochs has been implemented + * This currently runs 100 epochs (somewhat comparable to 20 rounds with 5 epochs each in the swarm case). 3. Output files * Same as for the swarm training (see below). From 9840b019f1e2a72efe23a6ca68ff63d7e0d8e8f8 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 14 Jul 2025 15:31:00 +0200 Subject: [PATCH 057/337] added output of license (and more) information of apt packages --- docker_config/Dockerfile_ODELIA | 5 ++++- docker_config/master_template.yml | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 919ca771..5839fa4e 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -45,9 +45,12 @@ RUN python3 -m pip install Flask==3.0.2 Flask-JWT-Extended==4.6.0 Flask-SQLAlche # Install additional Python packages for application code at defined versions RUN python3 -m pip install Deprecated==1.2.18 SimpleITK==2.5.0 absl-py==2.2.2 aiohttp==3.11.18 aiosignal==1.3.2 async-timeout==5.0.1 cachetools==5.5.2 contourpy==1.3.2 cycler==0.12.1 et-xmlfile==2.0.0 fonttools==4.58.0 frozenlist==1.6.0 google-auth-oauthlib==1.2.2 google-auth==2.40.2 huggingface_hub==0.29.3 datasets==3.4.1 coral_pytorch==1.4.0 humanize==4.12.3 joblib==1.5.1 kiwisolver==1.4.8 lightning-utilities==0.14.3 markdown-it-py==3.0.0 markdown==3.8 matplotlib==3.9.2 mdurl==0.1.2 monai==1.4.0 multidict==6.4.4 nibabel==5.3.2 oauthlib==3.2.2 openpyxl==3.1.5 pandas==2.2.3 numpy==1.26.4 pyasn1-modules==0.4.2 pyasn1==0.6.1 pydicom==3.0.1 python-dateutil==2.9.0.post0 x-transformers==2.3.5 pytorch-lightning==2.4.0 requests==2.32.3 requests-oauthlib==2.0.0 rich==14.0.0 rsa==4.9.1 safetensors==0.5.3 scikit-learn==1.5.2 scipy==1.15.3 seaborn==0.13.2 wandb==0.18.6 einops==0.8.0 shellingham==1.5.4 tensorboard-data-server==0.7.2 tensorboard-plugin-wit==1.8.1 tensorboard==2.19.0 threadpoolctl==3.6.0 timm==1.0.15 torchio==0.20.1 torchmetrics==1.7.1 torchvision==0.17.2 torchaudio==2.2.2 tqdm==4.67.0 typer==0.15.4 tzdata==2025.2 wrapt==1.17.2 yarl==1.20.0 aiohappyeyeballs==2.6.1 annotated-types==0.7.0 dill==0.3.8 docker-pycreds==0.4.0 einx==0.3.0 frozendict==2.4.6 gitdb==4.0.12 gitpython==3.1.44 hf-xet==1.1.2 importlib-resources==6.5.2 loguru==0.7.3 multiprocess==0.70.16 propcache==0.3.1 pyarrow==20.0.0 pydantic==2.11.5 pydantic-core==2.33.2 sentry-sdk==2.29.1 setproctitle==1.3.6 smmap==5.0.2 typing-extensions==4.13.2 typing-inspection==0.4.1 xxhash==3.5.0 -# Install packages needed for testing and for listing licenses of installed packages +# Install packages needed for testing RUN python3 -m pip install coverage==7.8.2 mock==5.2.0 +# Install packages needed for listing licenses of installed pip packages RUN python3 -m pip install pip-licenses==5.0.0 prettytable==3.16.0 +# Install packages needed for creating SBOM of apt packages +RUN python3 -m pip install defusedxml==0.7.1 distro2sbom==0.6.0 lib4sbom==0.8.4 semantic-version==2.10.0 # Clean up pip cache RUN python3 -m pip cache purge diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index e9cfba12..c43e8e25 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -785,7 +785,8 @@ docker_svr_sh: | elif [ ! -z "$LIST_LICENSES" ]; then docker run -it --rm --name=$CONTAINER_NAME \ $DOCKER_IMAGE \ - /bin/bash -c "pip-licenses -s -u --order=license" + /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json" + echo "TODO license for pre-trained weights" elif [ ! -z "$INTERACTIVE" ]; then docker run --rm -it --detach-keys="ctrl-x" --name=$CONTAINER_NAME \ -v $DIR/..:/startupkit/ -w /startupkit/startup/ \ @@ -794,7 +795,7 @@ docker_svr_sh: | else echo "One of the following options must be passed:" echo "--start_server start the swarm learning server" - echo "--list_licenses list licenses of installed python packages" + echo "--list_licenses list licenses of installed packages" echo "--interactive start the container with an interactive shell (for debugging purposes)" fi From a3ff6e843b2f507f051213285797c066f438c0b2 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 14 Jul 2025 16:55:29 +0200 Subject: [PATCH 058/337] made source more readable --- .../app/custom/data/datasets/dataset_3d_odelia.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/data/datasets/dataset_3d_odelia.py b/application/jobs/ODELIA_ternary_classification/app/custom/data/datasets/dataset_3d_odelia.py index 0b6b27c7..eba4aa12 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/data/datasets/dataset_3d_odelia.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/data/datasets/dataset_3d_odelia.py @@ -62,8 +62,7 @@ def __init__( institutions = [institutions] self.institutions = institutions - flip_axes = (0, 1) if config == "original" else (0, 1, - 2) # Do not flip horizontal axis 2, otherwise labels incorrect + flip_axes = (0, 1) if config == "original" else (0, 1, 2) # Do not flip horizontal axis 2, otherwise labels incorrect if transform is None: self.transform = tio.Compose([ tio.ToCanonical() if config == "original" else tio.Lambda(lambda x: x), From c4829e4029ff952aae126e3be389e1e9b64180fc Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 14 Jul 2025 17:23:26 +0200 Subject: [PATCH 059/337] output what is actually used as number of classes --- .../ODELIA_ternary_classification/app/custom/threedcnn_ptl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index 616c7b8a..868cdbf9 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -31,10 +31,11 @@ def set_up_logging(): def set_up_data_module(logger): torch.set_float32_matmul_precision('high') ds_train, ds_val, path_run_dir, run_name = prepare_odelia_dataset() + num_classes = sum(ds_train.class_labels_num) logger.info(f"Dataset path: {ds_train}") logger.info(f"Run directory: {path_run_dir}") logger.info(f"Run name: {run_name}") - logger.info(f"Number of classes: {len(ds_train.labels)}") + logger.info(f"Number of classes: {num_classes}") logger.info(f"Length of train dataset: {len(ds_train)}") logger.info(f"Length of val dataset: {len(ds_val)}") @@ -56,7 +57,6 @@ def set_up_data_module(logger): # logger.info(f"Number of unique labels: {len(distribution['counts'])}") # ------------ Initialize Model ------------ - num_classes = sum(ds_train.class_labels_num) loss_kwargs = {} return dm, path_run_dir, run_name, num_classes, loss_kwargs From 5e4ef3306763a8131cff58296a8c83fe1b7b5785 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 14 Jul 2025 17:23:40 +0200 Subject: [PATCH 060/337] removed misleading comment --- .../ODELIA_ternary_classification/app/custom/threedcnn_ptl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index 868cdbf9..f41dbdbd 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -56,7 +56,6 @@ def set_up_data_module(logger): # logger.info(f"Label '{label}': {pct:.2f}% of training set, Count: {distribution['counts'][label]}") # logger.info(f"Number of unique labels: {len(distribution['counts'])}") - # ------------ Initialize Model ------------ loss_kwargs = {} return dm, path_run_dir, run_name, num_classes, loss_kwargs From 3b6d5ce6a2eff6594c55f51b826db83f94f619a5 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 14 Jul 2025 17:36:38 +0200 Subject: [PATCH 061/337] removed confusing output for now --- .../ODELIA_ternary_classification/app/custom/threedcnn_ptl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index f41dbdbd..3b3d815e 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -35,7 +35,7 @@ def set_up_data_module(logger): logger.info(f"Dataset path: {ds_train}") logger.info(f"Run directory: {path_run_dir}") logger.info(f"Run name: {run_name}") - logger.info(f"Number of classes: {num_classes}") + # logger.info(f"Number of classes: {num_classes}") # number of possible classes, not number of classes present, thus misleading logger.info(f"Length of train dataset: {len(ds_train)}") logger.info(f"Length of val dataset: {len(ds_val)}") From 346d21c99a5c7b55bfcf45747fec9d136db91e0d Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 15 Jul 2025 13:49:47 +0200 Subject: [PATCH 062/337] use http for communication also in tests --- application/provision/project_MEVIS_test.yml | 2 +- tests/provision/dummy_project_for_testing.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/application/provision/project_MEVIS_test.yml b/application/provision/project_MEVIS_test.yml index 3a4d1ba7..b4e962c1 100644 --- a/application/provision/project_MEVIS_test.yml +++ b/application/provision/project_MEVIS_test.yml @@ -44,7 +44,7 @@ builders: config_folder: config # scheme for communication driver (currently supporting the default, grpc, only). - scheme: grpc + scheme: http # app_validator is used to verify if uploaded app has proper structures # if not set, no app_validator is included in fed_server.json diff --git a/tests/provision/dummy_project_for_testing.yml b/tests/provision/dummy_project_for_testing.yml index 1ab98c45..7e259592 100644 --- a/tests/provision/dummy_project_for_testing.yml +++ b/tests/provision/dummy_project_for_testing.yml @@ -28,7 +28,7 @@ builders: - path: nvflare.lighter.impl.static_file.StaticFileBuilder args: config_folder: config - scheme: grpc + scheme: http docker_image: jefftud/odelia:__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__ overseer_agent: path: nvflare.ha.dummy_overseer_agent.DummyOverseerAgent From 85bf8dd7860e4977c539503873ed49a7938531cd Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 15 Jul 2025 13:53:16 +0200 Subject: [PATCH 063/337] updated apt package versions login and passwd are not updated, removed --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 919ca771..0491561a 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -15,7 +15,7 @@ RUN apt update RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.6 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 login logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 passwd util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.7 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 From 9509e132b59509c20b6f77bc00edc3350c6f263a Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Wed, 16 Jul 2025 10:45:13 +0200 Subject: [PATCH 064/337] chore: update APT version update workflow with permissions and branch condition Signed-off-by: GitHub CI --- .github/workflows/update-apt-versions.yml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/update-apt-versions.yml b/.github/workflows/update-apt-versions.yml index 4ac7ae08..890a5cb1 100644 --- a/.github/workflows/update-apt-versions.yml +++ b/.github/workflows/update-apt-versions.yml @@ -1,5 +1,9 @@ name: Auto Update APT Versions (Self-hosted) +permissions: + contents: read + pull-requests: write + on: schedule: # run eveyday at 04:00 UTC @@ -8,6 +12,7 @@ on: jobs: update-apt: + if: github.ref == 'refs/heads/ci/apt-update' runs-on: self-hosted timeout-minutes: 60 @@ -23,10 +28,6 @@ jobs: git config --global user.email "ci@github.com" git config --global user.name "GitHub CI" - - name: Create and switch to apt-update branch - run: | - git checkout -b ci/apt-update || git switch ci/apt-update - - name: Run APT update script run: | chmod +x scripts/ci/update_apt_versions.sh @@ -35,7 +36,7 @@ jobs: - name: Show git diff for debugging run: git diff || true - - name: Push apt-update branch + - name: Push updated branch if: env.NO_CHANGES == 'false' run: git push origin ci/apt-update --force From 064e53fa218aa35ea5b9efdf01e01c5c93061f1a Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Wed, 16 Jul 2025 10:55:20 +0200 Subject: [PATCH 065/337] chore: refine APT update workflow by removing branch condition and enhancing branch handling Signed-off-by: GitHub CI --- .github/workflows/update-apt-versions.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/update-apt-versions.yml b/.github/workflows/update-apt-versions.yml index 890a5cb1..6ace3792 100644 --- a/.github/workflows/update-apt-versions.yml +++ b/.github/workflows/update-apt-versions.yml @@ -12,7 +12,6 @@ on: jobs: update-apt: - if: github.ref == 'refs/heads/ci/apt-update' runs-on: self-hosted timeout-minutes: 60 @@ -27,16 +26,17 @@ jobs: run: | git config --global user.email "ci@github.com" git config --global user.name "GitHub CI" - + - name: Create and switch to apt-update branch + run: | + git checkout -b ci/apt-update || git switch ci/apt-update - name: Run APT update script run: | chmod +x scripts/ci/update_apt_versions.sh scripts/ci/update_apt_versions.sh - - name: Show git diff for debugging run: git diff || true - - name: Push updated branch + - name: Push apt-update branch if: env.NO_CHANGES == 'false' run: git push origin ci/apt-update --force @@ -52,4 +52,4 @@ jobs: This PR automatically updates APT package version numbers in `Dockerfile_ODELIA` based on a rebuild and inspection of installation logs. base: main - delete-branch: false + delete-branch: false \ No newline at end of file From dc40756cf8386e28ec65c25eb2bbe1c3dad2ecc1 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Wed, 16 Jul 2025 11:09:05 +0200 Subject: [PATCH 066/337] chore: update APT version update workflow with permissions and branch condition Signed-off-by: GitHub CI --- .github/workflows/update-apt-versions.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/update-apt-versions.yml b/.github/workflows/update-apt-versions.yml index b9c0d79b..8e4eedc8 100644 --- a/.github/workflows/update-apt-versions.yml +++ b/.github/workflows/update-apt-versions.yml @@ -1,9 +1,5 @@ name: Auto Update APT Versions (Self-hosted) -permissions: - contents: read - pull-requests: write - on: schedule: # run eveyday at 04:00 UTC @@ -26,10 +22,12 @@ jobs: run: | git config --global user.email "ci@github.com" git config --global user.name "GitHub CI" + - name: Create and switch to apt-update branch run: | git checkout -b ci/apt-update || git switch ci/apt-update - name: Run APT update script + run: | chmod +x scripts/ci/update_apt_versions.sh scripts/ci/update_apt_versions.sh From d81e38b0fd6e04c0e4fc53c2a2f2c1911bb81515 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 16 Jul 2025 17:24:23 +0200 Subject: [PATCH 067/337] allow listing licenses for admin node --- docker_config/master_template.yml | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index c43e8e25..17434cdd 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -804,7 +804,8 @@ docker_adm_sh: | while [[ "$#" -gt 0 ]]; do case $1 in - --no_pull) NOPULL="1";; + --no_pull) NOPULL="1";; + --list_licenses) LIST_LICENSES="1";; *) echo "Unknown parameter passed: $1"; exit 1 ;; esac shift @@ -821,6 +822,13 @@ docker_adm_sh: | fi CONTAINER_NAME=odelia_swarm_admin + if [ ! -z "$LIST_LICENSES" ]; then + docker run -it --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ + /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json" + echo "TODO license for pre-trained weights" + exit 0 + fi + echo "Starting docker with $DOCKER_IMAGE as $CONTAINER_NAME" docker run --rm -it --name=fladmin -v $DIR/../local/:/fl_admin/local/ -v $DIR/../startup/:/fl_admin/startup/ -w /fl_admin/startup/ $NETARG $DOCKER_IMAGE /bin/bash -c "./fl_admin.sh" From ed7789e02ccc4279d8da9a94206d71d5f3e3eddd Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 16 Jul 2025 17:36:47 +0200 Subject: [PATCH 068/337] allow listing licenses for client node --- docker_config/master_template.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 17434cdd..0d884098 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -640,6 +640,7 @@ docker_cln_sh: | --preflight_check) PREFLIGHT_CHECK="1" ;; --local_training) LOCAL_TRAINING="1" ;; --start_client) START_CLIENT="1" ;; + --list_licenses) LIST_LICENSES="1";; --interactive) INTERACTIVE="1" ;; --run_script) SCRIPT_TO_RUN="$2"; shift ;; *) echo "Unknown parameter passed: $1"; exit 1 ;; @@ -648,7 +649,7 @@ docker_cln_sh: | done # Prompt for parameters if missing - if [[ -z "$DUMMY_TRAINING" && -z "$MY_DATA_DIR" ]]; then + if [[ -z "$DUMMY_TRAINING" && -z "$LIST_LICENSES" && -z "$MY_DATA_DIR" ]]; then read -p "Enter the path to your data directory (default: /home/flclient/data): " user_data_dir : ${MY_DATA_DIR:="${user_data_dir:-/home/flclient/data}"} fi @@ -723,6 +724,11 @@ docker_cln_sh: | docker run -d -t --rm $DOCKER_OPTIONS $ENV_VARS --env TRAINING_MODE=swarm $DOCKER_IMAGE \ /bin/bash -c "nohup ./start.sh >> nohup.out 2>&1 && /bin/bash" + elif [ ! -z "$LIST_LICENSES" ]; then + docker run -it --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ + /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json" + echo "TODO license for pre-trained weights" + elif [[ ! -z "$INTERACTIVE" ]]; then docker run --rm $TTY_OPT --detach-keys="ctrl-x" $DOCKER_OPTIONS $DOCKER_IMAGE /bin/bash @@ -736,6 +742,7 @@ docker_cln_sh: | echo "--preflight_check verify data access & local training" echo "--local_training train a local model" echo "--start_client launch FL client in swarm mode" + echo "--list_licenses list licenses of installed packages" echo "--interactive drop into interactive container (for debugging)" echo "--run_script execute script in container (for testing)" exit 1 From 5c9583e24051d891325342646b2af6f81d1c574c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 16 Jul 2025 17:54:50 +0200 Subject: [PATCH 069/337] added test whether licenses can be listed from all types of startup kits --- runTestsInDocker.sh | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/runTestsInDocker.sh b/runTestsInDocker.sh index 329ee6cf..608fdf1c 100755 --- a/runTestsInDocker.sh +++ b/runTestsInDocker.sh @@ -67,18 +67,38 @@ cleanup_dummy_trainings () { rm -rf "$PROJECT_DIR" } +check_license_listings () { + cd "$CWD"/"$PROJECT_DIR/prod_00/admin@test.odelia/startup" + if ! $( ./docker.sh --no_pull --list_licenses 2>&1 | grep -q MIT ); then + echo "could not list licenses from admin startup kit" + exit 1 + fi + cd "$CWD"/"$PROJECT_DIR/prod_00/server.local/startup/" + if ! $( ./docker.sh --no_pull --list_licenses 2>&1 | grep -q MIT ); then + echo "could not list licenses from server startup kit" + exit 1 + fi + cd "$CWD"/"$PROJECT_DIR/prod_00/client_A/startup/" + if ! $( ./docker.sh --data_dir /tmp/ --scratch_dir /tmp/scratch --GPU "$GPU_FOR_TESTING" --no_pull --list_licenses 2>&1 | grep -q MIT ); then + echo "could not list licenses from client startup kit" + exit 1 + fi +} + case "$1" in run_tests) run_tests ;; prepare_dummy_trainings) prepare_dummy_trainings ;; run_dummy_training) run_dummy_training ;; run_3dcnn_tests) run_3dcnn_tests ;; cleanup) cleanup_dummy_trainings ;; + check_license_listings) check_license_listings;; all | "") run_tests prepare_dummy_trainings run_dummy_training run_3dcnn_tests cleanup_dummy_trainings + check_license_listings ;; *) echo "Unknown argument: $1"; exit 1 ;; esac From a023eb213797438939cfda5f55954287b9cf439b Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Mon, 21 Jul 2025 06:28:04 +0200 Subject: [PATCH 070/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 0491561a..47d05edf 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -15,7 +15,7 @@ RUN apt update RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.7 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-143.153 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.7 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-144.157 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 From 17ffb1f2b83c636a345f8c50e69c98676eca7d1a Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 21 Jul 2025 15:27:26 +0200 Subject: [PATCH 071/337] create scratch dir only at user-writable location, avoid need for sudo password --- docker_config/master_template.yml | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index e9cfba12..bb2f99de 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -666,16 +666,8 @@ docker_cln_sh: | # Resolve script directory DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - if [ -t 1 ]; then - # Local only - sudo mkdir -p "$MY_SCRATCH_DIR" - sudo chown -R $(id -u):$(id -g) "$MY_SCRATCH_DIR" - sudo chmod -R 777 "$MY_SCRATCH_DIR" - else - mkdir -p "$MY_SCRATCH_DIR" - chmod -R 777 "$MY_SCRATCH_DIR" - fi - + mkdir -p "$MY_SCRATCH_DIR" + chmod -R 777 "$MY_SCRATCH_DIR" # Networking & Cleanup NETARG="--net=host" From a4c194d07df83e9e7e6d3dab5d7522af6d42c047 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 10:42:07 +0200 Subject: [PATCH 072/337] ignore openvpn configuration files at recommended location --- assets/openvpn_configs/good_access/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 assets/openvpn_configs/good_access/.gitignore diff --git a/assets/openvpn_configs/good_access/.gitignore b/assets/openvpn_configs/good_access/.gitignore new file mode 100644 index 00000000..2e66e21c --- /dev/null +++ b/assets/openvpn_configs/good_access/.gitignore @@ -0,0 +1 @@ +*.ovpn \ No newline at end of file From f0f710176739a14183baf01165d24c1db30b994e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 10:48:53 +0200 Subject: [PATCH 073/337] log mediswarm version in training (passed to the Docker container as an environment variable) --- .../ODELIA_ternary_classification/app/custom/env_config.py | 3 ++- .../ODELIA_ternary_classification/app/custom/threedcnn_ptl.py | 1 + buildDockerImageAndStartupKits.sh | 1 + docker_config/master_template.yml | 3 ++- 4 files changed, 6 insertions(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/env_config.py b/application/jobs/ODELIA_ternary_classification/app/custom/env_config.py index d624e144..93efb091 100755 --- a/application/jobs/ODELIA_ternary_classification/app/custom/env_config.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/env_config.py @@ -16,7 +16,8 @@ def load_environment_variables(): 'use_adaptive_sync': os.environ.get('USE_ADAPTIVE_SYNC', 'False').lower() == 'true', 'sync_frequency': int(os.environ.get('SYNC_FREQUENCY', 1024)), 'model_name': os.environ.get('MODEL_NAME', 'ResNet101'), - 'prediction_flag': os.environ.get('PREDICT_FLAG', 'ext') + 'prediction_flag': os.environ.get('PREDICT_FLAG', 'ext'), + 'mediswarm_version': os.environ.get('MEDISWARM_VERSION', 'unset'), } diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index 3b3d815e..ad291652 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -78,6 +78,7 @@ def prepare_training(logger, max_epochs: int, site_name: str): if not torch.cuda.is_available(): raise RuntimeError("This example requires a GPU") + logger.info(f"Running code version {env_vars['mediswarm_version']}") logger.info(f"Using GPU for training") model_name = env_vars['model_name'] diff --git a/buildDockerImageAndStartupKits.sh b/buildDockerImageAndStartupKits.sh index e1d582c8..51b07f2c 100755 --- a/buildDockerImageAndStartupKits.sh +++ b/buildDockerImageAndStartupKits.sh @@ -41,6 +41,7 @@ git clean -x -q -f . cd ../.. rm .git -rf chmod a+rX . -R +sed -i 's#__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__#'$VERSION'#' docker_config/master_template.yml cd $CWD diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index bb2f99de..0a2306db 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -696,7 +696,8 @@ docker_cln_sh: | --env TORCH_HOME=/torch_home \ --env GPU_DEVICE=$GPU2USE \ --env MODEL_NAME=MST \ - --env CONFIG=unilateral" + --env CONFIG=unilateral \ + --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" # Execution modes if [[ ! -z "$DUMMY_TRAINING" ]]; then From d0a893c0306e6884bbe38ec439d5d313625336e1 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 13:04:09 +0200 Subject: [PATCH 074/337] try ResNet 10 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 3 ++- docker_config/Dockerfile_ODELIA | 2 ++ docker_config/master_template.yml | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index 42ac6ebb..e5da6b6c 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -82,11 +82,12 @@ path = "nvflare.app_opt.pt.file_model_persistor.PTFileModelPersistor" args { model { - path = "models.mst.MST" + path = "models.resnet.ResNet" args { n_input_channels = 1 num_classes = 3 spatial_dims = 3 + resnet_variant = 10 } } } diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 47d05edf..acef6840 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -72,3 +72,5 @@ RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm # Copy pre-trained model weights to image COPY ./torch_home_cache /torch_home +RUN mkdir /huggingface_home +RUN chmod a+rwx /huggingface_home diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 0a2306db..99f06c65 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -694,8 +694,9 @@ docker_cln_sh: | --env DATA_DIR=/data \ --env SCRATCH_DIR=/scratch \ --env TORCH_HOME=/torch_home \ + --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=MST \ + --env MODEL_NAME=ResNet10 \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From 0ed0b3a4946d7a0a9b50241a88d041d93d76148d Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 13:24:27 +0200 Subject: [PATCH 075/337] try ResNet 18 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 2 +- .../ODELIA_ternary_classification/app/custom/models/resnet.py | 2 +- docker_config/master_template.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index e5da6b6c..c20582aa 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -87,7 +87,7 @@ n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 10 + resnet_variant = 18 } } } diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py b/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py index dfc21a3c..d4c74a79 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py @@ -21,7 +21,7 @@ def __init__(self, n_input_channels: int, num_classes: int, spatial_dims: int, r raise ValueError(f"Unsupported ResNet model number: {resnet_variant}") self.model = Model(n_input_channels=n_input_channels, spatial_dims=spatial_dims, num_classes=num_classes, - feed_forward=False, bias_downsample=False, pretrained=True) + feed_forward=False, shortcut_type='A', bias_downsample=True, pretrained=True) self.model.fc = nn.Linear(512, num_classes) # TODO can we get the number of channels from the ResNet rather than using a hard-coded value only confirmed to work with ResNet 10 and 18? diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 99f06c65..c7a46d17 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -696,7 +696,7 @@ docker_cln_sh: | --env TORCH_HOME=/torch_home \ --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=ResNet10 \ + --env MODEL_NAME=ResNet18 \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From 7c248a37158779cd91e7e4c5c54b0622881dd0ab Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 15:45:51 +0200 Subject: [PATCH 076/337] try ResNet 34 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 2 +- docker_config/master_template.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index c20582aa..28a9789a 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -87,7 +87,7 @@ n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 18 + resnet_variant = 34 } } } diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index c7a46d17..275442cb 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -696,7 +696,7 @@ docker_cln_sh: | --env TORCH_HOME=/torch_home \ --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=ResNet18 \ + --env MODEL_NAME=ResNet34 \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From 1524a6af8e668c812b0a0d37bf58b45766f64a84 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 15:47:45 +0200 Subject: [PATCH 077/337] try ResNet 50 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 2 +- .../ODELIA_ternary_classification/app/custom/models/resnet.py | 4 ++-- docker_config/master_template.yml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index 28a9789a..a93bce2f 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -87,7 +87,7 @@ n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 34 + resnet_variant = 50 } } } diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py b/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py index d4c74a79..bfc35c86 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py @@ -21,8 +21,8 @@ def __init__(self, n_input_channels: int, num_classes: int, spatial_dims: int, r raise ValueError(f"Unsupported ResNet model number: {resnet_variant}") self.model = Model(n_input_channels=n_input_channels, spatial_dims=spatial_dims, num_classes=num_classes, - feed_forward=False, shortcut_type='A', bias_downsample=True, pretrained=True) - self.model.fc = nn.Linear(512, + feed_forward=False, shortcut_type='B', bias_downsample=False, pretrained=True) + self.model.fc = nn.Linear(2048, num_classes) # TODO can we get the number of channels from the ResNet rather than using a hard-coded value only confirmed to work with ResNet 10 and 18? def forward(self, x): diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 275442cb..e0bfffab 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -696,7 +696,7 @@ docker_cln_sh: | --env TORCH_HOME=/torch_home \ --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=ResNet34 \ + --env MODEL_NAME=ResNet50 \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From dd83ca1c86826a0c0979a784906307e8f59c28a2 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 16:02:44 +0200 Subject: [PATCH 078/337] try ResNet 101 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 2 +- docker_config/master_template.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index a93bce2f..fccf5379 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -87,7 +87,7 @@ n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 50 + resnet_variant = 101 } } } diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index e0bfffab..fba835dd 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -696,7 +696,7 @@ docker_cln_sh: | --env TORCH_HOME=/torch_home \ --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=ResNet50 \ + --env MODEL_NAME=ResNet101 \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From 22bbb7243c3726fd2eb5004db9cd561a2b5e5982 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 16:07:02 +0200 Subject: [PATCH 079/337] try ResNet 152 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 2 +- docker_config/master_template.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index fccf5379..875ff4b7 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -87,7 +87,7 @@ n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 101 + resnet_variant = 152 } } } diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index fba835dd..2c68af0c 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -696,7 +696,7 @@ docker_cln_sh: | --env TORCH_HOME=/torch_home \ --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=ResNet101 \ + --env MODEL_NAME=ResNet152 \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From 0c6a27d98ff74e1227404b1f87745c00642911e0 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 16:17:43 +0200 Subject: [PATCH 080/337] set parameters for pre-trained ResNet depending on variant rather than hard-code them --- .../app/custom/models/resnet.py | 33 +++++++++++++++++-- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py b/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py index bfc35c86..49503b26 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/models/resnet.py @@ -20,10 +20,37 @@ def __init__(self, n_input_channels: int, num_classes: int, spatial_dims: int, r if Model is None: raise ValueError(f"Unsupported ResNet model number: {resnet_variant}") + shortcut_type = { + 10: 'B', + 18: 'A', + 34: 'A', + 50: 'B', + 101: 'B', + 152: 'B', + }.get(resnet_variant) + + bias_downsample = { + 10: False, + 18: True, + 34: True, + 50: False, + 101: False, + 152: False, + }.get(resnet_variant) + + num_channels = { + 10: 512, + 18: 512, + 34: 512, + 50: 2048, + 101: 2048, + 152: 2048, + }.get(resnet_variant) + self.model = Model(n_input_channels=n_input_channels, spatial_dims=spatial_dims, num_classes=num_classes, - feed_forward=False, shortcut_type='B', bias_downsample=False, pretrained=True) - self.model.fc = nn.Linear(2048, - num_classes) # TODO can we get the number of channels from the ResNet rather than using a hard-coded value only confirmed to work with ResNet 10 and 18? + feed_forward=False, shortcut_type=shortcut_type, bias_downsample=bias_downsample, pretrained=True) + self.model.fc = nn.Linear(num_channels, + num_classes) def forward(self, x): return self.model(x) From 97d609cb49632013fa3609253aca11d051e3a31f Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 16:27:18 +0200 Subject: [PATCH 081/337] try ResNet 10 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 2 +- docker_config/master_template.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index 875ff4b7..e5da6b6c 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -87,7 +87,7 @@ n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 152 + resnet_variant = 10 } } } diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 2c68af0c..99f06c65 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -696,7 +696,7 @@ docker_cln_sh: | --env TORCH_HOME=/torch_home \ --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=ResNet152 \ + --env MODEL_NAME=ResNet10 \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From e5d3baa909e81150547b9247342b7ab3a84aba65 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 16:27:32 +0200 Subject: [PATCH 082/337] try ResNet 18 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 2 +- docker_config/master_template.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index e5da6b6c..c20582aa 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -87,7 +87,7 @@ n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 10 + resnet_variant = 18 } } } diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 99f06c65..c7a46d17 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -696,7 +696,7 @@ docker_cln_sh: | --env TORCH_HOME=/torch_home \ --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=ResNet10 \ + --env MODEL_NAME=ResNet18 \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From 1b111e7e9e63001e44f12a7a7b09e7b491eb446e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 16:27:44 +0200 Subject: [PATCH 083/337] try ResNet 34 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index c20582aa..28a9789a 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -87,7 +87,7 @@ n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 18 + resnet_variant = 34 } } } From daa6e0f7ae4088c73311a88d48e0f4b8298233d1 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 16:27:56 +0200 Subject: [PATCH 084/337] try ResNet 50 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 2 +- docker_config/master_template.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index 28a9789a..a93bce2f 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -87,7 +87,7 @@ n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 34 + resnet_variant = 50 } } } diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index c7a46d17..e0bfffab 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -696,7 +696,7 @@ docker_cln_sh: | --env TORCH_HOME=/torch_home \ --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=ResNet18 \ + --env MODEL_NAME=ResNet50 \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From 92e1ba959f51524773e1097345948bb15f7a7c63 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 16:28:08 +0200 Subject: [PATCH 085/337] try ResNet 101 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 2 +- docker_config/master_template.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index a93bce2f..fccf5379 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -87,7 +87,7 @@ n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 50 + resnet_variant = 101 } } } diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index e0bfffab..fba835dd 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -696,7 +696,7 @@ docker_cln_sh: | --env TORCH_HOME=/torch_home \ --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=ResNet50 \ + --env MODEL_NAME=ResNet101 \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From ecf04adf6f1ab31a774a2995e729061a313b9a5f Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 16:28:21 +0200 Subject: [PATCH 086/337] try ResNet 152 (pretrained; downloaded from within container) --- .../app/config/config_fed_client.conf | 2 +- docker_config/master_template.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index fccf5379..875ff4b7 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -87,7 +87,7 @@ n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 101 + resnet_variant = 152 } } } diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index fba835dd..2c68af0c 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -696,7 +696,7 @@ docker_cln_sh: | --env TORCH_HOME=/torch_home \ --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=ResNet101 \ + --env MODEL_NAME=ResNet152 \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From b2b5b56e66865acb46301427ccc58f346eee7aba Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 16:40:47 +0200 Subject: [PATCH 087/337] omit potentially large workspace directory from copy (from which it will be cleaned up anyway) --- buildDockerImageAndStartupKits.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildDockerImageAndStartupKits.sh b/buildDockerImageAndStartupKits.sh index 51b07f2c..dd457e5b 100755 --- a/buildDockerImageAndStartupKits.sh +++ b/buildDockerImageAndStartupKits.sh @@ -33,7 +33,7 @@ DOCKER_IMAGE=jefftud/odelia:$VERSION CWD=`pwd` CLEAN_SOURCE_DIR=`mktemp -d` mkdir $CLEAN_SOURCE_DIR/MediSwarm -cp -r . $CLEAN_SOURCE_DIR/MediSwarm/ +rsync -ax workspace . $CLEAN_SOURCE_DIR/MediSwarm/ cd $CLEAN_SOURCE_DIR/MediSwarm git clean -x -q -f . cd docker_config/NVFlare From 147173c491701fcdc2c4532dbf171e375b1f2e9a Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 22 Jul 2025 17:10:56 +0200 Subject: [PATCH 088/337] fixed rsync option to exclude directory --- buildDockerImageAndStartupKits.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildDockerImageAndStartupKits.sh b/buildDockerImageAndStartupKits.sh index dd457e5b..1767cfef 100755 --- a/buildDockerImageAndStartupKits.sh +++ b/buildDockerImageAndStartupKits.sh @@ -33,7 +33,7 @@ DOCKER_IMAGE=jefftud/odelia:$VERSION CWD=`pwd` CLEAN_SOURCE_DIR=`mktemp -d` mkdir $CLEAN_SOURCE_DIR/MediSwarm -rsync -ax workspace . $CLEAN_SOURCE_DIR/MediSwarm/ +rsync -ax --exclude workspace . $CLEAN_SOURCE_DIR/MediSwarm/ cd $CLEAN_SOURCE_DIR/MediSwarm git clean -x -q -f . cd docker_config/NVFlare From dc9518fd61571ecaa589a598a643f59a33e013aa Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 24 Jul 2025 16:19:01 +0200 Subject: [PATCH 089/337] back to MST --- .../app/config/config_fed_client.conf | 3 +-- docker_config/Dockerfile_ODELIA | 2 -- docker_config/master_template.yml | 3 +-- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index 875ff4b7..42ac6ebb 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -82,12 +82,11 @@ path = "nvflare.app_opt.pt.file_model_persistor.PTFileModelPersistor" args { model { - path = "models.resnet.ResNet" + path = "models.mst.MST" args { n_input_channels = 1 num_classes = 3 spatial_dims = 3 - resnet_variant = 152 } } } diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index acef6840..47d05edf 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -72,5 +72,3 @@ RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm # Copy pre-trained model weights to image COPY ./torch_home_cache /torch_home -RUN mkdir /huggingface_home -RUN chmod a+rwx /huggingface_home diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 2c68af0c..0a2306db 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -694,9 +694,8 @@ docker_cln_sh: | --env DATA_DIR=/data \ --env SCRATCH_DIR=/scratch \ --env TORCH_HOME=/torch_home \ - --env HF_HOME=/huggingface_home \ --env GPU_DEVICE=$GPU2USE \ - --env MODEL_NAME=ResNet152 \ + --env MODEL_NAME=MST \ --env CONFIG=unilateral \ --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" From 9cc8169d320799d80f28401e3a78a20d8790d5ee Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Mon, 28 Jul 2025 15:17:16 +0200 Subject: [PATCH 090/337] WIP: remove apt versions for rebuild --- docker_config/Dockerfile_ODELIA | 307 ++++++++++++++++++++++++++++++-- 1 file changed, 294 insertions(+), 13 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 47d05edf..964d188c 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,13 +12,106 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 +RUN apt install \ + \ + -y \ + apt=2.4.14 \ + apt-utils=2.4.14 \ + libapt-pkg6.0=2.4.14 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.7 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-144.157 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 +RUN apt install \ + \ + -y \ + base-files=12ubuntu4.7 \ + bash=5.1-6ubuntu1.1 \ + bsdutils=1:2.37.2-4ubuntu3.4 \ + ca-certificates=20240203~22.04.1 \ + coreutils=8.32-4.1ubuntu1.2 \ + dpkg=1.21.1ubuntu2.3 \ + e2fsprogs=1.46.5-2ubuntu1.2 \ + gpgv=2.2.27-3ubuntu2.4 \ + libblkid1=2.37.2-4ubuntu3.4 \ + libc-bin=2.35-0ubuntu3.10 \ + libc-dev-bin=2.35-0ubuntu3.10 \ + libc6-dev=2.35-0ubuntu3.10 \ + libc6=2.35-0ubuntu3.10 \ + libcap2=1:2.44-1ubuntu0.22.04.2 \ + libcom-err2=1.46.5-2ubuntu1.2 \ + libext2fs2=1.46.5-2ubuntu1.2 \ + libgnutls30=3.7.3-4ubuntu1.7 \ + libgssapi-krb5-2=1.19.2-2ubuntu0.7 \ + libk5crypto3=1.19.2-2ubuntu0.7 \ + libkrb5-3=1.19.2-2ubuntu0.7 \ + libkrb5support0=1.19.2-2ubuntu0.7 \ + libmount1=2.37.2-4ubuntu3.4 \ + libpam-modules-bin=1.4.0-11ubuntu2.6 \ + libpam-modules=1.4.0-11ubuntu2.6 \ + libpam-runtime=1.4.0-11ubuntu2.6 \ + libpam0g=1.4.0-11ubuntu2.6 \ + libseccomp2=2.5.3-2ubuntu3~22.04.1 \ + libsmartcols1=2.37.2-4ubuntu3.4 \ + libss2=1.46.5-2ubuntu1.2 \ + libssl3=3.0.2-0ubuntu1.19 \ + libsystemd0=249.11-0ubuntu3.16 \ + libtasn1-6=4.18.0-4ubuntu0.1 \ + libudev1=249.11-0ubuntu3.16 \ + libuuid1=2.37.2-4ubuntu3.4 \ + linux-libc-dev=5.15.0-144.157 \ + logsave=1.46.5-2ubuntu1.2 \ + mount=2.37.2-4ubuntu3.4 \ + openssl=3.0.2-0ubuntu1.19 \ + util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 +RUN apt install \ + \ + -y \ + apt-transport-https=2.4.14 \ + curl=7.81.0-1ubuntu1.20 \ + dirmngr=2.2.27-3ubuntu2.4 \ + distro-info-data=0.52ubuntu0.9 \ + gnupg-l10n=2.2.27-3ubuntu2.4 \ + gnupg-utils=2.2.27-3ubuntu2.4 \ + gnupg=2.2.27-3ubuntu2.4 \ + gpg-agent=2.2.27-3ubuntu2.4 \ + gpg-wks-client=2.2.27-3ubuntu2.4 \ + gpg-wks-server=2.2.27-3ubuntu2.4 \ + gpg=2.2.27-3ubuntu2.4 \ + gpgconf=2.2.27-3ubuntu2.4 \ + gpgsm=2.2.27-3ubuntu2.4 \ + libassuan0=2.5.5-1build1 \ + libbrotli1=1.0.9-2build6 \ + libcurl4=7.81.0-1ubuntu1.20 \ + libexpat1=2.4.7-1ubuntu0.6 \ + libksba8=1.6.0-2ubuntu0.2 \ + libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 \ + libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 \ + libmpdec3=2.5.1-2build2 \ + libnghttp2-14=1.43.0-1ubuntu0.2 \ + libnpth0=1.6-3build2 \ + libpsl5=0.21.0-1.2build2 \ + libpython3-stdlib=3.10.6-1~22.04.1 \ + libpython3.10-minimal=3.10.12-1~22.04.10 \ + libpython3.10-stdlib=3.10.12-1~22.04.10 \ + libreadline8=8.1.2-1 \ + librtmp1=2.4+20151223.gitfa8646d.1-2build4 \ + libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 \ + libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 \ + libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 \ + libsqlite3-0=3.37.2-2ubuntu0.4 \ + libssh-4=0.9.6-2ubuntu0.22.04.4 \ + lsb-release=11.1.0ubuntu4 \ + media-types=7.0.0 \ + pinentry-curses=1.1.1-1build2 \ + publicsuffix=20211207.1025-1 \ + python3-minimal=3.10.6-1~22.04.1 \ + python3.10-minimal=3.10.12-1~22.04.10 \ + python3.10=3.10.12-1~22.04.10 \ + python3=3.10.6-1~22.04.1 \ + readline-common=8.1.2-1 \ + unzip=6.0-26ubuntu3.2 \ + zip=3.0-12build2 # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ @@ -27,7 +120,82 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.25.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.3.2-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.3.2-1~ubuntu.22.04~jammy docker-ce=5:28.3.2-1~ubuntu.22.04~jammy docker-compose-plugin=2.38.2-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.15 git=1:2.34.1-1ubuntu1.15 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install \ + \ + -y \ + apparmor=3.0.4-2ubuntu2.4 \ + containerd.io=1.7.27-1 \ + dbus-user-session=1.12.20-2ubuntu4.1 \ + dbus=1.12.20-2ubuntu4.1 \ + dmsetup=2:1.02.175-2.1ubuntu5 \ + docker-buildx-plugin=0.25.0-1~ubuntu.22.04~jammy \ + docker-ce-cli=5:28.3.2-1~ubuntu.22.04~jammy \ + docker-ce-rootless-extras=5:28.3.2-1~ubuntu.22.04~jammy \ + docker-ce=5:28.3.2-1~ubuntu.22.04~jammy \ + docker-compose-plugin=2.38.2-1~ubuntu.22.04~jammy \ + gir1.2-glib-2.0=1.72.0-1 \ + git-man=1:2.34.1-1ubuntu1.15 \ + git=1:2.34.1-1ubuntu1.15 \ + iptables=1.8.7-1ubuntu5.2 \ + less=590-1ubuntu0.22.04.3 \ + libapparmor1=3.0.4-2ubuntu2.4 \ + libargon2-1=0~20171227-0.3 \ + libbsd0=0.11.5-1 \ + libcbor0.8=0.8.0-2ubuntu1 \ + libcryptsetup12=2:2.4.3-1ubuntu1.3 \ + libcurl3-gnutls=7.81.0-1ubuntu1.20 \ + libdbus-1-3=1.12.20-2ubuntu4.1 \ + libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 \ + libedit2=3.1-20210910-1build1 \ + liberror-perl=0.17029-1 \ + libfido2-1=1.10.0-1 \ + libgdbm-compat4=1.23-1 \ + libgdbm6=1.23-1 \ + libgirepository-1.0-1=1.72.0-1 \ + libglib2.0-0=2.72.4-0ubuntu2.5 \ + libglib2.0-data=2.72.4-0ubuntu2.5 \ + libicu70=70.1-2 \ + libip4tc2=1.8.7-1ubuntu5.2 \ + libip6tc2=1.8.7-1ubuntu5.2 \ + libjson-c5=0.15-3~ubuntu1.22.04.2 \ + libkmod2=29-1ubuntu1 \ + libltdl7=2.4.6-15build2 \ + libmd0=1.0.4-1build1 \ + libmnl0=1.0.4-3build2 \ + libnetfilter-conntrack3=1.0.9-1 \ + libnfnetlink0=1.0.1-3build3 \ + libnftnl11=1.2.1-1build1 \ + libnss-systemd=249.11-0ubuntu3.16 \ + libpam-systemd=249.11-0ubuntu3.16 \ + libperl5.34=5.34.0-3ubuntu1.4 \ + libslirp0=4.6.1-1build1 \ + libx11-6=2:1.7.5-1ubuntu0.3 \ + libx11-data=2:1.7.5-1ubuntu0.3 \ + libxau6=1:1.0.9-1build5 \ + libxcb1=1.14-3ubuntu3 \ + libxdmcp6=1:1.1.3-0ubuntu5 \ + libxext6=2:1.3.4-1build1 \ + libxml2=2.9.13+dfsg-1ubuntu0.7 \ + libxmuu1=2:1.1.3-3 \ + libxtables12=1.8.7-1ubuntu5.2 \ + netbase=6.3 \ + networkd-dispatcher=2.1-2ubuntu0.22.04.2 \ + openssh-client=1:8.9p1-3ubuntu0.13 \ + patch=2.7.6-7build2 \ + perl-base=5.34.0-3ubuntu1.4 \ + perl-modules-5.34=5.34.0-3ubuntu1.4 \ + perl=5.34.0-3ubuntu1.4 \ + pigz=2.6-1 \ + python3-dbus=1.2.18-3build1 \ + python3-gi=3.42.1-0ubuntu1 \ + shared-mime-info=2.1-2 \ + slirp4netns=1.0.1-2 \ + systemd-sysv=249.11-0ubuntu3.16 \ + systemd-timesyncd=249.11-0ubuntu3.16 \ + systemd=249.11-0ubuntu3.16 \ + xauth=1:1.1-1build2 \ + xdg-user-dirs=0.17-2ubuntu4 \ + xz-utils=5.2.5-2ubuntu1 # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* @@ -36,18 +204,129 @@ RUN rm -rf /var/lib/apt/lists/* RUN python3 -m pip uninstall -y conda conda-package-handling conda_index # Install specific versions of pip and setuptools -RUN python3 -m pip install -U pip==25.1.1 setuptools==80.8.0 +RUN python3 -m pip install \ + -U \ + pip==25.1.1 \ + setuptools==80.8.0 # Install dependencies of NVFlare at fixed versions -RUN python3 -m pip install --upgrade psutil==7.0.0 -RUN python3 -m pip install Flask==3.0.2 Flask-JWT-Extended==4.6.0 Flask-SQLAlchemy==3.1.1 PyJWT==2.10.1 SQLAlchemy==2.0.16 Werkzeug==3.0.1 blinker==1.9.0 docker==7.1.0 greenlet==3.2.2 grpcio==1.62.1 gunicorn==23.0.0 itsdangerous==2.2.0 msgpack==1.1.0 protobuf==4.24.4 pyhocon==0.3.61 pyparsing==3.2.3 websockets==15.0.1 +RUN python3 -m pip install \ + --upgrade \ + psutil==7.0.0 +RUN python3 -m pip install \ + Flask==3.0.2 \ + Flask-JWT-Extended==4.6.0 \ + Flask-SQLAlchemy==3.1.1 \ + PyJWT==2.10.1 \ + SQLAlchemy==2.0.16 \ + Werkzeug==3.0.1 \ + blinker==1.9.0 \ + docker==7.1.0 \ + greenlet==3.2.2 \ + grpcio==1.62.1 \ + gunicorn==23.0.0 \ + itsdangerous==2.2.0 \ + msgpack==1.1.0 \ + protobuf==4.24.4 \ + pyhocon==0.3.61 \ + pyparsing==3.2.3 \ + websockets==15.0.1 # Install additional Python packages for application code at defined versions -RUN python3 -m pip install Deprecated==1.2.18 SimpleITK==2.5.0 absl-py==2.2.2 aiohttp==3.11.18 aiosignal==1.3.2 async-timeout==5.0.1 cachetools==5.5.2 contourpy==1.3.2 cycler==0.12.1 et-xmlfile==2.0.0 fonttools==4.58.0 frozenlist==1.6.0 google-auth-oauthlib==1.2.2 google-auth==2.40.2 huggingface_hub==0.29.3 datasets==3.4.1 coral_pytorch==1.4.0 humanize==4.12.3 joblib==1.5.1 kiwisolver==1.4.8 lightning-utilities==0.14.3 markdown-it-py==3.0.0 markdown==3.8 matplotlib==3.9.2 mdurl==0.1.2 monai==1.4.0 multidict==6.4.4 nibabel==5.3.2 oauthlib==3.2.2 openpyxl==3.1.5 pandas==2.2.3 numpy==1.26.4 pyasn1-modules==0.4.2 pyasn1==0.6.1 pydicom==3.0.1 python-dateutil==2.9.0.post0 x-transformers==2.3.5 pytorch-lightning==2.4.0 requests==2.32.3 requests-oauthlib==2.0.0 rich==14.0.0 rsa==4.9.1 safetensors==0.5.3 scikit-learn==1.5.2 scipy==1.15.3 seaborn==0.13.2 wandb==0.18.6 einops==0.8.0 shellingham==1.5.4 tensorboard-data-server==0.7.2 tensorboard-plugin-wit==1.8.1 tensorboard==2.19.0 threadpoolctl==3.6.0 timm==1.0.15 torchio==0.20.1 torchmetrics==1.7.1 torchvision==0.17.2 torchaudio==2.2.2 tqdm==4.67.0 typer==0.15.4 tzdata==2025.2 wrapt==1.17.2 yarl==1.20.0 aiohappyeyeballs==2.6.1 annotated-types==0.7.0 dill==0.3.8 docker-pycreds==0.4.0 einx==0.3.0 frozendict==2.4.6 gitdb==4.0.12 gitpython==3.1.44 hf-xet==1.1.2 importlib-resources==6.5.2 loguru==0.7.3 multiprocess==0.70.16 propcache==0.3.1 pyarrow==20.0.0 pydantic==2.11.5 pydantic-core==2.33.2 sentry-sdk==2.29.1 setproctitle==1.3.6 smmap==5.0.2 typing-extensions==4.13.2 typing-inspection==0.4.1 xxhash==3.5.0 +RUN python3 -m pip install \ + Deprecated==1.2.18 \ + SimpleITK==2.5.0 \ + absl-py==2.2.2 \ + aiohttp==3.11.18 \ + aiosignal==1.3.2 \ + async-timeout==5.0.1 \ + cachetools==5.5.2 \ + contourpy==1.3.2 \ + cycler==0.12.1 \ + et-xmlfile==2.0.0 \ + fonttools==4.58.0 \ + frozenlist==1.6.0 \ + google-auth-oauthlib==1.2.2 \ + google-auth==2.40.2 \ + huggingface_hub==0.29.3 \ + datasets==3.4.1 \ + coral_pytorch==1.4.0 \ + humanize==4.12.3 \ + joblib==1.5.1 \ + kiwisolver==1.4.8 \ + lightning-utilities==0.14.3 \ + markdown-it-py==3.0.0 \ + markdown==3.8 \ + matplotlib==3.9.2 \ + mdurl==0.1.2 \ + monai==1.4.0 \ + multidict==6.4.4 \ + nibabel==5.3.2 \ + oauthlib==3.2.2 \ + openpyxl==3.1.5 \ + pandas==2.2.3 \ + numpy==1.26.4 \ + pyasn1-modules==0.4.2 \ + pyasn1==0.6.1 \ + pydicom==3.0.1 \ + python-dateutil==2.9.0.post0 \ + x-transformers==2.3.5 \ + pytorch-lightning==2.4.0 \ + requests==2.32.3 \ + requests-oauthlib==2.0.0 \ + rich==14.0.0 \ + rsa==4.9.1 \ + safetensors==0.5.3 \ + scikit-learn==1.5.2 \ + scipy==1.15.3 \ + seaborn==0.13.2 \ + wandb==0.18.6 \ + einops==0.8.0 \ + shellingham==1.5.4 \ + tensorboard-data-server==0.7.2 \ + tensorboard-plugin-wit==1.8.1 \ + tensorboard==2.19.0 \ + threadpoolctl==3.6.0 \ + timm==1.0.15 \ + torchio==0.20.1 \ + torchmetrics==1.7.1 \ + torchvision==0.17.2 \ + torchaudio==2.2.2 \ + tqdm==4.67.0 \ + typer==0.15.4 \ + tzdata==2025.2 \ + wrapt==1.17.2 \ + yarl==1.20.0 \ + aiohappyeyeballs==2.6.1 \ + annotated-types==0.7.0 \ + dill==0.3.8 \ + docker-pycreds==0.4.0 \ + einx==0.3.0 \ + frozendict==2.4.6 \ + gitdb==4.0.12 \ + gitpython==3.1.44 \ + hf-xet==1.1.2 \ + importlib-resources==6.5.2 \ + loguru==0.7.3 \ + multiprocess==0.70.16 \ + propcache==0.3.1 \ + pyarrow==20.0.0 \ + pydantic==2.11.5 \ + pydantic-core==2.33.2 \ + sentry-sdk==2.29.1 \ + setproctitle==1.3.6 \ + smmap==5.0.2 \ + typing-extensions==4.13.2 \ + typing-inspection==0.4.1 \ + xxhash==3.5.0 # Install packages needed for testing and for listing licenses of installed packages -RUN python3 -m pip install coverage==7.8.2 mock==5.2.0 -RUN python3 -m pip install pip-licenses==5.0.0 prettytable==3.16.0 +RUN python3 -m pip install \ + coverage==7.8.2 \ + mock==5.2.0 +RUN python3 -m pip install \ + pip-licenses==5.0.0 \ + prettytable==3.16.0 # Clean up pip cache RUN python3 -m pip cache purge @@ -57,12 +336,14 @@ WORKDIR /workspace/ COPY ./MediSwarm/docker_config/NVFlare /workspace/nvflare ## use startup kit template in the dashboard COPY ./MediSwarm/docker_config/master_template.yml /workspace/nvflare/nvflare/lighter/impl/ -RUN python3 -m pip install /workspace/nvflare +RUN python3 -m pip install \ + /workspace/nvflare RUN rm -rf /workspace/nvflare # Install the ODELIA controller package from local source COPY ./MediSwarm/controller /workspace/controller -RUN python3 -m pip install /workspace/controller +RUN python3 -m pip install \ + /workspace/controller RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm @@ -71,4 +352,4 @@ RUN mkdir -p /fl_admin/transfer RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm # Copy pre-trained model weights to image -COPY ./torch_home_cache /torch_home +COPY ./torch_home_cache /torch_home \ No newline at end of file From 1ee9290ca98091437dc9b903bce161a6ab52c984 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Mon, 28 Jul 2025 15:17:26 +0200 Subject: [PATCH 091/337] chore: format APT install lines for better readability --- .../dev_utils/dockerfile_update_addAptVersionNumbers.py | 7 ++++++- scripts/dev_utils/dockerfile_update_removeVersionApt.py | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py b/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py index cca37ddd..6afc4bfc 100755 --- a/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py +++ b/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py @@ -35,6 +35,11 @@ def add_apt_versions(dockerfile: str, versions: dict) -> str: for package, version in versions.items(): outline = outline.replace(f' {package} ', f' {package}={version} ') outline = re.sub(f' {package}$', f' {package}={version}', outline) + parts = outline.split() + if len(parts) > 3: + header = " ".join(parts[:3]) + pkgs = parts[3:] + outline = header + " \\\n " + " \\\n ".join(pkgs) outlines.append(outline) else: outlines.append(line) @@ -55,4 +60,4 @@ def report_non_fixed_versions(dockerfile: str, versions: dict) -> None: versions = parse_apt_versions(installlog) report_non_fixed_versions(dockerfile, versions) dockerfile = add_apt_versions(dockerfile, versions) - save_file(dockerfile, sys.argv[1]) + save_file(dockerfile, sys.argv[1]) \ No newline at end of file diff --git a/scripts/dev_utils/dockerfile_update_removeVersionApt.py b/scripts/dev_utils/dockerfile_update_removeVersionApt.py index 15055b7f..a067578d 100755 --- a/scripts/dev_utils/dockerfile_update_removeVersionApt.py +++ b/scripts/dev_utils/dockerfile_update_removeVersionApt.py @@ -17,6 +17,11 @@ def remove_apt_versions(dockerfile: str) -> str: for line in dockerfile.splitlines(): if line.startswith('RUN apt install'): out_line = re.sub('=[^ ]*', '', line) + parts = out_line.split() + if len(parts) > 3: + header = " ".join(parts[:3]) + pkgs = parts[3:] + out_line = header + " \\\n " + " \\\n ".join(pkgs) output.append(out_line) else: output.append(line) @@ -26,4 +31,4 @@ def remove_apt_versions(dockerfile: str) -> str: if __name__ == '__main__': dockerfile = load_file(sys.argv[1]) dockerfile = remove_apt_versions(dockerfile) - save_file(dockerfile, sys.argv[1]) + save_file(dockerfile, sys.argv[1]) \ No newline at end of file From 2e3c5c4be609d4c2b6e14b576bb8cd9d867e1bdc Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Mon, 28 Jul 2025 15:17:30 +0200 Subject: [PATCH 092/337] WIP: remove apt versions for rebuild --- docker_config/Dockerfile_ODELIA | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 964d188c..aacdc9c7 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -13,6 +13,7 @@ ENV PYTHON_VERSION=3.10.14 RUN apt update RUN apt install \ + \ \ -y \ apt=2.4.14 \ @@ -21,6 +22,7 @@ RUN apt install \ # Update versions of installed packages RUN apt install \ + \ \ -y \ base-files=12ubuntu4.7 \ @@ -65,6 +67,7 @@ RUN apt install \ # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions RUN apt install \ + \ \ -y \ apt-transport-https=2.4.14 \ @@ -121,6 +124,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions RUN apt install \ + \ \ -y \ apparmor=3.0.4-2ubuntu2.4 \ From b556bdd0d7b78f91573187028f3d291807334a6d Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Mon, 28 Jul 2025 15:23:05 +0200 Subject: [PATCH 093/337] WIP: remove apt versions for rebuild --- docker_config/Dockerfile_ODELIA | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index aacdc9c7..5ce738e2 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -13,6 +13,7 @@ ENV PYTHON_VERSION=3.10.14 RUN apt update RUN apt install \ + \ \ \ -y \ @@ -22,6 +23,7 @@ RUN apt install \ # Update versions of installed packages RUN apt install \ + \ \ \ -y \ @@ -67,6 +69,7 @@ RUN apt install \ # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions RUN apt install \ + \ \ \ -y \ @@ -124,6 +127,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions RUN apt install \ + \ \ \ -y \ From a5fb3112a1ec014486f2eb1ea3501e9c0568acd5 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Mon, 28 Jul 2025 15:23:09 +0200 Subject: [PATCH 094/337] refactor: update parse_apt_versions to return a dictionary of package versions --- .../dockerfile_update_addAptVersionNumbers.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py b/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py index 6afc4bfc..4ef715d3 100755 --- a/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py +++ b/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py @@ -12,14 +12,14 @@ def save_file(contents: str, filename: str) -> None: outfile.write(contents) -def parse_apt_versions(installlog: str) -> str: +def parse_apt_versions(installlog: str) -> dict: versions = {} for line in installlog.splitlines(): - if re.match('.*Get:[0-9]* http.*', line): - blocks = line.split(' ') - if len(blocks) > 9: - package = blocks[6] - version = blocks[8] + if "Get:" in line: + match = re.search(r' ([a-zA-Z0-9\-\+\.]+)[/\s]([^\s]+) ', line) + if match: + package = match.group(1) + version = match.group(2) if package in versions and versions[package] != version: print(f'Conflicting versions of {package} found: {versions[package]} and {version} found, using the latter.') versions[package] = version From 1710d4076ae505a3d9e60a79dac876bcebbaa07f Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Mon, 28 Jul 2025 15:23:11 +0200 Subject: [PATCH 095/337] WIP: remove apt versions for rebuild --- docker_config/Dockerfile_ODELIA | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 5ce738e2..118bec0f 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -16,6 +16,7 @@ RUN apt install \ \ \ \ + \ -y \ apt=2.4.14 \ apt-utils=2.4.14 \ @@ -26,6 +27,7 @@ RUN apt install \ \ \ \ + \ -y \ base-files=12ubuntu4.7 \ bash=5.1-6ubuntu1.1 \ @@ -72,6 +74,7 @@ RUN apt install \ \ \ \ + \ -y \ apt-transport-https=2.4.14 \ curl=7.81.0-1ubuntu1.20 \ @@ -130,6 +133,7 @@ RUN apt install \ \ \ \ + \ -y \ apparmor=3.0.4-2ubuntu2.4 \ containerd.io=1.7.27-1 \ From 41ac3779dae5aede05988fb1289bc0eba6735aca Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Mon, 28 Jul 2025 15:28:42 +0200 Subject: [PATCH 096/337] refactor: optimize regex pattern for parsing APT versions --- .../dockerfile_update_addAptVersionNumbers.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py b/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py index 4ef715d3..c174dfc3 100755 --- a/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py +++ b/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py @@ -14,18 +14,19 @@ def save_file(contents: str, filename: str) -> None: def parse_apt_versions(installlog: str) -> dict: versions = {} + pattern = re.compile(r'Get:.*? ([a-z0-9\-\+\.]+)(?:/[^ ]*)? ([0-9a-zA-Z\:\~\.\+\-]+) ') for line in installlog.splitlines(): - if "Get:" in line: - match = re.search(r' ([a-zA-Z0-9\-\+\.]+)[/\s]([^\s]+) ', line) - if match: - package = match.group(1) - version = match.group(2) - if package in versions and versions[package] != version: - print(f'Conflicting versions of {package} found: {versions[package]} and {version} found, using the latter.') - versions[package] = version + match = pattern.search(line) + if match: + package = match.group(1) + version = match.group(2) + if package in versions and versions[package] != version: + print(f'Conflicting versions of {package} found: {versions[package]} and {version} found, using the latter.') + versions[package] = version return versions + def add_apt_versions(dockerfile: str, versions: dict) -> str: dockerfile = dockerfile.replace('RUN apt install', 'RUN_apt_install') outlines = [] From 4a28fe72a1adabd7260dacd78e86bb8e2abd3e10 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Tue, 29 Jul 2025 06:27:57 +0200 Subject: [PATCH 097/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 47d05edf..988928dd 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -15,10 +15,10 @@ RUN apt update RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.7 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-144.157 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.7 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.4 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 +RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.5 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 # Prepare Docker installation RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ From 25b662b905d4572a65e5966b9bff00bf475af245 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 29 Jul 2025 11:21:32 +0200 Subject: [PATCH 098/337] restored lost apt package version --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 988928dd..577fd58d 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -15,7 +15,7 @@ RUN apt update RUN apt install -y apt=2.4.14 apt-utils=2.4.14 libapt-pkg6.0=2.4.14 # Update versions of installed packages -RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.7 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 +RUN apt install -y base-files=12ubuntu4.7 bash=5.1-6ubuntu1.1 bsdutils=1:2.37.2-4ubuntu3.4 ca-certificates=20240203~22.04.1 coreutils=8.32-4.1ubuntu1.2 dpkg=1.21.1ubuntu2.3 e2fsprogs=1.46.5-2ubuntu1.2 gpgv=2.2.27-3ubuntu2.4 libblkid1=2.37.2-4ubuntu3.4 libc-bin=2.35-0ubuntu3.10 libc-dev-bin=2.35-0ubuntu3.10 libc6-dev=2.35-0ubuntu3.10 libc6=2.35-0ubuntu3.10 libcap2=1:2.44-1ubuntu0.22.04.2 libcom-err2=1.46.5-2ubuntu1.2 libext2fs2=1.46.5-2ubuntu1.2 libgnutls30=3.7.3-4ubuntu1.7 libgssapi-krb5-2=1.19.2-2ubuntu0.7 libk5crypto3=1.19.2-2ubuntu0.7 libkrb5-3=1.19.2-2ubuntu0.7 libkrb5support0=1.19.2-2ubuntu0.7 libmount1=2.37.2-4ubuntu3.4 libpam-modules-bin=1.4.0-11ubuntu2.6 libpam-modules=1.4.0-11ubuntu2.6 libpam-runtime=1.4.0-11ubuntu2.6 libpam0g=1.4.0-11ubuntu2.6 libseccomp2=2.5.3-2ubuntu3~22.04.1 libsmartcols1=2.37.2-4ubuntu3.4 libss2=1.46.5-2ubuntu1.2 libssl3=3.0.2-0ubuntu1.19 libsystemd0=249.11-0ubuntu3.16 libtasn1-6=4.18.0-4ubuntu0.1 libudev1=249.11-0ubuntu3.16 libuuid1=2.37.2-4ubuntu3.4 linux-libc-dev=5.15.0-151.161 logsave=1.46.5-2ubuntu1.2 mount=2.37.2-4ubuntu3.4 openssl=3.0.2-0ubuntu1.19 util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions RUN apt install -y apt-transport-https=2.4.14 curl=7.81.0-1ubuntu1.20 dirmngr=2.2.27-3ubuntu2.4 distro-info-data=0.52ubuntu0.9 gnupg-l10n=2.2.27-3ubuntu2.4 gnupg-utils=2.2.27-3ubuntu2.4 gnupg=2.2.27-3ubuntu2.4 gpg-agent=2.2.27-3ubuntu2.4 gpg-wks-client=2.2.27-3ubuntu2.4 gpg-wks-server=2.2.27-3ubuntu2.4 gpg=2.2.27-3ubuntu2.4 gpgconf=2.2.27-3ubuntu2.4 gpgsm=2.2.27-3ubuntu2.4 libassuan0=2.5.5-1build1 libbrotli1=1.0.9-2build6 libcurl4=7.81.0-1ubuntu1.20 libexpat1=2.4.7-1ubuntu0.6 libksba8=1.6.0-2ubuntu0.2 libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 libmpdec3=2.5.1-2build2 libnghttp2-14=1.43.0-1ubuntu0.2 libnpth0=1.6-3build2 libpsl5=0.21.0-1.2build2 libpython3-stdlib=3.10.6-1~22.04.1 libpython3.10-minimal=3.10.12-1~22.04.10 libpython3.10-stdlib=3.10.12-1~22.04.10 libreadline8=8.1.2-1 librtmp1=2.4+20151223.gitfa8646d.1-2build4 libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 libsqlite3-0=3.37.2-2ubuntu0.5 libssh-4=0.9.6-2ubuntu0.22.04.4 lsb-release=11.1.0ubuntu4 media-types=7.0.0 pinentry-curses=1.1.1-1build2 publicsuffix=20211207.1025-1 python3-minimal=3.10.6-1~22.04.1 python3.10-minimal=3.10.12-1~22.04.10 python3.10=3.10.12-1~22.04.10 python3=3.10.6-1~22.04.1 readline-common=8.1.2-1 unzip=6.0-26ubuntu3.2 zip=3.0-12build2 From e005b37b03289aed07eb44b69c3493d576b06bc7 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Wed, 30 Jul 2025 06:28:13 +0200 Subject: [PATCH 099/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 577fd58d..4c94781d 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -27,7 +27,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.25.0-1~ubuntu.22.04~jammy docker-ce-cli=5:28.3.2-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.3.2-1~ubuntu.22.04~jammy docker-ce=5:28.3.2-1~ubuntu.22.04~jammy docker-compose-plugin=2.38.2-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.15 git=1:2.34.1-1ubuntu1.15 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.4 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.4 perl-modules-5.34=5.34.0-3ubuntu1.4 perl=5.34.0-3ubuntu1.4 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 +RUN apt install -y apparmor=3.0.4-2ubuntu2.4 containerd.io=1.7.27-1 dbus-user-session=1.12.20-2ubuntu4.1 dbus=1.12.20-2ubuntu4.1 dmsetup=2:1.02.175-2.1ubuntu5 docker-buildx-plugin=0.26.1-1~ubuntu.22.04~jammy docker-ce-cli=5:28.3.3-1~ubuntu.22.04~jammy docker-ce-rootless-extras=5:28.3.3-1~ubuntu.22.04~jammy docker-ce=5:28.3.3-1~ubuntu.22.04~jammy docker-compose-plugin=2.39.1-1~ubuntu.22.04~jammy gir1.2-glib-2.0=1.72.0-1 git-man=1:2.34.1-1ubuntu1.15 git=1:2.34.1-1ubuntu1.15 iptables=1.8.7-1ubuntu5.2 less=590-1ubuntu0.22.04.3 libapparmor1=3.0.4-2ubuntu2.4 libargon2-1=0~20171227-0.3 libbsd0=0.11.5-1 libcbor0.8=0.8.0-2ubuntu1 libcryptsetup12=2:2.4.3-1ubuntu1.3 libcurl3-gnutls=7.81.0-1ubuntu1.20 libdbus-1-3=1.12.20-2ubuntu4.1 libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 libedit2=3.1-20210910-1build1 liberror-perl=0.17029-1 libfido2-1=1.10.0-1 libgdbm-compat4=1.23-1 libgdbm6=1.23-1 libgirepository-1.0-1=1.72.0-1 libglib2.0-0=2.72.4-0ubuntu2.5 libglib2.0-data=2.72.4-0ubuntu2.5 libicu70=70.1-2 libip4tc2=1.8.7-1ubuntu5.2 libip6tc2=1.8.7-1ubuntu5.2 libjson-c5=0.15-3~ubuntu1.22.04.2 libkmod2=29-1ubuntu1 libltdl7=2.4.6-15build2 libmd0=1.0.4-1build1 libmnl0=1.0.4-3build2 libnetfilter-conntrack3=1.0.9-1 libnfnetlink0=1.0.1-3build3 libnftnl11=1.2.1-1build1 libnss-systemd=249.11-0ubuntu3.16 libpam-systemd=249.11-0ubuntu3.16 libperl5.34=5.34.0-3ubuntu1.5 libslirp0=4.6.1-1build1 libx11-6=2:1.7.5-1ubuntu0.3 libx11-data=2:1.7.5-1ubuntu0.3 libxau6=1:1.0.9-1build5 libxcb1=1.14-3ubuntu3 libxdmcp6=1:1.1.3-0ubuntu5 libxext6=2:1.3.4-1build1 libxml2=2.9.13+dfsg-1ubuntu0.7 libxmuu1=2:1.1.3-3 libxtables12=1.8.7-1ubuntu5.2 netbase=6.3 networkd-dispatcher=2.1-2ubuntu0.22.04.2 openssh-client=1:8.9p1-3ubuntu0.13 patch=2.7.6-7build2 perl-base=5.34.0-3ubuntu1.5 perl-modules-5.34=5.34.0-3ubuntu1.5 perl=5.34.0-3ubuntu1.5 pigz=2.6-1 python3-dbus=1.2.18-3build1 python3-gi=3.42.1-0ubuntu1 shared-mime-info=2.1-2 slirp4netns=1.0.1-2 systemd-sysv=249.11-0ubuntu3.16 systemd-timesyncd=249.11-0ubuntu3.16 systemd=249.11-0ubuntu3.16 xauth=1:1.1-1build2 xdg-user-dirs=0.17-2ubuntu4 xz-utils=5.2.5-2ubuntu1 # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* From 618444bf7240641834d88654323d7728c08dee92 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Wed, 30 Jul 2025 11:24:35 +0200 Subject: [PATCH 100/337] fix: remove unnecessary line breaks in APT installation commands Signed-off-by: GitHub CI --- docker_config/Dockerfile_ODELIA | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 118bec0f..2d4d6ef9 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -13,10 +13,6 @@ ENV PYTHON_VERSION=3.10.14 RUN apt update RUN apt install \ - \ - \ - \ - \ -y \ apt=2.4.14 \ apt-utils=2.4.14 \ @@ -24,10 +20,6 @@ RUN apt install \ # Update versions of installed packages RUN apt install \ - \ - \ - \ - \ -y \ base-files=12ubuntu4.7 \ bash=5.1-6ubuntu1.1 \ @@ -71,10 +63,6 @@ RUN apt install \ # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions RUN apt install \ - \ - \ - \ - \ -y \ apt-transport-https=2.4.14 \ curl=7.81.0-1ubuntu1.20 \ @@ -130,10 +118,6 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions RUN apt install \ - \ - \ - \ - \ -y \ apparmor=3.0.4-2ubuntu2.4 \ containerd.io=1.7.27-1 \ @@ -348,14 +332,12 @@ WORKDIR /workspace/ COPY ./MediSwarm/docker_config/NVFlare /workspace/nvflare ## use startup kit template in the dashboard COPY ./MediSwarm/docker_config/master_template.yml /workspace/nvflare/nvflare/lighter/impl/ -RUN python3 -m pip install \ - /workspace/nvflare +RUN python3 -m pip install /workspace/nvflare RUN rm -rf /workspace/nvflare # Install the ODELIA controller package from local source COPY ./MediSwarm/controller /workspace/controller -RUN python3 -m pip install \ - /workspace/controller +RUN python3 -m pip install /workspace/controller RUN rm -rf /workspace/controller # Copy the source code for local training and deploying to the swarm From ee53d3ac87c1cd50252d68712cdb31e94a756be8 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 30 Jul 2025 13:27:08 +0200 Subject: [PATCH 101/337] keep argument -y in same line as RUN apt install adapted scripts for easier handling of 'RUN apt install -y' commands --- docker_config/Dockerfile_ODELIA | 12 ++---- .../dockerfile_update_addAptVersionNumbers.py | 40 ++++++++++--------- .../dockerfile_update_removeVersionApt.py | 28 +++++++------ 3 files changed, 41 insertions(+), 39 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 2d4d6ef9..9208e15b 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -12,15 +12,13 @@ ENV PYTHON_VERSION=3.10.14 # Install updates of installed packages RUN apt update -RUN apt install \ - -y \ +RUN apt install -y \ apt=2.4.14 \ apt-utils=2.4.14 \ libapt-pkg6.0=2.4.14 # Update versions of installed packages -RUN apt install \ - -y \ +RUN apt install -y \ base-files=12ubuntu4.7 \ bash=5.1-6ubuntu1.1 \ bsdutils=1:2.37.2-4ubuntu3.4 \ @@ -62,8 +60,7 @@ RUN apt install \ util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install \ - -y \ +RUN apt install -y \ apt-transport-https=2.4.14 \ curl=7.81.0-1ubuntu1.20 \ dirmngr=2.2.27-3ubuntu2.4 \ @@ -117,8 +114,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings && apt update # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install \ - -y \ +RUN apt install -y \ apparmor=3.0.4-2ubuntu2.4 \ containerd.io=1.7.27-1 \ dbus-user-session=1.12.20-2ubuntu4.1 \ diff --git a/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py b/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py index c174dfc3..186fc955 100755 --- a/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py +++ b/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py @@ -3,6 +3,11 @@ import re import sys +LINE_BREAK_IN_COMMAND = ' \\\n ' +LINE_BREAK_REPLACEMENT = ' λινε βρεακ ρεπλαζεμεντ ' +APT_INSTALL_COMMAND = 'RUN apt install -y' +APT_INSTALL_REPLACEMENT = 'ΡΥΝ απτ ινσταλλ -υ' + def load_file(filename: str) -> str: with open(filename, 'r') as infile: return infile.read() @@ -14,38 +19,34 @@ def save_file(contents: str, filename: str) -> None: def parse_apt_versions(installlog: str) -> dict: versions = {} - pattern = re.compile(r'Get:.*? ([a-z0-9\-\+\.]+)(?:/[^ ]*)? ([0-9a-zA-Z\:\~\.\+\-]+) ') for line in installlog.splitlines(): - match = pattern.search(line) - if match: - package = match.group(1) - version = match.group(2) - if package in versions and versions[package] != version: - print(f'Conflicting versions of {package} found: {versions[package]} and {version} found, using the latter.') - versions[package] = version + if re.match('.*Get:[0-9]* http.*', line): + blocks = line.split(' ') + if len(blocks) > 9: + package = blocks[6] + version = blocks[8] + if package in versions and versions[package] != version: + print(f'Conflicting versions of {package} found: {versions[package]} and {version} found, using the latter.') + versions[package] = version return versions - def add_apt_versions(dockerfile: str, versions: dict) -> str: - dockerfile = dockerfile.replace('RUN apt install', 'RUN_apt_install') + dockerfile = dockerfile.replace(LINE_BREAK_IN_COMMAND, LINE_BREAK_REPLACEMENT) + dockerfile = dockerfile.replace(APT_INSTALL_COMMAND, APT_INSTALL_REPLACEMENT) outlines = [] for line in dockerfile.splitlines(): - if line.startswith('RUN_apt_install'): + if line.startswith(APT_INSTALL_REPLACEMENT): outline = '' + line for package, version in versions.items(): outline = outline.replace(f' {package} ', f' {package}={version} ') outline = re.sub(f' {package}$', f' {package}={version}', outline) - parts = outline.split() - if len(parts) > 3: - header = " ".join(parts[:3]) - pkgs = parts[3:] - outline = header + " \\\n " + " \\\n ".join(pkgs) outlines.append(outline) else: outlines.append(line) dockerfile = '\n'.join(outlines) + '\n' - dockerfile = dockerfile.replace('RUN_apt_install', 'RUN apt install') + dockerfile = dockerfile.replace(APT_INSTALL_REPLACEMENT, APT_INSTALL_COMMAND) + dockerfile = dockerfile.replace(LINE_BREAK_REPLACEMENT, LINE_BREAK_IN_COMMAND) return dockerfile @@ -58,7 +59,10 @@ def report_non_fixed_versions(dockerfile: str, versions: dict) -> None: if __name__ == '__main__': dockerfile = load_file(sys.argv[1]) installlog = load_file(sys.argv[2]) + if LINE_BREAK_REPLACEMENT in dockerfile or APT_INSTALL_REPLACEMENT in dockerfile: + raise Exception('Line break replacement {LINE_BREAK_REPLACEMENT} or apt command replacement {APT_INSTALL_REPLACEMENT} in Dockerfile, cannot process it.') + versions = parse_apt_versions(installlog) report_non_fixed_versions(dockerfile, versions) dockerfile = add_apt_versions(dockerfile, versions) - save_file(dockerfile, sys.argv[1]) \ No newline at end of file + save_file(dockerfile, sys.argv[1]) diff --git a/scripts/dev_utils/dockerfile_update_removeVersionApt.py b/scripts/dev_utils/dockerfile_update_removeVersionApt.py index a067578d..7f87aa21 100755 --- a/scripts/dev_utils/dockerfile_update_removeVersionApt.py +++ b/scripts/dev_utils/dockerfile_update_removeVersionApt.py @@ -3,6 +3,9 @@ import re import sys +LINE_BREAK_IN_COMMAND = ' \\\n ' +LINE_BREAK_REPLACEMENT = ' λινε βρεακ ρεπλαζεμεντ ' + def load_file(filename: str) -> str: with open(filename, 'r') as infile: return infile.read() @@ -12,23 +15,22 @@ def save_file(contents: str, filename: str) -> None: outfile.write(contents) -def remove_apt_versions(dockerfile: str) -> str: +def remove_apt_versions(contents: str) -> str: + contents = contents.replace(LINE_BREAK_IN_COMMAND, LINE_BREAK_REPLACEMENT) output = [] - for line in dockerfile.splitlines(): - if line.startswith('RUN apt install'): + for line in contents.splitlines(): + if line.startswith('RUN apt install -y'): out_line = re.sub('=[^ ]*', '', line) - parts = out_line.split() - if len(parts) > 3: - header = " ".join(parts[:3]) - pkgs = parts[3:] - out_line = header + " \\\n " + " \\\n ".join(pkgs) output.append(out_line) else: output.append(line) - return '\n'.join(output) - + output = '\n'.join(output) + '\n' + output = output.replace(LINE_BREAK_REPLACEMENT, LINE_BREAK_IN_COMMAND) + return output if __name__ == '__main__': - dockerfile = load_file(sys.argv[1]) - dockerfile = remove_apt_versions(dockerfile) - save_file(dockerfile, sys.argv[1]) \ No newline at end of file + contents = load_file(sys.argv[1]) + if LINE_BREAK_REPLACEMENT in contents: + raise Exception('Line break replacement {LINE_BREAK_REPLACEMENT} in Dockerfile, cannot process it.') + contents = remove_apt_versions(contents) + save_file(contents, sys.argv[1]) From 9bbb0559f01cc53dbfc9ba73acecd1a5987a977d Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 30 Jul 2025 14:33:18 +0200 Subject: [PATCH 102/337] updated apt version numbers --- docker_config/Dockerfile_ODELIA | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 9208e15b..83253588 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -53,7 +53,7 @@ RUN apt install -y \ libtasn1-6=4.18.0-4ubuntu0.1 \ libudev1=249.11-0ubuntu3.16 \ libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-144.157 \ + linux-libc-dev=5.15.0-151.161 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ openssl=3.0.2-0ubuntu1.19 \ @@ -93,7 +93,7 @@ RUN apt install -y \ libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 \ libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 \ libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 \ - libsqlite3-0=3.37.2-2ubuntu0.4 \ + libsqlite3-0=3.37.2-2ubuntu0.5 \ libssh-4=0.9.6-2ubuntu0.22.04.4 \ lsb-release=11.1.0ubuntu4 \ media-types=7.0.0 \ @@ -120,11 +120,11 @@ RUN apt install -y \ dbus-user-session=1.12.20-2ubuntu4.1 \ dbus=1.12.20-2ubuntu4.1 \ dmsetup=2:1.02.175-2.1ubuntu5 \ - docker-buildx-plugin=0.25.0-1~ubuntu.22.04~jammy \ - docker-ce-cli=5:28.3.2-1~ubuntu.22.04~jammy \ - docker-ce-rootless-extras=5:28.3.2-1~ubuntu.22.04~jammy \ - docker-ce=5:28.3.2-1~ubuntu.22.04~jammy \ - docker-compose-plugin=2.38.2-1~ubuntu.22.04~jammy \ + docker-buildx-plugin=0.26.1-1~ubuntu.22.04~jammy \ + docker-ce-cli=5:28.3.3-1~ubuntu.22.04~jammy \ + docker-ce-rootless-extras=5:28.3.3-1~ubuntu.22.04~jammy \ + docker-ce=5:28.3.3-1~ubuntu.22.04~jammy \ + docker-compose-plugin=2.39.1-1~ubuntu.22.04~jammy \ gir1.2-glib-2.0=1.72.0-1 \ git-man=1:2.34.1-1ubuntu1.15 \ git=1:2.34.1-1ubuntu1.15 \ @@ -159,7 +159,7 @@ RUN apt install -y \ libnftnl11=1.2.1-1build1 \ libnss-systemd=249.11-0ubuntu3.16 \ libpam-systemd=249.11-0ubuntu3.16 \ - libperl5.34=5.34.0-3ubuntu1.4 \ + libperl5.34=5.34.0-3ubuntu1.5 \ libslirp0=4.6.1-1build1 \ libx11-6=2:1.7.5-1ubuntu0.3 \ libx11-data=2:1.7.5-1ubuntu0.3 \ @@ -174,9 +174,9 @@ RUN apt install -y \ networkd-dispatcher=2.1-2ubuntu0.22.04.2 \ openssh-client=1:8.9p1-3ubuntu0.13 \ patch=2.7.6-7build2 \ - perl-base=5.34.0-3ubuntu1.4 \ - perl-modules-5.34=5.34.0-3ubuntu1.4 \ - perl=5.34.0-3ubuntu1.4 \ + perl-base=5.34.0-3ubuntu1.5 \ + perl-modules-5.34=5.34.0-3ubuntu1.5 \ + perl=5.34.0-3ubuntu1.5 \ pigz=2.6-1 \ python3-dbus=1.2.18-3build1 \ python3-gi=3.42.1-0ubuntu1 \ @@ -342,4 +342,4 @@ RUN mkdir -p /fl_admin/transfer RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm # Copy pre-trained model weights to image -COPY ./torch_home_cache /torch_home \ No newline at end of file +COPY ./torch_home_cache /torch_home From 494f6a3c14134ec811a52f8a6109d34b38004246 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 30 Jul 2025 14:37:47 +0200 Subject: [PATCH 103/337] refactored to remove duplicate definitions --- .../dockerfile_update_addAptVersionNumbers.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py b/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py index 186fc955..cd9c94c7 100755 --- a/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py +++ b/scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py @@ -3,20 +3,11 @@ import re import sys -LINE_BREAK_IN_COMMAND = ' \\\n ' -LINE_BREAK_REPLACEMENT = ' λινε βρεακ ρεπλαζεμεντ ' +from dockerfile_update_removeVersionApt import LINE_BREAK_IN_COMMAND, LINE_BREAK_REPLACEMENT, load_file, save_file + APT_INSTALL_COMMAND = 'RUN apt install -y' APT_INSTALL_REPLACEMENT = 'ΡΥΝ απτ ινσταλλ -υ' -def load_file(filename: str) -> str: - with open(filename, 'r') as infile: - return infile.read() - -def save_file(contents: str, filename: str) -> None: - with open(filename, 'w') as outfile: - outfile.write(contents) - - def parse_apt_versions(installlog: str) -> dict: versions = {} for line in installlog.splitlines(): From 19f6b3ceb1bb6cd2c2aaee56656fff05c089203c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 31 Jul 2025 11:28:04 +0200 Subject: [PATCH 104/337] added potential pitfalls --- assets/readme/README.participant.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index afc6e269..0ffcc77c 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -157,3 +157,9 @@ To have a baseline for swarm training, train the same model in a comparable way tail -f nohup.out # Follow training log ``` For any issues, check if the commands above point to problems and contact your Swarm Operator. + +## Troubleshooting + +* Image files need to have the correct file name including capitalization +* The directories listed as identifiers in the tables `annotation.csv` and `split.csv` should all be present, only those directories should be present +* The tables should not have additional or duplicate columns, entries need to have the correct captitalization From 49a1a2ee984681cdbb90fd9663ae60e6ecc9d755 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 12 Aug 2025 11:52:51 +0200 Subject: [PATCH 105/337] fixed file name --- assets/readme/README.participant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 0ffcc77c..9a14fa7f 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -54,7 +54,7 @@ The dataset must be in the following format. #### Annotation -* `split.csv` defines the class labels +* `annotation.csv` defines the class labels * The file contains the columns `UID`, `PatientID`, `Age`, `Lesion` * `UID` is the identifier used in the folder name, e.g., `ID_001_left`. * `PatientID` is the identifier of the patient, in this case, `ID_001`. From 516d02a13334fc37084d838d3fcdcb371795e112 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 15 Aug 2025 10:20:39 +0200 Subject: [PATCH 106/337] note on age column --- assets/readme/README.participant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 9a14fa7f..06337a26 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -59,6 +59,7 @@ The dataset must be in the following format. * `UID` is the identifier used in the folder name, e.g., `ID_001_left`. * `PatientID` is the identifier of the patient, in this case, `ID_001`. * `Age` is the age of the patient at the time of the scan in days. + This columns is ignored for our current technical tests and exists only for compatibility with the ODELIA challenge data format. Please ignore discrepancies if age is listed in other units than days. * `Lesion` is 0 for no lesion, 1 for benign lesion, and 2 for malicious lesion. #### Split @@ -70,7 +71,6 @@ The dataset must be in the following format. * `Split` is either `train`, `val`, or `test`. The test set is currently ignored. * `Fold` is the 0-based index of the fold (for a potential cross-validation). - ## Prepare Training Participation 1. Extract startup kit provided by swarm operator From e885310f5e940e961ae344e3b1636ba174be1ea6 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Fri, 15 Aug 2025 11:25:57 +0200 Subject: [PATCH 107/337] docs: enhance VPN setup guide with troubleshooting steps for existing connections --- assets/VPN setup guide(CLI).md | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/assets/VPN setup guide(CLI).md b/assets/VPN setup guide(CLI).md index d6e9bc56..627cbc5e 100644 --- a/assets/VPN setup guide(CLI).md +++ b/assets/VPN setup guide(CLI).md @@ -72,8 +72,20 @@ sh envsetup_scripts/setup_vpntunnel.sh The `.ovpn` file assigned to you by TUD is required for re-establishing the connection. -For further troubleshooting, refer to the **VPN Connect Guide**. +For further troubleshooting, refer to the VPN Connect Guide on the GoodAccess support page: +[GoodAccess VPN Connect Guide](https://support.goodaccess.com/configuration-guides/linux) ---- -This guide ensures a smooth setup and reconnection process for GoodAccess VPN via CLI. \ No newline at end of file + +## Step 6: Troubleshooting — Disconnecting Existing VPN Connections + +Some users have experienced that connecting to GoodAccess **disconnects an existing VPN connection**. +This may happen because OpenVPN is configured to redirect all network traffic through the GoodAccess tunnel, which overrides your local or other VPN routes. + +If this occurs, you can prevent the redirection by starting OpenVPN with: +```sh +openvpn --config .ovpn --pull-filter ignore redirect-gateway +``` +This tells the OpenVPN client **not** to override your default gateway, allowing your other VPN connection to remain active. + +> **Note:** This behavior was reported by Aitor after certain OpenVPN updates. The above command has been effective in resolving the issue. \ No newline at end of file From 66ccb58f51dcc93ff675c334154a18e8956ef4a7 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 15 Aug 2025 14:33:11 +0200 Subject: [PATCH 108/337] extended description of VPN issue --- assets/VPN setup guide(CLI).md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/assets/VPN setup guide(CLI).md b/assets/VPN setup guide(CLI).md index 627cbc5e..e70fe26c 100644 --- a/assets/VPN setup guide(CLI).md +++ b/assets/VPN setup guide(CLI).md @@ -79,13 +79,13 @@ For further troubleshooting, refer to the VPN Connect Guide on the GoodAccess su ## Step 6: Troubleshooting — Disconnecting Existing VPN Connections -Some users have experienced that connecting to GoodAccess **disconnects an existing VPN connection**. -This may happen because OpenVPN is configured to redirect all network traffic through the GoodAccess tunnel, which overrides your local or other VPN routes. +Some users have experienced that connecting to GoodAccess **disconnects an existing VPN or ssh connection**. +This may happen because OpenVPN is configured to redirect all network traffic through the GoodAccess tunnel, which overrides your local or other VPN routes and may make the machine inaccessible in its local network. If this occurs, you can prevent the redirection by starting OpenVPN with: ```sh openvpn --config .ovpn --pull-filter ignore redirect-gateway ``` -This tells the OpenVPN client **not** to override your default gateway, allowing your other VPN connection to remain active. +This tells the OpenVPN client **not** to override your default gateway, allowing your other VPN or ssh connection to remain active. -> **Note:** This behavior was reported by Aitor after certain OpenVPN updates. The above command has been effective in resolving the issue. \ No newline at end of file +> **Note:** This behavior was observed by Aitor and Ole after certain OpenVPN updates. The above command has been effective in resolving the issue. \ No newline at end of file From 4439f1e9f08a2cb370bec9a5df73f3bb3ef312bb Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 15 Aug 2025 16:36:44 +0200 Subject: [PATCH 109/337] added potential pitfall --- assets/readme/README.participant.md | 1 + 1 file changed, 1 insertion(+) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 06337a26..e93eaf2a 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -163,3 +163,4 @@ For any issues, check if the commands above point to problems and contact your S * Image files need to have the correct file name including capitalization * The directories listed as identifiers in the tables `annotation.csv` and `split.csv` should all be present, only those directories should be present * The tables should not have additional or duplicate columns, entries need to have the correct captitalization +* Image and table folders and files need to be present in the folders specified via `--data_dir`. Symlinks to other locations do not work, they are not available in the Docker mount. From 70315912fcaa6cd6460d2f811ef26dbc46774b68 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 13 Aug 2025 14:53:45 +0200 Subject: [PATCH 110/337] use consistent server name --- tests/provision/dummy_project_for_testing.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/provision/dummy_project_for_testing.yml b/tests/provision/dummy_project_for_testing.yml index 7e259592..d4984d77 100644 --- a/tests/provision/dummy_project_for_testing.yml +++ b/tests/provision/dummy_project_for_testing.yml @@ -34,7 +34,7 @@ builders: path: nvflare.ha.dummy_overseer_agent.DummyOverseerAgent overseer_exists: false args: - sp_end_point: odeliatempvm.local:8002:8003 + sp_end_point: server.local:8002:8003 - path: nvflare.lighter.impl.cert.CertBuilder - path: nvflare.lighter.impl.signature.SignatureBuilder From 4097f791dca30f184ec2e259e326d9b7ff8c8e82 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 13 Aug 2025 16:04:46 +0200 Subject: [PATCH 111/337] scripts to start a dummy training from the startup kits --- _testsOutsideDocker_submitDummyTraining.exp | 15 +++++ runTestsOutsideDocker.sh | 73 +++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100755 _testsOutsideDocker_submitDummyTraining.exp create mode 100755 runTestsOutsideDocker.sh diff --git a/_testsOutsideDocker_submitDummyTraining.exp b/_testsOutsideDocker_submitDummyTraining.exp new file mode 100755 index 00000000..7d69997c --- /dev/null +++ b/_testsOutsideDocker_submitDummyTraining.exp @@ -0,0 +1,15 @@ +#!/usr/bin/env expect + +spawn ./docker.sh --no_pull +expect "User Name: " +send "admin@test.odelia\r" +expect "> " +send "submit_job MediSwarm/application/jobs/minimal_training_pytorch_cnn\r" +expect "> " +send "sys_info client\r" +expect "> " +send "sys_info server\r" +expect "> " +send "list_jobs\r" +expect "> " +send "list_jobs\r" diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh new file mode 100755 index 00000000..986168f1 --- /dev/null +++ b/runTestsOutsideDocker.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +set -e + +if ! grep -q "127.0.0.1 server.local" /etc/hosts; then + echo "/etc/hosts needs to contain the following line, please add it." + echo "127.0.0.1 server.local localhost" + exit 1 +fi + +if [ -z "$GPU_FOR_TESTING" ]; then + export GPU_FOR_TESTING="all" +fi + +VERSION=$(./getVersionNumber.sh) +DOCKER_IMAGE=jefftud/odelia:$VERSION +PROJECT_DIR="workspace/odelia_${VERSION}_dummy_project_for_testing" +SYNTHETIC_DATA_DIR=$(mktemp -d) +CWD=$(pwd) + +create_synthetic_data () { + # create synthetic data + docker run --rm \ + -u $(id -u):$(id -g) \ + -v "$SYNTHETIC_DATA_DIR":/synthetic_data \ + -w /MediSwarm \ + jefftud/odelia:$VERSION \ + /bin/bash -c "python3 application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py /synthetic_data" +} + +cleanup () { + rm -rf "$SYNTHETIC_DATA_DIR" + docker kill odelia_swarm_server_flserver odelia_swarm_client_client_A odelia_swarm_client_client_B +} + +start_server_and_clients () { + cd $PROJECT_DIR/prod_00 + cd server.local/startup + ./docker.sh --no_pull --start_server + cd ../.. + sleep 10 + + cd client_A/startup + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir /data/MEVISTwoNodeSwarm/scratch --GPU device=$GPU_FOR_TESTING --start_client + cd ../.. + cd client_B/startup + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir /data/MEVISTwoNodeSwarm/scratch --GPU device=$GPU_FOR_TESTING --start_client + sleep 5 + + cd "$CWD" +} + +run_dummy_training () { + cd $PROJECT_DIR/prod_00 + cd admin@test.odelia/startup + ../../../../../_testsOutsideDocker_submitDummyTraining.exp + docker kill fladmin + sleep 60 +} + +check_output_of_dummy_training () { + echo "TODO check output of dummy training" +} + +run_tests () { + create_synthetic_data + start_server_and_clients + run_dummy_training + check_output_of_dummy_training + cleanup +} + +run_tests From 5b6cbfb79d2e589df5b47509086c3adeee27a3ca Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 14 Aug 2025 15:59:12 +0200 Subject: [PATCH 112/337] skeleton for further tests --- runTestsOutsideDocker.sh | 46 +++++++++++++++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 5 deletions(-) diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index 986168f1..f82a4df7 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -18,6 +18,14 @@ PROJECT_DIR="workspace/odelia_${VERSION}_dummy_project_for_testing" SYNTHETIC_DATA_DIR=$(mktemp -d) CWD=$(pwd) +check_files_on_github () { + echo "TODO check files/documentation on github" +} + +check_startup_kits () { + echo "TODO check startup kits" +} + create_synthetic_data () { # create synthetic data docker run --rm \ @@ -28,9 +36,8 @@ create_synthetic_data () { /bin/bash -c "python3 application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py /synthetic_data" } -cleanup () { +cleanup_synthetic_data () { rm -rf "$SYNTHETIC_DATA_DIR" - docker kill odelia_swarm_server_flserver odelia_swarm_client_client_A odelia_swarm_client_client_B } start_server_and_clients () { @@ -50,7 +57,23 @@ start_server_and_clients () { cd "$CWD" } -run_dummy_training () { +kill_server_and_clients () { + docker kill odelia_swarm_server_flserver odelia_swarm_client_client_A odelia_swarm_client_client_B +} + +run_docker_gpu_preflight_check () { + echo "TODO run dummy training locally" +} + +run_data_access_preflight_check () { + echo "TODO run data access preflight check locally" +} + +check_output_of_preflight_checks () { + echo "TODO check output of preflight checks" +} + +run_dummy_training_in_swarm () { cd $PROJECT_DIR/prod_00 cd admin@test.odelia/startup ../../../../../_testsOutsideDocker_submitDummyTraining.exp @@ -63,11 +86,24 @@ check_output_of_dummy_training () { } run_tests () { + check_files_on_github + + check_startup_kits + create_synthetic_data + + run_docker_gpu_preflight_check + run_data_access_preflight_check + check_output_of_preflight_checks + start_server_and_clients - run_dummy_training + + run_dummy_training_in_swarm check_output_of_dummy_training - cleanup + + kill_server_and_clients + + cleanup_synthetic_data } run_tests From 4c3c23d1499f1920dad18c55aecf36e405b2d988 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 15 Aug 2025 14:48:10 +0200 Subject: [PATCH 113/337] added preflight checks (without checking their output so far) --- runTestsOutsideDocker.sh | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index f82a4df7..bbf450e6 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -16,6 +16,7 @@ VERSION=$(./getVersionNumber.sh) DOCKER_IMAGE=jefftud/odelia:$VERSION PROJECT_DIR="workspace/odelia_${VERSION}_dummy_project_for_testing" SYNTHETIC_DATA_DIR=$(mktemp -d) +SCRATCH_DIR=$(mktemp -d) CWD=$(pwd) check_files_on_github () { @@ -36,22 +37,23 @@ create_synthetic_data () { /bin/bash -c "python3 application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py /synthetic_data" } -cleanup_synthetic_data () { +cleanup_temporary_data () { rm -rf "$SYNTHETIC_DATA_DIR" + rm -rf "$SCRATCH_DIR" } start_server_and_clients () { - cd $PROJECT_DIR/prod_00 + cd "$PROJECT_DIR"/prod_00 cd server.local/startup ./docker.sh --no_pull --start_server cd ../.. sleep 10 cd client_A/startup - ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir /data/MEVISTwoNodeSwarm/scratch --GPU device=$GPU_FOR_TESTING --start_client + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --start_client cd ../.. cd client_B/startup - ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir /data/MEVISTwoNodeSwarm/scratch --GPU device=$GPU_FOR_TESTING --start_client + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_B --GPU device=$GPU_FOR_TESTING --start_client sleep 5 cd "$CWD" @@ -62,26 +64,29 @@ kill_server_and_clients () { } run_docker_gpu_preflight_check () { - echo "TODO run dummy training locally" + cd "$PROJECT_DIR"/prod_00 + cd client_A/startup + ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --dummy_training --no_pull 2>&1 | tee dummy_training_console_output.txt + echo "TODO check output in dummy_training_console_output.txt" + cd "$CWD" } run_data_access_preflight_check () { - echo "TODO run data access preflight check locally" -} - -check_output_of_preflight_checks () { - echo "TODO check output of preflight checks" + cd "$PROJECT_DIR"/prod_00 + cd client_A/startup + ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --preflight_check --no_pull 2>&1 | tee preflight_check_console_output.txt + echo "TODO check output in preflight_check_console_output.txt" + cd ../.. + cd ../.. } run_dummy_training_in_swarm () { - cd $PROJECT_DIR/prod_00 + cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup ../../../../../_testsOutsideDocker_submitDummyTraining.exp docker kill fladmin sleep 60 -} -check_output_of_dummy_training () { echo "TODO check output of dummy training" } @@ -94,16 +99,14 @@ run_tests () { run_docker_gpu_preflight_check run_data_access_preflight_check - check_output_of_preflight_checks start_server_and_clients run_dummy_training_in_swarm - check_output_of_dummy_training kill_server_and_clients - cleanup_synthetic_data + cleanup_temporary_data } run_tests From e6ace296b5a5db53dc1698fdd546d91026610e4e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 18 Aug 2025 11:41:52 +0200 Subject: [PATCH 114/337] check if (source code for) license is available on github and if README contains certain keywords --- runTestsOutsideDocker.sh | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index bbf450e6..32c7c3fa 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -20,7 +20,24 @@ SCRATCH_DIR=$(mktemp -d) CWD=$(pwd) check_files_on_github () { - echo "TODO check files/documentation on github" + CONTENT=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/LICENSE) + if echo "$CONTENT" | grep -q "MIT License" ; then + echo "Downloaded and verified license from github" + else + echo "Could not download and verify license" + exit 1 + fi + + CONTENT=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/README.md) + for ROLE in 'Swarm Participant' 'Developer' 'Swarm Operator'; + do + if echo "$CONTENT" | grep -q "$ROLE" ; then + echo "Instructions for $ROLE found" + else + echo "Instructions for role $ROLE missing" + exit 1 + fi + done } check_startup_kits () { @@ -28,7 +45,6 @@ check_startup_kits () { } create_synthetic_data () { - # create synthetic data docker run --rm \ -u $(id -u):$(id -g) \ -v "$SYNTHETIC_DATA_DIR":/synthetic_data \ From 62f00eebfc87f9764e69946a0dfd39aa3c7c83d7 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 18 Aug 2025 11:42:51 +0200 Subject: [PATCH 115/337] check if second startup kit can be built and contains expected files --- runTestsOutsideDocker.sh | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index 32c7c3fa..3142df9c 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -18,6 +18,7 @@ PROJECT_DIR="workspace/odelia_${VERSION}_dummy_project_for_testing" SYNTHETIC_DATA_DIR=$(mktemp -d) SCRATCH_DIR=$(mktemp -d) CWD=$(pwd) +PROJECT_FILE="tests/provision/dummy_project_for_testing.yml" check_files_on_github () { CONTENT=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/LICENSE) @@ -40,8 +41,37 @@ check_files_on_github () { done } -check_startup_kits () { - echo "TODO check startup kits" +create_second_startup_kit () { + if [ ! -d "$PROJECT_DIR"/prod_00 ]; then + echo '"$PROJECT_DIR"/prod_00 does not exist, please generate the startup kit first' + exit 1 + fi + if [ -d "$PROJECT_DIR"/prod_01 ]; then + echo '"$PROJECT_DIR"/prod_01 exists, please remove it' + exit 1 + fi + ./_buildStartupKits.sh $PROJECT_FILE $VERSION + + for FILE in 'client.crt' 'client.key' 'docker.sh' 'rootCA.pem'; + do + if [ -f "$PROJECT_DIR/prod_01/client_A/startup/$FILE" ] ; then + echo "$FILE found" + else + echo "$FILE missing" + exit 1 + fi + done + + ZIP_CONTENT=$(unzip -tv "$PROJECT_DIR/prod_01/client_B_${VERSION}.zip") + for FILE in 'client.crt' 'client.key' 'docker.sh' 'rootCA.pem'; + do + if echo "$ZIP_CONTENT" | grep -q "$FILE" ; then + echo "$FILE found in zip" + else + echo "$FILE missing in zip" + exit 1 + fi + done } create_synthetic_data () { @@ -109,7 +139,7 @@ run_dummy_training_in_swarm () { run_tests () { check_files_on_github - check_startup_kits + create_second_startup_kit create_synthetic_data From b3911aecb30f8b9ee3ac26eb7706420fa0b85220 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 18 Aug 2025 14:47:53 +0200 Subject: [PATCH 116/337] check output of preflight checks --- runTestsOutsideDocker.sh | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index 3142df9c..b27372d0 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -112,18 +112,33 @@ kill_server_and_clients () { run_docker_gpu_preflight_check () { cd "$PROJECT_DIR"/prod_00 cd client_A/startup - ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --dummy_training --no_pull 2>&1 | tee dummy_training_console_output.txt - echo "TODO check output in dummy_training_console_output.txt" + CONSOLE_OUTPUT=docker_gpu_preflight_check_console_output.txt + ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --dummy_training --no_pull 2>&1 | tee "$CONSOLE_OUTPUT" + + if grep -q "Epoch 1: 100%" "$CONSOLE_OUTPUT" && grep -q "Training completed successfully" "$CONSOLE_OUTPUT"; then + echo "Expected output of Docker/GPU preflight check found" + else + echo "Missing expected output of Docker/GPU preflight check" + exit 1 + fi + cd "$CWD" } run_data_access_preflight_check () { cd "$PROJECT_DIR"/prod_00 cd client_A/startup - ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --preflight_check --no_pull 2>&1 | tee preflight_check_console_output.txt - echo "TODO check output in preflight_check_console_output.txt" - cd ../.. - cd ../.. + CONSOLE_OUTPUT=data_access_preflight_check_console_output.txt + ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --preflight_check --no_pull 2>&1 | tee $CONSOLE_OUTPUT + + if grep -q "Train set: 18, Val set: 6" "$CONSOLE_OUTPUT" && grep -q "Epoch 0: 100%" "$CONSOLE_OUTPUT"; then + echo "Expected output of Docker/GPU preflight check found" + else + echo "Missing expected output of Docker/GPU preflight check" + exit 1 + fi + + cd "$CWD" } run_dummy_training_in_swarm () { From 0538226b85c0387fa34675011580ff28d6bce180 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 18 Aug 2025 17:32:14 +0200 Subject: [PATCH 117/337] check captured console output of swarm training and files created --- runTestsOutsideDocker.sh | 42 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index b27372d0..d0b65636 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -147,8 +147,46 @@ run_dummy_training_in_swarm () { ../../../../../_testsOutsideDocker_submitDummyTraining.exp docker kill fladmin sleep 60 + cd "$CWD" + + cd "$PROJECT_DIR"/prod_00/server.local/startup + CONSOLE_OUTPUT=nohup.out + for EXPECTED_OUTPUT in 'Total clients: 2' 'updated status of client client_A on round 4' 'updated status of client client_B on round 4' 'all_done=True' 'Server runner finished.'; + do + if grep -q "$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then + echo "Expected output $EXPECTED_OUTPUT found" + else + echo "Expected output $EXPECTED_OUTPUT missing" + exit 1 + fi + done + cd "$CWD" - echo "TODO check output of dummy training" + cd "$PROJECT_DIR"/prod_00/client_A/startup + CONSOLE_OUTPUT=nohup.out + for EXPECTED_OUTPUT in 'Sending training result to aggregation client' 'Epoch 9: 100%' ; + do + if grep -q "$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then + echo "Expected output $EXPECTED_OUTPUT found" + else + echo "Expected output $EXPECTED_OUTPUT missing" + exit 1 + fi + done + cd "$CWD" + + cd "$PROJECT_DIR"/prod_00/client_A/ + FILES_PRESENT=$(find . -type f -name "*.*") + for EXPECTED_FILE in 'custom/minimal_training.py' 'best_FL_global_model.pt' 'FL_global_model.pt' ; + do + if echo "$FILES_PRESENT" | grep -q "$EXPECTED_FILE" ; then + echo "Expected file $EXPECTED_FILE found" + else + echo "Expected file $EXPECTED_FILE missing" + exit 1 + fi + done + cd "$CWD" } run_tests () { @@ -162,9 +200,7 @@ run_tests () { run_data_access_preflight_check start_server_and_clients - run_dummy_training_in_swarm - kill_server_and_clients cleanup_temporary_data From b2431e53ccd28d396c57d6802e58b4a0694c0101 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Tue, 19 Aug 2025 06:11:19 +0200 Subject: [PATCH 118/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 83253588..7441f69c 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -53,7 +53,7 @@ RUN apt install -y \ libtasn1-6=4.18.0-4ubuntu0.1 \ libudev1=249.11-0ubuntu3.16 \ libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-151.161 \ + linux-libc-dev=5.15.0-152.162 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ openssl=3.0.2-0ubuntu1.19 \ From 4019b94cdeb5c693b7a236d4a28fbcee4e71190c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 20 Aug 2025 14:22:19 +0200 Subject: [PATCH 119/337] manually updated furhter outdated package version --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 7441f69c..2d670c5b 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -167,7 +167,7 @@ RUN apt install -y \ libxcb1=1.14-3ubuntu3 \ libxdmcp6=1:1.1.3-0ubuntu5 \ libxext6=2:1.3.4-1build1 \ - libxml2=2.9.13+dfsg-1ubuntu0.7 \ + libxml2=2.9.13+dfsg-1ubuntu0.8 \ libxmuu1=2:1.1.3-3 \ libxtables12=1.8.7-1ubuntu5.2 \ netbase=6.3 \ From 00c07b771dc400c95d25d94cf4408c17d43d1f90 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 20 Aug 2025 15:02:39 +0200 Subject: [PATCH 120/337] test pushing and pulling image to/from local docker registry --- _testsOutsideDocker_submitDummyTraining.exp | 2 +- runTestsOutsideDocker.sh | 31 ++++++++++++++----- tests/provision/dummy_project_for_testing.yml | 2 +- 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/_testsOutsideDocker_submitDummyTraining.exp b/_testsOutsideDocker_submitDummyTraining.exp index 7d69997c..0a79ec00 100755 --- a/_testsOutsideDocker_submitDummyTraining.exp +++ b/_testsOutsideDocker_submitDummyTraining.exp @@ -1,6 +1,6 @@ #!/usr/bin/env expect -spawn ./docker.sh --no_pull +spawn ./docker.sh expect "User Name: " send "admin@test.odelia\r" expect "> " diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index d0b65636..7a14090f 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -13,7 +13,8 @@ if [ -z "$GPU_FOR_TESTING" ]; then fi VERSION=$(./getVersionNumber.sh) -DOCKER_IMAGE=jefftud/odelia:$VERSION +GENERATED_DOCKER_IMAGE=jefftud/odelia:$VERSION +EXPECTED_DOCKER_IMAGE=localhost:5000/$GENERATED_DOCKER_IMAGE # must match what is specified in the project.yml PROJECT_DIR="workspace/odelia_${VERSION}_dummy_project_for_testing" SYNTHETIC_DATA_DIR=$(mktemp -d) SCRATCH_DIR=$(mktemp -d) @@ -74,12 +75,20 @@ create_second_startup_kit () { done } +push_image_to_local_docker_registry () { + docker run -d -p 5000:5000 --rm --name registry registry:3 + docker tag $GENERATED_DOCKER_IMAGE $EXPECTED_DOCKER_IMAGE + docker push $EXPECTED_DOCKER_IMAGE + docker rmi $EXPECTED_DOCKER_IMAGE # so that pulling later has an effect + docker pull $EXPECTED_DOCKER_IMAGE +} + create_synthetic_data () { docker run --rm \ -u $(id -u):$(id -g) \ -v "$SYNTHETIC_DATA_DIR":/synthetic_data \ -w /MediSwarm \ - jefftud/odelia:$VERSION \ + $GENERATED_DOCKER_IMAGE \ /bin/bash -c "python3 application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py /synthetic_data" } @@ -88,18 +97,23 @@ cleanup_temporary_data () { rm -rf "$SCRATCH_DIR" } +cleanup_local_docker_registry () { + docker rmi $EXPECTED_DOCKER_IMAGE + docker kill registry +} + start_server_and_clients () { cd "$PROJECT_DIR"/prod_00 cd server.local/startup - ./docker.sh --no_pull --start_server + ./docker.sh --start_server cd ../.. sleep 10 cd client_A/startup - ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --start_client + ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --start_client cd ../.. cd client_B/startup - ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_B --GPU device=$GPU_FOR_TESTING --start_client + ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_B --GPU device=$GPU_FOR_TESTING --start_client sleep 5 cd "$CWD" @@ -113,7 +127,7 @@ run_docker_gpu_preflight_check () { cd "$PROJECT_DIR"/prod_00 cd client_A/startup CONSOLE_OUTPUT=docker_gpu_preflight_check_console_output.txt - ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --dummy_training --no_pull 2>&1 | tee "$CONSOLE_OUTPUT" + ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --dummy_training 2>&1 | tee "$CONSOLE_OUTPUT" if grep -q "Epoch 1: 100%" "$CONSOLE_OUTPUT" && grep -q "Training completed successfully" "$CONSOLE_OUTPUT"; then echo "Expected output of Docker/GPU preflight check found" @@ -129,7 +143,7 @@ run_data_access_preflight_check () { cd "$PROJECT_DIR"/prod_00 cd client_A/startup CONSOLE_OUTPUT=data_access_preflight_check_console_output.txt - ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --preflight_check --no_pull 2>&1 | tee $CONSOLE_OUTPUT + ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --preflight_check 2>&1 | tee $CONSOLE_OUTPUT if grep -q "Train set: 18, Val set: 6" "$CONSOLE_OUTPUT" && grep -q "Epoch 0: 100%" "$CONSOLE_OUTPUT"; then echo "Expected output of Docker/GPU preflight check found" @@ -194,6 +208,8 @@ run_tests () { create_second_startup_kit + push_image_to_local_docker_registry + create_synthetic_data run_docker_gpu_preflight_check @@ -204,6 +220,7 @@ run_tests () { kill_server_and_clients cleanup_temporary_data + cleanup_local_docker_registry } run_tests diff --git a/tests/provision/dummy_project_for_testing.yml b/tests/provision/dummy_project_for_testing.yml index d4984d77..39a83bd0 100644 --- a/tests/provision/dummy_project_for_testing.yml +++ b/tests/provision/dummy_project_for_testing.yml @@ -29,7 +29,7 @@ builders: args: config_folder: config scheme: http - docker_image: jefftud/odelia:__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__ + docker_image: "localhost:5000/jefftud/odelia:__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__" overseer_agent: path: nvflare.ha.dummy_overseer_agent.DummyOverseerAgent overseer_exists: false From c7974a9b86e1e191404d37459ad4fbd05b146b09 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 20 Aug 2025 15:46:54 +0200 Subject: [PATCH 121/337] Revert "test pushing and pulling image to/from local docker registry", this is very slow This reverts commit 00c07b771dc400c95d25d94cf4408c17d43d1f90. --- _testsOutsideDocker_submitDummyTraining.exp | 2 +- runTestsOutsideDocker.sh | 31 +++++-------------- tests/provision/dummy_project_for_testing.yml | 2 +- 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/_testsOutsideDocker_submitDummyTraining.exp b/_testsOutsideDocker_submitDummyTraining.exp index 0a79ec00..7d69997c 100755 --- a/_testsOutsideDocker_submitDummyTraining.exp +++ b/_testsOutsideDocker_submitDummyTraining.exp @@ -1,6 +1,6 @@ #!/usr/bin/env expect -spawn ./docker.sh +spawn ./docker.sh --no_pull expect "User Name: " send "admin@test.odelia\r" expect "> " diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index 7a14090f..d0b65636 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -13,8 +13,7 @@ if [ -z "$GPU_FOR_TESTING" ]; then fi VERSION=$(./getVersionNumber.sh) -GENERATED_DOCKER_IMAGE=jefftud/odelia:$VERSION -EXPECTED_DOCKER_IMAGE=localhost:5000/$GENERATED_DOCKER_IMAGE # must match what is specified in the project.yml +DOCKER_IMAGE=jefftud/odelia:$VERSION PROJECT_DIR="workspace/odelia_${VERSION}_dummy_project_for_testing" SYNTHETIC_DATA_DIR=$(mktemp -d) SCRATCH_DIR=$(mktemp -d) @@ -75,20 +74,12 @@ create_second_startup_kit () { done } -push_image_to_local_docker_registry () { - docker run -d -p 5000:5000 --rm --name registry registry:3 - docker tag $GENERATED_DOCKER_IMAGE $EXPECTED_DOCKER_IMAGE - docker push $EXPECTED_DOCKER_IMAGE - docker rmi $EXPECTED_DOCKER_IMAGE # so that pulling later has an effect - docker pull $EXPECTED_DOCKER_IMAGE -} - create_synthetic_data () { docker run --rm \ -u $(id -u):$(id -g) \ -v "$SYNTHETIC_DATA_DIR":/synthetic_data \ -w /MediSwarm \ - $GENERATED_DOCKER_IMAGE \ + jefftud/odelia:$VERSION \ /bin/bash -c "python3 application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py /synthetic_data" } @@ -97,23 +88,18 @@ cleanup_temporary_data () { rm -rf "$SCRATCH_DIR" } -cleanup_local_docker_registry () { - docker rmi $EXPECTED_DOCKER_IMAGE - docker kill registry -} - start_server_and_clients () { cd "$PROJECT_DIR"/prod_00 cd server.local/startup - ./docker.sh --start_server + ./docker.sh --no_pull --start_server cd ../.. sleep 10 cd client_A/startup - ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --start_client + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --start_client cd ../.. cd client_B/startup - ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_B --GPU device=$GPU_FOR_TESTING --start_client + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_B --GPU device=$GPU_FOR_TESTING --start_client sleep 5 cd "$CWD" @@ -127,7 +113,7 @@ run_docker_gpu_preflight_check () { cd "$PROJECT_DIR"/prod_00 cd client_A/startup CONSOLE_OUTPUT=docker_gpu_preflight_check_console_output.txt - ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --dummy_training 2>&1 | tee "$CONSOLE_OUTPUT" + ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --dummy_training --no_pull 2>&1 | tee "$CONSOLE_OUTPUT" if grep -q "Epoch 1: 100%" "$CONSOLE_OUTPUT" && grep -q "Training completed successfully" "$CONSOLE_OUTPUT"; then echo "Expected output of Docker/GPU preflight check found" @@ -143,7 +129,7 @@ run_data_access_preflight_check () { cd "$PROJECT_DIR"/prod_00 cd client_A/startup CONSOLE_OUTPUT=data_access_preflight_check_console_output.txt - ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --preflight_check 2>&1 | tee $CONSOLE_OUTPUT + ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --preflight_check --no_pull 2>&1 | tee $CONSOLE_OUTPUT if grep -q "Train set: 18, Val set: 6" "$CONSOLE_OUTPUT" && grep -q "Epoch 0: 100%" "$CONSOLE_OUTPUT"; then echo "Expected output of Docker/GPU preflight check found" @@ -208,8 +194,6 @@ run_tests () { create_second_startup_kit - push_image_to_local_docker_registry - create_synthetic_data run_docker_gpu_preflight_check @@ -220,7 +204,6 @@ run_tests () { kill_server_and_clients cleanup_temporary_data - cleanup_local_docker_registry } run_tests diff --git a/tests/provision/dummy_project_for_testing.yml b/tests/provision/dummy_project_for_testing.yml index 39a83bd0..d4984d77 100644 --- a/tests/provision/dummy_project_for_testing.yml +++ b/tests/provision/dummy_project_for_testing.yml @@ -29,7 +29,7 @@ builders: args: config_folder: config scheme: http - docker_image: "localhost:5000/jefftud/odelia:__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__" + docker_image: jefftud/odelia:__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__ overseer_agent: path: nvflare.ha.dummy_overseer_agent.DummyOverseerAgent overseer_exists: false From f8bd6f5d2e9718a4f1d33aebe3eb938e6456d59d Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 20 Aug 2025 15:48:05 +0200 Subject: [PATCH 122/337] use defined variable rather than hard-coded name --- runTestsOutsideDocker.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index d0b65636..4758e056 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -79,7 +79,7 @@ create_synthetic_data () { -u $(id -u):$(id -g) \ -v "$SYNTHETIC_DATA_DIR":/synthetic_data \ -w /MediSwarm \ - jefftud/odelia:$VERSION \ + $DOCKER_IMAGE \ /bin/bash -c "python3 application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py /synthetic_data" } From 915e14239b67342d92aa609f590440eb823cf036 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Thu, 21 Aug 2025 06:10:52 +0200 Subject: [PATCH 123/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 2d670c5b..d0ba3b43 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -144,8 +144,8 @@ RUN apt install -y \ libgdbm-compat4=1.23-1 \ libgdbm6=1.23-1 \ libgirepository-1.0-1=1.72.0-1 \ - libglib2.0-0=2.72.4-0ubuntu2.5 \ - libglib2.0-data=2.72.4-0ubuntu2.5 \ + libglib2.0-0 \ + libglib2.0-data \ libicu70=70.1-2 \ libip4tc2=1.8.7-1ubuntu5.2 \ libip6tc2=1.8.7-1ubuntu5.2 \ @@ -167,7 +167,7 @@ RUN apt install -y \ libxcb1=1.14-3ubuntu3 \ libxdmcp6=1:1.1.3-0ubuntu5 \ libxext6=2:1.3.4-1build1 \ - libxml2=2.9.13+dfsg-1ubuntu0.8 \ + libxml2 \ libxmuu1=2:1.1.3-3 \ libxtables12=1.8.7-1ubuntu5.2 \ netbase=6.3 \ From 8e18ed8e8ebd7a782326eeb76f2f071803f8aba7 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 21 Aug 2025 10:06:44 +0200 Subject: [PATCH 124/337] added missing apt package version numbers --- docker_config/Dockerfile_ODELIA | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index d0ba3b43..2edbe4d0 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -144,8 +144,8 @@ RUN apt install -y \ libgdbm-compat4=1.23-1 \ libgdbm6=1.23-1 \ libgirepository-1.0-1=1.72.0-1 \ - libglib2.0-0 \ - libglib2.0-data \ + libglib2.0-0=2.72.4-0ubuntu2.6\ + libglib2.0-data=2.72.4-0ubuntu2.6 \ libicu70=70.1-2 \ libip4tc2=1.8.7-1ubuntu5.2 \ libip6tc2=1.8.7-1ubuntu5.2 \ @@ -167,7 +167,7 @@ RUN apt install -y \ libxcb1=1.14-3ubuntu3 \ libxdmcp6=1:1.1.3-0ubuntu5 \ libxext6=2:1.3.4-1build1 \ - libxml2 \ + libxml2=2.9.13+dfsg-1ubuntu0.8 \ libxmuu1=2:1.1.3-3 \ libxtables12=1.8.7-1ubuntu5.2 \ netbase=6.3 \ From 9c1a89619316376173413fd83a74fe561fd1212a Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 21 Aug 2025 10:40:18 +0200 Subject: [PATCH 125/337] output license of pre-trained weights --- docker_config/master_template.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index e862a1aa..33a417be 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -719,8 +719,7 @@ docker_cln_sh: | elif [ ! -z "$LIST_LICENSES" ]; then docker run -it --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ - /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json" - echo "TODO license for pre-trained weights" + /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json && grep 'DINOv2 code and model weights are released under' /torch_home/hub/facebookresearch_dinov2_main/README.md" elif [[ ! -z "$INTERACTIVE" ]]; then docker run --rm $TTY_OPT --detach-keys="ctrl-x" $DOCKER_OPTIONS $DOCKER_IMAGE /bin/bash @@ -785,8 +784,7 @@ docker_svr_sh: | elif [ ! -z "$LIST_LICENSES" ]; then docker run -it --rm --name=$CONTAINER_NAME \ $DOCKER_IMAGE \ - /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json" - echo "TODO license for pre-trained weights" + /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json && grep 'DINOv2 code and model weights are released under' /torch_home/hub/facebookresearch_dinov2_main/README.md" elif [ ! -z "$INTERACTIVE" ]; then docker run --rm -it --detach-keys="ctrl-x" --name=$CONTAINER_NAME \ -v $DIR/..:/startupkit/ -w /startupkit/startup/ \ @@ -824,8 +822,7 @@ docker_adm_sh: | if [ ! -z "$LIST_LICENSES" ]; then docker run -it --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ - /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json" - echo "TODO license for pre-trained weights" + /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json && grep 'DINOv2 code and model weights are released under' /torch_home/hub/facebookresearch_dinov2_main/README.md" exit 0 fi From fe20c17be5fdc19c720f12de54ada1f3325a5b58 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 21 Aug 2025 10:44:43 +0200 Subject: [PATCH 126/337] only require necessary command line arguments for listing licenses --- docker_config/master_template.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 33a417be..fc1e1eb3 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -654,12 +654,12 @@ docker_cln_sh: | : ${MY_DATA_DIR:="${user_data_dir:-/home/flclient/data}"} fi - if [ -z "$MY_SCRATCH_DIR" ]; then + if [[ -z "$MY_SCRATCH_DIR" && -z "$LIST_LICENSES" ]]; then read -p "Enter the path to your scratch directory (default: /mnt/scratch): " user_scratch_dir : ${MY_SCRATCH_DIR:="${user_scratch_dir:-/mnt/scratch}"} fi - if [ -z "$GPU2USE" ]; then + if [[ -z "$GPU2USE" && -z "$LIST_LICENSES" ]]; then read -p "Enter the GPU index to use or 'all' (default: device=0): " user_gpu : ${GPU2USE:="${user_gpu:-device=0}"} fi @@ -667,8 +667,10 @@ docker_cln_sh: | # Resolve script directory DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - mkdir -p "$MY_SCRATCH_DIR" - chmod -R 777 "$MY_SCRATCH_DIR" + if [ ! -z "$MY_SCRATCH_DIR" ]; then + mkdir -p "$MY_SCRATCH_DIR" + chmod -R 777 "$MY_SCRATCH_DIR" + fi # Networking & Cleanup NETARG="--net=host" From b0ff12cc60e2cbb895d21def7bfd24f3db61e3e1 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 21 Aug 2025 14:23:34 +0200 Subject: [PATCH 127/337] removed valiadion of pinned versions This caused problems likely due to apt cache not being in sync with what is used for the Docker build. We will rely on correctly parsed versions from the installation log and review of PRs. --- scripts/ci/update_apt_versions.sh | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/scripts/ci/update_apt_versions.sh b/scripts/ci/update_apt_versions.sh index d86bbc6d..75d6732a 100755 --- a/scripts/ci/update_apt_versions.sh +++ b/scripts/ci/update_apt_versions.sh @@ -30,24 +30,9 @@ echo "[INFO] Re-adding updated APT version pins to Dockerfile..." scripts/dev_utils/dockerfile_update_addAptVersionNumbers.py "$DOCKERFILE_PATH" "$LOG_PATH" rm "$LOG_PATH" -echo "[INFO] Validating all pinned versions, removing invalid ones..." -has_invalid_versions=0 -while IFS= read -r match; do - pkg="$(echo "$match" | cut -d= -f1)" - ver="$(echo "$match" | cut -d= -f2)" - echo -n "Checking $pkg=$ver... " - if ! apt-cache madison "$pkg" | grep -q "$ver"; then - echo "NOT FOUND – removing pin" - sed -i "s|\b$pkg=$ver\b|$pkg|" "$DOCKERFILE_PATH" - has_invalid_versions=1 - else - echo "OK" - fi -done < <(grep -oP '\b[a-z0-9\.\-]+=[a-zA-Z0-9:~.+-]+\b' "$DOCKERFILE_PATH") - git fetch origin main if git diff --quiet origin/main..HEAD; then echo "NO_CHANGES=true" >> "$GITHUB_ENV" else echo "NO_CHANGES=false" >> "$GITHUB_ENV" -fi \ No newline at end of file +fi From 70152189b048d332af92e23a46987ffecb490032 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 21 Aug 2025 10:57:47 +0200 Subject: [PATCH 128/337] refactored listing licenses to avoid code duplication --- docker_config/master_template.yml | 6 +++--- scripts/_list_licenses.sh | 7 +++++++ 2 files changed, 10 insertions(+), 3 deletions(-) create mode 100755 scripts/_list_licenses.sh diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index fc1e1eb3..7ed98f3f 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -721,7 +721,7 @@ docker_cln_sh: | elif [ ! -z "$LIST_LICENSES" ]; then docker run -it --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ - /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json && grep 'DINOv2 code and model weights are released under' /torch_home/hub/facebookresearch_dinov2_main/README.md" + /bin/bash -c "/MediSwarm/scripts/_list_licenses.sh" elif [[ ! -z "$INTERACTIVE" ]]; then docker run --rm $TTY_OPT --detach-keys="ctrl-x" $DOCKER_OPTIONS $DOCKER_IMAGE /bin/bash @@ -786,7 +786,7 @@ docker_svr_sh: | elif [ ! -z "$LIST_LICENSES" ]; then docker run -it --rm --name=$CONTAINER_NAME \ $DOCKER_IMAGE \ - /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json && grep 'DINOv2 code and model weights are released under' /torch_home/hub/facebookresearch_dinov2_main/README.md" + /bin/bash -c "/MediSwarm/scripts/_list_licenses.sh" elif [ ! -z "$INTERACTIVE" ]; then docker run --rm -it --detach-keys="ctrl-x" --name=$CONTAINER_NAME \ -v $DIR/..:/startupkit/ -w /startupkit/startup/ \ @@ -824,7 +824,7 @@ docker_adm_sh: | if [ ! -z "$LIST_LICENSES" ]; then docker run -it --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ - /bin/bash -c "pip-licenses -s -u --order=license && distro2sbom -s --format json && grep 'DINOv2 code and model weights are released under' /torch_home/hub/facebookresearch_dinov2_main/README.md" + /bin/bash -c "/MediSwarm/scripts/_list_licenses.sh" exit 0 fi diff --git a/scripts/_list_licenses.sh b/scripts/_list_licenses.sh new file mode 100755 index 00000000..5ecec196 --- /dev/null +++ b/scripts/_list_licenses.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +# this script is called inside the ODELIA docker containers to list licenses of all pip and apt packages as well as for pre-trained weights + +pip-licenses -s -u --order=license +distro2sbom -s --format json +grep "DINOv2 code and model weights are released under" /torch_home/hub/facebookresearch_dinov2_main/README.md From 643ec39a187b0157c85a73ec904a55f50750ea52 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 21 Aug 2025 14:56:34 +0200 Subject: [PATCH 129/337] test whether model weight license is listed --- runTestsInDocker.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/runTestsInDocker.sh b/runTestsInDocker.sh index 608fdf1c..32246b72 100755 --- a/runTestsInDocker.sh +++ b/runTestsInDocker.sh @@ -69,17 +69,20 @@ cleanup_dummy_trainings () { check_license_listings () { cd "$CWD"/"$PROJECT_DIR/prod_00/admin@test.odelia/startup" - if ! $( ./docker.sh --no_pull --list_licenses 2>&1 | grep -q MIT ); then + ADMIN_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) + if ! $( echo $ADMIN_LICENSES | grep -q MIT ) || ! $( echo $ADMIN_LICENSES | grep -q "model weights" ); then echo "could not list licenses from admin startup kit" exit 1 fi cd "$CWD"/"$PROJECT_DIR/prod_00/server.local/startup/" - if ! $( ./docker.sh --no_pull --list_licenses 2>&1 | grep -q MIT ); then + SERVER_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) + if ! $( echo $SERVER_LICENSES | grep -q MIT ) || ! $( echo $SERVER_LICENSES | grep -q "model weights" ); then echo "could not list licenses from server startup kit" exit 1 fi cd "$CWD"/"$PROJECT_DIR/prod_00/client_A/startup/" - if ! $( ./docker.sh --data_dir /tmp/ --scratch_dir /tmp/scratch --GPU "$GPU_FOR_TESTING" --no_pull --list_licenses 2>&1 | grep -q MIT ); then + CLIENT_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) + if ! $( echo $CLIENT_LICENSES | grep -q MIT ) || ! $( echo $CLIENT_LICENSES | grep -q "model weights" ); then echo "could not list licenses from client startup kit" exit 1 fi From 0ccc65065807c58f711fbe0782375f16832b6d77 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 22 Aug 2025 10:01:44 +0200 Subject: [PATCH 130/337] added missing space --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 2edbe4d0..56cd3919 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -144,7 +144,7 @@ RUN apt install -y \ libgdbm-compat4=1.23-1 \ libgdbm6=1.23-1 \ libgirepository-1.0-1=1.72.0-1 \ - libglib2.0-0=2.72.4-0ubuntu2.6\ + libglib2.0-0=2.72.4-0ubuntu2.6 \ libglib2.0-data=2.72.4-0ubuntu2.6 \ libicu70=70.1-2 \ libip4tc2=1.8.7-1ubuntu5.2 \ From 7836bb7875b0c2cd08b7a7dba522b8f4f0cce621 Mon Sep 17 00:00:00 2001 From: oleschwen Date: Fri, 22 Aug 2025 10:06:16 +0200 Subject: [PATCH 131/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 2edbe4d0..33cb901a 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -86,8 +86,8 @@ RUN apt install -y \ libnpth0=1.6-3build2 \ libpsl5=0.21.0-1.2build2 \ libpython3-stdlib=3.10.6-1~22.04.1 \ - libpython3.10-minimal=3.10.12-1~22.04.10 \ - libpython3.10-stdlib=3.10.12-1~22.04.10 \ + libpython3.10-minimal=3.10.12-1~22.04.11 \ + libpython3.10-stdlib=3.10.12-1~22.04.11 \ libreadline8=8.1.2-1 \ librtmp1=2.4+20151223.gitfa8646d.1-2build4 \ libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 \ @@ -100,8 +100,8 @@ RUN apt install -y \ pinentry-curses=1.1.1-1build2 \ publicsuffix=20211207.1025-1 \ python3-minimal=3.10.6-1~22.04.1 \ - python3.10-minimal=3.10.12-1~22.04.10 \ - python3.10=3.10.12-1~22.04.10 \ + python3.10-minimal=3.10.12-1~22.04.11 \ + python3.10=3.10.12-1~22.04.11 \ python3=3.10.6-1~22.04.1 \ readline-common=8.1.2-1 \ unzip=6.0-26ubuntu3.2 \ @@ -144,7 +144,7 @@ RUN apt install -y \ libgdbm-compat4=1.23-1 \ libgdbm6=1.23-1 \ libgirepository-1.0-1=1.72.0-1 \ - libglib2.0-0=2.72.4-0ubuntu2.6\ + libglib2.0-0=2.72.4-0ubuntu2.6 \ libglib2.0-data=2.72.4-0ubuntu2.6 \ libicu70=70.1-2 \ libip4tc2=1.8.7-1ubuntu5.2 \ From cece3a78b78de3fd16d104d8fb59bc7683d0e072 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 22 Aug 2025 10:28:48 +0200 Subject: [PATCH 132/337] increased version number for swarm technical test --- odelia_image.version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/odelia_image.version b/odelia_image.version index 4812aa29..c9a1f1c3 100644 --- a/odelia_image.version +++ b/odelia_image.version @@ -1,2 +1,2 @@ # version of the ODELIA Docker image, read by different scripts -1.0 \ No newline at end of file +1.0.1 From 292ccf3c8ddd45826304edb7cc6a12b3dcbd96ca Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Wed, 27 Aug 2025 06:10:02 +0200 Subject: [PATCH 133/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 33cb901a..551fa87b 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -53,7 +53,7 @@ RUN apt install -y \ libtasn1-6=4.18.0-4ubuntu0.1 \ libudev1=249.11-0ubuntu3.16 \ libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-152.162 \ + linux-libc-dev=5.15.0-153.163 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ openssl=3.0.2-0ubuntu1.19 \ From 23b35ef0eca0304bb1b3ab1d55acf844db92c7db Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 27 Aug 2025 09:48:46 +0200 Subject: [PATCH 134/337] renamed file for running integration tests --- runTestsInDocker.sh => runIntegrationTests.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename runTestsInDocker.sh => runIntegrationTests.sh (100%) diff --git a/runTestsInDocker.sh b/runIntegrationTests.sh similarity index 100% rename from runTestsInDocker.sh rename to runIntegrationTests.sh From f10e526666bb0d2d0343f616d843375be340c5e2 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 27 Aug 2025 09:54:16 +0200 Subject: [PATCH 135/337] refactored tests to be run in Docker: moved to separate scripts --- _runTestsInsideDocker.sh | 54 ------------------- runIntegrationTests.sh | 18 +++++-- ...run_controller_unit_tests_with_coverage.sh | 12 +++++ ...n_minimal_example_proof_of_concept_mode.sh | 19 +++++++ .../_run_minimal_example_simulation_mode.sh | 10 ++++ .../_run_minimal_example_standalone.sh | 10 ++++ .../_run_nvflare_unit_tests.sh | 10 ++++ 7 files changed, 75 insertions(+), 58 deletions(-) delete mode 100755 _runTestsInsideDocker.sh create mode 100755 tests/integration_tests/_run_controller_unit_tests_with_coverage.sh create mode 100755 tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh create mode 100755 tests/integration_tests/_run_minimal_example_simulation_mode.sh create mode 100755 tests/integration_tests/_run_minimal_example_standalone.sh create mode 100755 tests/integration_tests/_run_nvflare_unit_tests.sh diff --git a/_runTestsInsideDocker.sh b/_runTestsInsideDocker.sh deleted file mode 100755 index d3d07c18..00000000 --- a/_runTestsInsideDocker.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/usr/bin/env bash - -run_controller_unit_tests_with_coverage () { - # run unit tests of ODELIA swarm learning and report coverage - export MPLCONFIGDIR=/tmp - cd /MediSwarm/tests/unit_tests/controller - PYTHONPATH=/MediSwarm/controller/controller python3 -m coverage run --source=/MediSwarm/controller/controller -m unittest discover - coverage report -m - rm .coverage -} - -run_nvflare_unit_tests () { - cd /MediSwarm/docker_config/NVFlare - ./runtest.sh -c -r - coverage report -m - cd .. -} - -run_minimal_example_standalone () { - # run standalone version of minimal example - cd /MediSwarm/application/jobs/minimal_training_pytorch_cnn/app/custom/ - export TRAINING_MODE="local_training" - ./main.py -} - -run_minimal_example_simulation_mode () { - # run simulation mode for minimal example - cd /MediSwarm - export TRAINING_MODE="swarm" - nvflare simulator -w /tmp/minimal_training_pytorch_cnn -n 2 -t 2 application/jobs/minimal_training_pytorch_cnn -c simulated_node_0,simulated_node_1 -} - -run_minimal_example_proof_of_concept_mode () { - # run proof-of-concept mode for minimal example - cd /MediSwarm - export TRAINING_MODE="swarm" - nvflare poc prepare -c poc_client_0 poc_client_1 - nvflare poc prepare-jobs-dir -j application/jobs/ - nvflare poc start -ex admin@nvidia.com - sleep 15 - echo "Will submit job now after sleeping 15 seconds to allow the background process to complete" - nvflare job submit -j application/jobs/minimal_training_pytorch_cnn - sleep 60 - echo "Will shut down now after sleeping 60 seconds to allow the background process to complete" - sleep 2 - nvflare poc stop -} - -run_controller_unit_tests_with_coverage -# uncomment the following line to run NVFlare's unit tests (takes about 2 minutes and will install python packages in the container) -# run_nvflare_unit_tests -run_minimal_example_standalone -run_minimal_example_simulation_mode -run_minimal_example_proof_of_concept_mode diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 329ee6cf..92f7f75b 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -10,9 +10,8 @@ if [ -z "$GPU_FOR_TESTING" ]; then export GPU_FOR_TESTING="all" fi - -run_tests () { - echo "[Run] Unit tests inside Docker..." +_run_test_in_docker() { + echo "[Run] " $1 " inside Docker ..." docker run --rm \ --shm-size=16g \ --ipc=host \ @@ -20,10 +19,21 @@ run_tests () { --ulimit stack=67108864 \ -v /tmp:/scratch \ --gpus="$GPU_FOR_TESTING" \ - --entrypoint=/MediSwarm/_runTestsInsideDocker.sh \ + --entrypoint=/MediSwarm/$1 \ "$DOCKER_IMAGE" } + +run_tests () { + _run_test_in_docker tests/integration_tests/_run_controller_unit_tests_with_coverage.sh + _run_test_in_docker tests/integration_tests/_run_minimal_example_standalone.sh + _run_test_in_docker tests/integration_tests/_run_minimal_example_simulation_mode.sh + _run_test_in_docker tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh + + # uncomment the following line to also run NVFlare's unit tests (takes about 2 minutes and will install python packages in the container) + # run_test_in_docker tests/integration_tests/_run_nvflare_unit_tests.sh +} + prepare_dummy_trainings () { echo "[Prepare] Startup kits for dummy project..." rm -rf "$PROJECT_DIR" diff --git a/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh b/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh new file mode 100755 index 00000000..87ef36d1 --- /dev/null +++ b/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +run_controller_unit_tests_with_coverage () { + # run unit tests of ODELIA swarm learning and report coverage + export MPLCONFIGDIR=/tmp + cd /MediSwarm/tests/unit_tests/controller + PYTHONPATH=/MediSwarm/controller/controller python3 -m coverage run --source=/MediSwarm/controller/controller -m unittest discover + coverage report -m + rm .coverage +} + +run_controller_unit_tests_with_coverage diff --git a/tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh b/tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh new file mode 100755 index 00000000..9331ea7b --- /dev/null +++ b/tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +run_minimal_example_proof_of_concept_mode () { + # run proof-of-concept mode for minimal example + cd /MediSwarm + export TRAINING_MODE="swarm" + nvflare poc prepare -c poc_client_0 poc_client_1 + nvflare poc prepare-jobs-dir -j application/jobs/ + nvflare poc start -ex admin@nvidia.com + sleep 15 + echo "Will submit job now after sleeping 15 seconds to allow the background process to complete" + nvflare job submit -j application/jobs/minimal_training_pytorch_cnn + sleep 60 + echo "Will shut down now after sleeping 60 seconds to allow the background process to complete" + sleep 2 + nvflare poc stop +} + +run_minimal_example_proof_of_concept_mode diff --git a/tests/integration_tests/_run_minimal_example_simulation_mode.sh b/tests/integration_tests/_run_minimal_example_simulation_mode.sh new file mode 100755 index 00000000..4f87934d --- /dev/null +++ b/tests/integration_tests/_run_minimal_example_simulation_mode.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +run_minimal_example_simulation_mode () { + # run simulation mode for minimal example + cd /MediSwarm + export TRAINING_MODE="swarm" + nvflare simulator -w /tmp/minimal_training_pytorch_cnn -n 2 -t 2 application/jobs/minimal_training_pytorch_cnn -c simulated_node_0,simulated_node_1 +} + +run_minimal_example_simulation_mode diff --git a/tests/integration_tests/_run_minimal_example_standalone.sh b/tests/integration_tests/_run_minimal_example_standalone.sh new file mode 100755 index 00000000..79e10ddb --- /dev/null +++ b/tests/integration_tests/_run_minimal_example_standalone.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +run_minimal_example_standalone () { + # run standalone version of minimal example + cd /MediSwarm/application/jobs/minimal_training_pytorch_cnn/app/custom/ + export TRAINING_MODE="local_training" + ./main.py +} + +run_minimal_example_standalone diff --git a/tests/integration_tests/_run_nvflare_unit_tests.sh b/tests/integration_tests/_run_nvflare_unit_tests.sh new file mode 100755 index 00000000..efd3b502 --- /dev/null +++ b/tests/integration_tests/_run_nvflare_unit_tests.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash + +run_nvflare_unit_tests () { + cd /MediSwarm/docker_config/NVFlare + ./runtest.sh -c -r + coverage report -m + cd .. +} + +run_nvflare_unit_tests From c2f5e403b9d2c6c40be14cc7d587aecec8eab1de Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 27 Aug 2025 15:26:10 +0200 Subject: [PATCH 136/337] moved method to script for running integration tests --- runIntegrationTests.sh | 28 ++++++++++++++++++++++++++-- runTestsOutsideDocker.sh | 22 ---------------------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 92f7f75b..49a68bb4 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -10,8 +10,31 @@ if [ -z "$GPU_FOR_TESTING" ]; then export GPU_FOR_TESTING="all" fi +check_files_on_github () { + echo "[Run] Test whether expected content is available on github" + + CONTENT=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/LICENSE) + if echo "$CONTENT" | grep -q "MIT License" ; then + echo "Downloaded and verified license from github" + else + echo "Could not download and verify license" + exit 1 + fi + + CONTENT=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/README.md) + for ROLE in 'Swarm Participant' 'Developer' 'Swarm Operator'; + do + if echo "$CONTENT" | grep -q "$ROLE" ; then + echo "Instructions for $ROLE found" + else + echo "Instructions for role $ROLE missing" + exit 1 + fi + done +} + _run_test_in_docker() { - echo "[Run] " $1 " inside Docker ..." + echo "[Run]" $1 "inside Docker ..." docker run --rm \ --shm-size=16g \ --ipc=host \ @@ -23,7 +46,6 @@ _run_test_in_docker() { "$DOCKER_IMAGE" } - run_tests () { _run_test_in_docker tests/integration_tests/_run_controller_unit_tests_with_coverage.sh _run_test_in_docker tests/integration_tests/_run_minimal_example_standalone.sh @@ -78,12 +100,14 @@ cleanup_dummy_trainings () { } case "$1" in + check_files_on_github) check_files_on_github ;; run_tests) run_tests ;; prepare_dummy_trainings) prepare_dummy_trainings ;; run_dummy_training) run_dummy_training ;; run_3dcnn_tests) run_3dcnn_tests ;; cleanup) cleanup_dummy_trainings ;; all | "") + check_files_on_github run_tests prepare_dummy_trainings run_dummy_training diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index 4758e056..3e3f79a9 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -20,26 +20,6 @@ SCRATCH_DIR=$(mktemp -d) CWD=$(pwd) PROJECT_FILE="tests/provision/dummy_project_for_testing.yml" -check_files_on_github () { - CONTENT=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/LICENSE) - if echo "$CONTENT" | grep -q "MIT License" ; then - echo "Downloaded and verified license from github" - else - echo "Could not download and verify license" - exit 1 - fi - - CONTENT=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/README.md) - for ROLE in 'Swarm Participant' 'Developer' 'Swarm Operator'; - do - if echo "$CONTENT" | grep -q "$ROLE" ; then - echo "Instructions for $ROLE found" - else - echo "Instructions for role $ROLE missing" - exit 1 - fi - done -} create_second_startup_kit () { if [ ! -d "$PROJECT_DIR"/prod_00 ]; then @@ -190,8 +170,6 @@ run_dummy_training_in_swarm () { } run_tests () { - check_files_on_github - create_second_startup_kit create_synthetic_data From 05866f3fe2383f8c051f4f54aeb84eac745e2c67 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 27 Aug 2025 15:32:03 +0200 Subject: [PATCH 137/337] moved generating two sets of startup kits to script for integration tests --- runIntegrationTests.sh | 50 +++++++++++++++++++++++++++++++++++----- runTestsOutsideDocker.sh | 50 ---------------------------------------- 2 files changed, 44 insertions(+), 56 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 49a68bb4..f2f037ac 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -5,11 +5,15 @@ set -e VERSION=$(./getVersionNumber.sh) DOCKER_IMAGE=jefftud/odelia:$VERSION PROJECT_DIR="workspace/odelia_${VERSION}_dummy_project_for_testing" +SYNTHETIC_DATA_DIR=$(mktemp -d) +SCRATCH_DIR=$(mktemp -d) CWD=$(pwd) +PROJECT_FILE="tests/provision/dummy_project_for_testing.yml" if [ -z "$GPU_FOR_TESTING" ]; then export GPU_FOR_TESTING="all" fi + check_files_on_github () { echo "[Run] Test whether expected content is available on github" @@ -56,10 +60,44 @@ run_tests () { # run_test_in_docker tests/integration_tests/_run_nvflare_unit_tests.sh } -prepare_dummy_trainings () { - echo "[Prepare] Startup kits for dummy project..." - rm -rf "$PROJECT_DIR" - ./_buildStartupKits.sh tests/provision/dummy_project_for_testing.yml "$VERSION" +create_startup_kits_and_check_contained_files () { + echo "[Prepare] Startup kits for dummy project ..." + + if ! grep -q "127.0.0.1 server.local" /etc/hosts; then + echo "/etc/hosts needs to contain the following line, please add it." + echo "127.0.0.1 server.local localhost" + exit 1 + fi + + if [ ! -d "$PROJECT_DIR"/prod_00 ]; then + ./_buildStartupKits.sh $PROJECT_FILE $VERSION + fi + if [ -d "$PROJECT_DIR"/prod_01 ]; then + echo '"$PROJECT_DIR"/prod_01 exists, please remove/rename it' + exit 1 + fi + ./_buildStartupKits.sh $PROJECT_FILE $VERSION + + for FILE in 'client.crt' 'client.key' 'docker.sh' 'rootCA.pem'; + do + if [ -f "$PROJECT_DIR/prod_01/client_A/startup/$FILE" ] ; then + echo "$FILE found" + else + echo "$FILE missing" + exit 1 + fi + done + + ZIP_CONTENT=$(unzip -tv "$PROJECT_DIR/prod_01/client_B_${VERSION}.zip") + for FILE in 'client.crt' 'client.key' 'docker.sh' 'rootCA.pem'; + do + if echo "$ZIP_CONTENT" | grep -q "$FILE" ; then + echo "$FILE found in zip" + else + echo "$FILE missing in zip" + exit 1 + fi + done } run_dummy_training () { @@ -102,14 +140,14 @@ cleanup_dummy_trainings () { case "$1" in check_files_on_github) check_files_on_github ;; run_tests) run_tests ;; - prepare_dummy_trainings) prepare_dummy_trainings ;; + create_startup_kits) create_startup_kits_and_check_contained_files ;; run_dummy_training) run_dummy_training ;; run_3dcnn_tests) run_3dcnn_tests ;; cleanup) cleanup_dummy_trainings ;; all | "") check_files_on_github run_tests - prepare_dummy_trainings + create_startup_kits_and_check_contained_files run_dummy_training run_3dcnn_tests cleanup_dummy_trainings diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index 3e3f79a9..f990d11d 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -2,57 +2,7 @@ set -e -if ! grep -q "127.0.0.1 server.local" /etc/hosts; then - echo "/etc/hosts needs to contain the following line, please add it." - echo "127.0.0.1 server.local localhost" - exit 1 -fi - -if [ -z "$GPU_FOR_TESTING" ]; then - export GPU_FOR_TESTING="all" -fi - -VERSION=$(./getVersionNumber.sh) -DOCKER_IMAGE=jefftud/odelia:$VERSION -PROJECT_DIR="workspace/odelia_${VERSION}_dummy_project_for_testing" -SYNTHETIC_DATA_DIR=$(mktemp -d) -SCRATCH_DIR=$(mktemp -d) -CWD=$(pwd) -PROJECT_FILE="tests/provision/dummy_project_for_testing.yml" - - -create_second_startup_kit () { - if [ ! -d "$PROJECT_DIR"/prod_00 ]; then - echo '"$PROJECT_DIR"/prod_00 does not exist, please generate the startup kit first' - exit 1 - fi - if [ -d "$PROJECT_DIR"/prod_01 ]; then - echo '"$PROJECT_DIR"/prod_01 exists, please remove it' - exit 1 - fi - ./_buildStartupKits.sh $PROJECT_FILE $VERSION - - for FILE in 'client.crt' 'client.key' 'docker.sh' 'rootCA.pem'; - do - if [ -f "$PROJECT_DIR/prod_01/client_A/startup/$FILE" ] ; then - echo "$FILE found" - else - echo "$FILE missing" - exit 1 - fi - done - ZIP_CONTENT=$(unzip -tv "$PROJECT_DIR/prod_01/client_B_${VERSION}.zip") - for FILE in 'client.crt' 'client.key' 'docker.sh' 'rootCA.pem'; - do - if echo "$ZIP_CONTENT" | grep -q "$FILE" ; then - echo "$FILE found in zip" - else - echo "$FILE missing in zip" - exit 1 - fi - done -} create_synthetic_data () { docker run --rm \ From 1e25eb332888386204a880b9e9e76963df38df79 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 27 Aug 2025 15:38:19 +0200 Subject: [PATCH 138/337] dedicated function to generate synthetic data --- runIntegrationTests.sh | 24 ++++++++++++++---------- runTestsOutsideDocker.sh | 12 ------------ 2 files changed, 14 insertions(+), 22 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index f2f037ac..5e97ad9f 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -100,6 +100,17 @@ create_startup_kits_and_check_contained_files () { done } +create_synthetic_data () { + echo "[Prepare] Synthetic data ..." + docker run --rm \ + -u $(id -u):$(id -g) \ + -v "$SYNTHETIC_DATA_DIR":/synthetic_data \ + -w /MediSwarm \ + $DOCKER_IMAGE \ + /bin/bash -c "python3 application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py /synthetic_data" +} + + run_dummy_training () { echo "[Run] Dummy training session..." cd "$PROJECT_DIR/prod_00/client_A/startup/" @@ -108,16 +119,7 @@ run_dummy_training () { } run_3dcnn_tests () { - echo "[Run] Synthetic data + 3D CNN preflight check..." - SYNTHETIC_DATA_DIR=$(mktemp -d) - - # create synthetic data - docker run --rm \ - -u $(id -u):$(id -g) \ - -v "$SYNTHETIC_DATA_DIR":/synthetic_data \ - -w /MediSwarm \ - jefftud/odelia:$VERSION \ - /bin/bash -c "python3 application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py /synthetic_data" + echo "[Run] 3D CNN preflight check..." # run tests using synthetic data cd "$PROJECT_DIR/prod_00/client_A/startup/" @@ -141,6 +143,7 @@ case "$1" in check_files_on_github) check_files_on_github ;; run_tests) run_tests ;; create_startup_kits) create_startup_kits_and_check_contained_files ;; + create_synthetic_data) create_synthetic_data ;; run_dummy_training) run_dummy_training ;; run_3dcnn_tests) run_3dcnn_tests ;; cleanup) cleanup_dummy_trainings ;; @@ -148,6 +151,7 @@ case "$1" in check_files_on_github run_tests create_startup_kits_and_check_contained_files + create_synthetic_data run_dummy_training run_3dcnn_tests cleanup_dummy_trainings diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index f990d11d..171dfde7 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -4,14 +4,6 @@ set -e -create_synthetic_data () { - docker run --rm \ - -u $(id -u):$(id -g) \ - -v "$SYNTHETIC_DATA_DIR":/synthetic_data \ - -w /MediSwarm \ - $DOCKER_IMAGE \ - /bin/bash -c "python3 application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py /synthetic_data" -} cleanup_temporary_data () { rm -rf "$SYNTHETIC_DATA_DIR" @@ -120,10 +112,6 @@ run_dummy_training_in_swarm () { } run_tests () { - create_second_startup_kit - - create_synthetic_data - run_docker_gpu_preflight_check run_data_access_preflight_check From f7e06b5a82fbe2d003a9c9e6ced724781b59cb09 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 27 Aug 2025 15:56:28 +0200 Subject: [PATCH 139/337] more meaningful name for method --- runIntegrationTests.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 5e97ad9f..f7914481 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -50,7 +50,7 @@ _run_test_in_docker() { "$DOCKER_IMAGE" } -run_tests () { +run_local_tests () { _run_test_in_docker tests/integration_tests/_run_controller_unit_tests_with_coverage.sh _run_test_in_docker tests/integration_tests/_run_minimal_example_standalone.sh _run_test_in_docker tests/integration_tests/_run_minimal_example_simulation_mode.sh @@ -141,7 +141,7 @@ cleanup_dummy_trainings () { case "$1" in check_files_on_github) check_files_on_github ;; - run_tests) run_tests ;; + run_local_tests) run_local_tests ;; create_startup_kits) create_startup_kits_and_check_contained_files ;; create_synthetic_data) create_synthetic_data ;; run_dummy_training) run_dummy_training ;; @@ -149,7 +149,7 @@ case "$1" in cleanup) cleanup_dummy_trainings ;; all | "") check_files_on_github - run_tests + run_local_tests create_startup_kits_and_check_contained_files create_synthetic_data run_dummy_training From f0040641d8647a6346e1ba5f6381e6abe5525d59 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 27 Aug 2025 15:58:52 +0200 Subject: [PATCH 140/337] moved/merged cleanup to script for integration tests --- runIntegrationTests.sh | 18 ++++++++---------- runTestsOutsideDocker.sh | 7 ------- 2 files changed, 8 insertions(+), 17 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index f7914481..340c74b5 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -128,14 +128,12 @@ run_3dcnn_tests () { ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir /tmp/scratch --GPU "$GPU_FOR_TESTING" --no_pull --run_script /MediSwarm/_run3DdcnnptlTestsInDocker.sh cd "$CWD" - - # clean up synthetic data - rm -rf "$SYNTHETIC_DATA_DIR" || echo "Warning: cleanup failed" } - -cleanup_dummy_trainings () { - echo "[Cleanup] Removing dummy workspace..." +cleanup_temporary_data () { + echo "[Cleanup] Removing synthetic data, scratch directory, dummy workspace ..." + rm -rf "$SYNTHETIC_DATA_DIR" + rm -rf "$SCRATCH_DIR" rm -rf "$PROJECT_DIR" } @@ -146,15 +144,15 @@ case "$1" in create_synthetic_data) create_synthetic_data ;; run_dummy_training) run_dummy_training ;; run_3dcnn_tests) run_3dcnn_tests ;; - cleanup) cleanup_dummy_trainings ;; + cleanup) cleanup_temporary_data ;; all | "") check_files_on_github run_local_tests create_startup_kits_and_check_contained_files create_synthetic_data - run_dummy_training - run_3dcnn_tests - cleanup_dummy_trainings + # run_dummy_training + # run_3dcnn_tests + cleanup_temporary_data ;; *) echo "Unknown argument: $1"; exit 1 ;; esac diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index 171dfde7..49dfb8fa 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -3,13 +3,6 @@ set -e - - -cleanup_temporary_data () { - rm -rf "$SYNTHETIC_DATA_DIR" - rm -rf "$SCRATCH_DIR" -} - start_server_and_clients () { cd "$PROJECT_DIR"/prod_00 cd server.local/startup From 235f45eac8bca7e93cd0dc181905ec47bd358d45 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 28 Aug 2025 10:42:47 +0200 Subject: [PATCH 141/337] moved test for running Docker/GPU preflight check, i.e., extended existing test by checking output --- runIntegrationTests.sh | 23 ++++++++++++++++++----- runTestsOutsideDocker.sh | 16 ---------------- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 340c74b5..2ab7b059 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -111,10 +111,20 @@ create_synthetic_data () { } -run_dummy_training () { - echo "[Run] Dummy training session..." +run_docker_gpu_preflight_check () { + # requires having built a startup kit + echo "[Run] Docker/GPU preflight check (local dummy training via startup kit) ..." cd "$PROJECT_DIR/prod_00/client_A/startup/" - ./docker.sh --data_dir /tmp/ --scratch_dir /tmp/scratch --GPU "$GPU_FOR_TESTING" --no_pull --dummy_training + CONSOLE_OUTPUT=docker_gpu_preflight_check_console_output.txt + ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --dummy_training --no_pull 2>&1 | tee "$CONSOLE_OUTPUT" + + if grep -q "Epoch 1: 100%" "$CONSOLE_OUTPUT" && grep -q "Training completed successfully" "$CONSOLE_OUTPUT"; then + echo "Expected output of Docker/GPU preflight check found" + else + echo "Missing expected output of Docker/GPU preflight check" + exit 1 + fi + cd "$CWD" } @@ -142,7 +152,7 @@ case "$1" in run_local_tests) run_local_tests ;; create_startup_kits) create_startup_kits_and_check_contained_files ;; create_synthetic_data) create_synthetic_data ;; - run_dummy_training) run_dummy_training ;; + run_docker_gpu_preflight_check) run_docker_gpu_preflight_check ;; run_3dcnn_tests) run_3dcnn_tests ;; cleanup) cleanup_temporary_data ;; all | "") @@ -150,9 +160,12 @@ case "$1" in run_local_tests create_startup_kits_and_check_contained_files create_synthetic_data - # run_dummy_training + run_docker_gpu_preflight_check # run_3dcnn_tests cleanup_temporary_data ;; *) echo "Unknown argument: $1"; exit 1 ;; esac + +# TODO adapt ./assets/readme/README.developer.md +# TODO adapt .github/workflows/pr-test.yaml diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index 49dfb8fa..b2461ad3 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -24,21 +24,6 @@ kill_server_and_clients () { docker kill odelia_swarm_server_flserver odelia_swarm_client_client_A odelia_swarm_client_client_B } -run_docker_gpu_preflight_check () { - cd "$PROJECT_DIR"/prod_00 - cd client_A/startup - CONSOLE_OUTPUT=docker_gpu_preflight_check_console_output.txt - ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --dummy_training --no_pull 2>&1 | tee "$CONSOLE_OUTPUT" - - if grep -q "Epoch 1: 100%" "$CONSOLE_OUTPUT" && grep -q "Training completed successfully" "$CONSOLE_OUTPUT"; then - echo "Expected output of Docker/GPU preflight check found" - else - echo "Missing expected output of Docker/GPU preflight check" - exit 1 - fi - - cd "$CWD" -} run_data_access_preflight_check () { cd "$PROJECT_DIR"/prod_00 @@ -105,7 +90,6 @@ run_dummy_training_in_swarm () { } run_tests () { - run_docker_gpu_preflight_check run_data_access_preflight_check start_server_and_clients From 172c041cf40177ca5e90f258bb2b7735a2424279 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 28 Aug 2025 10:50:42 +0200 Subject: [PATCH 142/337] moved method for running data access preflight check --- runIntegrationTests.sh | 26 +++++++++++++++++--------- runTestsOutsideDocker.sh | 19 ------------------- 2 files changed, 17 insertions(+), 28 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 2ab7b059..bc77b52b 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -128,18 +128,26 @@ run_docker_gpu_preflight_check () { cd "$CWD" } -run_3dcnn_tests () { - echo "[Run] 3D CNN preflight check..." - - # run tests using synthetic data - cd "$PROJECT_DIR/prod_00/client_A/startup/" - # preflight check (standalone) and swarm simulation mode - ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir /tmp/scratch --GPU "$GPU_FOR_TESTING" --no_pull --preflight_check - ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir /tmp/scratch --GPU "$GPU_FOR_TESTING" --no_pull --run_script /MediSwarm/_run3DdcnnptlTestsInDocker.sh +run_data_access_preflight_check () { + # requires having built a startup kit and synthetic dataset + echo "[Run] Data access preflight check..." + cd "$PROJECT_DIR"/prod_00 + cd client_A/startup + CONSOLE_OUTPUT=data_access_preflight_check_console_output.txt + ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --preflight_check --no_pull 2>&1 | tee $CONSOLE_OUTPUT + + if grep -q "Train set: 18, Val set: 6" "$CONSOLE_OUTPUT" && grep -q "Epoch 0: 100%" "$CONSOLE_OUTPUT"; then + echo "Expected output of Docker/GPU preflight check found" + else + echo "Missing expected output of Docker/GPU preflight check" + exit 1 + fi cd "$CWD" } +# TODO ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir /tmp/scratch --GPU "$GPU_FOR_TESTING" --no_pull --run_script /MediSwarm/_run3DdcnnptlTestsInDocker.sh + cleanup_temporary_data () { echo "[Cleanup] Removing synthetic data, scratch directory, dummy workspace ..." rm -rf "$SYNTHETIC_DATA_DIR" @@ -161,7 +169,7 @@ case "$1" in create_startup_kits_and_check_contained_files create_synthetic_data run_docker_gpu_preflight_check - # run_3dcnn_tests + run_data_access_preflight_check cleanup_temporary_data ;; *) echo "Unknown argument: $1"; exit 1 ;; diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh index b2461ad3..a72018ba 100755 --- a/runTestsOutsideDocker.sh +++ b/runTestsOutsideDocker.sh @@ -24,23 +24,6 @@ kill_server_and_clients () { docker kill odelia_swarm_server_flserver odelia_swarm_client_client_A odelia_swarm_client_client_B } - -run_data_access_preflight_check () { - cd "$PROJECT_DIR"/prod_00 - cd client_A/startup - CONSOLE_OUTPUT=data_access_preflight_check_console_output.txt - ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --preflight_check --no_pull 2>&1 | tee $CONSOLE_OUTPUT - - if grep -q "Train set: 18, Val set: 6" "$CONSOLE_OUTPUT" && grep -q "Epoch 0: 100%" "$CONSOLE_OUTPUT"; then - echo "Expected output of Docker/GPU preflight check found" - else - echo "Missing expected output of Docker/GPU preflight check" - exit 1 - fi - - cd "$CWD" -} - run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup @@ -90,8 +73,6 @@ run_dummy_training_in_swarm () { } run_tests () { - run_data_access_preflight_check - start_server_and_clients run_dummy_training_in_swarm kill_server_and_clients From 7058c7dda61c040645876723d81954fda19a997f Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 28 Aug 2025 10:51:28 +0200 Subject: [PATCH 143/337] refactored so that individual steps (including cleanup) can be run separately --- runIntegrationTests.sh | 44 +++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index bc77b52b..6627ceb0 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -51,9 +51,13 @@ _run_test_in_docker() { } run_local_tests () { + echo "[Run] Controller unit tests" _run_test_in_docker tests/integration_tests/_run_controller_unit_tests_with_coverage.sh + echo "[Run] Minimal example, standalone" _run_test_in_docker tests/integration_tests/_run_minimal_example_standalone.sh + echo "[Run] Minimal example, simulation mode" _run_test_in_docker tests/integration_tests/_run_minimal_example_simulation_mode.sh + echo "[Run] Minimal example, proof-of-concept mode" _run_test_in_docker tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh # uncomment the following line to also run NVFlare's unit tests (takes about 2 minutes and will install python packages in the container) @@ -156,13 +160,39 @@ cleanup_temporary_data () { } case "$1" in - check_files_on_github) check_files_on_github ;; - run_local_tests) run_local_tests ;; - create_startup_kits) create_startup_kits_and_check_contained_files ;; - create_synthetic_data) create_synthetic_data ;; - run_docker_gpu_preflight_check) run_docker_gpu_preflight_check ;; - run_3dcnn_tests) run_3dcnn_tests ;; - cleanup) cleanup_temporary_data ;; + check_files_on_github) + check_files_on_github + cleanup_temporary_data + ;; + + run_local_tests) + run_local_tests + cleanup_temporary_data + ;; + + create_startup_kits) + create_startup_kits_and_check_contained_files + cleanup_temporary_data + ;; + + create_synthetic_data) + create_synthetic_data + cleanup_temporary_data + ;; + + run_docker_gpu_preflight_check) + create_startup_kits_and_check_contained_files + run_docker_gpu_preflight_check + cleanup_temporary_data + ;; + + run_data_access_preflight_check) + create_startup_kits_and_check_contained_files + create_synthetic_data + run_data_access_preflight_check + cleanup_temporary_data + ;; + all | "") check_files_on_github run_local_tests From 41d044a4e15ecd48df166ff065920902d1f2b90f Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 28 Aug 2025 11:44:32 +0200 Subject: [PATCH 144/337] integrated 3dcnn training in simulation mode in Docker in test --- runIntegrationTests.sh | 13 ++++++++++++- .../integration_tests/_run_3dcnn_simulation_mode.sh | 0 2 files changed, 12 insertions(+), 1 deletion(-) rename _run3DdcnnptlTestsInDocker.sh => tests/integration_tests/_run_3dcnn_simulation_mode.sh (100%) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 6627ceb0..8443061f 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -150,7 +150,11 @@ run_data_access_preflight_check () { cd "$CWD" } -# TODO ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir /tmp/scratch --GPU "$GPU_FOR_TESTING" --no_pull --run_script /MediSwarm/_run3DdcnnptlTestsInDocker.sh +run_simulation_mode_in_docker () { + # requires having built a startup kit and synthetic dataset + echo "[Run] Simulation mode of 3DCNN training in Docker" + _run_test_in_docker tests/integration_tests/_run_3dcnn_simulation_mode.sh +} cleanup_temporary_data () { echo "[Cleanup] Removing synthetic data, scratch directory, dummy workspace ..." @@ -193,6 +197,13 @@ case "$1" in cleanup_temporary_data ;; + run_simulation_mode_in_docker) + create_startup_kits_and_check_contained_files + create_synthetic_data + run_simulation_mode_in_docker + cleanup_temporary_data + ;; + all | "") check_files_on_github run_local_tests diff --git a/_run3DdcnnptlTestsInDocker.sh b/tests/integration_tests/_run_3dcnn_simulation_mode.sh similarity index 100% rename from _run3DdcnnptlTestsInDocker.sh rename to tests/integration_tests/_run_3dcnn_simulation_mode.sh From 8da161e6b0ad5bec639e9caa05579d2069ffd1df Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 28 Aug 2025 11:52:58 +0200 Subject: [PATCH 145/337] let scripts fail on error --- .../_run_controller_unit_tests_with_coverage.sh | 2 ++ .../_run_minimal_example_proof_of_concept_mode.sh | 2 ++ tests/integration_tests/_run_minimal_example_simulation_mode.sh | 2 ++ tests/integration_tests/_run_minimal_example_standalone.sh | 2 ++ tests/integration_tests/_run_nvflare_unit_tests.sh | 2 ++ 5 files changed, 10 insertions(+) diff --git a/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh b/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh index 87ef36d1..3d3b87dd 100755 --- a/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh +++ b/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -e + run_controller_unit_tests_with_coverage () { # run unit tests of ODELIA swarm learning and report coverage export MPLCONFIGDIR=/tmp diff --git a/tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh b/tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh index 9331ea7b..9e60b7fc 100755 --- a/tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh +++ b/tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -e + run_minimal_example_proof_of_concept_mode () { # run proof-of-concept mode for minimal example cd /MediSwarm diff --git a/tests/integration_tests/_run_minimal_example_simulation_mode.sh b/tests/integration_tests/_run_minimal_example_simulation_mode.sh index 4f87934d..e1fd931f 100755 --- a/tests/integration_tests/_run_minimal_example_simulation_mode.sh +++ b/tests/integration_tests/_run_minimal_example_simulation_mode.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -e + run_minimal_example_simulation_mode () { # run simulation mode for minimal example cd /MediSwarm diff --git a/tests/integration_tests/_run_minimal_example_standalone.sh b/tests/integration_tests/_run_minimal_example_standalone.sh index 79e10ddb..f0106342 100755 --- a/tests/integration_tests/_run_minimal_example_standalone.sh +++ b/tests/integration_tests/_run_minimal_example_standalone.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -e + run_minimal_example_standalone () { # run standalone version of minimal example cd /MediSwarm/application/jobs/minimal_training_pytorch_cnn/app/custom/ diff --git a/tests/integration_tests/_run_nvflare_unit_tests.sh b/tests/integration_tests/_run_nvflare_unit_tests.sh index efd3b502..890406c2 100755 --- a/tests/integration_tests/_run_nvflare_unit_tests.sh +++ b/tests/integration_tests/_run_nvflare_unit_tests.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +set -e + run_nvflare_unit_tests () { cd /MediSwarm/docker_config/NVFlare ./runtest.sh -c -r From 93883ef4c97c91a24c46bb4ad19b937b217462a9 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Mon, 8 Sep 2025 06:10:41 +0200 Subject: [PATCH 146/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 551fa87b..a2e58d48 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -120,11 +120,11 @@ RUN apt install -y \ dbus-user-session=1.12.20-2ubuntu4.1 \ dbus=1.12.20-2ubuntu4.1 \ dmsetup=2:1.02.175-2.1ubuntu5 \ - docker-buildx-plugin=0.26.1-1~ubuntu.22.04~jammy \ - docker-ce-cli=5:28.3.3-1~ubuntu.22.04~jammy \ - docker-ce-rootless-extras=5:28.3.3-1~ubuntu.22.04~jammy \ - docker-ce=5:28.3.3-1~ubuntu.22.04~jammy \ - docker-compose-plugin=2.39.1-1~ubuntu.22.04~jammy \ + docker-buildx-plugin=0.27.0-1~ubuntu.22.04~jammy \ + docker-ce-cli=5:28.4.0-1~ubuntu.22.04~jammy \ + docker-ce-rootless-extras=5:28.4.0-1~ubuntu.22.04~jammy \ + docker-ce=5:28.4.0-1~ubuntu.22.04~jammy \ + docker-compose-plugin=2.39.2-1~ubuntu.22.04~jammy \ gir1.2-glib-2.0=1.72.0-1 \ git-man=1:2.34.1-1ubuntu1.15 \ git=1:2.34.1-1ubuntu1.15 \ From 62ef9123e40629a61eb0f2454e3abc70c8999ada Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 8 Sep 2025 15:17:44 +0200 Subject: [PATCH 147/337] moved (last remaining) test of server and clients to script for integration tests --- runIntegrationTests.sh | 91 +++++++++++++++++++++++++++++++++++++++- runTestsOutsideDocker.sh | 83 ------------------------------------ 2 files changed, 90 insertions(+), 84 deletions(-) delete mode 100755 runTestsOutsideDocker.sh diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 8443061f..c4a19622 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -37,6 +37,7 @@ check_files_on_github () { done } + _run_test_in_docker() { echo "[Run]" $1 "inside Docker ..." docker run --rm \ @@ -50,6 +51,7 @@ _run_test_in_docker() { "$DOCKER_IMAGE" } + run_local_tests () { echo "[Run] Controller unit tests" _run_test_in_docker tests/integration_tests/_run_controller_unit_tests_with_coverage.sh @@ -64,8 +66,9 @@ run_local_tests () { # run_test_in_docker tests/integration_tests/_run_nvflare_unit_tests.sh } + create_startup_kits_and_check_contained_files () { - echo "[Prepare] Startup kits for dummy project ..." + echo "[Prepare] Startup kits for test project ..." if ! grep -q "127.0.0.1 server.local" /etc/hosts; then echo "/etc/hosts needs to contain the following line, please add it." @@ -104,6 +107,7 @@ create_startup_kits_and_check_contained_files () { done } + create_synthetic_data () { echo "[Prepare] Synthetic data ..." docker run --rm \ @@ -132,6 +136,7 @@ run_docker_gpu_preflight_check () { cd "$CWD" } + run_data_access_preflight_check () { # requires having built a startup kit and synthetic dataset echo "[Run] Data access preflight check..." @@ -150,12 +155,86 @@ run_data_access_preflight_check () { cd "$CWD" } + run_simulation_mode_in_docker () { # requires having built a startup kit and synthetic dataset echo "[Run] Simulation mode of 3DCNN training in Docker" _run_test_in_docker tests/integration_tests/_run_3dcnn_simulation_mode.sh } + +start_server_and_clients () { + cd "$PROJECT_DIR"/prod_00 + cd server.local/startup + ./docker.sh --no_pull --start_server + cd ../.. + sleep 10 + + cd client_A/startup + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --start_client + cd ../.. + cd client_B/startup + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_B --GPU device=$GPU_FOR_TESTING --start_client + sleep 5 + + cd "$CWD" +} + + +run_dummy_training_in_swarm () { + cd "$PROJECT_DIR"/prod_00 + cd admin@test.odelia/startup + "$CWD"/_testsOutsideDocker_submitDummyTraining.exp + docker kill fladmin + sleep 60 + cd "$CWD" + + cd "$PROJECT_DIR"/prod_00/server.local/startup + CONSOLE_OUTPUT=nohup.out + for EXPECTED_OUTPUT in 'Total clients: 2' 'updated status of client client_A on round 4' 'updated status of client client_B on round 4' 'all_done=True' 'Server runner finished.'; + do + if grep -q "$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then + echo "Expected output $EXPECTED_OUTPUT found" + else + echo "Expected output $EXPECTED_OUTPUT missing" + exit 1 + fi + done + cd "$CWD" + + cd "$PROJECT_DIR"/prod_00/client_A/startup + CONSOLE_OUTPUT=nohup.out + for EXPECTED_OUTPUT in 'Sending training result to aggregation client' 'Epoch 9: 100%' ; + do + if grep -q "$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then + echo "Expected output $EXPECTED_OUTPUT found" + else + echo "Expected output $EXPECTED_OUTPUT missing" + exit 1 + fi + done + cd "$CWD" + + cd "$PROJECT_DIR"/prod_00/client_A/ + FILES_PRESENT=$(find . -type f -name "*.*") + for EXPECTED_FILE in 'custom/minimal_training.py' 'best_FL_global_model.pt' 'FL_global_model.pt' ; + do + if echo "$FILES_PRESENT" | grep -q "$EXPECTED_FILE" ; then + echo "Expected file $EXPECTED_FILE found" + else + echo "Expected file $EXPECTED_FILE missing" + exit 1 + fi + done + cd "$CWD" +} + + +kill_server_and_clients () { + docker kill odelia_swarm_server_flserver odelia_swarm_client_client_A odelia_swarm_client_client_B +} + + cleanup_temporary_data () { echo "[Cleanup] Removing synthetic data, scratch directory, dummy workspace ..." rm -rf "$SYNTHETIC_DATA_DIR" @@ -163,6 +242,7 @@ cleanup_temporary_data () { rm -rf "$PROJECT_DIR" } + case "$1" in check_files_on_github) check_files_on_github @@ -204,6 +284,15 @@ case "$1" in cleanup_temporary_data ;; + run_dummy_training_in_swarm) + create_startup_kits_and_check_contained_files + create_synthetic_data + start_server_and_clients + run_dummy_training_in_swarm + kill_server_and_clients + cleanup_temporary_data + ;; + all | "") check_files_on_github run_local_tests diff --git a/runTestsOutsideDocker.sh b/runTestsOutsideDocker.sh deleted file mode 100755 index a72018ba..00000000 --- a/runTestsOutsideDocker.sh +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env bash - -set -e - - -start_server_and_clients () { - cd "$PROJECT_DIR"/prod_00 - cd server.local/startup - ./docker.sh --no_pull --start_server - cd ../.. - sleep 10 - - cd client_A/startup - ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --start_client - cd ../.. - cd client_B/startup - ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_B --GPU device=$GPU_FOR_TESTING --start_client - sleep 5 - - cd "$CWD" -} - -kill_server_and_clients () { - docker kill odelia_swarm_server_flserver odelia_swarm_client_client_A odelia_swarm_client_client_B -} - -run_dummy_training_in_swarm () { - cd "$PROJECT_DIR"/prod_00 - cd admin@test.odelia/startup - ../../../../../_testsOutsideDocker_submitDummyTraining.exp - docker kill fladmin - sleep 60 - cd "$CWD" - - cd "$PROJECT_DIR"/prod_00/server.local/startup - CONSOLE_OUTPUT=nohup.out - for EXPECTED_OUTPUT in 'Total clients: 2' 'updated status of client client_A on round 4' 'updated status of client client_B on round 4' 'all_done=True' 'Server runner finished.'; - do - if grep -q "$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then - echo "Expected output $EXPECTED_OUTPUT found" - else - echo "Expected output $EXPECTED_OUTPUT missing" - exit 1 - fi - done - cd "$CWD" - - cd "$PROJECT_DIR"/prod_00/client_A/startup - CONSOLE_OUTPUT=nohup.out - for EXPECTED_OUTPUT in 'Sending training result to aggregation client' 'Epoch 9: 100%' ; - do - if grep -q "$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then - echo "Expected output $EXPECTED_OUTPUT found" - else - echo "Expected output $EXPECTED_OUTPUT missing" - exit 1 - fi - done - cd "$CWD" - - cd "$PROJECT_DIR"/prod_00/client_A/ - FILES_PRESENT=$(find . -type f -name "*.*") - for EXPECTED_FILE in 'custom/minimal_training.py' 'best_FL_global_model.pt' 'FL_global_model.pt' ; - do - if echo "$FILES_PRESENT" | grep -q "$EXPECTED_FILE" ; then - echo "Expected file $EXPECTED_FILE found" - else - echo "Expected file $EXPECTED_FILE missing" - exit 1 - fi - done - cd "$CWD" -} - -run_tests () { - start_server_and_clients - run_dummy_training_in_swarm - kill_server_and_clients - - cleanup_temporary_data -} - -run_tests From 1a06b1b4c8106df6019c786be856097acb2754a3 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 8 Sep 2025 15:31:53 +0200 Subject: [PATCH 148/337] removed unnecessary block --- runIntegrationTests.sh | 5 ----- 1 file changed, 5 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index c4a19622..f51999ce 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -259,11 +259,6 @@ case "$1" in cleanup_temporary_data ;; - create_synthetic_data) - create_synthetic_data - cleanup_temporary_data - ;; - run_docker_gpu_preflight_check) create_startup_kits_and_check_contained_files run_docker_gpu_preflight_check From de29705d3b8bb18c9f344ad09d02fbd4100afe69 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 8 Sep 2025 15:32:08 +0200 Subject: [PATCH 149/337] completed "all" section --- runIntegrationTests.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index f51999ce..8fc1a26d 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -295,6 +295,9 @@ case "$1" in create_synthetic_data run_docker_gpu_preflight_check run_data_access_preflight_check + start_server_and_clients + run_dummy_training_in_swarm + kill_server_and_clients cleanup_temporary_data ;; *) echo "Unknown argument: $1"; exit 1 ;; From 1004218a8266afc367d8f084dd677f7ff42d1390 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 8 Sep 2025 15:32:22 +0200 Subject: [PATCH 150/337] consistently output what is being run --- runIntegrationTests.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 8fc1a26d..46dfb634 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -63,6 +63,7 @@ run_local_tests () { _run_test_in_docker tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh # uncomment the following line to also run NVFlare's unit tests (takes about 2 minutes and will install python packages in the container) + # echo "[Run] NVFlare unit tests" # run_test_in_docker tests/integration_tests/_run_nvflare_unit_tests.sh } @@ -164,6 +165,8 @@ run_simulation_mode_in_docker () { start_server_and_clients () { + echo "[Run] Start server and client Docker containers ..." + cd "$PROJECT_DIR"/prod_00 cd server.local/startup ./docker.sh --no_pull --start_server @@ -182,6 +185,8 @@ start_server_and_clients () { run_dummy_training_in_swarm () { + echo "[Run] Dummy training in swarm ..." + cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup "$CWD"/_testsOutsideDocker_submitDummyTraining.exp @@ -231,6 +236,7 @@ run_dummy_training_in_swarm () { kill_server_and_clients () { + echo "[Cleanup] Kill server and client Docker containers ..." docker kill odelia_swarm_server_flserver odelia_swarm_client_client_A odelia_swarm_client_client_B } From b928e9260ddce8daaf50679dd9cb8bb7ff65bda5 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 10:46:06 +0200 Subject: [PATCH 151/337] running simulation mode of 3D CNN training does not work yet, commented out --- runIntegrationTests.sh | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 46dfb634..1709ac96 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -157,13 +157,6 @@ run_data_access_preflight_check () { } -run_simulation_mode_in_docker () { - # requires having built a startup kit and synthetic dataset - echo "[Run] Simulation mode of 3DCNN training in Docker" - _run_test_in_docker tests/integration_tests/_run_3dcnn_simulation_mode.sh -} - - start_server_and_clients () { echo "[Run] Start server and client Docker containers ..." @@ -278,13 +271,6 @@ case "$1" in cleanup_temporary_data ;; - run_simulation_mode_in_docker) - create_startup_kits_and_check_contained_files - create_synthetic_data - run_simulation_mode_in_docker - cleanup_temporary_data - ;; - run_dummy_training_in_swarm) create_startup_kits_and_check_contained_files create_synthetic_data @@ -311,3 +297,18 @@ esac # TODO adapt ./assets/readme/README.developer.md # TODO adapt .github/workflows/pr-test.yaml + +# The following does not work yet. It should be included in "all" and in .github/workflows/pr-test.yaml once it works. +# +# run_simulation_mode_in_docker () { +# # requires having built a startup kit and synthetic dataset +# echo "[Run] Simulation mode of 3DCNN training in Docker" +# _run_test_in_docker tests/integration_tests/_run_3dcnn_simulation_mode.sh +# } +# +# run_simulation_mode_in_docker) +# create_startup_kits_and_check_contained_files +# create_synthetic_data +# run_simulation_mode_in_docker +# cleanup_temporary_data +# ;; From 8ad95cca2664dacca6d0d0cd79b861ceba866f9a Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 11:05:38 +0200 Subject: [PATCH 152/337] expanded run_local_tests and moved unit test script to more suitable folder --- runIntegrationTests.sh | 30 +++++++++++++++---- .../_run_nvflare_unit_tests.sh | 0 2 files changed, 24 insertions(+), 6 deletions(-) rename tests/{integration_tests => unit_tests}/_run_nvflare_unit_tests.sh (100%) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 1709ac96..392b9c3e 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -52,19 +52,29 @@ _run_test_in_docker() { } -run_local_tests () { +run_unit_tests_controller(){ echo "[Run] Controller unit tests" _run_test_in_docker tests/integration_tests/_run_controller_unit_tests_with_coverage.sh +} + +run_dummy_training_standalone(){ echo "[Run] Minimal example, standalone" _run_test_in_docker tests/integration_tests/_run_minimal_example_standalone.sh +} + +run_dummy_training_simulation_mode(){ echo "[Run] Minimal example, simulation mode" _run_test_in_docker tests/integration_tests/_run_minimal_example_simulation_mode.sh +} + +run_dummy_training_poc_mode(){ echo "[Run] Minimal example, proof-of-concept mode" _run_test_in_docker tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh +} - # uncomment the following line to also run NVFlare's unit tests (takes about 2 minutes and will install python packages in the container) - # echo "[Run] NVFlare unit tests" - # run_test_in_docker tests/integration_tests/_run_nvflare_unit_tests.sh +run_nvflare_unit_tests(){ + echo "[Run] NVFlare unit tests" + _run_test_in_docker tests/unit_tests/_run_nvflare_unit_tests.sh } @@ -249,7 +259,11 @@ case "$1" in ;; run_local_tests) - run_local_tests + run_unit_tests_controller + run_dummy_training_standalone + run_dummy_training_simulation_mode + run_dummy_training_poc_mode + run_nvflare_unit_tests cleanup_temporary_data ;; @@ -282,7 +296,11 @@ case "$1" in all | "") check_files_on_github - run_local_tests + run_unit_tests_controller + run_dummy_training_standalone + run_dummy_training_simulation_mode + run_dummy_training_poc_mode + run_nvflare_unit_tests create_startup_kits_and_check_contained_files create_synthetic_data run_docker_gpu_preflight_check diff --git a/tests/integration_tests/_run_nvflare_unit_tests.sh b/tests/unit_tests/_run_nvflare_unit_tests.sh similarity index 100% rename from tests/integration_tests/_run_nvflare_unit_tests.sh rename to tests/unit_tests/_run_nvflare_unit_tests.sh From a3c281c10f8bd79ea104f24ba5ba66d6bff2e486 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 13:33:23 +0200 Subject: [PATCH 153/337] disabled NVFlare unit tests as before --- runIntegrationTests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 392b9c3e..ff6b5a6a 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -263,7 +263,7 @@ case "$1" in run_dummy_training_standalone run_dummy_training_simulation_mode run_dummy_training_poc_mode - run_nvflare_unit_tests + # run_nvflare_unit_tests # uncomment to enable NVFlare unit tests cleanup_temporary_data ;; @@ -300,7 +300,7 @@ case "$1" in run_dummy_training_standalone run_dummy_training_simulation_mode run_dummy_training_poc_mode - run_nvflare_unit_tests + # run_nvflare_unit_tests # uncomment to enable NVFlare unit tests create_startup_kits_and_check_contained_files create_synthetic_data run_docker_gpu_preflight_check From d8e9a8972f1aac25410ee3d282740e831348d471 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 13:51:58 +0200 Subject: [PATCH 154/337] updated developer readme --- assets/readme/README.developer.md | 15 ++++++++++----- runIntegrationTests.sh | 3 +-- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/assets/readme/README.developer.md b/assets/readme/README.developer.md index fb6aafc3..0f789529 100644 --- a/assets/readme/README.developer.md +++ b/assets/readme/README.developer.md @@ -24,10 +24,10 @@ The project description specifies the swarm nodes etc. to be used for a swarm tr kits, running local trainings in the startup kit), you can manually push the image to DockerHub, provided you have the necessary rights. Make sure you are not re-using a version number for this purpose. -## Running Local Tests +## Running Tests ```bash - ./runTestsInDocker.sh + ./runIntegrationTests.sh ``` You should see @@ -36,10 +36,11 @@ You should see 2. output of a successful simulation run with two nodes 3. output of a successful proof-of-concept run run with two nodes 4. output of a set of startup kits being generated -5. output of a dummy training run using one of the startup kits -6. TODO update this to what the tests output now +5. output of a Docker/GPU preflight check using one of the startup kits +6. output of a data access preflight check using one of the startup kits +7. output of a dummy training run in a swarm consisting of one server and two client nodes -Optionally, uncomment running NVFlare unit tests in `_runTestsInsideDocker.sh`. +Optionally, uncomment running NVFlare unit tests. ## Distributing Startup Kits @@ -93,3 +94,7 @@ export CONFIG=original run in the swarm 3. Use the local tests to check if the code is swarm-ready 4. TODO more detailed instructions + +## Continuous Integration + +Tests to be executed after pushing to github are defined in `.github/workflows/pr-test.yaml`. diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index ff6b5a6a..86429103 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -313,10 +313,9 @@ case "$1" in *) echo "Unknown argument: $1"; exit 1 ;; esac -# TODO adapt ./assets/readme/README.developer.md # TODO adapt .github/workflows/pr-test.yaml -# The following does not work yet. It should be included in "all" and in .github/workflows/pr-test.yaml once it works. +# The following does not work yet. It should be included in "all", in ./assets/readme/README.developer.md and in .github/workflows/pr-test.yaml once it works. # # run_simulation_mode_in_docker () { # # requires having built a startup kit and synthetic dataset From 219ecf0cf377f69b9e1b4d4fb7a8f401d3d31adf Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 14:02:46 +0200 Subject: [PATCH 155/337] run integration tests in CI in one go --- .github/workflows/pr-test.yaml | 27 ++------------------------- runIntegrationTests.sh | 3 +-- 2 files changed, 3 insertions(+), 27 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index a75c14a5..64c18cd1 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -50,27 +50,10 @@ jobs: - name: Build Docker image and dummy startup kits run: ./buildDockerImageAndStartupKits.sh -p tests/provision/dummy_project_for_testing.yml --use-docker-cache - - name: Prepare dummy trainings - continue-on-error: true - run: | - ./runTestsInDocker.sh prepare_dummy_trainings - echo "Dummy training project prepared" - - - name: Run dummy training + - name: Run integration tests continue-on-error: false run: | - ./runTestsInDocker.sh run_dummy_training - echo "Dummy training finished" - echo "=== Checking log output ===" - ls -lh workspace/*/prod_00/client_A/logs || echo "No logs found for dummy training" - - - name: Run 3D CNN tests - continue-on-error: false - run: | - ./runTestsInDocker.sh run_3dcnn_tests - echo "3D CNN tests check finished" - echo "=== Checking synthetic log output ===" - ls -lh workspace/*/prod_00/client_A/logs || echo "No logs found for 3D CNN tests" + ./runIntegrationTests.sh - name: Run Unit Tests inside Docker continue-on-error: true @@ -78,9 +61,3 @@ jobs: ./runTestsInDocker.sh run_tests echo "=== [LOG CHECK] ===" docker logs $(docker ps -a -q --latest) | grep -i "error" && echo "Error found in logs" || echo "No error found" - - - name: Cleanup training artifacts - continue-on-error: true - run: | - ./runTestsInDocker.sh cleanup - echo "Cleanup finished" diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 86429103..7335860a 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -313,8 +313,7 @@ case "$1" in *) echo "Unknown argument: $1"; exit 1 ;; esac -# TODO adapt .github/workflows/pr-test.yaml - +# TODO # The following does not work yet. It should be included in "all", in ./assets/readme/README.developer.md and in .github/workflows/pr-test.yaml once it works. # # run_simulation_mode_in_docker () { From e722909d77dfe1416414127e75836a1347b7c072 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 14:07:16 +0200 Subject: [PATCH 156/337] renamed expect script and moved it to more suitable location --- runIntegrationTests.sh | 2 +- .../integration_tests/_submitDummyTraining.exp | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename _testsOutsideDocker_submitDummyTraining.exp => tests/integration_tests/_submitDummyTraining.exp (100%) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 7335860a..f747f3e6 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -192,7 +192,7 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup - "$CWD"/_testsOutsideDocker_submitDummyTraining.exp + "$CWD"/tests/integration_tests/_submitDummyTraining.exp docker kill fladmin sleep 60 cd "$CWD" diff --git a/_testsOutsideDocker_submitDummyTraining.exp b/tests/integration_tests/_submitDummyTraining.exp similarity index 100% rename from _testsOutsideDocker_submitDummyTraining.exp rename to tests/integration_tests/_submitDummyTraining.exp From 15efd9fad79ce4b27e31443eefbfe8f7288c6d30 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 14:11:02 +0200 Subject: [PATCH 157/337] removed step using script that no longer exists --- .github/workflows/pr-test.yaml | 7 ------- 1 file changed, 7 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index 64c18cd1..8a93ba94 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -54,10 +54,3 @@ jobs: continue-on-error: false run: | ./runIntegrationTests.sh - - - name: Run Unit Tests inside Docker - continue-on-error: true - run: | - ./runTestsInDocker.sh run_tests - echo "=== [LOG CHECK] ===" - docker logs $(docker ps -a -q --latest) | grep -i "error" && echo "Error found in logs" || echo "No error found" From 2bac35e9127a185c5b3bfe87960840b8714909e2 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 14:34:56 +0200 Subject: [PATCH 158/337] trying to enable test of 3D CNN in simulation mode --- runIntegrationTests.sh | 37 ++++++++++--------- .../_run_3dcnn_simulation_mode.sh | 9 +++-- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index f747f3e6..ad9bb283 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -45,7 +45,8 @@ _run_test_in_docker() { --ipc=host \ --ulimit memlock=-1 \ --ulimit stack=67108864 \ - -v /tmp:/scratch \ + -v "$SYNTHETIC_DATA_DIR":/data \ + -v "$SCRATCH_DIR":/scratch \ --gpus="$GPU_FOR_TESTING" \ --entrypoint=/MediSwarm/$1 \ "$DOCKER_IMAGE" @@ -83,7 +84,7 @@ create_startup_kits_and_check_contained_files () { if ! grep -q "127.0.0.1 server.local" /etc/hosts; then echo "/etc/hosts needs to contain the following line, please add it." - echo "127.0.0.1 server.local localhost" + echo "127.0.0.1 server.local" exit 1 fi @@ -167,6 +168,13 @@ run_data_access_preflight_check () { } +run_3dcnn_simulation_mode () { + # requires having built a startup kit and synthetic dataset + echo "[Run] Simulation mode of 3DCNN training in Docker" + _run_test_in_docker tests/integration_tests/_run_3dcnn_simulation_mode.sh +} + + start_server_and_clients () { echo "[Run] Start server and client Docker containers ..." @@ -272,6 +280,13 @@ case "$1" in cleanup_temporary_data ;; + run_3dcnn_simulation_mode) + create_startup_kits_and_check_contained_files + create_synthetic_data + run_3dcnn_simulation_mode + cleanup_temporary_data + ;; + run_docker_gpu_preflight_check) create_startup_kits_and_check_contained_files run_docker_gpu_preflight_check @@ -301,8 +316,9 @@ case "$1" in run_dummy_training_simulation_mode run_dummy_training_poc_mode # run_nvflare_unit_tests # uncomment to enable NVFlare unit tests - create_startup_kits_and_check_contained_files create_synthetic_data + run_3dcnn_simulation_mode + create_startup_kits_and_check_contained_files run_docker_gpu_preflight_check run_data_access_preflight_check start_server_and_clients @@ -314,17 +330,4 @@ case "$1" in esac # TODO -# The following does not work yet. It should be included in "all", in ./assets/readme/README.developer.md and in .github/workflows/pr-test.yaml once it works. -# -# run_simulation_mode_in_docker () { -# # requires having built a startup kit and synthetic dataset -# echo "[Run] Simulation mode of 3DCNN training in Docker" -# _run_test_in_docker tests/integration_tests/_run_3dcnn_simulation_mode.sh -# } -# -# run_simulation_mode_in_docker) -# create_startup_kits_and_check_contained_files -# create_synthetic_data -# run_simulation_mode_in_docker -# cleanup_temporary_data -# ;; +# Once the 3D CNN simulation mode works, it should be mentioned in ./assets/readme/README.developer.md. diff --git a/tests/integration_tests/_run_3dcnn_simulation_mode.sh b/tests/integration_tests/_run_3dcnn_simulation_mode.sh index 7fb7a877..030e855e 100755 --- a/tests/integration_tests/_run_3dcnn_simulation_mode.sh +++ b/tests/integration_tests/_run_3dcnn_simulation_mode.sh @@ -11,11 +11,12 @@ run_3dcnn_simulation_mode () { sed -i 's/num_rounds = .*/num_rounds = 2/' ${TMPDIR}/ODELIA_ternary_classification/app/config/config_fed_server.conf export TRAINING_MODE="swarm" export SITE_NAME="client_A" + export DATA_DIR=/data + export SCRATCH_DIR=/scratch + export TORCH_HOME=/torch_home + export MODEL_NAME=MST + export CONFIG=unilateral nvflare simulator -w /tmp/ODELIA_ternary_classification -n 2 -t 2 ${TMPDIR}/ODELIA_ternary_classification -c client_A,client_B - unset TRAINING_MODE - unset SITE_NAME - rm -rf ${TMPDIR} - unset TMPDIR } run_3dcnn_simulation_mode From 7c85dd7fd2214716f2bba998f25e5d0170b388b9 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 15:21:43 +0200 Subject: [PATCH 159/337] moved check of name resolution to where it is needed --- runIntegrationTests.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index ad9bb283..69cf1d1b 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -82,12 +82,6 @@ run_nvflare_unit_tests(){ create_startup_kits_and_check_contained_files () { echo "[Prepare] Startup kits for test project ..." - if ! grep -q "127.0.0.1 server.local" /etc/hosts; then - echo "/etc/hosts needs to contain the following line, please add it." - echo "127.0.0.1 server.local" - exit 1 - fi - if [ ! -d "$PROJECT_DIR"/prod_00 ]; then ./_buildStartupKits.sh $PROJECT_FILE $VERSION fi @@ -178,6 +172,12 @@ run_3dcnn_simulation_mode () { start_server_and_clients () { echo "[Run] Start server and client Docker containers ..." + if ! grep -q "127.0.0.1 server.local" /etc/hosts; then + echo "/etc/hosts needs to contain the following line, please add it." + echo "127.0.0.1 server.local" + exit 1 + fi + cd "$PROJECT_DIR"/prod_00 cd server.local/startup ./docker.sh --no_pull --start_server From 966a099e9dd338a9c9d007d9aa14c40da2c3cbf1 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 15:22:48 +0200 Subject: [PATCH 160/337] removed unnecessary step --- runIntegrationTests.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 69cf1d1b..5b2e914b 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -281,7 +281,6 @@ case "$1" in ;; run_3dcnn_simulation_mode) - create_startup_kits_and_check_contained_files create_synthetic_data run_3dcnn_simulation_mode cleanup_temporary_data From aa5c98325ab48b245d8f04ab0e264efdf4a5ada5 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 15:23:08 +0200 Subject: [PATCH 161/337] made tests that do not use the startup kits callable individually --- runIntegrationTests.sh | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 5b2e914b..900009b8 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -266,17 +266,23 @@ case "$1" in cleanup_temporary_data ;; - run_local_tests) + run_unit_tests_controller) run_unit_tests_controller + cleanup_temporary_data + ;; + + run_dummy_training_standalone) run_dummy_training_standalone + cleanup_temporary_data + ;; + + run_dummy_training_simulation_mode) run_dummy_training_simulation_mode - run_dummy_training_poc_mode - # run_nvflare_unit_tests # uncomment to enable NVFlare unit tests cleanup_temporary_data ;; - create_startup_kits) - create_startup_kits_and_check_contained_files + run_dummy_training_poc_mode) + run_dummy_training_poc_mode cleanup_temporary_data ;; @@ -286,6 +292,11 @@ case "$1" in cleanup_temporary_data ;; + create_startup_kits) + create_startup_kits_and_check_contained_files + cleanup_temporary_data + ;; + run_docker_gpu_preflight_check) create_startup_kits_and_check_contained_files run_docker_gpu_preflight_check From 631c1f7d4eeb64cbf802b462c8324b74d0aa6a43 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 15:23:31 +0200 Subject: [PATCH 162/337] call tests as separate steps in workflow --- .github/workflows/pr-test.yaml | 49 ++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index 8a93ba94..abe73027 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -50,7 +50,52 @@ jobs: - name: Build Docker image and dummy startup kits run: ./buildDockerImageAndStartupKits.sh -p tests/provision/dummy_project_for_testing.yml --use-docker-cache - - name: Run integration tests + - name: Run integration test (check_files_on_github) continue-on-error: false run: | - ./runIntegrationTests.sh + ./runIntegrationTests.sh check_files_on_github + + - name: Run integration test (run_unit_tests_controller) + continue-on-error: false + run: | + ./runIntegrationTests.sh run_unit_tests_controller + + - name: Run integration test (run_dummy_training_standalone) + continue-on-error: false + run: | + ./runIntegrationTests.sh run_dummy_training_standalone + + - name: Run integration test (run_dummy_training_simulation_mode) + continue-on-error: false + run: | + ./runIntegrationTests.sh run_dummy_training_simulation_mode + + - name: Run integration test (run_dummy_training_poc_mode) + continue-on-error: false + run: | + ./runIntegrationTests.sh run_dummy_training_poc_mode + + - name: Run integration test (run_3dcnn_simulation_mode) + continue-on-error: false + run: | + ./runIntegrationTests.sh run_3dcnn_simulation_mode + + - name: Run integration test (create_startup_kits) + continue-on-error: false + run: | + ./runIntegrationTests.sh create_startup_kits + + - name: Run integration test (run_docker_gpu_preflight_check) + continue-on-error: false + run: | + ./runIntegrationTests.sh run_docker_gpu_preflight_check + + - name: Run integration test (run_data_access_preflight_check) + continue-on-error: false + run: | + ./runIntegrationTests.sh run_data_access_preflight_check + + - name: Run integration test (run_dummy_training_in_swarm) + continue-on-error: false + run: | + ./runIntegrationTests.sh run_dummy_training_in_swarm From a51099a29fc3d55bf4802505c7c4422064635368 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 15:33:30 +0200 Subject: [PATCH 163/337] arguments for docker run like in docker.sh from startup scripts to create files with permission for local user --- runIntegrationTests.sh | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 900009b8..95a4eda6 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -45,6 +45,8 @@ _run_test_in_docker() { --ipc=host \ --ulimit memlock=-1 \ --ulimit stack=67108864 \ + -u $(id -u):$(id -g) \ + -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group \ -v "$SYNTHETIC_DATA_DIR":/data \ -v "$SCRATCH_DIR":/scratch \ --gpus="$GPU_FOR_TESTING" \ @@ -117,11 +119,12 @@ create_startup_kits_and_check_contained_files () { create_synthetic_data () { echo "[Prepare] Synthetic data ..." docker run --rm \ - -u $(id -u):$(id -g) \ - -v "$SYNTHETIC_DATA_DIR":/synthetic_data \ - -w /MediSwarm \ - $DOCKER_IMAGE \ - /bin/bash -c "python3 application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py /synthetic_data" + -u $(id -u):$(id -g) \ + -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group \ + -v "$SYNTHETIC_DATA_DIR":/synthetic_data \ + -w /MediSwarm \ + $DOCKER_IMAGE \ + /bin/bash -c "python3 application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py /synthetic_data" } From 325b537994ce99d335c46fcb2751e31f4d2678ed Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 15:44:52 +0200 Subject: [PATCH 164/337] write coverage file to location outside code directory --- .../_run_controller_unit_tests_with_coverage.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh b/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh index 3d3b87dd..46e6e11c 100755 --- a/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh +++ b/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh @@ -5,10 +5,11 @@ set -e run_controller_unit_tests_with_coverage () { # run unit tests of ODELIA swarm learning and report coverage export MPLCONFIGDIR=/tmp + export COVERAGE_FILE=/tmp/.MediSwarm_coverage cd /MediSwarm/tests/unit_tests/controller PYTHONPATH=/MediSwarm/controller/controller python3 -m coverage run --source=/MediSwarm/controller/controller -m unittest discover coverage report -m - rm .coverage + rm "$COVERAGE_FILE" } run_controller_unit_tests_with_coverage From a2be697f066bfff17080c9c28e3a1bfbffe1f966 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 15:56:56 +0200 Subject: [PATCH 165/337] ensure directory exists --- .../_run_minimal_example_proof_of_concept_mode.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh b/tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh index 9e60b7fc..ee26a4d0 100755 --- a/tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh +++ b/tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh @@ -4,6 +4,7 @@ set -e run_minimal_example_proof_of_concept_mode () { # run proof-of-concept mode for minimal example + mkdir -p ~/.nvflare cd /MediSwarm export TRAINING_MODE="swarm" nvflare poc prepare -c poc_client_0 poc_client_1 From e9f2228658c8a2669e006f72d72e25b997df4d70 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 15:58:36 +0200 Subject: [PATCH 166/337] renamed server "localhost" so that it does not need mapping to an IP address --- runIntegrationTests.sh | 10 ++-------- tests/provision/dummy_project_for_testing.yml | 4 ++-- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 95a4eda6..eec1d22c 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -175,14 +175,8 @@ run_3dcnn_simulation_mode () { start_server_and_clients () { echo "[Run] Start server and client Docker containers ..." - if ! grep -q "127.0.0.1 server.local" /etc/hosts; then - echo "/etc/hosts needs to contain the following line, please add it." - echo "127.0.0.1 server.local" - exit 1 - fi - cd "$PROJECT_DIR"/prod_00 - cd server.local/startup + cd localhost/startup ./docker.sh --no_pull --start_server cd ../.. sleep 10 @@ -208,7 +202,7 @@ run_dummy_training_in_swarm () { sleep 60 cd "$CWD" - cd "$PROJECT_DIR"/prod_00/server.local/startup + cd "$PROJECT_DIR"/prod_00/localhost/startup CONSOLE_OUTPUT=nohup.out for EXPECTED_OUTPUT in 'Total clients: 2' 'updated status of client client_A on round 4' 'updated status of client client_B on round 4' 'all_done=True' 'Server runner finished.'; do diff --git a/tests/provision/dummy_project_for_testing.yml b/tests/provision/dummy_project_for_testing.yml index d4984d77..613f81ce 100644 --- a/tests/provision/dummy_project_for_testing.yml +++ b/tests/provision/dummy_project_for_testing.yml @@ -4,7 +4,7 @@ description: > Test setup. participants: - - name: server.local + - name: localhost type: server org: Test_Org fed_learn_port: 8002 @@ -34,7 +34,7 @@ builders: path: nvflare.ha.dummy_overseer_agent.DummyOverseerAgent overseer_exists: false args: - sp_end_point: server.local:8002:8003 + sp_end_point: localhost:8002:8003 - path: nvflare.lighter.impl.cert.CertBuilder - path: nvflare.lighter.impl.signature.SignatureBuilder From 0ef20d95ac16f8e93acf8c4e00ed74fde80cb17b Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 16:10:50 +0200 Subject: [PATCH 167/337] allow local user to create home directory --- docker_config/Dockerfile_ODELIA | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index a2e58d48..0eada436 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -343,3 +343,6 @@ RUN ln -s /MediSwarm /fl_admin/transfer/MediSwarm # Copy pre-trained model weights to image COPY ./torch_home_cache /torch_home + +# allow creating home directory for local user inside container if needed +RUN chmod a+rwx /home From e6acae0cdea925ba34b3a1ffd97760d2a28aa7b4 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 16:34:05 +0200 Subject: [PATCH 168/337] avoid name clashes of Docker containers --- buildDockerImageAndStartupKits.sh | 4 +++- docker_config/master_template.yml | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/buildDockerImageAndStartupKits.sh b/buildDockerImageAndStartupKits.sh index 1767cfef..5c894a4f 100755 --- a/buildDockerImageAndStartupKits.sh +++ b/buildDockerImageAndStartupKits.sh @@ -26,7 +26,7 @@ fi VERSION=`./getVersionNumber.sh` DOCKER_IMAGE=jefftud/odelia:$VERSION - +CONTAINER_VERSION_ID=`git rev-parse --short HEAD` # prepare clean version of source code repository clone for building Docker image @@ -42,6 +42,8 @@ cd ../.. rm .git -rf chmod a+rX . -R sed -i 's#__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__#'$VERSION'#' docker_config/master_template.yml +sed -i 's#__REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__#'$CONTAINER_VERSION_ID'#' docker_config/master_template.yml + cd $CWD diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 0a2306db..0423403b 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -680,7 +680,7 @@ docker_cln_sh: | docker pull "$DOCKER_IMAGE" fi - CONTAINER_NAME=odelia_swarm_client_{~~client_name~~} + CONTAINER_NAME=odelia_swarm_client_{~~client_name~~}___REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__ DOCKER_OPTIONS_A="--name=$CONTAINER_NAME --gpus=$GPU2USE -u $(id -u):$(id -g)" DOCKER_MOUNTS="-v /etc/passwd:/etc/passwd -v /etc/group:/etc/group -v $DIR/..:/startupkit/ -v $MY_SCRATCH_DIR:/scratch/" if [[ ! -z "$MY_DATA_DIR" ]]; then @@ -697,7 +697,7 @@ docker_cln_sh: | --env GPU_DEVICE=$GPU2USE \ --env MODEL_NAME=MST \ --env CONFIG=unilateral \ - --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" + --env MEDISWARM_VERSION=__REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__ # Execution modes if [[ ! -z "$DUMMY_TRAINING" ]]; then @@ -764,7 +764,7 @@ docker_svr_sh: | docker pull $DOCKER_IMAGE fi svr_name="${SVR_NAME:-flserver}" - CONTAINER_NAME=odelia_swarm_server_$svr_name + CONTAINER_NAME=odelia_swarm_server_${svr_name}___REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__ rm -rf ../pid.fl ../daemon_pid.fl # clean up potential leftovers from previous run @@ -811,7 +811,7 @@ docker_adm_sh: | echo "Updating docker image" docker pull $DOCKER_IMAGE fi - CONTAINER_NAME=odelia_swarm_admin + CONTAINER_NAME=odelia_swarm_admin___REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__ echo "Starting docker with $DOCKER_IMAGE as $CONTAINER_NAME" docker run --rm -it --name=fladmin -v $DIR/../local/:/fl_admin/local/ -v $DIR/../startup/:/fl_admin/startup/ -w /fl_admin/startup/ $NETARG $DOCKER_IMAGE /bin/bash -c "./fl_admin.sh" From e27bb473659239393243ae8c55917bc6b1217def Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 9 Sep 2025 16:54:48 +0200 Subject: [PATCH 169/337] fixed replacement of version identifiers --- docker_config/master_template.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 0423403b..180eacf4 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -680,7 +680,7 @@ docker_cln_sh: | docker pull "$DOCKER_IMAGE" fi - CONTAINER_NAME=odelia_swarm_client_{~~client_name~~}___REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__ + CONTAINER_NAME=odelia_swarm_client_{~~client_name~~}___REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__ DOCKER_OPTIONS_A="--name=$CONTAINER_NAME --gpus=$GPU2USE -u $(id -u):$(id -g)" DOCKER_MOUNTS="-v /etc/passwd:/etc/passwd -v /etc/group:/etc/group -v $DIR/..:/startupkit/ -v $MY_SCRATCH_DIR:/scratch/" if [[ ! -z "$MY_DATA_DIR" ]]; then @@ -697,7 +697,7 @@ docker_cln_sh: | --env GPU_DEVICE=$GPU2USE \ --env MODEL_NAME=MST \ --env CONFIG=unilateral \ - --env MEDISWARM_VERSION=__REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__ + --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__ # Execution modes if [[ ! -z "$DUMMY_TRAINING" ]]; then From 8f5141663890a7c37769b1bc8ef23a42a199773c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 10 Sep 2025 11:16:25 +0200 Subject: [PATCH 170/337] fixed missing closing " --- docker_config/master_template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 180eacf4..440f0633 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -697,7 +697,7 @@ docker_cln_sh: | --env GPU_DEVICE=$GPU2USE \ --env MODEL_NAME=MST \ --env CONFIG=unilateral \ - --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__ + --env MEDISWARM_VERSION=__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__" # Execution modes if [[ ! -z "$DUMMY_TRAINING" ]]; then From d29ac3119475720f31b8eb6dbe4749571ac2cd4e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 10 Sep 2025 13:55:14 +0200 Subject: [PATCH 171/337] wait longer so that sys_info sees both clients --- runIntegrationTests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index eec1d22c..d21a32d1 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -186,7 +186,7 @@ start_server_and_clients () { cd ../.. cd client_B/startup ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_B --GPU device=$GPU_FOR_TESTING --start_client - sleep 5 + sleep 8 cd "$CWD" } From 42cc98af9d26a5252bda5015f094f1167949bc09 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 10 Sep 2025 13:58:49 +0200 Subject: [PATCH 172/337] check that models for dummy training are small --- runIntegrationTests.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index d21a32d1..d8f560ac 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -239,6 +239,15 @@ run_dummy_training_in_swarm () { exit 1 fi done + + actualsize=$(wc -c <*/app_client_A/best_FL_global_model.pt) + if [ $actualsize -le 1048576 ]; then + echo "Checkpoint file size OK" + else + echo "Checkpoint too large: " $actualsize + exit 1 + fi + cd "$CWD" } From cbdf38a987b8300e2e38019d93df0c8717685cc2 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 10 Sep 2025 14:16:22 +0200 Subject: [PATCH 173/337] added check whether job ID is logged by server --- runIntegrationTests.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index d8f560ac..164e66b1 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -204,9 +204,10 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00/localhost/startup CONSOLE_OUTPUT=nohup.out - for EXPECTED_OUTPUT in 'Total clients: 2' 'updated status of client client_A on round 4' 'updated status of client client_B on round 4' 'all_done=True' 'Server runner finished.'; + for EXPECTED_OUTPUT in 'Total clients: 2' 'updated status of client client_A on round 4' 'updated status of client client_B on round 4' 'all_done=True' 'Server runner finished.' \ + 'Start to the run Job: [0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}'; do - if grep -q "$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then + if grep -q --regexp="$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then echo "Expected output $EXPECTED_OUTPUT found" else echo "Expected output $EXPECTED_OUTPUT missing" From a350349461a01f814e6611d97cb02cdeafad4ed8 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 10 Sep 2025 14:30:30 +0200 Subject: [PATCH 174/337] use defined container name for container running admin console --- docker_config/master_template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 440f0633..ec989a7b 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -814,7 +814,7 @@ docker_adm_sh: | CONTAINER_NAME=odelia_swarm_admin___REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__ echo "Starting docker with $DOCKER_IMAGE as $CONTAINER_NAME" - docker run --rm -it --name=fladmin -v $DIR/../local/:/fl_admin/local/ -v $DIR/../startup/:/fl_admin/startup/ -w /fl_admin/startup/ $NETARG $DOCKER_IMAGE /bin/bash -c "./fl_admin.sh" + docker run --rm -it --name=$CONTAINER_NAME -v $DIR/../local/:/fl_admin/local/ -v $DIR/../startup/:/fl_admin/startup/ -w /fl_admin/startup/ $NETARG $DOCKER_IMAGE /bin/bash -c "./fl_admin.sh" compose_yaml: | services: From d4d50b918a5dc65490129d7632e84889a5d64c31 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 10 Sep 2025 14:31:37 +0200 Subject: [PATCH 175/337] use correct container names in `docker kill` --- runIntegrationTests.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 164e66b1..30167298 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -3,6 +3,7 @@ set -e VERSION=$(./getVersionNumber.sh) +CONTAINER_VERSION_SUFFIX=$(git rev-parse --short HEAD) DOCKER_IMAGE=jefftud/odelia:$VERSION PROJECT_DIR="workspace/odelia_${VERSION}_dummy_project_for_testing" SYNTHETIC_DATA_DIR=$(mktemp -d) @@ -198,7 +199,7 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup "$CWD"/tests/integration_tests/_submitDummyTraining.exp - docker kill fladmin + docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX sleep 60 cd "$CWD" @@ -255,7 +256,7 @@ run_dummy_training_in_swarm () { kill_server_and_clients () { echo "[Cleanup] Kill server and client Docker containers ..." - docker kill odelia_swarm_server_flserver odelia_swarm_client_client_A odelia_swarm_client_client_B + docker kill odelia_swarm_server_flserver_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_A_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_B_$CONTAINER_VERSION_SUFFIX } From d831f6697dbe10fd9d1040ab417752e3db103432 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 10 Sep 2025 14:56:08 +0200 Subject: [PATCH 176/337] updated instructions on building startup kits --- assets/readme/README.operator.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/assets/readme/README.operator.md b/assets/readme/README.operator.md index 101d5266..130629b7 100644 --- a/assets/readme/README.operator.md +++ b/assets/readme/README.operator.md @@ -24,12 +24,15 @@ For example, add the following line (replace `` with the server's actual IP ### Via Script (recommended) 1. Use, e.g., the file `application/provision/project_MEVIS_test.yml`, adapt as needed (network protocol etc.) -2. Call `buildStartupKits.sh /path/to/project_configuration.yml` to build the startup kits +2. Call `buildDockerImageAndStartupKits.sh -p /path/to/project_configuration.yml` to build the Docker image and the startup kits 3. Startup kits are generated to `workspace//prod_00/` -4. Deploy startup kits to the respective server/clients +4. Deploy startup kits to the respective server/client operators +5. Push the Docker image to the registry ### Via the Dashboard (not recommended) +Build the Docker image as described above. + ```bash docker run -d --rm \ --ipc=host -p 8443:8443 \ @@ -69,7 +72,7 @@ Access the dashboard at `https://localhost:8443` log in with the admin credentia 2. Client Sites > approve client sites 3. Project Home > freeze project -## Download startup kits +#### Download startup kits After setting up the project admin configuration, server and clients can download their startup kits. Store the passwords somewhere, they are only displayed once (or you can download them again). From 3c44e88a111968c8e07e8deb57643eb7c1d2611e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 10 Sep 2025 14:56:17 +0200 Subject: [PATCH 177/337] check for keywords in documentation --- runIntegrationTests.sh | 44 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 30167298..23339c58 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -18,24 +18,59 @@ fi check_files_on_github () { echo "[Run] Test whether expected content is available on github" - CONTENT=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/LICENSE) - if echo "$CONTENT" | grep -q "MIT License" ; then + LICENSE_ON_GITHUB=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/LICENSE) + if echo "$LICENSE_ON_GITHUB" | grep -q "MIT License" ; then echo "Downloaded and verified license from github" else echo "Could not download and verify license" exit 1 fi - CONTENT=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/README.md) + MAIN_README=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/README.md) for ROLE in 'Swarm Participant' 'Developer' 'Swarm Operator'; do - if echo "$CONTENT" | grep -q "$ROLE" ; then + if echo "$MAIN_README" | grep -qie "$ROLE" ; then echo "Instructions for $ROLE found" else echo "Instructions for role $ROLE missing" exit 1 fi done + + PARTICIPANT_README=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/assets/readme/README.participant.md) + for EXPECTED_KEYWORDS in 'Prerequisites' 'RAM' 'Ubuntu' 'VPN' 'Prepare Dataset' './docker.sh' 'Local Training' 'Start Swarm Node'; + do + if echo "$PARTICIPANT_README" | grep -qie "$EXPECTED_KEYWORDS" ; then + echo "Instructions on $EXPECTED_KEYWORDS found" + else + echo "Instructions on $EXPECTED_KEYWORDS missing" + exit 1 + fi + done + + SWARM_OPERATOR_README=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/assets/readme/README.operator.md) + for EXPECTED_KEYWORDS in 'Create Startup Kits' 'Starting a Swarm Training'; + do + if echo "$SWARM_OPERATOR_README" | grep -qie "$EXPECTED_KEYWORDS" ; then + echo "Instructions on $EXPECTED_KEYWORDS found" + else + echo "Instructions on $EXPECTED_KEYWORDS missing" + exit 1 + fi + done + + APC_DEVELOPER_README=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/assets/readme/README.developer.md) + for EXPECTED_KEYWORDS in 'Contributing Application Code'; + do + if echo "$APC_DEVELOPER_README" | grep -qie "$EXPECTED_KEYWORDS" ; then + echo "Instructions on $EXPECTED_KEYWORDS found" + else + echo "Instructions on $EXPECTED_KEYWORDS missing" + exit 1 + fi + done + + } @@ -271,7 +306,6 @@ cleanup_temporary_data () { case "$1" in check_files_on_github) check_files_on_github - cleanup_temporary_data ;; run_unit_tests_controller) From 630931ddcff6e47c54373d52d874d5ea46b70bfa Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 10 Sep 2025 15:17:07 +0200 Subject: [PATCH 178/337] clean up temp dir in case more than this test is run in a container --- tests/integration_tests/_run_3dcnn_simulation_mode.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration_tests/_run_3dcnn_simulation_mode.sh b/tests/integration_tests/_run_3dcnn_simulation_mode.sh index 030e855e..a39da49d 100755 --- a/tests/integration_tests/_run_3dcnn_simulation_mode.sh +++ b/tests/integration_tests/_run_3dcnn_simulation_mode.sh @@ -17,6 +17,7 @@ run_3dcnn_simulation_mode () { export MODEL_NAME=MST export CONFIG=unilateral nvflare simulator -w /tmp/ODELIA_ternary_classification -n 2 -t 2 ${TMPDIR}/ODELIA_ternary_classification -c client_A,client_B + rm -rf ${TMPDIR} } run_3dcnn_simulation_mode From 6b84f073477b0a175b3a6e4ccd4305d6a1f23681 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 10 Sep 2025 15:17:23 +0200 Subject: [PATCH 179/337] updated documentation of test output --- assets/readme/README.developer.md | 13 +++++++------ runIntegrationTests.sh | 5 ----- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/assets/readme/README.developer.md b/assets/readme/README.developer.md index 0f789529..1215e574 100644 --- a/assets/readme/README.developer.md +++ b/assets/readme/README.developer.md @@ -33,12 +33,13 @@ The project description specifies the swarm nodes etc. to be used for a swarm tr You should see 1. several expected errors and warnings printed from unit tests that should succeed overall, and a coverage report -2. output of a successful simulation run with two nodes -3. output of a successful proof-of-concept run run with two nodes -4. output of a set of startup kits being generated -5. output of a Docker/GPU preflight check using one of the startup kits -6. output of a data access preflight check using one of the startup kits -7. output of a dummy training run in a swarm consisting of one server and two client nodes +2. output of a successful simulation run of a dummy training with two nodes +3. output of a successful proof-of-concept run of a dummy training with two nodes +4. output of a successful simulation run of a 3D CNN training using synthetic data with two nodes +5. output of a set of startup kits being generated +6. output of a Docker/GPU preflight check using one of the startup kits +7. output of a data access preflight check using one of the startup kits +8. output of a dummy training run in a swarm consisting of one server and two client nodes Optionally, uncomment running NVFlare unit tests. diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 23339c58..dc0add80 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -69,8 +69,6 @@ check_files_on_github () { exit 1 fi done - - } @@ -380,6 +378,3 @@ case "$1" in ;; *) echo "Unknown argument: $1"; exit 1 ;; esac - -# TODO -# Once the 3D CNN simulation mode works, it should be mentioned in ./assets/readme/README.developer.md. From fe0a20388b297e988e060cbfc8188e73863fd316 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Thu, 11 Sep 2025 06:10:20 +0200 Subject: [PATCH 180/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index a2e58d48..4a77cb7c 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -167,7 +167,7 @@ RUN apt install -y \ libxcb1=1.14-3ubuntu3 \ libxdmcp6=1:1.1.3-0ubuntu5 \ libxext6=2:1.3.4-1build1 \ - libxml2=2.9.13+dfsg-1ubuntu0.8 \ + libxml2=2.9.13+dfsg-1ubuntu0.9 \ libxmuu1=2:1.1.3-3 \ libxtables12=1.8.7-1ubuntu5.2 \ netbase=6.3 \ From 52ce9d4ad8f551b5fa30c7adc1710b07ede4d8df Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 15 Sep 2025 14:21:42 +0200 Subject: [PATCH 181/337] check that aggregation and metrics are communicated --- runIntegrationTests.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index dc0add80..8fac9cad 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -252,7 +252,7 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00/client_A/startup CONSOLE_OUTPUT=nohup.out - for EXPECTED_OUTPUT in 'Sending training result to aggregation client' 'Epoch 9: 100%' ; + for EXPECTED_OUTPUT in 'Sending training result to aggregation client' 'Epoch 9: 100%' 'val/AUC_ROC'; do if grep -q "$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then echo "Expected output $EXPECTED_OUTPUT found" @@ -263,6 +263,16 @@ run_dummy_training_in_swarm () { done cd "$CWD" + for EXPECTED_OUTPUT in 'validation metric .* from client' 'aggregating [0-9]* update(s) at round [0-9]*'; + do + if grep -q --regexp="$EXPECTED_OUTPUT" "$PROJECT_DIR"/prod_00/client_?/startup/nohup.out; then + echo "Expected output $EXPECTED_OUTPUT found" + else + echo "Expected output $EXPECTED_OUTPUT missing" + exit 1 + fi + done + cd "$PROJECT_DIR"/prod_00/client_A/ FILES_PRESENT=$(find . -type f -name "*.*") for EXPECTED_FILE in 'custom/minimal_training.py' 'best_FL_global_model.pt' 'FL_global_model.pt' ; From b2a7405579919499d9bdd9b2cde68351846c62d7 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 15 Sep 2025 14:40:55 +0200 Subject: [PATCH 182/337] check number of rounds --- runIntegrationTests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 8fac9cad..ce0db11a 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -239,7 +239,7 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00/localhost/startup CONSOLE_OUTPUT=nohup.out for EXPECTED_OUTPUT in 'Total clients: 2' 'updated status of client client_A on round 4' 'updated status of client client_B on round 4' 'all_done=True' 'Server runner finished.' \ - 'Start to the run Job: [0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}'; + 'Start to the run Job: [0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}' 'updated status of client client_B on round 4'; do if grep -q --regexp="$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then echo "Expected output $EXPECTED_OUTPUT found" From 66398935e7519a84474bfa04823f85a3d1699452 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 15 Sep 2025 14:46:10 +0200 Subject: [PATCH 183/337] check that dummy training ApC is available --- runIntegrationTests.sh | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index ce0db11a..402dd4d8 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -69,6 +69,17 @@ check_files_on_github () { exit 1 fi done + + DUMMY_TRAINING_APC=$(curl -L https://raw.githubusercontent.com/KatherLab/MediSwarm/refs/heads/main/application/jobs/minimal_training_pytorch_cnn/app/custom/main.py) + for EXPECTED_KEYWORDS in 'python3'; + do + if echo "$DUMMY_TRAINING_APC" | grep -qie "$EXPECTED_KEYWORDS" ; then + echo "Dummy Training ApC: $EXPECTED_KEYWORDS found" + else + echo "Dummy Training ApC: $EXPECTED_KEYWORDS missing" + exit 1 + fi + done } From 0f7e50eb68cf2fc7e2d72189d17453ccffa5b7f4 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 16 Sep 2025 16:50:33 +0200 Subject: [PATCH 184/337] temporarily removed failing test from CI workflow --- .github/workflows/pr-test.yaml | 5 ----- runIntegrationTests.sh | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index abe73027..c97f16aa 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -94,8 +94,3 @@ jobs: continue-on-error: false run: | ./runIntegrationTests.sh run_data_access_preflight_check - - - name: Run integration test (run_dummy_training_in_swarm) - continue-on-error: false - run: | - ./runIntegrationTests.sh run_dummy_training_in_swarm diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 402dd4d8..49f07caf 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -378,6 +378,7 @@ case "$1" in run_dummy_training_in_swarm kill_server_and_clients cleanup_temporary_data + # TODO add to CI if we want this (currently not working) ;; all | "") From f0f490fc9272b1b86549522f4b6b22730dbeedb1 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 16 Sep 2025 16:59:30 +0200 Subject: [PATCH 185/337] test listing licenses --- .github/workflows/pr-test.yaml | 5 +++++ runIntegrationTests.sh | 25 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index c97f16aa..0ffa7007 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -85,6 +85,11 @@ jobs: run: | ./runIntegrationTests.sh create_startup_kits + - name: Run license listing test (run_list_licenses) + continue-on-error: false + run: | + ./runIntegrationTests.sh run_list_licenses + - name: Run integration test (run_docker_gpu_preflight_check) continue-on-error: false run: | diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 49f07caf..25a1f05c 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -173,6 +173,25 @@ create_synthetic_data () { } +run_list_licenses () { + cd "$PROJECT_DIR"/prod_00 + cd localhost/startup + LICENSES_LISTED=$(./docker.sh --list_licenses --no_pull) + + for EXPECTED_KEYWORDS in 'scikit-learn' 'torch' 'nvflare_mediswarm' 'BSD License' 'MIT License'; + do + if echo "$LICENSES_LISTED" | grep -qie "$EXPECTED_KEYWORDS" ; then + echo "Instructions on $EXPECTED_KEYWORDS found" + else + echo "Instructions on $EXPECTED_KEYWORDS missing" + exit 1 + fi + done + + cd "$CWD" +} + + run_docker_gpu_preflight_check () { # requires having built a startup kit echo "[Run] Docker/GPU preflight check (local dummy training via startup kit) ..." @@ -358,6 +377,12 @@ case "$1" in cleanup_temporary_data ;; + run_list_licenses) + create_startup_kits_and_check_contained_files + run_list_licenses + cleanup_temporary_data + ;; + run_docker_gpu_preflight_check) create_startup_kits_and_check_contained_files run_docker_gpu_preflight_check From ceb33ff0fec8857824710354bf8e82243ce28296 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 17 Sep 2025 11:33:32 +0200 Subject: [PATCH 186/337] Added test of pushing image to local registry (in separate Docker container) and pulling it from there. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This required additional changes: * changed name of Docker image for testing to localhost:5000/…, which should also prevent accidental push * parse name of Docker image from swarm project description yml rather than use hard-coded name * extended "delete old image versions" script accordingly --- .github/workflows/pr-test.yaml | 5 ++ _buildStartupKits.sh | 8 ++-- buildDockerImageAndStartupKits.sh | 25 +++++----- runIntegrationTests.sh | 48 +++++++++++++++++-- .../remove_old_odelia_docker_images.sh | 5 +- tests/provision/dummy_project_for_testing.yml | 2 +- 6 files changed, 71 insertions(+), 22 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index 0ffa7007..2f1f733c 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -99,3 +99,8 @@ jobs: continue-on-error: false run: | ./runIntegrationTests.sh run_data_access_preflight_check + + - name: Run integration test (push_pull_image) + continue-on-error: false + run: | + ./runIntegrationTests.sh push_pull_image \ No newline at end of file diff --git a/_buildStartupKits.sh b/_buildStartupKits.sh index 29755d27..94950376 100755 --- a/_buildStartupKits.sh +++ b/_buildStartupKits.sh @@ -2,15 +2,17 @@ set -euo pipefail -if [ "$#" -ne 2 ]; then - echo "Usage: _buildStartupKits.sh SWARM_PROJECT.yml VERSION_STRING" +if [ "$#" -ne 3 ]; then + echo "Usage: _buildStartupKits.sh SWARM_PROJECT.yml VERSION_STRING CONTAINER_NAME" exit 1 fi PROJECT_YML=$1 VERSION=$2 +CONTAINER_NAME=$3 sed -i 's#__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__#'$VERSION'#' $PROJECT_YML + echo "Building startup kits for project $PROJECT_YML with version $VERSION" docker run --rm \ -u $(id -u):$(id -g) \ @@ -20,7 +22,7 @@ docker run --rm \ -w /workspace/ \ -e PROJECT_YML=$PROJECT_YML \ -e VERSION=$VERSION \ - jefftud/odelia:$VERSION \ + $CONTAINER_NAME \ /bin/bash -c "nvflare provision -p \$PROJECT_YML && ./_generateStartupKitArchives.sh \$PROJECT_YML \$VERSION"|| { echo "Docker run failed"; exit 1; } sed -i 's#'$VERSION'#__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__#' $PROJECT_YML diff --git a/buildDockerImageAndStartupKits.sh b/buildDockerImageAndStartupKits.sh index 5c894a4f..654e63cd 100755 --- a/buildDockerImageAndStartupKits.sh +++ b/buildDockerImageAndStartupKits.sh @@ -25,7 +25,6 @@ if [ -z "$PROJECT_FILE" ]; then fi VERSION=`./getVersionNumber.sh` -DOCKER_IMAGE=jefftud/odelia:$VERSION CONTAINER_VERSION_ID=`git rev-parse --short HEAD` # prepare clean version of source code repository clone for building Docker image @@ -41,16 +40,15 @@ git clean -x -q -f . cd ../.. rm .git -rf chmod a+rX . -R + +# replacements in copy of source code sed -i 's#__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__#'$VERSION'#' docker_config/master_template.yml sed -i 's#__REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__#'$CONTAINER_VERSION_ID'#' docker_config/master_template.yml -cd $CWD - - # prepare pre-trained model weights for being included in Docker image -MODEL_WEIGHTS_FILE='docker_config/torch_home_cache/hub/checkpoints/dinov2_vits14_pretrain.pth' -MODEL_LICENSE_FILE='docker_config/torch_home_cache/hub/facebookresearch_dinov2_main/LICENSE' +MODEL_WEIGHTS_FILE=$CWD'/docker_config/torch_home_cache/hub/checkpoints/dinov2_vits14_pretrain.pth' +MODEL_LICENSE_FILE=$CWD'/docker_config/torch_home_cache/hub/facebookresearch_dinov2_main/LICENSE' if [[ ! -f $MODEL_WEIGHTS_FILE || ! -f $MODEL_LICENSE_FILE ]]; then echo "Pre-trained model not available. Attempting download" HUBDIR=$(dirname $(dirname $MODEL_LICENSE_FILE)) @@ -63,22 +61,25 @@ if [[ ! -f $MODEL_WEIGHTS_FILE || ! -f $MODEL_LICENSE_FILE ]]; then fi if echo 2e405cee1bad14912278296d4f42e993 $MODEL_WEIGHTS_FILE | md5sum --check - && echo 153d2db1c329326a2d9f881317ea942e $MODEL_LICENSE_FILE | md5sum --check -; then - cp -r ./docker_config/torch_home_cache $CLEAN_SOURCE_DIR/torch_home_cache + cp -r $CWD/docker_config/torch_home_cache $CLEAN_SOURCE_DIR/torch_home_cache else exit 1 fi chmod a+rX $CLEAN_SOURCE_DIR/torch_home_cache -R +cd $CWD # build and print follow-up steps +CONTAINER_NAME=`grep " docker_image: " $PROJECT_FILE | sed 's/ docker_image: //' | sed 's#__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__#'$VERSION'#'` +echo $CONTAINER_NAME -docker build $DOCKER_BUILD_ARGS -t $DOCKER_IMAGE $CLEAN_SOURCE_DIR -f docker_config/Dockerfile_ODELIA +docker build $DOCKER_BUILD_ARGS -t $CONTAINER_NAME $CLEAN_SOURCE_DIR -f docker_config/Dockerfile_ODELIA -echo "Docker image $DOCKER_IMAGE built successfully" -echo "./_buildStartupKits.sh $PROJECT_FILE $VERSION" -./_buildStartupKits.sh $PROJECT_FILE $VERSION +echo "Docker image $CONTAINER_NAME built successfully" +echo "./_buildStartupKits.sh $PROJECT_FILE $VERSION $CONTAINER_NAME" +./_buildStartupKits.sh $PROJECT_FILE $VERSION $CONTAINER_NAME echo "Startup kits built successfully" rm -rf $CLEAN_SOURCE_DIR -echo "If you wish, manually push $DOCKER_IMAGE now" +echo "If you wish, manually push $CONTAINER_NAME now" diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 25a1f05c..f8e68533 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -4,7 +4,7 @@ set -e VERSION=$(./getVersionNumber.sh) CONTAINER_VERSION_SUFFIX=$(git rev-parse --short HEAD) -DOCKER_IMAGE=jefftud/odelia:$VERSION +DOCKER_IMAGE=localhost:5000/odelia:$VERSION PROJECT_DIR="workspace/odelia_${VERSION}_dummy_project_for_testing" SYNTHETIC_DATA_DIR=$(mktemp -d) SCRATCH_DIR=$(mktemp -d) @@ -130,13 +130,13 @@ create_startup_kits_and_check_contained_files () { echo "[Prepare] Startup kits for test project ..." if [ ! -d "$PROJECT_DIR"/prod_00 ]; then - ./_buildStartupKits.sh $PROJECT_FILE $VERSION + ./_buildStartupKits.sh $PROJECT_FILE $VERSION $DOCKER_IMAGE fi if [ -d "$PROJECT_DIR"/prod_01 ]; then - echo '"$PROJECT_DIR"/prod_01 exists, please remove/rename it' + echo '$PROJECT_DIR/prod_01 exists, please remove/rename it' exit 1 fi - ./_buildStartupKits.sh $PROJECT_FILE $VERSION + ./_buildStartupKits.sh $PROJECT_FILE $VERSION $DOCKER_IMAGE for FILE in 'client.crt' 'client.key' 'docker.sh' 'rootCA.pem'; do @@ -256,6 +256,35 @@ start_server_and_clients () { } +start_registry_docker_and_push () { + docker run -d --rm -p 5000:5000 --name local_test_registry_$CONTAINER_VERSION_SUFFIX registry:3 + sleep 3 + docker push localhost:5000/odelia:$VERSION +} + + +run_container_with_pulling () { + docker rmi localhost:5000/odelia:$VERSION + cd "$PROJECT_DIR"/prod_00 + cd localhost/startup + OUTPUT=$(./docker.sh --list_licenses) + + if echo "$OUTPUT" | grep -qie "Status: Downloaded newer image for localhost:5000/odelia:$VERSION" ; then + echo "Image pulled successfully" + else + echo "Instructions on $EXPECTED_KEYWORDS missing" + exit 1 + fi + + cd "$CWD" +} + + +kill_registry_docker () { + docker kill local_test_registry_$CONTAINER_VERSION_SUFFIX +} + + run_dummy_training_in_swarm () { echo "[Run] Dummy training in swarm ..." @@ -396,6 +425,13 @@ case "$1" in cleanup_temporary_data ;; + push_pull_image) + create_startup_kits_and_check_contained_files + start_registry_docker_and_push + run_container_with_pulling + kill_registry_docker + ;; + run_dummy_training_in_swarm) create_startup_kits_and_check_contained_files create_synthetic_data @@ -416,6 +452,9 @@ case "$1" in create_synthetic_data run_3dcnn_simulation_mode create_startup_kits_and_check_contained_files + start_registry_docker_and_push + run_container_with_pulling + kill_registry_docker run_docker_gpu_preflight_check run_data_access_preflight_check start_server_and_clients @@ -423,5 +462,6 @@ case "$1" in kill_server_and_clients cleanup_temporary_data ;; + *) echo "Unknown argument: $1"; exit 1 ;; esac diff --git a/scripts/dev_utils/remove_old_odelia_docker_images.sh b/scripts/dev_utils/remove_old_odelia_docker_images.sh index 7da4ee25..5f25f6d3 100755 --- a/scripts/dev_utils/remove_old_odelia_docker_images.sh +++ b/scripts/dev_utils/remove_old_odelia_docker_images.sh @@ -1,6 +1,7 @@ #!/usr/bin/env bash export OLD_ODELIA_DOCKER_IMAGES=$(docker image list | grep jefftud/odelia | sed 's|jefftud/odelia *[0-9a-z.-]* *||' | sed 's| *.*||' | tail -n +2) +export OLD_ODELIA_DOCKER_IMAGES_LOCAL=$(docker image list | grep localhost:5000/odelia | sed 's|localhost:5000/odelia *[0-9a-z.-]* *||' | sed 's| *.*||' | tail -n +2) echo "All docker images:" @@ -8,12 +9,12 @@ docker image list echo "The following Docker images are old ODELIA docker images:" -echo "$OLD_ODELIA_DOCKER_IMAGES" +echo "$OLD_ODELIA_DOCKER_IMAGES" "$OLD_ODELIA_DOCKER_IMAGES_LOCAL" read -p "Delete these Docker images, unless they have additional tags? (y/n): " answer if [[ "$answer" == "y" ]]; then - for image in $OLD_ODELIA_DOCKER_IMAGES; do + for image in $OLD_ODELIA_DOCKER_IMAGES $OLD_ODELIA_DOCKER_IMAGES_LOCAL; do docker rmi $image done fi diff --git a/tests/provision/dummy_project_for_testing.yml b/tests/provision/dummy_project_for_testing.yml index 613f81ce..5e658c78 100644 --- a/tests/provision/dummy_project_for_testing.yml +++ b/tests/provision/dummy_project_for_testing.yml @@ -29,7 +29,7 @@ builders: args: config_folder: config scheme: http - docker_image: jefftud/odelia:__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__ + docker_image: localhost:5000/odelia:__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__ overseer_agent: path: nvflare.ha.dummy_overseer_agent.DummyOverseerAgent overseer_exists: false From d74a31ddb7bb3005be168c77ba0c81ba84657601 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 17 Sep 2025 14:29:40 +0200 Subject: [PATCH 187/337] removed lengthy test step that does not provide much value from CI pipeline --- .github/workflows/pr-test.yaml | 5 ----- runIntegrationTests.sh | 1 + 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index 2f1f733c..0ffa7007 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -99,8 +99,3 @@ jobs: continue-on-error: false run: | ./runIntegrationTests.sh run_data_access_preflight_check - - - name: Run integration test (push_pull_image) - continue-on-error: false - run: | - ./runIntegrationTests.sh push_pull_image \ No newline at end of file diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index f8e68533..e63bf993 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -430,6 +430,7 @@ case "$1" in start_registry_docker_and_push run_container_with_pulling kill_registry_docker + # TODO add to CI if we want this (takes several minutes) ;; run_dummy_training_in_swarm) From 0be2cd5cacff9925999400dc118c9c4b41dce087 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 17 Sep 2025 14:29:08 +0200 Subject: [PATCH 188/337] more speaking names of the CI test steps --- .github/workflows/pr-test.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index 0ffa7007..c54b5ce9 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -50,52 +50,52 @@ jobs: - name: Build Docker image and dummy startup kits run: ./buildDockerImageAndStartupKits.sh -p tests/provision/dummy_project_for_testing.yml --use-docker-cache - - name: Run integration test (check_files_on_github) + - name: Run integration test: check documentation on github continue-on-error: false run: | ./runIntegrationTests.sh check_files_on_github - - name: Run integration test (run_unit_tests_controller) + - name: Run controller unit tests continue-on-error: false run: | ./runIntegrationTests.sh run_unit_tests_controller - - name: Run integration test (run_dummy_training_standalone) + - name: Run dummy training standalone continue-on-error: false run: | ./runIntegrationTests.sh run_dummy_training_standalone - - name: Run integration test (run_dummy_training_simulation_mode) + - name: Run dummy training in simulation mode continue-on-error: false run: | ./runIntegrationTests.sh run_dummy_training_simulation_mode - - name: Run integration test (run_dummy_training_poc_mode) + - name: Run dummy training in proof-of-concept mode continue-on-error: false run: | ./runIntegrationTests.sh run_dummy_training_poc_mode - - name: Run integration test (run_3dcnn_simulation_mode) + - name: Run 3DCNN training in simulation mode continue-on-error: false run: | ./runIntegrationTests.sh run_3dcnn_simulation_mode - - name: Run integration test (create_startup_kits) + - name: Run integration test: creating startup kits continue-on-error: false run: | ./runIntegrationTests.sh create_startup_kits - - name: Run license listing test (run_list_licenses) + - name: Run intergration test: listing licenses continue-on-error: false run: | ./runIntegrationTests.sh run_list_licenses - - name: Run integration test (run_docker_gpu_preflight_check) + - name: Run integration test: Docker/GPU preflight check continue-on-error: false run: | ./runIntegrationTests.sh run_docker_gpu_preflight_check - - name: Run integration test (run_data_access_preflight_check) + - name: Run integration test: Data access preflight check continue-on-error: false run: | ./runIntegrationTests.sh run_data_access_preflight_check From 54f91823fccd63dea0da67356c3bc9cc59cbdf0b Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 17 Sep 2025 14:35:55 +0200 Subject: [PATCH 189/337] fixed syntax of workflow --- .github/workflows/pr-test.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index c54b5ce9..950064f2 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -50,7 +50,7 @@ jobs: - name: Build Docker image and dummy startup kits run: ./buildDockerImageAndStartupKits.sh -p tests/provision/dummy_project_for_testing.yml --use-docker-cache - - name: Run integration test: check documentation on github + - name: Run integration test checking documentation on github continue-on-error: false run: | ./runIntegrationTests.sh check_files_on_github @@ -80,22 +80,22 @@ jobs: run: | ./runIntegrationTests.sh run_3dcnn_simulation_mode - - name: Run integration test: creating startup kits + - name: Run integration test creating startup kits continue-on-error: false run: | ./runIntegrationTests.sh create_startup_kits - - name: Run intergration test: listing licenses + - name: Run intergration test listing licenses continue-on-error: false run: | ./runIntegrationTests.sh run_list_licenses - - name: Run integration test: Docker/GPU preflight check + - name: Run integration test Docker GPU preflight check continue-on-error: false run: | ./runIntegrationTests.sh run_docker_gpu_preflight_check - - name: Run integration test: Data access preflight check + - name: Run integration test Data access preflight check continue-on-error: false run: | ./runIntegrationTests.sh run_data_access_preflight_check From 1c31e0260a0b98c71ac7b4e1d9bc4ff0e86edf9e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 17 Sep 2025 15:01:43 +0200 Subject: [PATCH 190/337] do not need -it for listing licenses --- docker_config/master_template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index ec989a7b..42e81588 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -776,7 +776,7 @@ docker_svr_sh: | --ipc=host $NETARG $DOCKER_IMAGE \ /bin/bash -c "nohup ./start.sh >> nohup.out 2>&1 && chmod a+r nohup.out && /bin/bash" elif [ ! -z "$LIST_LICENSES" ]; then - docker run -it --rm --name=$CONTAINER_NAME \ + docker run --rm --name=$CONTAINER_NAME \ $DOCKER_IMAGE \ /bin/bash -c "pip-licenses -s -u --order=license" elif [ ! -z "$INTERACTIVE" ]; then From baee7cd9eb2de8b8d7ac54a04a79274f49a4d551 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 19 Sep 2025 13:33:38 +0200 Subject: [PATCH 191/337] implemented test that client with incorrect startup kit cannot connect --- runIntegrationTests.sh | 54 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index e63bf993..1dc9dc3a 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -285,6 +285,51 @@ kill_registry_docker () { } +verify_wrong_client_does_not_connect () { + echo "[Run] Verify that client with outdated startup kit does not connect ..." + + cp -r "$PROJECT_DIR"/prod_01 "$PROJECT_DIR"/prod_wrong_client + cd "$PROJECT_DIR"/prod_wrong_client + cd localhost/startup + ./docker.sh --no_pull --start_server + cd ../.. + sleep 10 + + rm client_A -rf + tar xvf "$CWD"/tests/integration_tests/outdated_startup_kit.tar.gz + sed -i 's#DOCKER_IMAGE=localhost:5000/odelia:1.0.1-dev.250919.095c1b7#DOCKER_IMAGE='$DOCKER_IMAGE'#' client_A/startup/docker.sh + sed -i 's#CONTAINER_NAME=odelia_swarm_client_client_A_095c1b7#CONTAINER_NAME=odelia_swarm_client_client_A_'$CONTAINER_VERSION_SUFFIX'#' client_A/startup/docker.sh + + cd client_A/startup + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --start_client + cd ../.. + + sleep 20 + + CONSOLE_OUTPUT_SERVER=localhost/startup/nohup.out + CONSOLE_OUTPUT_CLIENT=client_A/startup/nohup.out + + if grep -q "Total clients: 1" $CONSOLE_OUTPUT_SERVER; then + echo "Connection with non-authorized client" + exit 1 + else + echo "Connection rejected successfully by server" + fi + + if grep -q "SSLCertVerificationError" $CONSOLE_OUTPUT_CLIENT; then + echo "Connection rejected successfully by client" + else + echo "Could not verify that connection was rejected" + exit 1 + fi + + docker kill odelia_swarm_server_flserver_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_A_$CONTAINER_VERSION_SUFFIX + rm -rf "$PROJECT_DIR"/prod_wrong_client + + cd "$CWD" +} + + run_dummy_training_in_swarm () { echo "[Run] Dummy training in swarm ..." @@ -433,6 +478,14 @@ case "$1" in # TODO add to CI if we want this (takes several minutes) ;; + check_wrong_startup_kit) + create_startup_kits_and_check_contained_files + create_synthetic_data + verify_wrong_client_does_not_connect + cleanup_temporary_data + # TODO add to CI if we want this + ;; + run_dummy_training_in_swarm) create_startup_kits_and_check_contained_files create_synthetic_data @@ -459,6 +512,7 @@ case "$1" in run_docker_gpu_preflight_check run_data_access_preflight_check start_server_and_clients + verify_wrong_client_does_not_connect run_dummy_training_in_swarm kill_server_and_clients cleanup_temporary_data From d9b7771da9eff6a2503045484de50133ae497537 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Sun, 21 Sep 2025 06:10:07 +0200 Subject: [PATCH 192/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 4a77cb7c..27df6e9f 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -120,11 +120,11 @@ RUN apt install -y \ dbus-user-session=1.12.20-2ubuntu4.1 \ dbus=1.12.20-2ubuntu4.1 \ dmsetup=2:1.02.175-2.1ubuntu5 \ - docker-buildx-plugin=0.27.0-1~ubuntu.22.04~jammy \ + docker-buildx-plugin=0.28.0-0~ubuntu.22.04~jammy \ docker-ce-cli=5:28.4.0-1~ubuntu.22.04~jammy \ docker-ce-rootless-extras=5:28.4.0-1~ubuntu.22.04~jammy \ docker-ce=5:28.4.0-1~ubuntu.22.04~jammy \ - docker-compose-plugin=2.39.2-1~ubuntu.22.04~jammy \ + docker-compose-plugin=2.39.4-0~ubuntu.22.04~jammy \ gir1.2-glib-2.0=1.72.0-1 \ git-man=1:2.34.1-1ubuntu1.15 \ git=1:2.34.1-1ubuntu1.15 \ From 0baaedc0b37ec289f6ea9723e5fce023298e9cad Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 22 Sep 2025 11:39:55 +0200 Subject: [PATCH 193/337] need to use correct startup kit --- assets/readme/README.participant.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index e93eaf2a..08f5fe6f 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -73,7 +73,7 @@ The dataset must be in the following format. ## Prepare Training Participation -1. Extract startup kit provided by swarm operator +1. Extract the startup kit provided by swarm operator for the current experiment. ### Local Testing on Your Data @@ -164,3 +164,4 @@ For any issues, check if the commands above point to problems and contact your S * The directories listed as identifiers in the tables `annotation.csv` and `split.csv` should all be present, only those directories should be present * The tables should not have additional or duplicate columns, entries need to have the correct captitalization * Image and table folders and files need to be present in the folders specified via `--data_dir`. Symlinks to other locations do not work, they are not available in the Docker mount. +* The correct startup kit needs to be used. `SSLCertVerificationError` or `authentication failed` may indicate an incorrect startup kit incompatible with the current experiment. From 611813b3c3f6ffbf01c921d348598684aa10b2c3 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 27 Aug 2025 09:24:12 +0200 Subject: [PATCH 194/337] extended troubleshooting --- assets/readme/README.participant.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 08f5fe6f..3cf6bd4c 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -160,8 +160,9 @@ For any issues, check if the commands above point to problems and contact your S ## Troubleshooting +* Folders where files are located need to have the correct name * Image files need to have the correct file name including capitalization -* The directories listed as identifiers in the tables `annotation.csv` and `split.csv` should all be present, only those directories should be present +* The directories listed as identifiers in the tables `annotation.csv` and `split.csv` should all be present and named correctly (including capitalization), only those directories should be present * The tables should not have additional or duplicate columns, entries need to have the correct captitalization * Image and table folders and files need to be present in the folders specified via `--data_dir`. Symlinks to other locations do not work, they are not available in the Docker mount. * The correct startup kit needs to be used. `SSLCertVerificationError` or `authentication failed` may indicate an incorrect startup kit incompatible with the current experiment. From 3b547beffd58565bb6e2bd8ce926cccbc3ed3b74 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 23 Sep 2025 14:51:13 +0200 Subject: [PATCH 195/337] added file forgotten in baee7cd9eb2de8b8d7ac54a04a79274f49a4d551 --- .../outdated_startup_kit.tar.gz | Bin 0 -> 9917 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/integration_tests/outdated_startup_kit.tar.gz diff --git a/tests/integration_tests/outdated_startup_kit.tar.gz b/tests/integration_tests/outdated_startup_kit.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..ba3a984ec77afe5d8cfe1dd3747b3d0b57306e39 GIT binary patch literal 9917 zcma)fMN}LLpeqz8?(W6i-QBIo;10#LxVsm38>F}tEACL-eSi*5ad#M)cmLnNb6!@- z$uf&1lnF>M{~vJYhJMML^36}%uXum`$#7NtH9hEnH01;wV} zggiMrux}~>Q=|6fu8jv`ttkO3{WX`reo=^ENE&XVFA#>1TcqgcT>=GF_9-POPY~3t z(3H&KWzsk}3ySqm5u~9}!eW}*E4p!CIGQIPjZ!`=x;a*$Y7@b~Lre_q55sRc+#)4h zLs0sH+oA2n3~$*3-*McGicTBYfAVn3Xogsa6wn{fU1&7jqr$(P=0PytCEB49{j$Bt zZj$yZ15&i+nf59Ef6>m5iOI7|wf0(b7 z#s!vJ*qS~ap03PW#$b~+4#wpS1Khi-#;Jsq#u;?QH6PLt3kQvOr?kzzdiyW?ZBRc- z&vH#ZjH2F2yzc$qFMjvqH?P$8=G0#oLppYWMB8 zatztDnF;Zxme|SmP z{+-T6Q%kmgDd2@qaZv4GzUN)2aXph~MsVTD3kw3}O^E|sm<$)?cfXmPm%(E3xDyjk zS_+wwH8xEOvd~fBl@{mk+Ogjl8fJ(}B{d$i)?0ZCr7=w8n4~4Dzrp*vlvmRiBHroz zQMthh=<;?L+#_3wmxAGZh)0n0Ax=`_1x9V~{?MR3WLHd7oJCa10ipIuy*XxP*esb3 zv7#X#m+RWKc&6%2=2a@I;#OJ)7$#>Xb`7mHg|d65A3muH#&4*OoPDLt1LP#T)d

kY*LoZpd8U&ZboWufHzEiX3n*9?3%7-(^Ri)67m|NEo_(^&_}VJUJ4M8yQAKF z-yMi^)N}DY@eZZ5DJ?6}H3Ux3T;{L$R6`aQ&wl7aWoCSy*9H==`xJsPoE zVq;ndcu+ba%Na?wr|)D;*Pt&IuV4lFPOPZ?&GDLGfiuo`2$5mSXl*1^eD+YYdXu?j zODL~ayGM*JGTvjPYQ$$O$D`>lJjOY8cIpn;>YePtOK~nLYUVE5yi~zTIlpTMgCuPp zMJ}U#YB2I4=}3z1yv|OYBkl%h`T6x=+4bW1*kryh1wmm(pZ{D`>Be5M3Pp+`EbrR6 zzM2M@$>SkgN1j~gCzW$tnd@<`as3ClNwh{X%(jo2>~YvEknN>fZQdsqduLVG=?BaG zyLI6DZ*;(3qh?y1-n4C=2Z0~6;oC=3+`l0I#-hBp2u+pQGD;$&x6u8?D;L0 z8yhD$tQBX-{!;H&5+bc2U)Ro2!40%G6`4uCT743$U{U;yar5ZgJT#gDQdK5g+OiqA zOroyta;>=|l}!mtZgG5IHw(@8$H7{O+hZ^+ z3wZ#W@^6u!sNKsCLFAV;`}67=L6y7HO@iAe*%Xg^yFMCbm2_h74B9Jg4-733a~==!}D#rGDq4WBn>KLTemV2N)XQ0i z_+})+_5}~Tyojz+^)iS~+{+s4#I5Xz(n=$=X8Np${smx>xhtugwQh_fz6bFIx8nIplUvl4P=n)5A+ zS%=nv_XNd4%0G9SH!18=?@P5!zc%@I_%bmtuXn{}Z?WfISOxQbP1q1pe@qM}aw1+j zqS5A~Zss~Qp+8r1?AIE_RegN$kJ-CfEy~@TvCkwp3(nx~v2!f6K5&e50mj zqw)yze>WwgPzErSsQa)0?sjcZ8Q#PeH5jfij)@q9f;)2my=%MZ5b~LajsIp;%=@sU zDmYl_5}IBL>e!(+KtZWj%^?vUsR_xATaNmR);2>Sz(5y+;Czt=0Cj+V*^G5`{pCoq zs9}=NNyG#sxF|uIyHX-Pq=u8sD5_j z%aF}Ox|wY@tONNGE^2u#e#2gm4ECg;Cr=eA{yY~ zV*bHaqls%g$DcpaAq1Rl-l-$Cl3}%;BjLT7$F4-{B;39HVu9_xrH+XJvd^tDznhsS zu1^l7;3^^HjTCD}GZ0&6m;ISRn4z8lXz+iRMgb<#hGOZnZHbj^+L zFOk46E%3qyEXbWjL?i_&K18Axf%6NioI_VZ^hSP?H!tBv#Tw2_r;xLaGv=&vn8@5#Vv;5ogB^uTR% z!o4`8-Rf0Bre6#yMvmB*_k*JV>K^_9fy`kMeY``S*p5R!!uSDk6ybOXC)h&zncKLs z5AQQmda)N@z{ALsj%}C!4mluS^TGCKiNYk%$al!@XTP8b2&&frwpzZRd~C>jD2)Up z^q?Nj(SP8P=Rm$sqmb{~m6|Fm9o{}0QrAr6x!zY?LlUYJtSw@YFJ6Io!K&1CZ;_3` z+Cr9bLwq87Crc4G)wJ*$TGZ0@Q~VSTS6x9og!621t6p%eQC z+eoUNCHFETry1}*pAfP5>DdYf)mHH885;k zE$Z~uq-7wvB0|}F`n0Yo=z#8?>i&@+sK#^O@giAbNa6MDbS>oL7e;MeFv;YM72;Qe z$8Dza*Y?uwI%p9cQ(yECjsqws0Tl9)JhFcE((fwWKVaYuSLtoJ;^U@vdTDO%rr2{5 zf0bR05)Ab$Oz1Zbv5;#g_oTA^7b@6zd2Mt~5Q6FxX4H}HA?TCf{yUKpBvPrAb_X6d znIX^kt~VS-r`Ve>kX$EB90qm|IM~A95bwy|S@R0M`+8II^!#=Q=ndcqI`!+N4(Leh z42X4?i1*zV73eZ8>3yTMztOz7W83W3P!DSyiK+ct2Zs=PJ9{ghT4eO_fRs)F1~CuT zyMnjiuI*!tN?42~GSC}DDgGFh>jg#rQOn#8Tkh=Mf-}Wb0Nq&O8`7L?>kF%zl5t zpU&>8$<%n|YFmrw^Kf--^7gtx`fB6e$5f(OZ{KX+fL!n^vfs-o{_SlE&n}2&Ls0Acw?BQqKd7 zbO`eYRiw!90PhLR@#WA|ozMVLH|o#)W#PXun{!a$S0liEa?)jX3b)i&mqyk&9AdOn zKjElYrr7bld#Y6!t;*G<$o82R%IVmk_l`tT%hM?!_}4otU1~e}Ex!~6;Y|71Lzre}Ur6M#OW>)gSQudyzYX%@mNl^@x zi_VIa-`e5A2(v8!z3xwJlmy#}DTjtJQ2j0*v)>>96&deQVmvl=ToEr;qa0sO738AjEzxg3n7r6yR`FQt?36xgvWYjx-6cB{70f4CCINCCkP z@7#l?1*PPIrpO1^WQs$yGVHO)QJrxOLf%78F1#W}4q^^%q-hh9^VquV_OIQu&Q8;- zVriEjSG)XNmQ45iwhq*9_^R&VQ#cFnEDvvqDpUft^pR6}D9J>DxQZqE6hbcyOTc&c zPuXTow82NxzSW+DH37p@aisJ*e8cc?WG?^{)piFvgj|1RT zt?@#lz6PV72;o03YV7G@z@hNO7;|6eQUM8XHfah@!5A=ID}yM~DX#sg@9W|llrKHk z&?2b?0n&Zc=YGTLHJ5j&a?49L6ec$#3u^lPv;R%3Pkes@Znu z+<+(g?yq(RdY;n^`Y|BUv~Li>dV51B@)ioB%=GP;iorrK91kx&(pc2ps&=oCKG_1W7tz^Go@~Kxp!{LV~^O$ z%Tk_p+|ibF3PMF|O81yUWnRT|0r_ex^CCxpvt|8$^b&T~-9mGx@$a?AK3o9Uv~gG- zq??ygJ4|*(0%1P@__bEICXeY6L9Ryu zkO_!9`r5AQA&NWc<^Y0BsiuKM%ZaeMn{WgNVk)~<1=rfn+nH|2>Hv*0`}iTmtX4y~ zsPEZ8yQK2X-1z2P>7Dy&G0Op0y1W~DW7~4wrB;73vO*<~p_6(f*V7FLVq#+@U_gB< z9>=#PF&DN^mXm5Rv5i?n8%}V{jo^U`koy>Mj&m^{?oZ}ix1xk+S z0H-ZSp5CMYR(I`i1uZ}gQF*RvLOJN3AajKm>%rUp;O@eLCN{5LEya9~npw|5n(>b9 zhsu7UGbQPCy!sj{N=ZO16U}&Oa7NXa^(!I|IYu~p+uh`3MH_BApufHRhZFA8+TG}# zGUO>A_&#`~^0KXV?hSS2H3}(&24x>5i$b+co|Mq*t>JiCKZ$+3x)xoArmJ@amrCR@ zi;@N)N_4Qm744)ZH!84SG;E4F#y`V&Psr-aW^ z8dk6w9IP?U0FSCn%#M58+DI!RriWAP+I-9Pko$oQDf~^95!U6Sp8D-Rc?mCn6FV#|7?Xp^rv-dlHn|NL6 ze%9Rz#o76{cOnge>g70Fqi{;kN&H&psr=nlBg9Vv={6xzTF4`L&#n>vrW?(KgZmrL$SMMW||E=Ej^he+_v48=Y zyO<31zA(6Tbm_2kbt~=hSl>$*ziEWyWy5>TGp{ZAB6Z8O#2VWOH?sBW^*KM2Nwk&) z``I)3%Dy!4kKX~GF7HR|-J(Ndt) zzu7ad!bHTS(G5d~{3P71?!uZ5-7nvJy)F{P+wr`Lbnt==2~Ule9M;|z;qM9?TdcA2 zHYNky8yzRY`mm4H0hSEmdkr~~bxfNwdO|rKg=ww2R;~3(P|ATk!Z)Z8>Dy-FO&in< zmK2%*(Lk$(wn9UakNBWa@Cm8cb8bLtI#LU?Ygk*@>d`q6h#NO#Bf|H)1&ODsO-}~= z$U2V-{39-jsre|MhDT&@8`^@_?|=zlYxDW6+}sE!a^L&+eXig#hN!+cn+sN2IB@aU_|Ej)0{MT=4T80@L-K`@9*pq~KFv_DI84$xpOo zyq8MM;>X&ZgUb&vm>38lXS-IyP-317g_p)LNnBwFE!irH7d(s`qJuES7_4;nA64x~ z%H8Tpo;%dl14+Ymakc)?H9vXF*zk%aXNVOS3m488x$eGbz)HH#)%#lTl4DR_G5f6H z-#JyVmSIF9ooZ`iq|n6)PPK^MH4nbTM^>K4M?M`ZmUYWyohWHoC(|Qx7+D)oWXqrp z(hCGy#Zq%nYft@%h>J`pexp*kTbw#2BuTrQy2WAJEBiBfKeZtVE&Be>5ea7#O0J}l zc1^4PGN*KJwnZ2(^gxD5zx*i{^}o6HfWFMgeUB-rASpwnhv5G~0FQ7=^qOz^QrI15 z{><9V3Cr3-o#q{x`n&d|BSKM>rT=S3KFAOVAS?Fy?K5c?ZQ#T?n`JT$<3+d^(ijT4 z4*e)2t;p-RLHylwDFOp+RE?&qhPWp-S?F<2yaz|(YP>+e(;Q4rR zsoh55;qjHzy#(rW>}v5-U$w!b8CHe>A862_ADnG${PlWzT+U=*O3sHPmvMEEup03p z6nh~NuQ?^ERpwq{#Ve%ei?M+D-T}>{>Ti5nTg!x0U^P64glNG=0b@W+y!=7;fH8s1 z2WyF|H`0B<;e#axwsY$Z>>2fjL#wGD`uKjL?^O)O?{$gdGRG=YoG+s2tAdRP1D?b; zG}AFXQ>A?+apDSrX7uDeGSz`bEdtjRdFS3~OLVO3L9|G8DR72?mO=$3ltt-mqys8U zN1eYY#k0@+xh6VadVu>wU*IL%sW@H{s9wZ%G9+ct^lty7qp~P4SguRVtCtb7F?6EVi zQasehH%HIo`xf`X)oU#$6RavPGRL;my{Q*wZf1wE2KDDab#TQPyr3R$gyMK~>dhYbfnMhYhXA8^b8+`lmCLO^?4 zI&?ds?x#W^-)l4`=vRVYueiro_5VO|Lhdi1Nje@&w$pt?Reccd#6IHZc`)W7M>a4jL=;d*>YA9*B>jHxzS zz!W!b2%|WSXrc#rc;=JzokNu&Z}?|$uZue}Y>U`102kBAd2S59IOtbKpxWt)fb zxf3GMDE|M3P%Lz%CUZ3b(%-)a-9GA!g~s8Mk;m2jP~jCW&h$V_!EnnOxS2cZ=*`@V z4pd9pG-7fYUpx5UB-04`Q&^)t*rne4K=zOn?t72yKV=~RO$ZmcWvfEx^# za*X>M*0SKYRR3XTr@a~{bh7VLE1PBC$r4hjmGE1*q{DZC;`b^TC(EoC$CNr* z=}j(7|FS(AHk3v-KuVleJD1G%DwOm7`ZULYwbjqRVwc^r)i=F1xVWU56mVVtdu1a% zW=W4qWXqUJSLR4*d>m7Qpc-tEJ4r61mhI|!?L{74k%MAIo(7|n*+B1wWsKD2dlKcj zCXDY|`tj=}3p2p#i_L*lt|A}2!`!q!J>aSw# z?NwcorqCTWGd<&7-;CI~JNNqF&pOS2_)Y92R2b3CY#XPd8X z=KFpFtEoBX?(uRydXyojQJkv~yasmucb_JKzc$qy0w0yVwHKN?F+xnJQ zeOGG#B93ES1(RZby3oVJwqkrOtFq&UzAuKyAJ0zr&GZVi&oMY@OxcT7ij^N)4B#=# zwXO58u9;rR|9th-%C6aKLtro$;9F$HVUa77JVyE{F40#gu*$NQ{#i?D+;(|2Ldd*q zdXXu~pq#gQntw^YoPWnV%@qZ_VAEmG2l-G3X!*(Nd#LZa(=+S zNHb{93@TH2X8x-yQ3s#e2)UdDqW{+wDK#|@ACedB@r#R9Iw}YPn*4rOtLF?IQR9ki z9=uOMeSFo~bYTq)B#_2syf>{S)h~Kp&A2|f`ox9R>jtrhF?a&T021S)ux(bhfI8=# zE$(4w{~GGtt|OyW!37hB4vYCI`OZDvnjtsdjpJgFQ#lIY$zGrbJd2>PZH?Z6AY(I& z>c%y(zTYfh>RL2F#w%<;>~iWH^!4_6ap~gPe!=idf|2!0Yj*z`d!?2G;{$^hP2~aV z4_9etv3>sf7=zVMK8Il~pqVZ{M7-rKb8(o8@ozjDi2;3U^$jU-<<5jZx~qiD@cK~< zzfJ!2@jo-c$p7AX>Ef~=n(wOe>CLNc3n{i?E3WgKNPi!pmZ^&-uzSRP)BDnauNSa9 z4e`I^s#%`}`%ydITuS41tLIzQBmQ}mtG7$ov)cq!n+Nwd3maFM;jM#(X$B~Eve?wL z92qflb=86z|IMhaYF{b->nukVjEpk=^mwo zmH8q5!jSDtL=fuAZ&nQHzX}IIlkMGglloOrefx)JqRo27Y9W+SpqFj5 zk55R6&}}%5n~*=q{A6ISwq{gS@?UP>`N?l}tLEbaP+qqA(j z^V7a3Lw3%4mz`i&%>{A1+dGh_YGDDE7Zz zOYO0fXjvEwJf?IDdT!vKBkK4k1*Z>z9adtj9Y_U;a=-OiBINPIEOmzP1Olaa{s-L` z^bfWkPht#yOR(sh!B^o)s-`k=6jOY-i$I77$QUT2YYi&pz*<)BxThfIjs@>%cTM;0 z-7yc7z3}d*QoU^ol%5^OcV=TS*KK{rnIqGQI|)_j$k(t-NE=^)JWM_uRc+nIU0dl- z&5UWFXx)4|;(5J393Io~B|_(z5Tof=3+dhGU=aTaeIz2d3;A|9TLR~Oz&Uv3|M@x| z+7jGBX#{OK;eCMm3?4o}@$*&FLt$fbcPrPEZiZHX3+X9Bxa@vsl8wlP>s0TG)^QTo z|0Yodsm@I7 Date: Tue, 23 Sep 2025 15:33:13 +0200 Subject: [PATCH 196/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 27df6e9f..0dd9d9c1 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -28,10 +28,10 @@ RUN apt install -y \ e2fsprogs=1.46.5-2ubuntu1.2 \ gpgv=2.2.27-3ubuntu2.4 \ libblkid1=2.37.2-4ubuntu3.4 \ - libc-bin=2.35-0ubuntu3.10 \ - libc-dev-bin=2.35-0ubuntu3.10 \ - libc6-dev=2.35-0ubuntu3.10 \ - libc6=2.35-0ubuntu3.10 \ + libc-bin=2.35-0ubuntu3.11 \ + libc-dev-bin=2.35-0ubuntu3.11 \ + libc6-dev=2.35-0ubuntu3.11 \ + libc6=2.35-0ubuntu3.11 \ libcap2=1:2.44-1ubuntu0.22.04.2 \ libcom-err2=1.46.5-2ubuntu1.2 \ libext2fs2=1.46.5-2ubuntu1.2 \ From d6507acc84edbc24a82f09619bcdb460f55d896e Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Wed, 24 Sep 2025 06:10:04 +0200 Subject: [PATCH 197/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 0dd9d9c1..d1d0ac72 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -53,7 +53,7 @@ RUN apt install -y \ libtasn1-6=4.18.0-4ubuntu0.1 \ libudev1=249.11-0ubuntu3.16 \ libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-153.163 \ + linux-libc-dev=5.15.0-156.166 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ openssl=3.0.2-0ubuntu1.19 \ From b4c5f2c03e5ef4045d3c6b133317bd70cae84fd0 Mon Sep 17 00:00:00 2001 From: GitHub CI Date: Wed, 24 Sep 2025 11:14:03 +0200 Subject: [PATCH 198/337] fix: update DATADIR path in pr-test.yaml for correct directory reference Signed-off-by: GitHub CI --- .github/workflows/pr-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index a75c14a5..608349b7 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -17,7 +17,7 @@ jobs: timeout-minutes: 45 env: - DATADIR: /mnt/swarm_alpha/Odelia_challange/ODELIA_Challenge_unilateral/ + DATADIR: /mnt/sda1/Odelia_challange/ODELIA_Challenge_unilateral/ SCRATCHDIR: /mnt/scratch SITE_NAME: UKA PYTHONUNBUFFERED: 1 From 1bdc5fd18a819f013f94b11ace2baa4091989fa5 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 25 Sep 2025 11:12:17 +0200 Subject: [PATCH 199/337] more consistently use double [[ ]] in bash scripts --- docker_config/master_template.yml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index 7ed98f3f..61daadc0 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -667,7 +667,7 @@ docker_cln_sh: | # Resolve script directory DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - if [ ! -z "$MY_SCRATCH_DIR" ]; then + if [[ ! -z "$MY_SCRATCH_DIR" ]]; then mkdir -p "$MY_SCRATCH_DIR" chmod -R 777 "$MY_SCRATCH_DIR" fi @@ -678,7 +678,7 @@ docker_cln_sh: | # Docker image and container name DOCKER_IMAGE={~~docker_image~~} - if [ -z "$NOPULL" ]; then + if [[ -z "$NOPULL" ]]; then echo "Updating docker image" docker pull "$DOCKER_IMAGE" fi @@ -719,7 +719,7 @@ docker_cln_sh: | docker run -d -t --rm $DOCKER_OPTIONS $ENV_VARS --env TRAINING_MODE=swarm $DOCKER_IMAGE \ /bin/bash -c "nohup ./start.sh >> nohup.out 2>&1 && /bin/bash" - elif [ ! -z "$LIST_LICENSES" ]; then + elif [[ ! -z "$LIST_LICENSES" ]]; then docker run -it --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ /bin/bash -c "/MediSwarm/scripts/_list_licenses.sh" @@ -767,7 +767,7 @@ docker_svr_sh: | #NETARG="-p {~~admin_port~~}:{~~admin_port~~} -p {~~fed_learn_port~~}:{~~fed_learn_port~~}" DOCKER_IMAGE={~~docker_image~~} - if [ -z "$NOPULL" ]; then + if [[ -z "$NOPULL" ]]; then echo "Updating docker image" docker pull $DOCKER_IMAGE fi @@ -778,16 +778,16 @@ docker_svr_sh: | echo "Starting docker with $DOCKER_IMAGE as $CONTAINER_NAME" # Run docker with appropriate parameters - if [ ! -z "$START_SERVER" ]; then + if [[ ! -z "$START_SERVER" ]]; then docker run -d -t --rm --name=$CONTAINER_NAME \ -v $DIR/..:/startupkit/ -w /startupkit/startup/ \ --ipc=host $NETARG $DOCKER_IMAGE \ /bin/bash -c "nohup ./start.sh >> nohup.out 2>&1 && chmod a+r nohup.out && /bin/bash" - elif [ ! -z "$LIST_LICENSES" ]; then + elif [[ ! -z "$LIST_LICENSES" ]]; then docker run -it --rm --name=$CONTAINER_NAME \ $DOCKER_IMAGE \ /bin/bash -c "/MediSwarm/scripts/_list_licenses.sh" - elif [ ! -z "$INTERACTIVE" ]; then + elif [[ ! -z "$INTERACTIVE" ]]; then docker run --rm -it --detach-keys="ctrl-x" --name=$CONTAINER_NAME \ -v $DIR/..:/startupkit/ -w /startupkit/startup/ \ --ipc=host $NETARG $DOCKER_IMAGE \ @@ -816,13 +816,13 @@ docker_adm_sh: | NETARG="--net=host" DOCKER_IMAGE={~~docker_image~~} - if [ -z "$NOPULL" ]; then + if [[ -z "$NOPULL" ]]; then echo "Updating docker image" docker pull $DOCKER_IMAGE fi CONTAINER_NAME=odelia_swarm_admin - if [ ! -z "$LIST_LICENSES" ]; then + if [[ ! -z "$LIST_LICENSES" ]]; then docker run -it --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ /bin/bash -c "/MediSwarm/scripts/_list_licenses.sh" exit 0 From 90773c136661662d7cb47fb151f5728430f6af3c Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Tue, 30 Sep 2025 14:27:40 +0200 Subject: [PATCH 200/337] Update README.participant.md with instructions for /etc/hosts --- assets/readme/README.participant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 3cf6bd4c..be8d4759 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -9,7 +9,7 @@ This guide is for data scientists and medical research sites participating in a - Software: Docker, OpenVPN, Git ## Setup - +0. Add this line to your `/etc/hosts`: `172.24.4.65 dl3.tud.de dl3` 1. Make sure your compute node satisfies the specification and has the necessary software installed. 2. Set up the VPN. A VPN is necessary so that the swarm nodes can communicate with each other securely across firewalls. For that purpose, 1. Install OpenVPN From 14858e74890f3bd385f4d1fdc1319edb6a6cd298 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Wed, 1 Oct 2025 10:56:44 +0200 Subject: [PATCH 201/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index d1d0ac72..2177d48f 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -24,7 +24,7 @@ RUN apt install -y \ bsdutils=1:2.37.2-4ubuntu3.4 \ ca-certificates=20240203~22.04.1 \ coreutils=8.32-4.1ubuntu1.2 \ - dpkg=1.21.1ubuntu2.3 \ + dpkg=1.21.1ubuntu2.6 \ e2fsprogs=1.46.5-2ubuntu1.2 \ gpgv=2.2.27-3ubuntu2.4 \ libblkid1=2.37.2-4ubuntu3.4 \ @@ -48,21 +48,21 @@ RUN apt install -y \ libseccomp2=2.5.3-2ubuntu3~22.04.1 \ libsmartcols1=2.37.2-4ubuntu3.4 \ libss2=1.46.5-2ubuntu1.2 \ - libssl3=3.0.2-0ubuntu1.19 \ + libssl3=3.0.2-0ubuntu1.20 \ libsystemd0=249.11-0ubuntu3.16 \ libtasn1-6=4.18.0-4ubuntu0.1 \ libudev1=249.11-0ubuntu3.16 \ libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-156.166 \ + linux-libc-dev=5.15.0-157.167 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ - openssl=3.0.2-0ubuntu1.19 \ + openssl=3.0.2-0ubuntu1.20 \ util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions RUN apt install -y \ apt-transport-https=2.4.14 \ - curl=7.81.0-1ubuntu1.20 \ + curl=7.81.0-1ubuntu1.21 \ dirmngr=2.2.27-3ubuntu2.4 \ distro-info-data=0.52ubuntu0.9 \ gnupg-l10n=2.2.27-3ubuntu2.4 \ @@ -76,7 +76,7 @@ RUN apt install -y \ gpgsm=2.2.27-3ubuntu2.4 \ libassuan0=2.5.5-1build1 \ libbrotli1=1.0.9-2build6 \ - libcurl4=7.81.0-1ubuntu1.20 \ + libcurl4=7.81.0-1ubuntu1.21 \ libexpat1=2.4.7-1ubuntu0.6 \ libksba8=1.6.0-2ubuntu0.2 \ libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 \ @@ -116,7 +116,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions RUN apt install -y \ apparmor=3.0.4-2ubuntu2.4 \ - containerd.io=1.7.27-1 \ + containerd.io=1.7.28-0~ubuntu.22.04~jammy \ dbus-user-session=1.12.20-2ubuntu4.1 \ dbus=1.12.20-2ubuntu4.1 \ dmsetup=2:1.02.175-2.1ubuntu5 \ @@ -135,7 +135,7 @@ RUN apt install -y \ libbsd0=0.11.5-1 \ libcbor0.8=0.8.0-2ubuntu1 \ libcryptsetup12=2:2.4.3-1ubuntu1.3 \ - libcurl3-gnutls=7.81.0-1ubuntu1.20 \ + libcurl3-gnutls=7.81.0-1ubuntu1.21 \ libdbus-1-3=1.12.20-2ubuntu4.1 \ libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 \ libedit2=3.1-20210910-1build1 \ From 4d964724df78653619e11787105673582facea41 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 1 Oct 2025 14:39:23 +0200 Subject: [PATCH 202/337] check all license output in integration test --- runIntegrationTests.sh | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 1dc9dc3a..c1ced7e0 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -174,21 +174,25 @@ create_synthetic_data () { run_list_licenses () { - cd "$PROJECT_DIR"/prod_00 - cd localhost/startup - LICENSES_LISTED=$(./docker.sh --list_licenses --no_pull) + cd "$CWD"/"$PROJECT_DIR/prod_00/admin@test.odelia/startup" + ADMIN_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) + SERVER_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) + cd "$CWD"/"$PROJECT_DIR/prod_00/client_A/startup/" + CLIENT_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) + cd "$CWD" - for EXPECTED_KEYWORDS in 'scikit-learn' 'torch' 'nvflare_mediswarm' 'BSD License' 'MIT License'; + for license_output in "$ADMIN_LICENSES" "$SERVER_LICENSES" "$CLIENT_LICENSES"; do - if echo "$LICENSES_LISTED" | grep -qie "$EXPECTED_KEYWORDS" ; then - echo "Instructions on $EXPECTED_KEYWORDS found" - else - echo "Instructions on $EXPECTED_KEYWORDS missing" - exit 1 - fi + for expected_keywords in 'scikit-learn' 'torch' 'nvflare_mediswarm' 'BSD License' 'MIT License' 'model weights'; + do + if echo "$license_output" | grep -qie "$expected_keywords" ; then + echo "License check: $expected_keywords found" + else + echo "License check: $expected_keywords missing" + exit 1 + fi + done done - - cd "$CWD" } From ebc73ae5b15c99f353f0666ea72dd54a7e029227 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Thu, 2 Oct 2025 06:09:54 +0200 Subject: [PATCH 203/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 2177d48f..2d2f018d 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -120,7 +120,7 @@ RUN apt install -y \ dbus-user-session=1.12.20-2ubuntu4.1 \ dbus=1.12.20-2ubuntu4.1 \ dmsetup=2:1.02.175-2.1ubuntu5 \ - docker-buildx-plugin=0.28.0-0~ubuntu.22.04~jammy \ + docker-buildx-plugin=0.29.0-0~ubuntu.22.04~jammy \ docker-ce-cli=5:28.4.0-1~ubuntu.22.04~jammy \ docker-ce-rootless-extras=5:28.4.0-1~ubuntu.22.04~jammy \ docker-ce=5:28.4.0-1~ubuntu.22.04~jammy \ From 5ce9145ae6d29a8c7d412fdd762e9949ffc02042 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Mon, 6 Oct 2025 06:09:53 +0200 Subject: [PATCH 204/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 2d2f018d..607e66e0 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -120,10 +120,10 @@ RUN apt install -y \ dbus-user-session=1.12.20-2ubuntu4.1 \ dbus=1.12.20-2ubuntu4.1 \ dmsetup=2:1.02.175-2.1ubuntu5 \ - docker-buildx-plugin=0.29.0-0~ubuntu.22.04~jammy \ - docker-ce-cli=5:28.4.0-1~ubuntu.22.04~jammy \ - docker-ce-rootless-extras=5:28.4.0-1~ubuntu.22.04~jammy \ - docker-ce=5:28.4.0-1~ubuntu.22.04~jammy \ + docker-buildx-plugin=0.29.1-1~ubuntu.22.04~jammy \ + docker-ce-cli=5:28.5.0-1~ubuntu.22.04~jammy \ + docker-ce-rootless-extras=5:28.5.0-1~ubuntu.22.04~jammy \ + docker-ce=5:28.5.0-1~ubuntu.22.04~jammy \ docker-compose-plugin=2.39.4-0~ubuntu.22.04~jammy \ gir1.2-glib-2.0=1.72.0-1 \ git-man=1:2.34.1-1ubuntu1.15 \ From 7b0ea147cd1f9d3cde4964b6c8c95c40b97afb35 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Tue, 7 Oct 2025 06:09:50 +0200 Subject: [PATCH 205/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 607e66e0..8939e209 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -124,7 +124,7 @@ RUN apt install -y \ docker-ce-cli=5:28.5.0-1~ubuntu.22.04~jammy \ docker-ce-rootless-extras=5:28.5.0-1~ubuntu.22.04~jammy \ docker-ce=5:28.5.0-1~ubuntu.22.04~jammy \ - docker-compose-plugin=2.39.4-0~ubuntu.22.04~jammy \ + docker-compose-plugin=2.40.0-1~ubuntu.22.04~jammy \ gir1.2-glib-2.0=1.72.0-1 \ git-man=1:2.34.1-1ubuntu1.15 \ git=1:2.34.1-1ubuntu1.15 \ From d212f90a5e5344297a6d1778c486e0400baa8923 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 7 Oct 2025 10:42:58 +0200 Subject: [PATCH 206/337] removed apparently unused environment variable definitions --- .github/workflows/pr-test.yaml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index 962df74d..ac9bee08 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -17,9 +17,6 @@ jobs: timeout-minutes: 45 env: - DATADIR: /mnt/sda1/Odelia_challange/ODELIA_Challenge_unilateral/ - SCRATCHDIR: /mnt/scratch - SITE_NAME: UKA PYTHONUNBUFFERED: 1 From 1af55eb53561ae2c3b52b6221f8a715a4c74c881 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 7 Oct 2025 11:17:21 +0200 Subject: [PATCH 207/337] fixed directory where server startup kit is located --- runIntegrationTests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index c1ced7e0..5923c208 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -176,6 +176,7 @@ create_synthetic_data () { run_list_licenses () { cd "$CWD"/"$PROJECT_DIR/prod_00/admin@test.odelia/startup" ADMIN_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) + cd "$CWD"/"$PROJECT_DIR/prod_00/localhost/startup/" SERVER_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) cd "$CWD"/"$PROJECT_DIR/prod_00/client_A/startup/" CLIENT_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) From 99f6a28aa2e20aa9f9c83bbbe54a87a8ba0a79be Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 7 Oct 2025 11:17:57 +0200 Subject: [PATCH 208/337] remove CR from mixed line endings --- runIntegrationTests.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 5923c208..d2cf8e06 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -174,12 +174,13 @@ create_synthetic_data () { run_list_licenses () { + # the output has mixed line endings, remove CRs cd "$CWD"/"$PROJECT_DIR/prod_00/admin@test.odelia/startup" - ADMIN_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) + ADMIN_LICENSES=$( ./docker.sh --no_pull --list_licenses | sed 's/\r//g' ) cd "$CWD"/"$PROJECT_DIR/prod_00/localhost/startup/" - SERVER_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) + SERVER_LICENSES=$( ./docker.sh --no_pull --list_licenses | sed 's/\r//g' ) cd "$CWD"/"$PROJECT_DIR/prod_00/client_A/startup/" - CLIENT_LICENSES=$( ./docker.sh --no_pull --list_licenses 2>&1 ) + CLIENT_LICENSES=$( ./docker.sh --no_pull --list_licenses | sed 's/\r//g' ) cd "$CWD" for license_output in "$ADMIN_LICENSES" "$SERVER_LICENSES" "$CLIENT_LICENSES"; From 272a2e6f85bbcb2818510124032e74a750d42779 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 7 Oct 2025 11:35:25 +0200 Subject: [PATCH 209/337] removed "-it" from docker run for license listing --- docker_config/master_template.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker_config/master_template.yml b/docker_config/master_template.yml index e8eb111f..db16ab87 100644 --- a/docker_config/master_template.yml +++ b/docker_config/master_template.yml @@ -720,7 +720,7 @@ docker_cln_sh: | /bin/bash -c "nohup ./start.sh >> nohup.out 2>&1 && /bin/bash" elif [[ ! -z "$LIST_LICENSES" ]]; then - docker run -it --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ + docker run --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ /bin/bash -c "/MediSwarm/scripts/_list_licenses.sh" elif [[ ! -z "$INTERACTIVE" ]]; then @@ -784,7 +784,7 @@ docker_svr_sh: | --ipc=host $NETARG $DOCKER_IMAGE \ /bin/bash -c "nohup ./start.sh >> nohup.out 2>&1 && chmod a+r nohup.out && /bin/bash" elif [[ ! -z "$LIST_LICENSES" ]]; then - docker run -it --rm --name=$CONTAINER_NAME \ + docker run --rm --name=$CONTAINER_NAME \ $DOCKER_IMAGE \ /bin/bash -c "/MediSwarm/scripts/_list_licenses.sh" elif [[ ! -z "$INTERACTIVE" ]]; then @@ -823,7 +823,7 @@ docker_adm_sh: | CONTAINER_NAME=odelia_swarm_admin___REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__ if [[ ! -z "$LIST_LICENSES" ]]; then - docker run -it --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ + docker run --rm --name=$CONTAINER_NAME $DOCKER_IMAGE \ /bin/bash -c "/MediSwarm/scripts/_list_licenses.sh" exit 0 fi From d73d220cb86bdf4888c4851f383d9c71409e04d5 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 7 Oct 2025 15:54:48 +0200 Subject: [PATCH 210/337] removed installation of Docker+dependencies in Docker image --- docker_config/Dockerfile_ODELIA | 82 --------------------------------- 1 file changed, 82 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index e154301f..c90b7b7c 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -107,88 +107,6 @@ RUN apt install -y \ unzip=6.0-26ubuntu3.2 \ zip=3.0-12build2 -# Prepare Docker installation -RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \ - && chmod a+r /etc/apt/keyrings/docker.asc \ - && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "${UBUNTU_CODENAME:-$VERSION_CODENAME}") stable" | tee /etc/apt/sources.list.d/docker.list > /dev/null \ - && apt update - -# Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions -RUN apt install -y \ - apparmor=3.0.4-2ubuntu2.4 \ - containerd.io=1.7.28-0~ubuntu.22.04~jammy \ - dbus-user-session=1.12.20-2ubuntu4.1 \ - dbus=1.12.20-2ubuntu4.1 \ - dmsetup=2:1.02.175-2.1ubuntu5 \ - docker-buildx-plugin=0.29.1-1~ubuntu.22.04~jammy \ - docker-ce-cli=5:28.5.0-1~ubuntu.22.04~jammy \ - docker-ce-rootless-extras=5:28.5.0-1~ubuntu.22.04~jammy \ - docker-ce=5:28.5.0-1~ubuntu.22.04~jammy \ - docker-compose-plugin=2.40.0-1~ubuntu.22.04~jammy \ - gir1.2-glib-2.0=1.72.0-1 \ - git-man=1:2.34.1-1ubuntu1.15 \ - git=1:2.34.1-1ubuntu1.15 \ - iptables=1.8.7-1ubuntu5.2 \ - less=590-1ubuntu0.22.04.3 \ - libapparmor1=3.0.4-2ubuntu2.4 \ - libargon2-1=0~20171227-0.3 \ - libbsd0=0.11.5-1 \ - libcbor0.8=0.8.0-2ubuntu1 \ - libcryptsetup12=2:2.4.3-1ubuntu1.3 \ - libcurl3-gnutls=7.81.0-1ubuntu1.21 \ - libdbus-1-3=1.12.20-2ubuntu4.1 \ - libdevmapper1.02.1=2:1.02.175-2.1ubuntu5 \ - libedit2=3.1-20210910-1build1 \ - liberror-perl=0.17029-1 \ - libfido2-1=1.10.0-1 \ - libgdbm-compat4=1.23-1 \ - libgdbm6=1.23-1 \ - libgirepository-1.0-1=1.72.0-1 \ - libglib2.0-0=2.72.4-0ubuntu2.6 \ - libglib2.0-data=2.72.4-0ubuntu2.6 \ - libicu70=70.1-2 \ - libip4tc2=1.8.7-1ubuntu5.2 \ - libip6tc2=1.8.7-1ubuntu5.2 \ - libjson-c5=0.15-3~ubuntu1.22.04.2 \ - libkmod2=29-1ubuntu1 \ - libltdl7=2.4.6-15build2 \ - libmd0=1.0.4-1build1 \ - libmnl0=1.0.4-3build2 \ - libnetfilter-conntrack3=1.0.9-1 \ - libnfnetlink0=1.0.1-3build3 \ - libnftnl11=1.2.1-1build1 \ - libnss-systemd=249.11-0ubuntu3.16 \ - libpam-systemd=249.11-0ubuntu3.16 \ - libperl5.34=5.34.0-3ubuntu1.5 \ - libslirp0=4.6.1-1build1 \ - libx11-6=2:1.7.5-1ubuntu0.3 \ - libx11-data=2:1.7.5-1ubuntu0.3 \ - libxau6=1:1.0.9-1build5 \ - libxcb1=1.14-3ubuntu3 \ - libxdmcp6=1:1.1.3-0ubuntu5 \ - libxext6=2:1.3.4-1build1 \ - libxml2=2.9.13+dfsg-1ubuntu0.9 \ - libxmuu1=2:1.1.3-3 \ - libxtables12=1.8.7-1ubuntu5.2 \ - netbase=6.3 \ - networkd-dispatcher=2.1-2ubuntu0.22.04.2 \ - openssh-client=1:8.9p1-3ubuntu0.13 \ - patch=2.7.6-7build2 \ - perl-base=5.34.0-3ubuntu1.5 \ - perl-modules-5.34=5.34.0-3ubuntu1.5 \ - perl=5.34.0-3ubuntu1.5 \ - pigz=2.6-1 \ - python3-dbus=1.2.18-3build1 \ - python3-gi=3.42.1-0ubuntu1 \ - shared-mime-info=2.1-2 \ - slirp4netns=1.0.1-2 \ - systemd-sysv=249.11-0ubuntu3.16 \ - systemd-timesyncd=249.11-0ubuntu3.16 \ - systemd=249.11-0ubuntu3.16 \ - xauth=1:1.1-1build2 \ - xdg-user-dirs=0.17-2ubuntu4 \ - xz-utils=5.2.5-2ubuntu1 - # Clean up apt cache RUN rm -rf /var/lib/apt/lists/* From f813ab07b1a886bfc8d985e453231f410e8d6ccc Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Thu, 9 Oct 2025 06:10:05 +0200 Subject: [PATCH 211/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 8939e209..04c89dd4 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -49,9 +49,9 @@ RUN apt install -y \ libsmartcols1=2.37.2-4ubuntu3.4 \ libss2=1.46.5-2ubuntu1.2 \ libssl3=3.0.2-0ubuntu1.20 \ - libsystemd0=249.11-0ubuntu3.16 \ + libsystemd0=249.11-0ubuntu3.17 \ libtasn1-6=4.18.0-4ubuntu0.1 \ - libudev1=249.11-0ubuntu3.16 \ + libudev1=249.11-0ubuntu3.17 \ libuuid1=2.37.2-4ubuntu3.4 \ linux-libc-dev=5.15.0-157.167 \ logsave=1.46.5-2ubuntu1.2 \ @@ -121,9 +121,9 @@ RUN apt install -y \ dbus=1.12.20-2ubuntu4.1 \ dmsetup=2:1.02.175-2.1ubuntu5 \ docker-buildx-plugin=0.29.1-1~ubuntu.22.04~jammy \ - docker-ce-cli=5:28.5.0-1~ubuntu.22.04~jammy \ - docker-ce-rootless-extras=5:28.5.0-1~ubuntu.22.04~jammy \ - docker-ce=5:28.5.0-1~ubuntu.22.04~jammy \ + docker-ce-cli=5:28.5.1-1~ubuntu.22.04~jammy \ + docker-ce-rootless-extras=5:28.5.1-1~ubuntu.22.04~jammy \ + docker-ce=5:28.5.1-1~ubuntu.22.04~jammy \ docker-compose-plugin=2.40.0-1~ubuntu.22.04~jammy \ gir1.2-glib-2.0=1.72.0-1 \ git-man=1:2.34.1-1ubuntu1.15 \ @@ -157,8 +157,8 @@ RUN apt install -y \ libnetfilter-conntrack3=1.0.9-1 \ libnfnetlink0=1.0.1-3build3 \ libnftnl11=1.2.1-1build1 \ - libnss-systemd=249.11-0ubuntu3.16 \ - libpam-systemd=249.11-0ubuntu3.16 \ + libnss-systemd=249.11-0ubuntu3.17 \ + libpam-systemd=249.11-0ubuntu3.17 \ libperl5.34=5.34.0-3ubuntu1.5 \ libslirp0=4.6.1-1build1 \ libx11-6=2:1.7.5-1ubuntu0.3 \ @@ -182,9 +182,9 @@ RUN apt install -y \ python3-gi=3.42.1-0ubuntu1 \ shared-mime-info=2.1-2 \ slirp4netns=1.0.1-2 \ - systemd-sysv=249.11-0ubuntu3.16 \ - systemd-timesyncd=249.11-0ubuntu3.16 \ - systemd=249.11-0ubuntu3.16 \ + systemd-sysv=249.11-0ubuntu3.17 \ + systemd-timesyncd=249.11-0ubuntu3.17 \ + systemd=249.11-0ubuntu3.17 \ xauth=1:1.1-1build2 \ xdg-user-dirs=0.17-2ubuntu4 \ xz-utils=5.2.5-2ubuntu1 From a59974e869f65c09cbd2883954f8643c7fab5828 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 9 Oct 2025 11:27:42 +0200 Subject: [PATCH 212/337] use newer version of pip --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index bb2fc9fa..eef21b7e 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -198,7 +198,7 @@ RUN python3 -m pip uninstall -y conda conda-package-handling conda_index # Install specific versions of pip and setuptools RUN python3 -m pip install \ -U \ - pip==25.1.1 \ + pip==25.2 \ setuptools==80.8.0 # Install dependencies of NVFlare at fixed versions From 419dd384bc1be55f74741560b1afbab28c4ba6f5 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Mon, 13 Oct 2025 06:10:35 +0200 Subject: [PATCH 213/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 04c89dd4..7a305ec6 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -116,7 +116,7 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings # Install docker-ce docker-ce-cli containerd.io and dependencies at fixed versions RUN apt install -y \ apparmor=3.0.4-2ubuntu2.4 \ - containerd.io=1.7.28-0~ubuntu.22.04~jammy \ + containerd.io=1.7.28-1~ubuntu.22.04~jammy \ dbus-user-session=1.12.20-2ubuntu4.1 \ dbus=1.12.20-2ubuntu4.1 \ dmsetup=2:1.02.175-2.1ubuntu5 \ From 28bd10938bf77604adb583fc60ad1cab91434ee5 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 16 Oct 2025 16:56:02 +0200 Subject: [PATCH 214/337] consistent use of GPU_FOR_TESTING in integration test script, shoud include --device= in case of sliced GPU --- runIntegrationTests.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 1dc9dc3a..d089f1bc 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -197,7 +197,7 @@ run_docker_gpu_preflight_check () { echo "[Run] Docker/GPU preflight check (local dummy training via startup kit) ..." cd "$PROJECT_DIR/prod_00/client_A/startup/" CONSOLE_OUTPUT=docker_gpu_preflight_check_console_output.txt - ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --dummy_training --no_pull 2>&1 | tee "$CONSOLE_OUTPUT" + ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --dummy_training --no_pull 2>&1 | tee "$CONSOLE_OUTPUT" if grep -q "Epoch 1: 100%" "$CONSOLE_OUTPUT" && grep -q "Training completed successfully" "$CONSOLE_OUTPUT"; then echo "Expected output of Docker/GPU preflight check found" @@ -216,7 +216,7 @@ run_data_access_preflight_check () { cd "$PROJECT_DIR"/prod_00 cd client_A/startup CONSOLE_OUTPUT=data_access_preflight_check_console_output.txt - ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --preflight_check --no_pull 2>&1 | tee $CONSOLE_OUTPUT + ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --preflight_check --no_pull 2>&1 | tee $CONSOLE_OUTPUT if grep -q "Train set: 18, Val set: 6" "$CONSOLE_OUTPUT" && grep -q "Epoch 0: 100%" "$CONSOLE_OUTPUT"; then echo "Expected output of Docker/GPU preflight check found" @@ -246,10 +246,10 @@ start_server_and_clients () { sleep 10 cd client_A/startup - ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --start_client + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --start_client cd ../.. cd client_B/startup - ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_B --GPU device=$GPU_FOR_TESTING --start_client + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_B --GPU "$GPU_FOR_TESTING" --start_client sleep 8 cd "$CWD" @@ -301,7 +301,7 @@ verify_wrong_client_does_not_connect () { sed -i 's#CONTAINER_NAME=odelia_swarm_client_client_A_095c1b7#CONTAINER_NAME=odelia_swarm_client_client_A_'$CONTAINER_VERSION_SUFFIX'#' client_A/startup/docker.sh cd client_A/startup - ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU device=$GPU_FOR_TESTING --start_client + ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --start_client cd ../.. sleep 20 From 881feb8c2d952a57a8ac0df94f499a5a2fa0d193 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 17 Oct 2025 09:58:06 +0200 Subject: [PATCH 215/337] extended documentation --- assets/readme/README.developer.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/assets/readme/README.developer.md b/assets/readme/README.developer.md index 1215e574..8d964565 100644 --- a/assets/readme/README.developer.md +++ b/assets/readme/README.developer.md @@ -26,6 +26,11 @@ The project description specifies the swarm nodes etc. to be used for a swarm tr ## Running Tests +* If you have multiple GPUs, use `GPU_FOR_TESTING="device=0" (or another device) +* If you have a sliced multiple GPUs, use `GPU_FOR_TESTING="device=0:0" (or another slice) +* Otherwise, leave this environment variable unset to use all GPUs. +* To run only specific tests, look at the options at the end of the script. + ```bash ./runIntegrationTests.sh ``` From 4cc429f77b6531519ad264a3491c7e2372677156 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 17 Oct 2025 10:17:13 +0200 Subject: [PATCH 216/337] split function to start servers/clients --- runIntegrationTests.sh | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index d089f1bc..5a4024ad 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -236,8 +236,8 @@ run_3dcnn_simulation_mode () { } -start_server_and_clients () { - echo "[Run] Start server and client Docker containers ..." +start_server () { + echo "[Run] Start server Docker container ..." cd "$PROJECT_DIR"/prod_00 cd localhost/startup @@ -245,6 +245,14 @@ start_server_and_clients () { cd ../.. sleep 10 + cd "$CWD" +} + + +start_clients () { + echo "[Run] Start client Docker containers ..." + + cd "$PROJECT_DIR"/prod_00 cd client_A/startup ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --start_client cd ../.. @@ -255,6 +263,11 @@ start_server_and_clients () { cd "$CWD" } +start_server_and_clients () { + start_server() + start_clients() +} + start_registry_docker_and_push () { docker run -d --rm -p 5000:5000 --name local_test_registry_$CONTAINER_VERSION_SUFFIX registry:3 From 854a4a98ee39bd0faac1464bec774d37441a8c4a Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 17 Oct 2025 10:19:43 +0200 Subject: [PATCH 217/337] changed order of tests to avoid interfering containers --- runIntegrationTests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 5a4024ad..8e117546 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -524,8 +524,8 @@ case "$1" in kill_registry_docker run_docker_gpu_preflight_check run_data_access_preflight_check - start_server_and_clients verify_wrong_client_does_not_connect + start_server_and_clients run_dummy_training_in_swarm kill_server_and_clients cleanup_temporary_data From ac12871d67db01ad341a4c669ada804fdd98bf6d Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 17 Oct 2025 10:34:48 +0200 Subject: [PATCH 218/337] sleep to allow container to be killed before proceeding --- runIntegrationTests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 8e117546..90163d1e 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -337,6 +337,7 @@ verify_wrong_client_does_not_connect () { fi docker kill odelia_swarm_server_flserver_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_A_$CONTAINER_VERSION_SUFFIX + sleep 3 rm -rf "$PROJECT_DIR"/prod_wrong_client cd "$CWD" From 22bc5727336af28c7bd51110d70f25f9241d2713 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 17 Oct 2025 10:37:21 +0200 Subject: [PATCH 219/337] fixed syntax --- runIntegrationTests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 90163d1e..8870b160 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -264,8 +264,8 @@ start_clients () { } start_server_and_clients () { - start_server() - start_clients() + start_server + start_clients } From 803aed69dca07faed6da4810e2cd03e1b1391058 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 17 Oct 2025 11:33:37 +0200 Subject: [PATCH 220/337] extended documentation of integration tests --- assets/readme/README.developer.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/assets/readme/README.developer.md b/assets/readme/README.developer.md index 8d964565..0219674f 100644 --- a/assets/readme/README.developer.md +++ b/assets/readme/README.developer.md @@ -42,9 +42,11 @@ You should see 3. output of a successful proof-of-concept run of a dummy training with two nodes 4. output of a successful simulation run of a 3D CNN training using synthetic data with two nodes 5. output of a set of startup kits being generated -6. output of a Docker/GPU preflight check using one of the startup kits -7. output of a data access preflight check using one of the startup kits -8. output of a dummy training run in a swarm consisting of one server and two client nodes +6. output of pushing the Docker image to a local registry and pulling it from there (takes several minutes) +7. output of a Docker/GPU preflight check using one of the startup kits +8. output of a data access preflight check using one of the startup kits +9. output of an outdated client startup kit failing to connect to the server +10. output of a dummy training run in a swarm consisting of one server and two client nodes Optionally, uncomment running NVFlare unit tests. From 3e3cd6d8e9646ba8b93b191826989d80f4420c8c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 17 Oct 2025 11:34:06 +0200 Subject: [PATCH 221/337] include NVFlare unit tests when running all integration tests --- assets/readme/README.developer.md | 2 -- runIntegrationTests.sh | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/assets/readme/README.developer.md b/assets/readme/README.developer.md index 0219674f..0d595563 100644 --- a/assets/readme/README.developer.md +++ b/assets/readme/README.developer.md @@ -48,8 +48,6 @@ You should see 9. output of an outdated client startup kit failing to connect to the server 10. output of a dummy training run in a swarm consisting of one server and two client nodes -Optionally, uncomment running NVFlare unit tests. - ## Distributing Startup Kits Distribute the startup kits to the clients. diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 8870b160..719da6c2 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -516,7 +516,7 @@ case "$1" in run_dummy_training_standalone run_dummy_training_simulation_mode run_dummy_training_poc_mode - # run_nvflare_unit_tests # uncomment to enable NVFlare unit tests + run_nvflare_unit_tests create_synthetic_data run_3dcnn_simulation_mode create_startup_kits_and_check_contained_files From e3fa177e23b6282d2b6b012a7dcc13ded4a12969 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 17 Oct 2025 14:23:28 +0200 Subject: [PATCH 222/337] added running dummy training in swarm to CI --- .github/workflows/pr-test.yaml | 5 +++++ runIntegrationTests.sh | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index ac9bee08..bc773807 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -96,3 +96,8 @@ jobs: continue-on-error: false run: | ./runIntegrationTests.sh run_data_access_preflight_check + + - name: Run dummy training in swarm + continue-on-error: false + run: | + ./runIntegrationTests.sh run_dummy_training_in_swarm diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 719da6c2..a94bb6f4 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -507,7 +507,6 @@ case "$1" in run_dummy_training_in_swarm kill_server_and_clients cleanup_temporary_data - # TODO add to CI if we want this (currently not working) ;; all | "") From 4a676e6a7b64ad026bd3fd8e40b156918b7fcd02 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Mon, 20 Oct 2025 07:39:30 +0200 Subject: [PATCH 223/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 9d8805e7..fe8d8609 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -53,7 +53,7 @@ RUN apt install -y \ libtasn1-6=4.18.0-4ubuntu0.1 \ libudev1=249.11-0ubuntu3.17 \ libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-157.167 \ + linux-libc-dev=5.15.0-160.170 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ openssl=3.0.2-0ubuntu1.20 \ @@ -124,7 +124,7 @@ RUN apt install -y \ docker-ce-cli=5:28.5.1-1~ubuntu.22.04~jammy \ docker-ce-rootless-extras=5:28.5.1-1~ubuntu.22.04~jammy \ docker-ce=5:28.5.1-1~ubuntu.22.04~jammy \ - docker-compose-plugin=2.40.0-1~ubuntu.22.04~jammy \ + docker-compose-plugin=2.40.1-1~ubuntu.22.04~jammy \ gir1.2-glib-2.0=1.72.0-1 \ git-man=1:2.34.1-1ubuntu1.15 \ git=1:2.34.1-1ubuntu1.15 \ From b02330c22289823798a643ac7b3dac60d64c413f Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Tue, 21 Oct 2025 06:10:14 +0200 Subject: [PATCH 224/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index fe8d8609..1ebe4295 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -64,7 +64,7 @@ RUN apt install -y \ apt-transport-https=2.4.14 \ curl=7.81.0-1ubuntu1.21 \ dirmngr=2.2.27-3ubuntu2.4 \ - distro-info-data=0.52ubuntu0.9 \ + distro-info-data=0.52ubuntu0.11 \ gnupg-l10n=2.2.27-3ubuntu2.4 \ gnupg-utils=2.2.27-3ubuntu2.4 \ gnupg=2.2.27-3ubuntu2.4 \ From e652c4f44b0f9c1e8183cae168f0491d8915f733 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 21 Oct 2025 14:36:55 +0200 Subject: [PATCH 225/337] verify that minimal example used in the Docker/GPU preflight check actually requires a GPU --- runIntegrationTests.sh | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index a94bb6f4..ea9ae02e 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -107,6 +107,24 @@ run_unit_tests_controller(){ run_dummy_training_standalone(){ echo "[Run] Minimal example, standalone" + OUTPUT_WITHOUT_GPU=$(docker run --rm \ + --shm-size=16g \ + --ipc=host \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + -u $(id -u):$(id -g) \ + -v /etc/passwd:/etc/passwd -v /etc/group:/etc/group \ + -v "$SYNTHETIC_DATA_DIR":/data \ + -v "$SCRATCH_DIR":/scratch \ + --entrypoint=/MediSwarm/tests/integration_tests/_run_minimal_example_standalone.sh \ + "$DOCKER_IMAGE" 2>&1 || echo "") + if echo "$OUTPUT_WITHOUT_GPU" | grep -q "RuntimeError: This example does not work without GPU" ; then + echo "Verified that minimal example requires GPU" + else + echo "Failed to verify that minimal example requires GPU" + exit 1 + fi + _run_test_in_docker tests/integration_tests/_run_minimal_example_standalone.sh } From 66350ffc34529d8b7452fcdddeb645d505f45a18 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 21 Oct 2025 14:50:57 +0200 Subject: [PATCH 226/337] verify that dummy training in Docker/GPU preflight check takes less than one minute --- runIntegrationTests.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index ea9ae02e..19875cd4 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -215,7 +215,8 @@ run_docker_gpu_preflight_check () { echo "[Run] Docker/GPU preflight check (local dummy training via startup kit) ..." cd "$PROJECT_DIR/prod_00/client_A/startup/" CONSOLE_OUTPUT=docker_gpu_preflight_check_console_output.txt - ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --dummy_training --no_pull 2>&1 | tee "$CONSOLE_OUTPUT" + # also check that it finishes within one minute + timeout --signal=kill 1m ./docker.sh --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --dummy_training --no_pull 2>&1 | tee "$CONSOLE_OUTPUT" if grep -q "Epoch 1: 100%" "$CONSOLE_OUTPUT" && grep -q "Training completed successfully" "$CONSOLE_OUTPUT"; then echo "Expected output of Docker/GPU preflight check found" From 29c74159d90390f4eb0d3b57dec4e6f3162234c7 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 21 Oct 2025 14:59:42 +0200 Subject: [PATCH 227/337] verify that data access preflight check takes less than one minute per round --- runIntegrationTests.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 19875cd4..52207cfc 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -235,7 +235,8 @@ run_data_access_preflight_check () { cd "$PROJECT_DIR"/prod_00 cd client_A/startup CONSOLE_OUTPUT=data_access_preflight_check_console_output.txt - ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --preflight_check --no_pull 2>&1 | tee $CONSOLE_OUTPUT + # also check that it finishes the single round within one minute + timeout --signal=kill 1m ./docker.sh --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --preflight_check --no_pull 2>&1 | tee $CONSOLE_OUTPUT if grep -q "Train set: 18, Val set: 6" "$CONSOLE_OUTPUT" && grep -q "Epoch 0: 100%" "$CONSOLE_OUTPUT"; then echo "Expected output of Docker/GPU preflight check found" From 05e06110ef9a05d65bb7d228b79dc08e50b6e48e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 21 Oct 2025 15:44:36 +0200 Subject: [PATCH 228/337] test for more expected output in server log --- runIntegrationTests.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 52207cfc..0838d3e7 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -377,7 +377,13 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00/localhost/startup CONSOLE_OUTPUT=nohup.out for EXPECTED_OUTPUT in 'Total clients: 2' 'updated status of client client_A on round 4' 'updated status of client client_B on round 4' 'all_done=True' 'Server runner finished.' \ - 'Start to the run Job: [0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}' 'updated status of client client_B on round 4'; + 'Start to the run Job: [0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}' 'updated status of client client_B on round 4' \ + '.*SwarmServerController - INFO - .*updated status of client client_B on round 3: .* action=start_learn_task, all_done=False' \ + '.*SwarmServerController - INFO - .*updated status of client client_B on round 3: .* action=finished_learn_task, all_done=False' \ + '.*ClientManager - INFO - Client: New client client_A.* joined.*' \ + '.*ClientManager - INFO - Client: New client client_B.* joined.*' \ + '.*ClientManager - INFO - Client: New client client_.* joined. Sent token: .* Total clients: 1' \ + '.*ClientManager - INFO - Client: New client client_.* joined. Sent token: .* Total clients: 2'; do if grep -q --regexp="$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then echo "Expected output $EXPECTED_OUTPUT found" From dd68c40d5b11e0ef67aa81da89fc530bde6580d6 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 21 Oct 2025 15:44:51 +0200 Subject: [PATCH 229/337] test for more expected output in client log --- runIntegrationTests.sh | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 0838d3e7..33982491 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -396,9 +396,19 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00/client_A/startup CONSOLE_OUTPUT=nohup.out - for EXPECTED_OUTPUT in 'Sending training result to aggregation client' 'Epoch 9: 100%' 'val/AUC_ROC'; + for EXPECTED_OUTPUT in 'Sending training result to aggregation client' \ + 'Epoch 9: 100%' \ + 'val/AUC_ROC' \ + 'validation metric .* from client' \ + 'aggregating [0-9]* update(s) at round [0-9]*' \ + 'FederatedClient - INFO - Successfully registered client:client_A for project' \ + 'FederatedClient - INFO - Got engine after .* seconds' \ + 'FederatedClient - INFO - Got the new primary SP:' \ + 'SwarmClientController - INFO - .*: accepted learn request from client_.' \ + 'Gatherer - INFO - .*: Contribution from client_. ACCEPTED by the aggregator at round .' \ + "SwarmClientController - INFO - .*: Broadcasting learn task of round . to ['client_A', 'client_B']; aggr client is client_." do - if grep -q "$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then + if grep -q --regexp="$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then echo "Expected output $EXPECTED_OUTPUT found" else echo "Expected output $EXPECTED_OUTPUT missing" @@ -407,16 +417,6 @@ run_dummy_training_in_swarm () { done cd "$CWD" - for EXPECTED_OUTPUT in 'validation metric .* from client' 'aggregating [0-9]* update(s) at round [0-9]*'; - do - if grep -q --regexp="$EXPECTED_OUTPUT" "$PROJECT_DIR"/prod_00/client_?/startup/nohup.out; then - echo "Expected output $EXPECTED_OUTPUT found" - else - echo "Expected output $EXPECTED_OUTPUT missing" - exit 1 - fi - done - cd "$PROJECT_DIR"/prod_00/client_A/ FILES_PRESENT=$(find . -type f -name "*.*") for EXPECTED_FILE in 'custom/minimal_training.py' 'best_FL_global_model.pt' 'FL_global_model.pt' ; From 543c1ece3d2a0dc47e1cab7ded44d458a4c27061 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Thu, 23 Oct 2025 06:10:02 +0200 Subject: [PATCH 230/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 1ebe4295..1ec9f97a 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -124,7 +124,7 @@ RUN apt install -y \ docker-ce-cli=5:28.5.1-1~ubuntu.22.04~jammy \ docker-ce-rootless-extras=5:28.5.1-1~ubuntu.22.04~jammy \ docker-ce=5:28.5.1-1~ubuntu.22.04~jammy \ - docker-compose-plugin=2.40.1-1~ubuntu.22.04~jammy \ + docker-compose-plugin=2.40.2-1~ubuntu.22.04~jammy \ gir1.2-glib-2.0=1.72.0-1 \ git-man=1:2.34.1-1ubuntu1.15 \ git=1:2.34.1-1ubuntu1.15 \ From 68f53c47f0b43c4f246db9018fd035bd5a784d5c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 23 Oct 2025 10:07:19 +0200 Subject: [PATCH 231/337] prepare test that admin console with invalid certificate is rejected by server --- runIntegrationTests.sh | 18 ++++++++++++++---- .../outdated_startup_kit.tar.gz | Bin 9917 -> 12991 bytes 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 33982491..9af5e0db 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -318,9 +318,10 @@ kill_registry_docker () { } -verify_wrong_client_does_not_connect () { - echo "[Run] Verify that client with outdated startup kit does not connect ..." +verify_wrong_certificates_are_rejected () { + echo "[Run] Verify that client and admin console with invalid certificate in startup kit do not connect ..." + # start server cp -r "$PROJECT_DIR"/prod_01 "$PROJECT_DIR"/prod_wrong_client cd "$PROJECT_DIR"/prod_wrong_client cd localhost/startup @@ -328,11 +329,16 @@ verify_wrong_client_does_not_connect () { cd ../.. sleep 10 + # inject invalid certificates from outdated startup kits rm client_A -rf + rm admin@test.odelia/ -rf tar xvf "$CWD"/tests/integration_tests/outdated_startup_kit.tar.gz sed -i 's#DOCKER_IMAGE=localhost:5000/odelia:1.0.1-dev.250919.095c1b7#DOCKER_IMAGE='$DOCKER_IMAGE'#' client_A/startup/docker.sh sed -i 's#CONTAINER_NAME=odelia_swarm_client_client_A_095c1b7#CONTAINER_NAME=odelia_swarm_client_client_A_'$CONTAINER_VERSION_SUFFIX'#' client_A/startup/docker.sh + sed -i 's#DOCKER_IMAGE=localhost:5000/odelia:1.0.1-dev.251023.e940002#DOCKER_IMAGE='$DOCKER_IMAGE'#' admin@test.odelia/startup/docker.sh + sed -i 's#CONTAINER_NAME=odelia_swarm_admin_e940002#CONTAINER_NAME=odelia_swarm_admin_'$CONTAINER_VERSION_SUFFIX'#' admin@test.odelia/startup/docker.sh + # start client and verify that it gets rejected cd client_A/startup ./docker.sh --no_pull --data_dir "$SYNTHETIC_DATA_DIR" --scratch_dir "$SCRATCH_DIR"/client_A --GPU "$GPU_FOR_TESTING" --start_client cd ../.. @@ -356,6 +362,10 @@ verify_wrong_client_does_not_connect () { exit 1 fi + # start admin console and verify that it gets rejected + echo "TODO" + + # cleanup docker kill odelia_swarm_server_flserver_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_A_$CONTAINER_VERSION_SUFFIX sleep 3 rm -rf "$PROJECT_DIR"/prod_wrong_client @@ -521,7 +531,7 @@ case "$1" in check_wrong_startup_kit) create_startup_kits_and_check_contained_files create_synthetic_data - verify_wrong_client_does_not_connect + verify_wrong_certificates_are_rejected cleanup_temporary_data # TODO add to CI if we want this ;; @@ -550,7 +560,7 @@ case "$1" in kill_registry_docker run_docker_gpu_preflight_check run_data_access_preflight_check - verify_wrong_client_does_not_connect + verify_wrong_certificates_are_rejected start_server_and_clients run_dummy_training_in_swarm kill_server_and_clients diff --git a/tests/integration_tests/outdated_startup_kit.tar.gz b/tests/integration_tests/outdated_startup_kit.tar.gz index ba3a984ec77afe5d8cfe1dd3747b3d0b57306e39..4f293dada7ce163335e08160e527f33b49c9a047 100644 GIT binary patch literal 12991 zcmV;wGC<8AiwFP!000001ME6$lbhC({>-mnl~n5H$}{32gyfV{%Tq!E1QL>fRBDPh zh^Kf8rSjhw&VGf?_z1LN6eAjKcT}5c~S3dy&(Z&% zg7HE8%iKr`*U7)$LiqgnPr4QGJ^tSUK8%0wkFv_XVjlRs_``ktAq4pz|8D`Gj(_fS zH0QQFmtU%*5Ab9>{#5*7bd0>e|B;^I1qgkO2|fq^;qm|Ha{v(i3IVAoaFPr3#R2SFpT!FOJ~pP3UILDbvb>CM5SGUK2qg7j)_*7+>3 zg3Ks;$98jO9oxx`e%8EWgVn{US;H+va*Xs z9@M>&-Lzi5H^qPT4*Pqtd6tht{ zmZJ!Txnco<<7nwZQAFT)M8lDU$L-k0+=Nnr7c)fn`P#8|lhkcS!(2=oTGp1UROkHK z5($~%0a7NK4b4&lL(B>|orA3eyNjq0*HQ5@@0k zqFD&o2e@?hv&M>NC;cEoDx%j`G=p?igp3202;^9$6%G~@q7^O#Dk|1lfYTm_bhtCH zrr#nPMPUqq^d2nPXopSN2_fwH{$f*Dt=NM8n0Y9>i4b4MTb$%ncBO#Jrcmn(|n9mxB>>R=p#F`@HDbX`f-l}_Mh$=#- zl^jJd9JG6A06BQjSkim?W0TSs5PC>3kt_Zba0{(d0=^ zcG<|;BQr;hh#;Vt!TE)1 zRwgEPEM-h2b2K3nWZDu9us9M{AJfH#VOWJwJfHRTIh>gxd0k9=JPY;1In$IkVS5*~ z;uI#$E6fLus(3guqpMXY(E&y8zKjYd)P`cSrk!=zj^mGRV^`O19l z0|HhrEu!YtTp9DTGb@EOF?HcpY$|!;@K_}gsB$6+v@UCbMg4V6?_wsbb3@8jqi_X? z$Ape2P<-Nc<59bpG$u{=7pz*~=sBv3jG{d;^TyDjosAm#uu!5l%~7U?@y* znAZwp!fv^Jw1XsPZTR$oitT2dA%~cyByqQ9B;3l9rMXwa78_+J8F8o5kt$_ChyqE5 z5t!eu*Hbb%#`ucjHCa!Z*RO$V&S6CQ!#wn_&XNDl@E7lYZ_m45s^hu(KgN+K@BcWC zz~Aftw}9vBe}3%he`dU`|MWy!t|roivp1q%sX+tPEaC`-$P#~-`X7C){&#Ph{})Su z2UDK4&lyz?%$l(jX0BBy8@ z4fls=+AiAN+?xu?#JC{gdA`a0i`oWgv%+Jg8ivssVx3fDgRQd4x5;=j%wich(1mK| zyRa0Eoj5AHH-eohH`ylPwzAXAPz7CG_~T66YC=&I?3gQ}Y+S^2!O+WUDZ;>s+w;g& zCW>mD7H6ufe1>tR<&aH8?KtC*%SNwHioNC&PsOC8!?2rLPUtPkS}AuHd|-VzX5E2v z=OHY)&>$lTTNrn1HQm+Z7KV>A9AD0~xokK0HGe!;1#Z3*7`YF(4KT7i6X=RsAMWe? zetPDQL?vwDP*=4|IPT5@&0Y9-hlzd2>7}bL_NA#tuC-X!fD!MHHam+=bm?9XjPt0{ zFNA$PRLEn7_2k`3cQjI@RHbSrlQrQq+OcvPTU{bhcLHfNdcT2=BQI}!OEjK0sSc{!* zPE7eo8=7L+0itN9q2Qa$*k+cjC8z7d+?X|&ERr!@Pd77WdESMS3p(MIC^=EUUaGQ~ z!C0PMs7SZ7cbqzH(P5l3({*?Dgb6)APf}${Fml5#k8@!KtuXtjt+HjSR@o8QS?tOZ zn+Z4n&J>8jYx^~SLDg@()QIM@rzDKNy(whAj1 zPD36OpkxxK`#ML-$w;fCVK|BSkQr}^vomg6-}?_Zm3+GXKe}xCLLJZD|084g>HRN^ zOupCuZvj92*QhG;kr^Z-C#gZxD13nCHm?Ui{sXjZu>Z1yJn63)2S5Hn65GYycD>tB z%x}=|e-6N(2jH(G+o?xM6-6NO>rXKF(@$X5*c}DP+R{%G6ugPSq;?}CcW&LviEe!> zxGmE(DsJCg#CG%_JDsE3Tkxg;UB|v(c*i15{!O;o-xfZ-|KDDrf3c2d_J4QQdD8!Z zyGQ8z{{I&6vHcG;e&68u?YC~r|1bb=yb^@K-+%uFv{L}=!0K<`e}usxx&G|n<|aw+ zvMvSu3_gIkmUcYO1tw+IG92zu+x#(jkY0vG%Z zyt#Dty>ZL{?EG2!&KnorJ0-J?vTvx|cc6mU@SMSw;U3ewxXYD~>;T*Vpw^;7aAJoN z34wY2#+19FG4l9LCcS&G0(hfn+i&qlikhJG-xmuS1^DCpatHT)HZZTclzo%O;7u2# z&Ox2qUq9gmZ|aAXKDs(;^y=+-;vVssQs z+s8+k!S_P)QTU7V-+OiVOLRPQ|I^im5BmR;@8`d71AlbsW#FcHY?ODkllN`k!7C7c z*`u$~?|%JpO+kY`OWgG><-3Q!h;_6wubh`}=v5Y_hJDv%;toG}%X3e)yLA6o^sDqJ z^{yHpNxkBWB6=Td<=yHm+sx@jUHk42glpIGE_B)}_c9fS2qfbo|2?gsRM znB{4lU1`6954f~jneOV9}d61acfs`+}^!Ew6q=mGbgI zhLL;4Bbgnh$nKoPzPtYSpqKux5KX%j-+!Qkd+tAhhtEHOKR+J-XZydM@4j@6J+uE& z1V%sX|9-#!{U-3}{&%w_KxxD6355YyjP~{pm`VG)U*dn)u6)|91U|Q2`LtUJd~Un) zX}1#i+;-*DZYA)!?aHU!O5k(al|+tshODx7lQ0byh9#&5z_bShzK(;JS}5g2m}*Yx+a==UXWqc)<=gZeU=v1 zMXl&EMn=9&w;{_~A)U?RV-evYu{{T-VhN;)TbubZv;cmp8Zu{M65moo#xTx=Fv91U zI6e{zWw!ew|qL-UM90FSVZt(aYo>VzHosO2@x(yIw< zDw%e%QH-`)%`PQ|~;A}w<`o(AYb`k{AT-(fm1+yuf2}5}rLyTx`G^1K3k)4gA z9Q=!z&m#I9p}Q=_$n%t%4(FDH7L>HuRJaer6gL6%O{=j2j(V(Ci|e9Q3V~A=#7>XU z?XVz4m6Q%6p0@FYFy786LxJLA#fDpqUvhDR8Gt)EXvB1n;g_*^$|P=$ucF@LTHJCbTI7Ksh%%Z!L(7V)uJIK`Qv zI`Sc1P?foyaMr%c$yVY^PE=3(Nt=!6oCan`y%5_()K5#3xAf_l#g3FE>>(rbmIr>Z zw^WmzPcLP+5e_L+;wwCJXg55$t2T&50BP($tDcm@FpFon!_N>{2BoLyrXEwETbTer`U(#AELC z^c5jq(kw|=K{7nh+tplV_v>sr98CeU<{a|kFP(%^^c-6``%SpBb#>qE(3o-X5DB&u zajL@y&mh`0Axsz|K3Ez9Be}BJN&tG^AhL9*B{K{OQ<&klO<`Q`WrD8=QH?L9FE8ij z5S#rEdr#Wzwvpra_bW6m*Q~va!H9XurQ6pyJb4{uVuFI|)Q?@LO0r|me| zIL_3r@!=IIk_|L~?gr36KbX+LbOK0{K0i6e98F;_!tN^TER&eY?#B$?7-|><78Tgql)1q>z*wgELwk4FaNv%xm(EOVc9CL%3??iTq$RTH z_>|-xtF1L=zTSkHzucxW2)XF#F_`|zrRL9G3jbfi$M)aVJ^z|GetZ1~jt%zT_x%4q z9&Y~7y`U_1Xut6v+{>=1b-=ijQsBZe-rp}A%On$fjzX^OK z|0WJ|yX|WKcW?VWz!%nk97nJh@+Tqkd;Qlpfq%YwTo#Ajq?p*1IY5FZ3$8JJ?}uv% z-6k!goLx>Hw@7*pPC&`jTBe;048nW2;RF+hHCYmlis=gpr6^OC)pnbx%sg1C9Lrkv zWNYR@0lU6i_z^HSy{|K7f@N@(rKk}lqW--T4PY+0|${;*k~)B zg+l>ssS3he5WshINEx;kA zZXYpBzQ9+6Mjx8h7(3-->R4-)AeLyHkYd&CyE7GAkuVM_+`$&BIG;%G-@p1{^y0KiC0A2%p3TS=+NpSifTx47P=wQ%%UQT) z&uOSC$AizGl2$Si-(Q3k?}K^)Ljn#LJ7uv;k0=)KsDflCM_}eIPj?`&kx^l}i01@v z3cgw^m9otFvXM44RznKAU=PR5Ow!~vyh$05S1pn@h}|?f6f)fI)QQ#X$|;OBd)C0c ztuvR6!aOk8VmrCA`>FAF;dN$6NmCo=wb7i89lDp#&6%7sGhZagU*3MQSa)U6ym}IyP4fn$oyIXK7g_bTk=z zX&Z>0O6^u-%BTUdIDzV!8l&SC#6da3z>*=(JkWLsy+~Oa&Jt7O*A$#+?&Rz`DoKgY44z#_WmW}H?Ne&PtgR9nh4 zN+t6-V=`sW$4U#NvLP8EozCF%c`PiXtx4NRM`Y{~9v|s&(^g}A&xzAob!B3sdrF{U zw7Wo
o^0PFjd3ZT;Rz5a`|PMGk@>nHo@J*PQ_cWK&WEO2iwa7$%^afc#GC9&dm zHJ+2<{xohF-A*`sh0F|$7_0LmaB!1-cm@Hci1&)Q>PRlj;l&P-B?Ql6IinA)B2Q^j zEUYQA@j|qv8D_2?m);(j!qIUJB3JNwQq!c#iqS zW)7agXb%*}12i-2z?>!81X!6uso2%ZoYAudb4W~LB5u%K0#oA!X89TF(!PvL0(~X= z05!#r4r@~=6X}Jy$uDRsS zYl^hyOexgsT0pQCgPk$CD+D6uloHDmkJ!xno=MR@z-@qoolw(uMZs${8;kNz0jiC= zh}WEX&~gl-7;RhSddd2$qrs3#5_d*NrHs17X+7PnA#W_ITnJ0VlH2b^a|~~8e14Mr z(R12BLct1In3a+B__RR$6s#b3E6WtPGORI0I_ZTRG(gtMfLfqS7~|Y?j<@6b3|U~ zYgvi<#kL>pS57~8-j{3^UjpcfwkUyPX*Op$CViGTfPU5iJkK1(^`>*U+}QC{A_{S( z)EUwCO0fpcOBOq$+IZU;ur;qKC_c0S+U&Xj(pz!{%PWK5?{Y;lcg4(U4&WAr&+9Wf z1{?+2`?|D~DDIg{$PeSj)GJfpU( zf&tNiX113ykr*Nep3r*u^-TrtI z8E?~>8p4aAJ?_$p$1VueVEffUfD+`iT{xAmD+Mb-oHp^Mo4t=uiCq|XhSQ9ims z&y8B-F%`3JUToY~!wWZMsNzo4}h( zv}L&xkk==74b@U_lZk>T(*5v7!Ni(!w7yp9eI>MT%Bdb5izEWhH&TWgb5rplzLv=a zx+cL;g^*l1GF)u*{XM&fvta79mmfF2m*q2#YO5KMAVLR%3fx-~Ip2_GJVO|EZt16; z4ey&Kkkuu+pQ|QJ2XvDH^VPxb+yz9=&e0i+0jkN4p(Qc9Q<`p4rKagwu@nf_=@&Yj z(5EaDp`s1JR(UX|sJ~lk*ep;1Mwhr5d33vC-!rVN2TSiWnO6Uv@Uj2*;0}B>9AEJN zVFZ8a{}b?c|L@ztA6fr%O*R*=p|c^yelEN?dZ{f&anp4EsEN9E$Pm2`54uH`4dKHf z99S2nU#F+hpqbZz;B6wi$Dj_sJ?ia!0N1tiQIYo_Iv4ETkkdYd_|hTfT>DK?-;Ra) zOtWlzi;WiTBN3io`(fM+6$?*dytiOzRn>JX`EcRI7E}gSmw+}@JPpS`;dlA%r)%iV(+#YA^dq0jwx*fNppr}We+<-xh zJwIH(xkQ2QMfc-1>$EL8H+szMzke8{m#3GCj{N8K&;R*J8rDVo(3MyBA6&{m{_Jt` ze-%EO|6c?tz6Opj#Qz`?e!2fg!o>Ia|C_+4^Z$`c{{MBF8q&}Iei@4oiS5I3Fqn|H zN~5PmPY?xDKSC7l!VdZ2n(cnutNGAl`7k6J#d-Lc-#*}oUH?UjyBqme3{l&bA7``w z_l3NxmOj=e3|nNkEt${f{|_&HKKcLoF5B}GfFX&11^=8k-Qo7j&ukNy>Z+b9j z?yZIU9Hcw{SPs^iXsXN2TAZlM_qVh}w-;BD{nzWqC-+w7!wVm^Jet=_VMW5=ym_~=}7o*oczo4|kU&5$4Mvi$LWj=oM)y@Gu5 zynrVL%jn&^LC+fB?P~w}!TX_bY}Ccx`}^5$RMg>nweP@(O&tyvXAp<|`jqz8{rl_P zO_#5Xef^iwPk%*6Z%61)FBmikxxak>57%yAFz+9C_hQNY!|>AR^*{T+|9W-zue@ndp(W?$)j%G_s)`i#fkM@)M5?mwP!$omy6@~cm?sCN$Q!4veeix<_qdqqCD ze0^7P*Z<)ntk(mAABJqBA?f8N)0=a8&GoT|HhM)L#qEDI7fE)H|Y!xa5;3_q96^YPxlaGi`_56gteGNYe9^XBb7JMR& zAM^-LFTLu~$mvH?a=E?`{#gdxOS)&ojdHK>ud2}tayZA}i%j_~q+h9+I7QcfA{cO+ z=I)+$;4>&52k|dsxPR{@g1*`7!4O}6A^Wp!v_BF)vj4x{vCj|i1^*uz{@KUN_%DHc zxBtHdeChm0W;p*jcs&1E3R240AqALXD@FzVa^fJh&IuZx{ND2)C;!9VwJs~FBB|v<7@CA`*zL>Xp`Q`IV+v;AB%ec)4$x@jkW|w zqsW;PN`~`5nYW<@bc^|X4fzaw4%hJ|SVxaC{n2O>T9m~4^*ZM_a~t5GJmKmS3+E^1 zu;0Pv^+qO*TT@XEq$xPp=EiEQs7+01MhOS@SWBQXN@{ImWVG+a?gH>VD`59t&`?4X z<~9mCj6pww2XzF*gdJn?Ltkv#jbz~|ti+OCMH(F79-Qasf||567#M*bf^BuwuzT!k zF|>G5>WRN;K99>xco@9?zVjnR&-nXA~K>(avO zZiQqeamcKz1ei#R6M)?cQFHDV1k@mKdR7t$aO}o0gDs+L(-3gbK!hF!ci6+qY;$QY zlhDKxHKG+w3+?s+es2%piJm)o0B<)fZ*@4}HjQZW;e5J^*P-XA^?ziNC;sIwGOSJVjmgKrRGLZ*w~|nz^3i>bn^^RHmDQ4k=NB4OFhD&y>jy zS_~0#*mDkbTErM2SA~XApGyQ+Pu8B&FvWn=k^0nUkV!j;J6$(OAx??GMw5*srVG(| z;#$_BO`1f$Rr&xiG)0n?IQ<~>gLIdXYZH^lLbgHw1Ar3$pOzQ)f4}M4Us=AC|J{M^ z=l^!+$NK;GDgVs%Kh<%TYP3n+!ouRB#lQa7rxHXAcl=cUxK_)xv3KPU7e&-^A$!7} z`{~^Uc@8)M&)LSlXr}RSg+f={dcvW5eALRef4%G^oL^Q&n&+EG6uIN|YY1}{WZO@t zCTbt6aJBY3a0*P0VF_FAFZj zX#SUe|7ZRG+rhqH%9s5A6aM?P{}286|L^;huf>0t;MAEXwuzuN>M|jWXI&KR5M3Lm z|K0une}og@%*r>#e}}o@Y!1h3dANory>{LbsJy|`umWdu@i6aZ!=N>2li`-J8DKJ2 zguxtJehBZbc+kVs_q<01l(54 z^5d1iXt@$*K~t1V$z*7IBDoSaOfb`Qp&g40cev>G6%VFZO|pZT#kxty8%cc@7{y+C zn9N zH9f$#+2Fd&ifM*f$L?7E3;6G^5q}l_yMhzlHi_-PE{w3g)AWh;5GU)Zgg3*kA4XX? z^w6Hc0G)MA57B@~njwbF+LL!i z_n>bkK)A|7ChuZ(8`p?QtJlvugHe}5Httm9)4A=h_VuJr==0o}k;+<~I(=m*baTUO z;i*EB05z3H(z>+`rUs{>X0!_93E7Mm`_jM!pZbZ3LKMgkA{ zcGO#u0MQSn9XHOpQlPsm(SrqwXJy|c1Fk8D`+>Y)FZ8~ob=zhL&;A3#nrHL>w}XAY zl-K?L22_GQ-}(RD?vL|7ze{<^|9@AzLi#=BB$nBKgULCavHCaum=jqmt7U3rRXfZD z*-njg%i9#KplRY4(L-E45d^v1S@H^Jx%{g=Qnkd&j2cdr4Oo%y9o_#UPI3wUe7y%y zR9>=E;p4-jm0aBEvk(5?6f<^K{ytu`ciKR>wER%* zU%jhMt9ay;fBINZzZjgn6&Sk+Npv+)OVHB`G&U_S`>22}k6{-4@aJRHQMSwSTHTAOmjU_y~YVTlh3eEhsdmF02$ib*laOPoUKuN@N>cak)NBci1r}zTYR9Fh^k`tNFY4jU{oEPR;AHdYs1$)d9kG~wVf3D$l>|WPdmQPlMde1`d7I6M|6@i2up=W^0ff}*SW6W65#*g2G#yJF};TIf5x;vxpK=h zOQ%+f>5XC;eu-8Zdl9Ywv9ON#NH(htsx_<5>WA-RsHE6^%CJD)r7a96wRW%t_tRJ+_evUGLh^3qS0 zmE(oC;d?cLkxEZ028xTNgmR^lA4~U&rv&2o-7g7j|8tpc%89BZy7&~8m@g1qyANmW zzCb3hTtPHsi?@NrNKg!HUEO}!?_eWg(bn*`` z2Th#5TsN z#Zz<3V1N8Bn&*|&r=(cWEgruQ_JszDvEf?cJFn}XBl4ZEccNiLdPz)Ke8_)n{6RI8 zc0{4e!bChZ+83i;G1eSGol26o=;79v{4QVqxfUSkU#^F17x@(}=w%#s)Wuh+pqiB4 zAf)G;#gy>=vg26$%U^2Mz4v9R*V&D@)Ojw)mGA!6k68pUMxtuwqiyWtj}KZBIgk0n za{*sIR_3`Tuj7F~Zcty=+1(p~`&jt@{`YS+)~nJWZ&CHm;B{_c8)dTTDH6F(DZLNy ziU+5mrxvT8mF$l|cU3HM^rXOI*uQPzeiW!>MpOdIRa26xRhfxDpME1ezvs4vyAD)4 zlGW5xrE)-bZfjhJfYqGgsHs;>ZBNS;Mg5~umbGsyUoRO{MUVWTTu8WGO?Yw?e<{V+ zja9nYlo9G}Yfk~=Y5Fp%ndstPAAfzlZt3(x)UQvewzbCh+!Xv={@*t^_Ty5%u>aSD z8{Oymf9)UZf8V3L<@}ETeLDZ+r5v}{Qrv*IStKcl~uSebknhWAcFxrvFknA834-I^72ES|hO+gp6&Tlj+I zaihy4J~r?948^Y!ZP)cSYYC!6)!6|&$|NvK1=y6#_yF^;k>2H216^pU_`zyJ^R*jp zRcbG_{rRZXO4zvzu3Bl2V<4@s5I7L-Mrfxl%l5iAXZff_%=oY??`Xb(uQ`;(>`Y@@ zIML=|1W^Kdj_C9(S&PT>KSHJtfFjfIo0x@W>5A%b(4@D>+8cM}`sRjVNr*kztxj9+ zD`Y4zT^nQim>$g-^=c^g`Tf{mS%N>}x|o(x%(4 z7&>?#=_Ga3C3C_R>e!YC#IY-zMV}w9hNwuB!#YJSkP$D4BexzWTGU)6VLf%2waaXI zbPi&R6sIs|PVPc4Q1l(Zg)p|`G*xtsys#{#KF#Xb;;3Z@#S^?wFJyUVSl!VW?DP}+ zupss(9T~f(-ZX&Fv&So^2x3;L6bd^aE zv4epz?&@wkGjl+5CoMMT8y=Z#b$%L6u!c8}8v?I)1D==n)Iqb%S?VzuEOhzCA%kh3 zv#@Zii`pP=`@ndh%{_CpYv$@q&c)aYV|LY}GB{$W;1I{FVc*{)9Eauo&O~2;6M3J6 zH+iBz&H(v$C>Q_l+aLRJDPNEO+iZ7!;QxM?a`*j9Se!H}!n7mq_i3}?55D7z_#d6S z{NM8Y_aEm!f0yza{>MGQMr0(>ewkRV;iu*8CUibC(|3!&KR#7{AP-jX1eZ}h^u2QX z{5g<_rB@&uC!<@ss;6!)M}ocpsi6kJQmYor-``h?-*19+d4KmkITg9nJ^H@&=Vxnr zsk`*oX`3>gmH(pUPC{v&6--3+~V5y(MHt z*$0X#sXT}{xdtb?X8eYERM#2biVeZ1Ess|7D-K>71CG7D5W_1gEgd5>%Cm=|_wa9b zG;02w=8ELg<@(cauX8vxir(ubmSaqh-R~_d2A4|ZdCkxra~sul)ruwPGd!KHe!o?2 zm6Y*j)ncknm+P_pD#gl=9-cS<;>dWht0UOQKWQ8AlJ&d^jwPvF^AsLwFLVXh`@YEHoh+Q_$^Siu z)vka4C7p{Tdgqr3_H_?n$MtVhvS%~?w+91U|3{%Y{%_m;0Xq!jW~!~(Du1C}Wc1`@ z&YVkrSti&N$*e4|5?Rs)Z>jezCLHgIrRMWZGp8bKuF8Y>qE1}oYEnJKrDQ&_@Pn;F zuTl7Bnn{MPcvCs`qeyo{IUBR5)&R&RiOE__OM7)6w;hLm@b#%17MN$=WVqP?jUx31 zXcRhJ1CU1J{1>zzU!u2we(=&W{DAWZ000000000000000003|hJ^>lYuV(F}tEACL-eSi*5ad#M)cmLnNb6!@- z$uf&1lnF>M{~vJYhJMML^36}%uXum`$#7NtH9hEnH01;wV} zggiMrux}~>Q=|6fu8jv`ttkO3{WX`reo=^ENE&XVFA#>1TcqgcT>=GF_9-POPY~3t z(3H&KWzsk}3ySqm5u~9}!eW}*E4p!CIGQIPjZ!`=x;a*$Y7@b~Lre_q55sRc+#)4h zLs0sH+oA2n3~$*3-*McGicTBYfAVn3Xogsa6wn{fU1&7jqr$(P=0PytCEB49{j$Bt zZj$yZ15&i+nf59Ef6>m5iOI7|wf0(b7 z#s!vJ*qS~ap03PW#$b~+4#wpS1Khi-#;Jsq#u;?QH6PLt3kQvOr?kzzdiyW?ZBRc- z&vH#ZjH2F2yzc$qFMjvqH?P$8=G0#oLppYWMB8 zatztDnF;Zxme|SmP z{+-T6Q%kmgDd2@qaZv4GzUN)2aXph~MsVTD3kw3}O^E|sm<$)?cfXmPm%(E3xDyjk zS_+wwH8xEOvd~fBl@{mk+Ogjl8fJ(}B{d$i)?0ZCr7=w8n4~4Dzrp*vlvmRiBHroz zQMthh=<;?L+#_3wmxAGZh)0n0Ax=`_1x9V~{?MR3WLHd7oJCa10ipIuy*XxP*esb3 zv7#X#m+RWKc&6%2=2a@I;#OJ)7$#>Xb`7mHg|d65A3muH#&4*OoPDLt1LP#T)d

kY*LoZpd8U&ZboWufHzEiX3n*9?3%7-(^Ri)67m|NEo_(^&_}VJUJ4M8yQAKF z-yMi^)N}DY@eZZ5DJ?6}H3Ux3T;{L$R6`aQ&wl7aWoCSy*9H==`xJsPoE zVq;ndcu+ba%Na?wr|)D;*Pt&IuV4lFPOPZ?&GDLGfiuo`2$5mSXl*1^eD+YYdXu?j zODL~ayGM*JGTvjPYQ$$O$D`>lJjOY8cIpn;>YePtOK~nLYUVE5yi~zTIlpTMgCuPp zMJ}U#YB2I4=}3z1yv|OYBkl%h`T6x=+4bW1*kryh1wmm(pZ{D`>Be5M3Pp+`EbrR6 zzM2M@$>SkgN1j~gCzW$tnd@<`as3ClNwh{X%(jo2>~YvEknN>fZQdsqduLVG=?BaG zyLI6DZ*;(3qh?y1-n4C=2Z0~6;oC=3+`l0I#-hBp2u+pQGD;$&x6u8?D;L0 z8yhD$tQBX-{!;H&5+bc2U)Ro2!40%G6`4uCT743$U{U;yar5ZgJT#gDQdK5g+OiqA zOroyta;>=|l}!mtZgG5IHw(@8$H7{O+hZ^+ z3wZ#W@^6u!sNKsCLFAV;`}67=L6y7HO@iAe*%Xg^yFMCbm2_h74B9Jg4-733a~==!}D#rGDq4WBn>KLTemV2N)XQ0i z_+})+_5}~Tyojz+^)iS~+{+s4#I5Xz(n=$=X8Np${smx>xhtugwQh_fz6bFIx8nIplUvl4P=n)5A+ zS%=nv_XNd4%0G9SH!18=?@P5!zc%@I_%bmtuXn{}Z?WfISOxQbP1q1pe@qM}aw1+j zqS5A~Zss~Qp+8r1?AIE_RegN$kJ-CfEy~@TvCkwp3(nx~v2!f6K5&e50mj zqw)yze>WwgPzErSsQa)0?sjcZ8Q#PeH5jfij)@q9f;)2my=%MZ5b~LajsIp;%=@sU zDmYl_5}IBL>e!(+KtZWj%^?vUsR_xATaNmR);2>Sz(5y+;Czt=0Cj+V*^G5`{pCoq zs9}=NNyG#sxF|uIyHX-Pq=u8sD5_j z%aF}Ox|wY@tONNGE^2u#e#2gm4ECg;Cr=eA{yY~ zV*bHaqls%g$DcpaAq1Rl-l-$Cl3}%;BjLT7$F4-{B;39HVu9_xrH+XJvd^tDznhsS zu1^l7;3^^HjTCD}GZ0&6m;ISRn4z8lXz+iRMgb<#hGOZnZHbj^+L zFOk46E%3qyEXbWjL?i_&K18Axf%6NioI_VZ^hSP?H!tBv#Tw2_r;xLaGv=&vn8@5#Vv;5ogB^uTR% z!o4`8-Rf0Bre6#yMvmB*_k*JV>K^_9fy`kMeY``S*p5R!!uSDk6ybOXC)h&zncKLs z5AQQmda)N@z{ALsj%}C!4mluS^TGCKiNYk%$al!@XTP8b2&&frwpzZRd~C>jD2)Up z^q?Nj(SP8P=Rm$sqmb{~m6|Fm9o{}0QrAr6x!zY?LlUYJtSw@YFJ6Io!K&1CZ;_3` z+Cr9bLwq87Crc4G)wJ*$TGZ0@Q~VSTS6x9og!621t6p%eQC z+eoUNCHFETry1}*pAfP5>DdYf)mHH885;k zE$Z~uq-7wvB0|}F`n0Yo=z#8?>i&@+sK#^O@giAbNa6MDbS>oL7e;MeFv;YM72;Qe z$8Dza*Y?uwI%p9cQ(yECjsqws0Tl9)JhFcE((fwWKVaYuSLtoJ;^U@vdTDO%rr2{5 zf0bR05)Ab$Oz1Zbv5;#g_oTA^7b@6zd2Mt~5Q6FxX4H}HA?TCf{yUKpBvPrAb_X6d znIX^kt~VS-r`Ve>kX$EB90qm|IM~A95bwy|S@R0M`+8II^!#=Q=ndcqI`!+N4(Leh z42X4?i1*zV73eZ8>3yTMztOz7W83W3P!DSyiK+ct2Zs=PJ9{ghT4eO_fRs)F1~CuT zyMnjiuI*!tN?42~GSC}DDgGFh>jg#rQOn#8Tkh=Mf-}Wb0Nq&O8`7L?>kF%zl5t zpU&>8$<%n|YFmrw^Kf--^7gtx`fB6e$5f(OZ{KX+fL!n^vfs-o{_SlE&n}2&Ls0Acw?BQqKd7 zbO`eYRiw!90PhLR@#WA|ozMVLH|o#)W#PXun{!a$S0liEa?)jX3b)i&mqyk&9AdOn zKjElYrr7bld#Y6!t;*G<$o82R%IVmk_l`tT%hM?!_}4otU1~e}Ex!~6;Y|71Lzre}Ur6M#OW>)gSQudyzYX%@mNl^@x zi_VIa-`e5A2(v8!z3xwJlmy#}DTjtJQ2j0*v)>>96&deQVmvl=ToEr;qa0sO738AjEzxg3n7r6yR`FQt?36xgvWYjx-6cB{70f4CCINCCkP z@7#l?1*PPIrpO1^WQs$yGVHO)QJrxOLf%78F1#W}4q^^%q-hh9^VquV_OIQu&Q8;- zVriEjSG)XNmQ45iwhq*9_^R&VQ#cFnEDvvqDpUft^pR6}D9J>DxQZqE6hbcyOTc&c zPuXTow82NxzSW+DH37p@aisJ*e8cc?WG?^{)piFvgj|1RT zt?@#lz6PV72;o03YV7G@z@hNO7;|6eQUM8XHfah@!5A=ID}yM~DX#sg@9W|llrKHk z&?2b?0n&Zc=YGTLHJ5j&a?49L6ec$#3u^lPv;R%3Pkes@Znu z+<+(g?yq(RdY;n^`Y|BUv~Li>dV51B@)ioB%=GP;iorrK91kx&(pc2ps&=oCKG_1W7tz^Go@~Kxp!{LV~^O$ z%Tk_p+|ibF3PMF|O81yUWnRT|0r_ex^CCxpvt|8$^b&T~-9mGx@$a?AK3o9Uv~gG- zq??ygJ4|*(0%1P@__bEICXeY6L9Ryu zkO_!9`r5AQA&NWc<^Y0BsiuKM%ZaeMn{WgNVk)~<1=rfn+nH|2>Hv*0`}iTmtX4y~ zsPEZ8yQK2X-1z2P>7Dy&G0Op0y1W~DW7~4wrB;73vO*<~p_6(f*V7FLVq#+@U_gB< z9>=#PF&DN^mXm5Rv5i?n8%}V{jo^U`koy>Mj&m^{?oZ}ix1xk+S z0H-ZSp5CMYR(I`i1uZ}gQF*RvLOJN3AajKm>%rUp;O@eLCN{5LEya9~npw|5n(>b9 zhsu7UGbQPCy!sj{N=ZO16U}&Oa7NXa^(!I|IYu~p+uh`3MH_BApufHRhZFA8+TG}# zGUO>A_&#`~^0KXV?hSS2H3}(&24x>5i$b+co|Mq*t>JiCKZ$+3x)xoArmJ@amrCR@ zi;@N)N_4Qm744)ZH!84SG;E4F#y`V&Psr-aW^ z8dk6w9IP?U0FSCn%#M58+DI!RriWAP+I-9Pko$oQDf~^95!U6Sp8D-Rc?mCn6FV#|7?Xp^rv-dlHn|NL6 ze%9Rz#o76{cOnge>g70Fqi{;kN&H&psr=nlBg9Vv={6xzTF4`L&#n>vrW?(KgZmrL$SMMW||E=Ej^he+_v48=Y zyO<31zA(6Tbm_2kbt~=hSl>$*ziEWyWy5>TGp{ZAB6Z8O#2VWOH?sBW^*KM2Nwk&) z``I)3%Dy!4kKX~GF7HR|-J(Ndt) zzu7ad!bHTS(G5d~{3P71?!uZ5-7nvJy)F{P+wr`Lbnt==2~Ule9M;|z;qM9?TdcA2 zHYNky8yzRY`mm4H0hSEmdkr~~bxfNwdO|rKg=ww2R;~3(P|ATk!Z)Z8>Dy-FO&in< zmK2%*(Lk$(wn9UakNBWa@Cm8cb8bLtI#LU?Ygk*@>d`q6h#NO#Bf|H)1&ODsO-}~= z$U2V-{39-jsre|MhDT&@8`^@_?|=zlYxDW6+}sE!a^L&+eXig#hN!+cn+sN2IB@aU_|Ej)0{MT=4T80@L-K`@9*pq~KFv_DI84$xpOo zyq8MM;>X&ZgUb&vm>38lXS-IyP-317g_p)LNnBwFE!irH7d(s`qJuES7_4;nA64x~ z%H8Tpo;%dl14+Ymakc)?H9vXF*zk%aXNVOS3m488x$eGbz)HH#)%#lTl4DR_G5f6H z-#JyVmSIF9ooZ`iq|n6)PPK^MH4nbTM^>K4M?M`ZmUYWyohWHoC(|Qx7+D)oWXqrp z(hCGy#Zq%nYft@%h>J`pexp*kTbw#2BuTrQy2WAJEBiBfKeZtVE&Be>5ea7#O0J}l zc1^4PGN*KJwnZ2(^gxD5zx*i{^}o6HfWFMgeUB-rASpwnhv5G~0FQ7=^qOz^QrI15 z{><9V3Cr3-o#q{x`n&d|BSKM>rT=S3KFAOVAS?Fy?K5c?ZQ#T?n`JT$<3+d^(ijT4 z4*e)2t;p-RLHylwDFOp+RE?&qhPWp-S?F<2yaz|(YP>+e(;Q4rR zsoh55;qjHzy#(rW>}v5-U$w!b8CHe>A862_ADnG${PlWzT+U=*O3sHPmvMEEup03p z6nh~NuQ?^ERpwq{#Ve%ei?M+D-T}>{>Ti5nTg!x0U^P64glNG=0b@W+y!=7;fH8s1 z2WyF|H`0B<;e#axwsY$Z>>2fjL#wGD`uKjL?^O)O?{$gdGRG=YoG+s2tAdRP1D?b; zG}AFXQ>A?+apDSrX7uDeGSz`bEdtjRdFS3~OLVO3L9|G8DR72?mO=$3ltt-mqys8U zN1eYY#k0@+xh6VadVu>wU*IL%sW@H{s9wZ%G9+ct^lty7qp~P4SguRVtCtb7F?6EVi zQasehH%HIo`xf`X)oU#$6RavPGRL;my{Q*wZf1wE2KDDab#TQPyr3R$gyMK~>dhYbfnMhYhXA8^b8+`lmCLO^?4 zI&?ds?x#W^-)l4`=vRVYueiro_5VO|Lhdi1Nje@&w$pt?Reccd#6IHZc`)W7M>a4jL=;d*>YA9*B>jHxzS zz!W!b2%|WSXrc#rc;=JzokNu&Z}?|$uZue}Y>U`102kBAd2S59IOtbKpxWt)fb zxf3GMDE|M3P%Lz%CUZ3b(%-)a-9GA!g~s8Mk;m2jP~jCW&h$V_!EnnOxS2cZ=*`@V z4pd9pG-7fYUpx5UB-04`Q&^)t*rne4K=zOn?t72yKV=~RO$ZmcWvfEx^# za*X>M*0SKYRR3XTr@a~{bh7VLE1PBC$r4hjmGE1*q{DZC;`b^TC(EoC$CNr* z=}j(7|FS(AHk3v-KuVleJD1G%DwOm7`ZULYwbjqRVwc^r)i=F1xVWU56mVVtdu1a% zW=W4qWXqUJSLR4*d>m7Qpc-tEJ4r61mhI|!?L{74k%MAIo(7|n*+B1wWsKD2dlKcj zCXDY|`tj=}3p2p#i_L*lt|A}2!`!q!J>aSw# z?NwcorqCTWGd<&7-;CI~JNNqF&pOS2_)Y92R2b3CY#XPd8X z=KFpFtEoBX?(uRydXyojQJkv~yasmucb_JKzc$qy0w0yVwHKN?F+xnJQ zeOGG#B93ES1(RZby3oVJwqkrOtFq&UzAuKyAJ0zr&GZVi&oMY@OxcT7ij^N)4B#=# zwXO58u9;rR|9th-%C6aKLtro$;9F$HVUa77JVyE{F40#gu*$NQ{#i?D+;(|2Ldd*q zdXXu~pq#gQntw^YoPWnV%@qZ_VAEmG2l-G3X!*(Nd#LZa(=+S zNHb{93@TH2X8x-yQ3s#e2)UdDqW{+wDK#|@ACedB@r#R9Iw}YPn*4rOtLF?IQR9ki z9=uOMeSFo~bYTq)B#_2syf>{S)h~Kp&A2|f`ox9R>jtrhF?a&T021S)ux(bhfI8=# zE$(4w{~GGtt|OyW!37hB4vYCI`OZDvnjtsdjpJgFQ#lIY$zGrbJd2>PZH?Z6AY(I& z>c%y(zTYfh>RL2F#w%<;>~iWH^!4_6ap~gPe!=idf|2!0Yj*z`d!?2G;{$^hP2~aV z4_9etv3>sf7=zVMK8Il~pqVZ{M7-rKb8(o8@ozjDi2;3U^$jU-<<5jZx~qiD@cK~< zzfJ!2@jo-c$p7AX>Ef~=n(wOe>CLNc3n{i?E3WgKNPi!pmZ^&-uzSRP)BDnauNSa9 z4e`I^s#%`}`%ydITuS41tLIzQBmQ}mtG7$ov)cq!n+Nwd3maFM;jM#(X$B~Eve?wL z92qflb=86z|IMhaYF{b->nukVjEpk=^mwo zmH8q5!jSDtL=fuAZ&nQHzX}IIlkMGglloOrefx)JqRo27Y9W+SpqFj5 zk55R6&}}%5n~*=q{A6ISwq{gS@?UP>`N?l}tLEbaP+qqA(j z^V7a3Lw3%4mz`i&%>{A1+dGh_YGDDE7Zz zOYO0fXjvEwJf?IDdT!vKBkK4k1*Z>z9adtj9Y_U;a=-OiBINPIEOmzP1Olaa{s-L` z^bfWkPht#yOR(sh!B^o)s-`k=6jOY-i$I77$QUT2YYi&pz*<)BxThfIjs@>%cTM;0 z-7yc7z3}d*QoU^ol%5^OcV=TS*KK{rnIqGQI|)_j$k(t-NE=^)JWM_uRc+nIU0dl- z&5UWFXx)4|;(5J393Io~B|_(z5Tof=3+dhGU=aTaeIz2d3;A|9TLR~Oz&Uv3|M@x| z+7jGBX#{OK;eCMm3?4o}@$*&FLt$fbcPrPEZiZHX3+X9Bxa@vsl8wlP>s0TG)^QTo z|0Yodsm@I7 Date: Thu, 23 Oct 2025 10:10:35 +0200 Subject: [PATCH 232/337] drop creating synthetic data unnecessary for this test --- runIntegrationTests.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 9af5e0db..95f491af 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -530,7 +530,6 @@ case "$1" in check_wrong_startup_kit) create_startup_kits_and_check_contained_files - create_synthetic_data verify_wrong_certificates_are_rejected cleanup_temporary_data # TODO add to CI if we want this From 109d19e43129c83b32d4a9ef6020737b5ab0bbe4 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 23 Oct 2025 10:13:14 +0200 Subject: [PATCH 233/337] implemented test that admin console with invalid certificate is rejected by server --- runIntegrationTests.sh | 10 +++++++++- tests/integration_tests/_attemptAdminConsoleLogin.exp | 8 ++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100755 tests/integration_tests/_attemptAdminConsoleLogin.exp diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 95f491af..4cdf3b51 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -363,7 +363,15 @@ verify_wrong_certificates_are_rejected () { fi # start admin console and verify that it gets rejected - echo "TODO" + cd admin@test.odelia/startup + CONSOLE_OUTPUT_ADMIN=$("$CWD"/tests/integration_tests/_attemptAdminConsoleLogin.exp) + if echo "$CONSOLE_OUTPUT_ADMIN" | grep -q "Communication Error - please try later"; then + echo "Connection rejected successfully" + else + echo "Connection with non-authorized admin console" + exit 1 + fi + cd ../.. # cleanup docker kill odelia_swarm_server_flserver_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_A_$CONTAINER_VERSION_SUFFIX diff --git a/tests/integration_tests/_attemptAdminConsoleLogin.exp b/tests/integration_tests/_attemptAdminConsoleLogin.exp new file mode 100755 index 00000000..00a51ee7 --- /dev/null +++ b/tests/integration_tests/_attemptAdminConsoleLogin.exp @@ -0,0 +1,8 @@ +#!/usr/bin/env expect + +spawn ./docker.sh --no_pull +expect "User Name: " +send "admin@test.odelia\r" +sleep 30 +expect "Trying to login, please wait ..." +expect "Communication Error - please try later" From 0bbd5102928c7a33e5338b188f981c044c1a9b24 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 23 Oct 2025 13:39:58 +0200 Subject: [PATCH 234/337] check if startup kit has option for local training --- runIntegrationTests.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 4cdf3b51..14df3124 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -166,6 +166,13 @@ create_startup_kits_and_check_contained_files () { fi done + if grep -q "\-\-local_training" "$PROJECT_DIR/prod_01/client_A/startup/docker.sh"; then + echo "Expected option for running local training found" + else + echo "Missing option for running local training" + exit 1 + fi + ZIP_CONTENT=$(unzip -tv "$PROJECT_DIR/prod_01/client_B_${VERSION}.zip") for FILE in 'client.crt' 'client.key' 'docker.sh' 'rootCA.pem'; do From b73e66618d09e292d0e8ef87ee05543fb71fc11c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 23 Oct 2025 15:20:58 +0200 Subject: [PATCH 235/337] check whether continue-on-error works as expected --- .github/workflows/pr-test.yaml | 22 +++++++++++----------- runIntegrationTests.sh | 1 + 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index bc773807..cf794c02 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -48,56 +48,56 @@ jobs: run: ./buildDockerImageAndStartupKits.sh -p tests/provision/dummy_project_for_testing.yml --use-docker-cache - name: Run integration test checking documentation on github - continue-on-error: false + continue-on-error: true run: | ./runIntegrationTests.sh check_files_on_github - name: Run controller unit tests - continue-on-error: false + continue-on-error: true run: | ./runIntegrationTests.sh run_unit_tests_controller - name: Run dummy training standalone - continue-on-error: false + continue-on-error: true run: | ./runIntegrationTests.sh run_dummy_training_standalone - name: Run dummy training in simulation mode - continue-on-error: false + continue-on-error: true run: | ./runIntegrationTests.sh run_dummy_training_simulation_mode - name: Run dummy training in proof-of-concept mode - continue-on-error: false + continue-on-error: true run: | ./runIntegrationTests.sh run_dummy_training_poc_mode - name: Run 3DCNN training in simulation mode - continue-on-error: false + continue-on-error: true run: | ./runIntegrationTests.sh run_3dcnn_simulation_mode - name: Run integration test creating startup kits - continue-on-error: false + continue-on-error: true run: | ./runIntegrationTests.sh create_startup_kits - name: Run intergration test listing licenses - continue-on-error: false + continue-on-error: true run: | ./runIntegrationTests.sh run_list_licenses - name: Run integration test Docker GPU preflight check - continue-on-error: false + continue-on-error: true run: | ./runIntegrationTests.sh run_docker_gpu_preflight_check - name: Run integration test Data access preflight check - continue-on-error: false + continue-on-error: true run: | ./runIntegrationTests.sh run_data_access_preflight_check - name: Run dummy training in swarm - continue-on-error: false + continue-on-error: true run: | ./runIntegrationTests.sh run_dummy_training_in_swarm diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 14df3124..ba317950 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -16,6 +16,7 @@ fi check_files_on_github () { + exit 1 echo "[Run] Test whether expected content is available on github" LICENSE_ON_GITHUB=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/LICENSE) From 2d8df026638fd59d4a400617bb153b4d28130915 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 23 Oct 2025 15:35:21 +0200 Subject: [PATCH 236/337] testing if: always() --- .github/workflows/pr-test.yaml | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index cf794c02..8062c2b8 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -48,56 +48,62 @@ jobs: run: ./buildDockerImageAndStartupKits.sh -p tests/provision/dummy_project_for_testing.yml --use-docker-cache - name: Run integration test checking documentation on github - continue-on-error: true + continue-on-error: false run: | ./runIntegrationTests.sh check_files_on_github - name: Run controller unit tests - continue-on-error: true + continue-on-error: false run: | ./runIntegrationTests.sh run_unit_tests_controller - name: Run dummy training standalone - continue-on-error: true + continue-on-error: false run: | ./runIntegrationTests.sh run_dummy_training_standalone - name: Run dummy training in simulation mode - continue-on-error: true + continue-on-error: false run: | ./runIntegrationTests.sh run_dummy_training_simulation_mode - name: Run dummy training in proof-of-concept mode - continue-on-error: true + continue-on-error: false run: | ./runIntegrationTests.sh run_dummy_training_poc_mode - name: Run 3DCNN training in simulation mode - continue-on-error: true + continue-on-error: false run: | ./runIntegrationTests.sh run_3dcnn_simulation_mode - name: Run integration test creating startup kits - continue-on-error: true + continue-on-error: false run: | ./runIntegrationTests.sh create_startup_kits - name: Run intergration test listing licenses - continue-on-error: true + continue-on-error: false run: | ./runIntegrationTests.sh run_list_licenses - name: Run integration test Docker GPU preflight check - continue-on-error: true + continue-on-error: false run: | ./runIntegrationTests.sh run_docker_gpu_preflight_check - name: Run integration test Data access preflight check - continue-on-error: true + continue-on-error: false run: | ./runIntegrationTests.sh run_data_access_preflight_check - name: Run dummy training in swarm - continue-on-error: true + continue-on-error: false run: | ./runIntegrationTests.sh run_dummy_training_in_swarm + + - name: Kill orphaned containers from this run (if any) + continue-on-error: false + if: always() + run: | + echo "TODO clean up" From 1685a0cdb6671fa1a2f4ca816fe00cad2d8c3339 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 23 Oct 2025 15:56:26 +0200 Subject: [PATCH 237/337] kill potentially orphaned containers at the end of the test --- .github/workflows/pr-test.yaml | 2 +- runIntegrationTests.sh | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index 8062c2b8..3ab36d4b 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -106,4 +106,4 @@ jobs: continue-on-error: false if: always() run: | - echo "TODO clean up" + ./runIntegrationTests.sh kill_server_and_clients diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index ba317950..c7794fa9 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -16,7 +16,6 @@ fi check_files_on_github () { - exit 1 echo "[Run] Test whether expected content is available on github" LICENSE_ON_GITHUB=$(curl -L https://github.com/KatherLab/MediSwarm/raw/refs/heads/main/LICENSE) @@ -468,8 +467,8 @@ run_dummy_training_in_swarm () { kill_server_and_clients () { - echo "[Cleanup] Kill server and client Docker containers ..." - docker kill odelia_swarm_server_flserver_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_A_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_B_$CONTAINER_VERSION_SUFFIX + echo "[Cleanup] Kill server and client Docker containers if running ..." + docker kill odelia_swarm_server_flserver_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_A_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_B_$CONTAINER_VERSION_SUFFIX || true } @@ -560,6 +559,10 @@ case "$1" in cleanup_temporary_data ;; + kill_server_and_clients) + kill_server_and_clients + ;; + all | "") check_files_on_github run_unit_tests_controller From 8776c545b0250c22da2c46f048ca41c7f405317c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 24 Oct 2025 08:41:41 +0200 Subject: [PATCH 238/337] check if containers were killed --- runIntegrationTests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index c7794fa9..fc664356 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -469,6 +469,7 @@ run_dummy_training_in_swarm () { kill_server_and_clients () { echo "[Cleanup] Kill server and client Docker containers if running ..." docker kill odelia_swarm_server_flserver_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_A_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_B_$CONTAINER_VERSION_SUFFIX || true + docker ps } From bcd362eac9067994b9944388a19210d4a1cca746 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 24 Oct 2025 09:22:07 +0200 Subject: [PATCH 239/337] try killing orphaned containers by name pattern --- runIntegrationTests.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index fc664356..e44283af 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -470,6 +470,8 @@ kill_server_and_clients () { echo "[Cleanup] Kill server and client Docker containers if running ..." docker kill odelia_swarm_server_flserver_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_A_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_B_$CONTAINER_VERSION_SUFFIX || true docker ps + docker kill $(docker container ls -q --filter name=odelia_swarm_server_flserver_\*) $(docker container ls -q --filter name=odelia_swarm_client_\*) || true + docker ps } From 28d15c2ebde4dd3ca72dc19794fcd14baa61f21c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 24 Oct 2025 09:23:10 +0200 Subject: [PATCH 240/337] Revert "try killing orphaned containers by name pattern" This reverts commit bcd362eac9067994b9944388a19210d4a1cca746. --- runIntegrationTests.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index e44283af..fc664356 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -470,8 +470,6 @@ kill_server_and_clients () { echo "[Cleanup] Kill server and client Docker containers if running ..." docker kill odelia_swarm_server_flserver_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_A_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_B_$CONTAINER_VERSION_SUFFIX || true docker ps - docker kill $(docker container ls -q --filter name=odelia_swarm_server_flserver_\*) $(docker container ls -q --filter name=odelia_swarm_client_\*) || true - docker ps } From 9dfc9f0fac9208c8879fc5fa6d21604252ffc7d1 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 24 Oct 2025 09:23:16 +0200 Subject: [PATCH 241/337] Revert "check if containers were killed" This reverts commit 8776c545b0250c22da2c46f048ca41c7f405317c. --- runIntegrationTests.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index fc664356..c7794fa9 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -469,7 +469,6 @@ run_dummy_training_in_swarm () { kill_server_and_clients () { echo "[Cleanup] Kill server and client Docker containers if running ..." docker kill odelia_swarm_server_flserver_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_A_$CONTAINER_VERSION_SUFFIX odelia_swarm_client_client_B_$CONTAINER_VERSION_SUFFIX || true - docker ps } From 8bbfc274ac7e0769d858851e8965779f0a8ed5f0 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 24 Oct 2025 09:27:20 +0200 Subject: [PATCH 242/337] setup for local openvpn server (same as in dev-122-vpn-from-within-container branch, but without hard-coded certificates) --- tests/local_vpn/Dockerfile_openvpnserver | 11 + tests/local_vpn/README.txt | 17 + tests/local_vpn/_build_docker.sh | 6 + .../_openvpn_certificate_creation.sh | 67 ++++ tests/local_vpn/_openvpn_start.sh | 31 ++ tests/local_vpn/ca_user/ca_setup.sh | 19 ++ tests/local_vpn/client_configs/.gitignore | 1 + tests/local_vpn/client_configs/client.conf | 138 ++++++++ tests/local_vpn/client_configs/make_ovpn.sh | 18 ++ .../local_vpn/create_openvpn_certificates.sh | 5 + tests/local_vpn/run_docker_openvpnserver.sh | 5 + tests/local_vpn/server_config/.gitignore | 1 + tests/local_vpn/server_config/ccd/client_A | 1 + tests/local_vpn/server_config/ccd/client_B | 1 + tests/local_vpn/server_config/server.conf | 304 ++++++++++++++++++ 15 files changed, 625 insertions(+) create mode 100644 tests/local_vpn/Dockerfile_openvpnserver create mode 100644 tests/local_vpn/README.txt create mode 100755 tests/local_vpn/_build_docker.sh create mode 100644 tests/local_vpn/_openvpn_certificate_creation.sh create mode 100644 tests/local_vpn/_openvpn_start.sh create mode 100755 tests/local_vpn/ca_user/ca_setup.sh create mode 100644 tests/local_vpn/client_configs/.gitignore create mode 100755 tests/local_vpn/client_configs/client.conf create mode 100755 tests/local_vpn/client_configs/make_ovpn.sh create mode 100755 tests/local_vpn/create_openvpn_certificates.sh create mode 100755 tests/local_vpn/run_docker_openvpnserver.sh create mode 100644 tests/local_vpn/server_config/.gitignore create mode 100644 tests/local_vpn/server_config/ccd/client_A create mode 100644 tests/local_vpn/server_config/ccd/client_B create mode 100755 tests/local_vpn/server_config/server.conf diff --git a/tests/local_vpn/Dockerfile_openvpnserver b/tests/local_vpn/Dockerfile_openvpnserver new file mode 100644 index 00000000..8270f8fa --- /dev/null +++ b/tests/local_vpn/Dockerfile_openvpnserver @@ -0,0 +1,11 @@ +FROM ubuntu:22.04 + +RUN apt update +RUN apt install -y easy-rsa openvpn openssl ufw joe patch +RUN apt install -y openssh-server net-tools + +RUN useradd ca_user + +COPY _openvpn_certificate_creation.sh / +COPY _openvpn_start.sh / +RUN chmod u+x /*.sh diff --git a/tests/local_vpn/README.txt b/tests/local_vpn/README.txt new file mode 100644 index 00000000..5cc3e826 --- /dev/null +++ b/tests/local_vpn/README.txt @@ -0,0 +1,17 @@ +# Following https://www.digitalocean.com/community/tutorials/how-to-set-up-and-configure-an-openvpn-server-on-ubuntu-20-04 +# but on 22.04 + +Setup +----- +./create_openvpn_certificates.sh builds a docker image and creates certificates and .ovpn config files for the clients specified in _openvpn_certificate_creation.sh +Modify server_config/server.conf and client_configs/client.conf to modify network configuration. +Files to use on the server and client are created in server_config/ and client_configs/ + +Usage +----- +./openvpn_start.sh builds a docker image and starts OpenVPN server in the docker container. +Modify _openvpn_start.sh for further firewall etc. configuration. + +Disclaimer +---------- +This configuration is not necessarily secure and should not be re-used unless you know what you are doing. diff --git a/tests/local_vpn/_build_docker.sh b/tests/local_vpn/_build_docker.sh new file mode 100755 index 00000000..0df1ce0f --- /dev/null +++ b/tests/local_vpn/_build_docker.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +# TODO should this be named "latest"? Do we need to pin versions? +# TODO think about splitting building certificates from running the VPN container + +docker build -t odelia_testing_openvpnserver:latest . -f Dockerfile_openvpnserver diff --git a/tests/local_vpn/_openvpn_certificate_creation.sh b/tests/local_vpn/_openvpn_certificate_creation.sh new file mode 100644 index 00000000..a815f001 --- /dev/null +++ b/tests/local_vpn/_openvpn_certificate_creation.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash + +# Roughly following https://www.digitalocean.com/community/tutorials/how-to-set-up-and-configure-an-openvpn-server-on-ubuntu-20-04 +# but on 22.04 + +chown ca_user:ca_user /home/ca_user/ -R +chmod a+rwX /home/ca_user/ -R +/bin/su - -c '/home/ca_user/ca_setup.sh' ca_user + +mkdir ~/easy-rsa +ln -s /usr/share/easy-rsa/* ~/easy-rsa/ +cd ~/easy-rsa + +echo 'set_var EASYRSA_ALGO "ec"' > vars +echo 'set_var EASYRSA_DIGEST "sha512"' >> vars + +./easyrsa init-pki + +rm /server_config/ca.crt \ + /server_config/server.crt \ + /server_config/server.key \ + /server_config/ta.key -f + +rm -rf /client_configs/keys +mkdir -p /client_configs/keys/ + +export EASYRSA_BATCH=1 +./easyrsa gen-req server nopass + +cp ~/easy-rsa/pki/reqs/server.req /tmp/ +chmod a+r /tmp/server.req +/bin/su - -c "export EASYRSA_BATCH=1 && cd ~/easy-rsa/ && ./easyrsa import-req /tmp/server.req server && ./easyrsa sign-req server server" ca_user + +cd ~/easy-rsa +openvpn --genkey secret ta.key +cp ta.key /client_configs/keys/ +cp /home/ca_user/easy-rsa/pki/ca.crt /client_configs/keys/ + +# copy/create files to where they are needed +cp /home/ca_user/easy-rsa/pki/ca.crt /server_config/ +cp /home/ca_user/easy-rsa/pki/issued/server.crt /server_config/ +cp ~/easy-rsa/pki/private/server.key /server_config/ +cp ~/easy-rsa/ta.key /server_config/ + +mkdir /server_config/ccd + +i=4 +for client in testserver.local admin@test.odelia client_A client_B; do + cd ~/easy-rsa + EASYRSA_BATCH=1 EASYRSA_REQ_CN=$client ./easyrsa gen-req $client nopass + cp pki/private/$client.key /client_configs/keys/ + + cp ~/easy-rsa/pki/reqs/$client.req /tmp/ + chmod a+r /tmp/$client.req + /bin/su - -c "export EASYRSA_BATCH=1 && cd ~/easy-rsa/ && ./easyrsa import-req /tmp/$client.req $client && ./easyrsa sign-req client $client" ca_user + cp /home/ca_user/easy-rsa/pki/issued/$client.crt /client_configs/keys/ + + cd /client_configs + ./make_ovpn.sh $client + + echo "ifconfig-push 10.8.0."$i" 255.0.0.0" > /server_config/ccd/$client + i=$((i+1)) +done + +chmod a+rwX /client_configs -R +chmod a+rwX /server_config -R +chmod a+rwX /home/ca_user -R diff --git a/tests/local_vpn/_openvpn_start.sh b/tests/local_vpn/_openvpn_start.sh new file mode 100644 index 00000000..62d1a864 --- /dev/null +++ b/tests/local_vpn/_openvpn_start.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +echo "net.ipv4.ip_forward = 1" >> /etc/sysctl.conf +sysctl -p + +echo "MTBhMTEsMTkKPiAjIFNUQVJUIE9QRU5WUE4gUlVMRVMKPiAjIE5BVCB0YWJsZSBydWxlcwo+ICpuYXQKPiA6UE9TVFJPVVRJTkcgQUNDRVBUIFswOjBdCj4gIyBBbGxvdyB0cmFmZmljIGZyb20gT3BlblZQTiBjbGllbnQgdG8gZXRoMCAoY2hhbmdlIHRvIHRoZSBpbnRlcmZhY2UgeW91IGRpc2NvdmVyZWQhKQo+IC1BIFBPU1RST1VUSU5HIC1zIDEwLjguMC4wLzggLW8gZXRoMCAtaiBNQVNRVUVSQURFCj4gQ09NTUlUCj4gIyBFTkQgT1BFTlZQTiBSVUxFUwo+IAo=" | base64 -d > before.rules.patch +patch /etc/ufw/before.rules before.rules.patch +rm before.rules.patch + +echo "MTljMTkKPCBERUZBVUxUX0ZPUldBUkRfUE9MSUNZPSJEUk9QIgotLS0KPiBERUZBVUxUX0ZPUldBUkRfUE9MSUNZPSJBQ0NFUFQiCg==" | base64 -d > ufw.patch +patch /etc/default/ufw ufw.patch +rm ufw.patch + +ufw allow 9194/udp +ufw allow OpenSSH +ufw disable +ufw enable + +cp /server_config/ca.crt /etc/openvpn/server/ +cp /server_config/server.conf /etc/openvpn/server/ +cp /server_config/server.crt /etc/openvpn/server/ +cp /server_config/server.key /etc/openvpn/server/ +cp /server_config/ta.key /etc/openvpn/server/ +cp /server_config/ccd /etc/openvpn/ccd -r + +# write log to folder on host +cd server_config + +nohup openvpn --duplicate-cn --client-to-client --config /etc/openvpn/server/server.conf & +sleep 2 +chmod a+r /server_config/nohup.out diff --git a/tests/local_vpn/ca_user/ca_setup.sh b/tests/local_vpn/ca_user/ca_setup.sh new file mode 100755 index 00000000..e7ac3992 --- /dev/null +++ b/tests/local_vpn/ca_user/ca_setup.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +rm -rf ~/easy-rsa +mkdir ~/easy-rsa +ln -s /usr/share/easy-rsa/* ~/easy-rsa/ +cd ~/easy-rsa +./easyrsa init-pki + +echo 'set_var EASYRSA_REQ_COUNTRY "DE"' > vars +echo 'set_var EASYRSA_REQ_PROVINCE "Bremen"' >> vars +echo 'set_var EASYRSA_REQ_CITY "Bremen"' >> vars +echo 'set_var EASYRSA_REQ_ORG "ODELIA_MEVIS"' >> vars +echo 'set_var EASYRSA_REQ_EMAIL "admin@mevis.odelia"' >> vars +echo 'set_var EASYRSA_REQ_OU "Testing"' >> vars +echo 'set_var EASYRSA_ALGO "ec"' >> vars +echo 'set_var EASYRSA_DIGEST "sha512"' >> vars + +export EASYRSA_BATCH=1 +./easyrsa build-ca nopass diff --git a/tests/local_vpn/client_configs/.gitignore b/tests/local_vpn/client_configs/.gitignore new file mode 100644 index 00000000..dee56957 --- /dev/null +++ b/tests/local_vpn/client_configs/.gitignore @@ -0,0 +1 @@ +*.* \ No newline at end of file diff --git a/tests/local_vpn/client_configs/client.conf b/tests/local_vpn/client_configs/client.conf new file mode 100755 index 00000000..49669b71 --- /dev/null +++ b/tests/local_vpn/client_configs/client.conf @@ -0,0 +1,138 @@ +############################################## +# Sample client-side OpenVPN 2.0 config file # +# for connecting to multi-client server. # +# # +# This configuration can be used by multiple # +# clients, however each client should have # +# its own cert and key files. # +# # +# On Windows, you might want to rename this # +# file so it has a .ovpn extension # +############################################## + +# Specify that we are a client and that we +# will be pulling certain config file directives +# from the server. +client + +# Use the same setting as you are using on +# the server. +# On most systems, the VPN will not function +# unless you partially or fully disable +# the firewall for the TUN/TAP interface. +;dev tap +dev tun + +# Windows needs the TAP-Win32 adapter name +# from the Network Connections panel +# if you have more than one. On XP SP2, +# you may need to disable the firewall +# for the TAP adapter. +;dev-node MyTap + +# Are we connecting to a TCP or +# UDP server? Use the same setting as +# on the server. +;proto tcp +proto udp + +# The hostname/IP and port of the server. +# You can have multiple remote entries +# to load balance between the servers. +remote 172.17.0.1 9194 + +# Choose a random host from the remote +# list for load-balancing. Otherwise +# try hosts in the order specified. +;remote-random + +# Keep trying indefinitely to resolve the +# host name of the OpenVPN server. Very useful +# on machines which are not permanently connected +# to the internet such as laptops. +resolv-retry infinite + +# Most clients don't need to bind to +# a specific local port number. +nobind + +# Downgrade privileges after initialization (non-Windows only) +user nobody +group nogroup + +# Try to preserve some state across restarts. +persist-key +persist-tun + +# If you are connecting through an +# HTTP proxy to reach the actual OpenVPN +# server, put the proxy server/IP and +# port number here. See the man page +# if your proxy server requires +# authentication. +;http-proxy-retry # retry on connection failures +;http-proxy [proxy server] [proxy port #] + +# Wireless networks often produce a lot +# of duplicate packets. Set this flag +# to silence duplicate packet warnings. +;mute-replay-warnings + +# SSL/TLS parms. +# See the server config file for more +# description. It's best to use +# a separate .crt/.key file pair +# for each client. A single ca +# file can be used for all clients. + +# Verify server certificate by checking that the +# certificate has the correct key usage set. +# This is an important precaution to protect against +# a potential attack discussed here: +# http://openvpn.net/howto.html#mitm +# +# To use this feature, you will need to generate +# your server certificates with the keyUsage set to +# digitalSignature, keyEncipherment +# and the extendedKeyUsage to +# serverAuth +# EasyRSA can do this for you. +remote-cert-tls server + +# If a tls-auth key is used on the server +# then every client must also have the key. +;tls-auth ta.key 1 + +# Select a cryptographic cipher. +# If the cipher option is used on the server +# then you must also specify it here. +# Note that v2.4 client/server will automatically +# negotiate AES-256-GCM in TLS mode. +# See also the data-ciphers option in the manpage +;cipher AES-256-CBC +cipher AES-256-GCM + +auth SHA256 + +# Enable compression on the VPN link. +# Don't enable this unless it is also +# enabled in the server config file. +#comp-lzo + +# Set log file verbosity. +verb 3 + +# Silence repeating messages +;mute 20 + +key-direction 1 + +; script-security 2 +; up /etc/openvpn/update-resolv-conf +; down /etc/openvpn/update-resolv-conf + +; script-security 2 +; up /etc/openvpn/update-systemd-resolved +; down /etc/openvpn/update-systemd-resolved +; down-pre +; dhcp-option DOMAIN-ROUTE . diff --git a/tests/local_vpn/client_configs/make_ovpn.sh b/tests/local_vpn/client_configs/make_ovpn.sh new file mode 100755 index 00000000..6a73d7f7 --- /dev/null +++ b/tests/local_vpn/client_configs/make_ovpn.sh @@ -0,0 +1,18 @@ +#!/bin/bash + +# First argument: Client identifier + +KEY_DIR=./keys +BASE_CONFIG=./client.conf + +cat ${BASE_CONFIG} \ + <(echo -e '') \ + ${KEY_DIR}/ca.crt \ + <(echo -e '\n') \ + ${KEY_DIR}/${1}.crt \ + <(echo -e '\n') \ + ${KEY_DIR}/${1}.key \ + <(echo -e '\n') \ + ${KEY_DIR}/ta.key \ + <(echo -e '') \ + > ${1}_client.ovpn diff --git a/tests/local_vpn/create_openvpn_certificates.sh b/tests/local_vpn/create_openvpn_certificates.sh new file mode 100755 index 00000000..91aef0dc --- /dev/null +++ b/tests/local_vpn/create_openvpn_certificates.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +./_build_docker.sh + +docker run --rm -v ./ca_user:/home/ca_user -v ./client_configs:/client_configs -v ./server_config:/server_config -p 9194:9194/udp --cap-add=NET_ADMIN --privileged --name odelia_testing_openvpnserver odelia_testing_openvpnserver:latest /bin/bash -c "./_openvpn_certificate_creation.sh" diff --git a/tests/local_vpn/run_docker_openvpnserver.sh b/tests/local_vpn/run_docker_openvpnserver.sh new file mode 100755 index 00000000..f501f811 --- /dev/null +++ b/tests/local_vpn/run_docker_openvpnserver.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +./_build_docker.sh + +docker run -d -t --rm -v ./ca_user:/home/ca_user -v ./server_config:/server_config -p 9194:9194/udp --cap-add=NET_ADMIN --privileged --name odelia_testing_openvpnserver odelia_testing_openvpnserver:latest /bin/bash -c "./_openvpn_start.sh && /bin/bash" diff --git a/tests/local_vpn/server_config/.gitignore b/tests/local_vpn/server_config/.gitignore new file mode 100644 index 00000000..dee56957 --- /dev/null +++ b/tests/local_vpn/server_config/.gitignore @@ -0,0 +1 @@ +*.* \ No newline at end of file diff --git a/tests/local_vpn/server_config/ccd/client_A b/tests/local_vpn/server_config/ccd/client_A new file mode 100644 index 00000000..2009193e --- /dev/null +++ b/tests/local_vpn/server_config/ccd/client_A @@ -0,0 +1 @@ +ifconfig-push 10.8.0.6 255.0.0.0 diff --git a/tests/local_vpn/server_config/ccd/client_B b/tests/local_vpn/server_config/ccd/client_B new file mode 100644 index 00000000..da607617 --- /dev/null +++ b/tests/local_vpn/server_config/ccd/client_B @@ -0,0 +1 @@ +ifconfig-push 10.8.0.7 255.0.0.0 diff --git a/tests/local_vpn/server_config/server.conf b/tests/local_vpn/server_config/server.conf new file mode 100755 index 00000000..8342f7be --- /dev/null +++ b/tests/local_vpn/server_config/server.conf @@ -0,0 +1,304 @@ +################################################# +# Sample OpenVPN 2.0 config file for # +# multi-client server. # +# # +# This file is for the server side # +# of a many-clients <-> one-server # +# OpenVPN configuration. # +# # +# OpenVPN also supports # +# single-machine <-> single-machine # +# configurations (See the Examples page # +# on the web site for more info). # +# # +# This config should work on Windows # +# or Linux/BSD systems. Remember on # +# Windows to quote pathnames and use # +# double backslashes, e.g.: # +# "C:\\Program Files\\OpenVPN\\config\\foo.key" # +# # +# Comments are preceded with '#' or ';' # +################################################# + +# Which local IP address should OpenVPN +# listen on? (optional) +;local a.b.c.d + +# Which TCP/UDP port should OpenVPN listen on? +# If you want to run multiple OpenVPN instances +# on the same machine, use a different port +# number for each one. You will need to +# open up this port on your firewall. +port 9194 + +# TCP or UDP server? +;proto tcp +proto udp + +# "dev tun" will create a routed IP tunnel, +# "dev tap" will create an ethernet tunnel. +# Use "dev tap0" if you are ethernet bridging +# and have precreated a tap0 virtual interface +# and bridged it with your ethernet interface. +# If you want to control access policies +# over the VPN, you must create firewall +# rules for the the TUN/TAP interface. +# On non-Windows systems, you can give +# an explicit unit number, such as tun0. +# On Windows, use "dev-node" for this. +# On most systems, the VPN will not function +# unless you partially or fully disable +# the firewall for the TUN/TAP interface. +;dev tap +dev tun + +# Windows needs the TAP-Win32 adapter name +# from the Network Connections panel if you +# have more than one. On XP SP2 or higher, +# you may need to selectively disable the +# Windows firewall for the TAP adapter. +# Non-Windows systems usually don't need this. +;dev-node MyTap + +# SSL/TLS root certificate (ca), certificate +# (cert), and private key (key). Each client +# and the server must have their own cert and +# key file. The server and all clients will +# use the same ca file. +# +# See the "easy-rsa" directory for a series +# of scripts for generating RSA certificates +# and private keys. Remember to use +# a unique Common Name for the server +# and each of the client certificates. +# +# Any X509 key management system can be used. +# OpenVPN can also use a PKCS #12 formatted key file +# (see "pkcs12" directive in man page). +ca /etc/openvpn/server/ca.crt +cert /etc/openvpn/server/server.crt +key /etc/openvpn/server/server.key # This file should be kept secret + +# Diffie hellman parameters. +# Generate your own with: +# openssl dhparam -out dh1024.pem 1024 +# Substitute 2048 for 1024 if you are using +# 2048 bit keys. +;dh dh1024.pem +dh none + +# Configure server mode and supply a VPN subnet +# for OpenVPN to draw client addresses from. +# The server will take 10.8.0.1 for itself, +# the rest will be made available to clients. +# Each client will be able to reach the server +# on 10.8.0.1. Comment this line out if you are +# ethernet bridging. See the man page for more info. +server 10.8.0.0 255.255.255.0 + +# Maintain a record of client <-> virtual IP address +# associations in this file. If OpenVPN goes down or +# is restarted, reconnecting clients can be assigned +# the same virtual IP address from the pool that was +# previously assigned. +ifconfig-pool-persist ipp.txt + +# Configure server mode for ethernet bridging. +# You must first use your OS's bridging capability +# to bridge the TAP interface with the ethernet +# NIC interface. Then you must manually set the +# IP/netmask on the bridge interface, here we +# assume 10.8.0.4/255.255.255.0. Finally we +# must set aside an IP range in this subnet +# (start=10.8.0.50 end=10.8.0.100) to allocate +# to connecting clients. Leave this line commented +# out unless you are ethernet bridging. +;server-bridge 10.8.0.4 255.255.255.0 10.8.0.50 10.8.0.100 + +# Configure server mode for ethernet bridging +# using a DHCP-proxy, where clients talk +# to the OpenVPN server-side DHCP server +# to receive their IP address allocation +# and DNS server addresses. You must first use +# your OS's bridging capability to bridge the TAP +# interface with the ethernet NIC interface. +# Note: this mode only works on clients (such as +# Windows), where the client-side TAP adapter is +# bound to a DHCP client. +;server-bridge + +# Push routes to the client to allow it +# to reach other private subnets behind +# the server. Remember that these +# private subnets will also need +# to know to route the OpenVPN client +# address pool (10.8.0.0/255.255.255.0) +# back to the OpenVPN server. +;push "route 192.168.10.0 255.255.255.0" +;push "route 192.168.20.0 255.255.255.0" + +# To assign specific IP addresses to specific +# clients or if a connecting client has a private +# subnet behind it that should also have VPN access, +# use the subdirectory "ccd" for client-specific +# configuration files (see man page for more info). + +# EXAMPLE: Suppose the client +# having the certificate common name "Thelonious" +# also has a small subnet behind his connecting +# machine, such as 192.168.40.128/255.255.255.248. +# First, uncomment out these lines: +;client-config-dir ccd +;route 192.168.40.128 255.255.255.248 +# Then create a file ccd/Thelonious with this line: +# iroute 192.168.40.128 255.255.255.248 +# This will allow Thelonious' private subnet to +# access the VPN. This example will only work +# if you are routing, not bridging, i.e. you are +# using "dev tun" and "server" directives. + +# EXAMPLE: Suppose you want to give +# Thelonious a fixed VPN IP address of 10.9.0.1. +# First uncomment out these lines: +client-config-dir ccd +;route 10.9.0.0 255.255.255.252 +# Then add this line to ccd/Thelonious: +# ifconfig-push 10.9.0.1 10.9.0.2 + +# Suppose that you want to enable different +# firewall access policies for different groups +# of clients. There are two methods: +# (1) Run multiple OpenVPN daemons, one for each +# group, and firewall the TUN/TAP interface +# for each group/daemon appropriately. +# (2) (Advanced) Create a script to dynamically +# modify the firewall in response to access +# from different clients. See man +# page for more info on learn-address script. +;learn-address ./script + +# If enabled, this directive will configure +# all clients to redirect their default +# network gateway through the VPN, causing +# all IP traffic such as web browsing and +# and DNS lookups to go through the VPN +# (The OpenVPN server machine may need to NAT +# or bridge the TUN/TAP interface to the internet +# in order for this to work properly). +;push "redirect-gateway def1 bypass-dhcp" + +# Certain Windows-specific network settings +# can be pushed to clients, such as DNS +# or WINS server addresses. CAVEAT: +# http://openvpn.net/faq.html#dhcpcaveats +# The addresses below refer to the public +# DNS servers provided by opendns.com. +;push "dhcp-option DNS 208.67.222.222" +;push "dhcp-option DNS 208.67.220.220" + +# Uncomment this directive to allow different +# clients to be able to "see" each other. +# By default, clients will only see the server. +# To force clients to only see the server, you +# will also need to appropriately firewall the +# server's TUN/TAP interface. +;client-to-client + +# Uncomment this directive if multiple clients +# might connect with the same certificate/key +# files or common names. This is recommended +# only for testing purposes. For production use, +# each client should have its own certificate/key +# pair. +# +# IF YOU HAVE NOT GENERATED INDIVIDUAL +# CERTIFICATE/KEY PAIRS FOR EACH CLIENT, +# EACH HAVING ITS OWN UNIQUE "COMMON NAME", +# UNCOMMENT THIS LINE OUT. +;duplicate-cn + +# The keepalive directive causes ping-like +# messages to be sent back and forth over +# the link so that each side knows when +# the other side has gone down. +# Ping every 10 seconds, assume that remote +# peer is down if no ping received during +# a 120 second time period. +keepalive 2 10 + +# For extra security beyond that provided +# by SSL/TLS, create an "HMAC firewall" +# to help block DoS attacks and UDP port flooding. +# +# Generate with: +# openvpn --genkey --secret ta.key +# +# The server and each client must have +# a copy of this key. +# The second parameter should be '0' +# on the server and '1' on the clients. +;tls-auth ta.key 0 # This file is secret +tls-crypt /etc/openvpn/server/ta.key + +# Select a cryptographic cipher. +# This config item must be copied to +# the client config file as well. +;cipher BF-CBC # Blowfish (default) +;cipher AES-128-CBC # AES +;cipher DES-EDE3-CBC # Triple-DES +cipher AES-256-GCM + +auth SHA256 + +# Enable compression on the VPN link. +# If you enable it here, you must also +# enable it in the client config file. +;comp-lzo + +# The maximum number of concurrently connected +# clients we want to allow. +;max-clients 100 + +# It's a good idea to reduce the OpenVPN +# daemon's privileges after initialization. +# +# You can uncomment this out on +# non-Windows systems. +user nobody +group nogroup + +# The persist options will try to avoid +# accessing certain resources on restart +# that may no longer be accessible because +# of the privilege downgrade. +persist-key +persist-tun + +# Output a short status file showing +# current connections, truncated +# and rewritten every minute. +status openvpn-status.log + +# By default, log messages will go to the syslog (or +# on Windows, if running as a service, they will go to +# the "\Program Files\OpenVPN\log" directory). +# Use log or log-append to override this default. +# "log" will truncate the log file on OpenVPN startup, +# while "log-append" will append to it. Use one +# or the other (but not both). +;log openvpn.log +;log-append openvpn.log + +# Set the appropriate level of log +# file verbosity. +# +# 0 is silent, except for fatal errors +# 4 is reasonable for general usage +# 5 and 6 can help to debug connection problems +# 9 is extremely verbose +verb 3 + +# Silence repeating messages. At most 20 +# sequential messages of the same message +# category will be output to the log. +;mute 20 From 1b9d91afe9e7a467a86e5c43068cb05e5b494e10 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 24 Oct 2025 09:47:29 +0200 Subject: [PATCH 243/337] throttle VPN to 30 Mbit/s --- tests/local_vpn/_openvpn_start.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/local_vpn/_openvpn_start.sh b/tests/local_vpn/_openvpn_start.sh index 62d1a864..059cbd6c 100644 --- a/tests/local_vpn/_openvpn_start.sh +++ b/tests/local_vpn/_openvpn_start.sh @@ -29,3 +29,6 @@ cd server_config nohup openvpn --duplicate-cn --client-to-client --config /etc/openvpn/server/server.conf & sleep 2 chmod a+r /server_config/nohup.out + +tc qdisc add dev eth0 root tbf rate 30mbit burst 5mbit limit 16gbit + From 77039de68971ad4d886870ad6852c8142ddd1ae3 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 24 Oct 2025 10:07:52 +0200 Subject: [PATCH 244/337] upgraded to ubuntu 24.04 base image, upgrade packages, install iproute2 for tc --- tests/local_vpn/Dockerfile_openvpnserver | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/local_vpn/Dockerfile_openvpnserver b/tests/local_vpn/Dockerfile_openvpnserver index 8270f8fa..931bd276 100644 --- a/tests/local_vpn/Dockerfile_openvpnserver +++ b/tests/local_vpn/Dockerfile_openvpnserver @@ -1,8 +1,9 @@ -FROM ubuntu:22.04 +FROM ubuntu:24.04 RUN apt update +RUN apt upgrade -y RUN apt install -y easy-rsa openvpn openssl ufw joe patch -RUN apt install -y openssh-server net-tools +RUN apt install -y openssh-server iproute2 RUN useradd ca_user From 72bf2f4308afcc33aa26ce2e4c2a05ab1cb71c36 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 24 Oct 2025 10:28:25 +0200 Subject: [PATCH 245/337] configured testing swarm with two clients and modified ports --- application/provision/project_MEVIS_test.yml | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/application/provision/project_MEVIS_test.yml b/application/provision/project_MEVIS_test.yml index b4e962c1..a36b5ea1 100644 --- a/application/provision/project_MEVIS_test.yml +++ b/application/provision/project_MEVIS_test.yml @@ -4,29 +4,17 @@ description: > Test setup. participants: - - name: odeliaswarmvm.local + - name: odelia-vm-a-localvpn type: server org: MEVIS_Test - fed_learn_port: 8002 - admin_port: 8003 + fed_learn_port: 8022 + admin_port: 8023 - name: CAM type: client org: MEVIS_Test - name: MHA type: client org: MEVIS_Test - - name: RUMC - type: client - org: MEVIS_Test - - name: UKA - type: client - org: MEVIS_Test - - name: UMCU - type: client - org: MEVIS_Test - - name: Centralized - type: client - org: MEVIS_Test - name: admin@mevis.odelia type: admin org: MEVIS_Test @@ -66,7 +54,7 @@ builders: # overseer_exists: false args: - sp_end_point: odeliaswarmvm.local:8002:8003 + sp_end_point: odeliaswarmvm.local:8022:8023 - path: nvflare.lighter.impl.cert.CertBuilder - path: nvflare.lighter.impl.signature.SignatureBuilder From d407ce6f2806830dbcc724c85aff104cc801873e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 24 Oct 2025 11:03:28 +0200 Subject: [PATCH 246/337] vpn test server --- tests/local_vpn/client_configs/client.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/local_vpn/client_configs/client.conf b/tests/local_vpn/client_configs/client.conf index 49669b71..298fd2ec 100755 --- a/tests/local_vpn/client_configs/client.conf +++ b/tests/local_vpn/client_configs/client.conf @@ -39,7 +39,7 @@ proto udp # The hostname/IP and port of the server. # You can have multiple remote entries # to load balance between the servers. -remote 172.17.0.1 9194 +remote odelia-vm-a 9194 # Choose a random host from the remote # list for load-balancing. Otherwise From 8eaa994695ff7a27e92bd9869ab5932089faeb35 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 24 Oct 2025 11:18:07 +0200 Subject: [PATCH 247/337] omit tests/local_vpn from Docker image --- buildDockerImageAndStartupKits.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildDockerImageAndStartupKits.sh b/buildDockerImageAndStartupKits.sh index 654e63cd..c490123d 100755 --- a/buildDockerImageAndStartupKits.sh +++ b/buildDockerImageAndStartupKits.sh @@ -32,7 +32,7 @@ CONTAINER_VERSION_ID=`git rev-parse --short HEAD` CWD=`pwd` CLEAN_SOURCE_DIR=`mktemp -d` mkdir $CLEAN_SOURCE_DIR/MediSwarm -rsync -ax --exclude workspace . $CLEAN_SOURCE_DIR/MediSwarm/ +rsync -ax --exclude workspace --exclude tests/local_vpn . $CLEAN_SOURCE_DIR/MediSwarm/ cd $CLEAN_SOURCE_DIR/MediSwarm git clean -x -q -f . cd docker_config/NVFlare From e677c4005f8209499e0c1b21f911fa60cb57a94a Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 24 Oct 2025 14:43:41 +0200 Subject: [PATCH 248/337] consistent server host name --- application/provision/project_MEVIS_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/provision/project_MEVIS_test.yml b/application/provision/project_MEVIS_test.yml index a36b5ea1..2701cb4d 100644 --- a/application/provision/project_MEVIS_test.yml +++ b/application/provision/project_MEVIS_test.yml @@ -54,7 +54,7 @@ builders: # overseer_exists: false args: - sp_end_point: odeliaswarmvm.local:8022:8023 + sp_end_point: odelia-vm-a-localvpn:8022:8023 - path: nvflare.lighter.impl.cert.CertBuilder - path: nvflare.lighter.impl.signature.SignatureBuilder From 5192c766dc2ddc971b7917a0e79cdeabe73f8650 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 27 Oct 2025 14:15:37 +0100 Subject: [PATCH 249/337] avoid nested quotes --- runIntegrationTests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 0249c69b..97e9de36 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -437,7 +437,7 @@ run_dummy_training_in_swarm () { 'FederatedClient - INFO - Got the new primary SP:' \ 'SwarmClientController - INFO - .*: accepted learn request from client_.' \ 'Gatherer - INFO - .*: Contribution from client_. ACCEPTED by the aggregator at round .' \ - "SwarmClientController - INFO - .*: Broadcasting learn task of round . to ['client_A', 'client_B']; aggr client is client_." + 'SwarmClientController - INFO - .*: Broadcasting learn task of round . to .*; aggr client is client_.' do if grep -q --regexp="$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then echo "Expected output $EXPECTED_OUTPUT found" From 2be94badc73e46d0502e1194f5a8d2c293a2a320 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 27 Oct 2025 14:47:43 +0100 Subject: [PATCH 250/337] wait an additional minute for training to finish --- runIntegrationTests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 97e9de36..c7028615 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -402,7 +402,7 @@ run_dummy_training_in_swarm () { cd admin@test.odelia/startup "$CWD"/tests/integration_tests/_submitDummyTraining.exp docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX - sleep 60 + sleep 120 cd "$CWD" cd "$PROJECT_DIR"/prod_00/localhost/startup From 97704261531ab7ed9e9bf139a58e8cd2049ae65d Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 27 Oct 2025 15:11:34 +0100 Subject: [PATCH 251/337] no synthetic data needed for dummy training --- runIntegrationTests.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index c7028615..1d927569 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -558,7 +558,6 @@ case "$1" in run_dummy_training_in_swarm) create_startup_kits_and_check_contained_files - create_synthetic_data start_server_and_clients run_dummy_training_in_swarm kill_server_and_clients From ef25440cb4b5e80631fccaa127fefb07b680f6c5 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 27 Oct 2025 15:12:01 +0100 Subject: [PATCH 252/337] remove synthetic data before running dummy training --- runIntegrationTests.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 1d927569..0cd5d0ff 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -478,8 +478,13 @@ kill_server_and_clients () { } +cleanup_synthetic_data () { + echo "[Cleanup] Removing synthetic data ..." + rm -rf "$SYNTHETIC_DATA_DIR"/* +} + cleanup_temporary_data () { - echo "[Cleanup] Removing synthetic data, scratch directory, dummy workspace ..." + echo "[Cleanup] Removing synthetic data directory, scratch directory, dummy workspace ..." rm -rf "$SYNTHETIC_DATA_DIR" rm -rf "$SCRATCH_DIR" rm -rf "$PROJECT_DIR" @@ -584,6 +589,7 @@ case "$1" in run_docker_gpu_preflight_check run_data_access_preflight_check verify_wrong_certificates_are_rejected + cleanup_synthetic_data start_server_and_clients run_dummy_training_in_swarm kill_server_and_clients From 73cf8bf2be2daeff1ed768a28b194f2c3bdec95d Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 28 Oct 2025 11:42:13 +0100 Subject: [PATCH 253/337] NVFlare unit tests need write access --- runIntegrationTests.sh | 10 ++++++++-- tests/unit_tests/_run_nvflare_unit_tests.sh | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 0cd5d0ff..71ee15ae 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -140,10 +140,16 @@ run_dummy_training_poc_mode(){ run_nvflare_unit_tests(){ echo "[Run] NVFlare unit tests" - _run_test_in_docker tests/unit_tests/_run_nvflare_unit_tests.sh + docker run --rm \ + --shm-size=16g \ + --ipc=host \ + --ulimit memlock=-1 \ + --ulimit stack=67108864 \ + --gpus="$GPU_FOR_TESTING" \ + --entrypoint=/MediSwarm/tests/unit_tests/_run_nvflare_unit_tests.sh \ + "$DOCKER_IMAGE" } - create_startup_kits_and_check_contained_files () { echo "[Prepare] Startup kits for test project ..." diff --git a/tests/unit_tests/_run_nvflare_unit_tests.sh b/tests/unit_tests/_run_nvflare_unit_tests.sh index 890406c2..13ce9ff0 100755 --- a/tests/unit_tests/_run_nvflare_unit_tests.sh +++ b/tests/unit_tests/_run_nvflare_unit_tests.sh @@ -3,6 +3,7 @@ set -e run_nvflare_unit_tests () { + chmod a+rwX /MediSwarm -R cd /MediSwarm/docker_config/NVFlare ./runtest.sh -c -r coverage report -m From a842d228c3a331628c05240a022e01462ef361ee Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 28 Oct 2025 13:32:20 +0100 Subject: [PATCH 254/337] wait longer for swarm training to finish --- runIntegrationTests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 71ee15ae..6ebf2b27 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -408,7 +408,7 @@ run_dummy_training_in_swarm () { cd admin@test.odelia/startup "$CWD"/tests/integration_tests/_submitDummyTraining.exp docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX - sleep 120 + sleep 180 cd "$CWD" cd "$PROJECT_DIR"/prod_00/localhost/startup From 69785d050f644cd2dfa72484b9d53bdc5e1e9954 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 28 Oct 2025 13:32:29 +0100 Subject: [PATCH 255/337] print output if expected contents are missing for better debugging --- runIntegrationTests.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 6ebf2b27..f0f928dc 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -426,6 +426,7 @@ run_dummy_training_in_swarm () { echo "Expected output $EXPECTED_OUTPUT found" else echo "Expected output $EXPECTED_OUTPUT missing" + cat "$CONSOLE_OUTPUT" exit 1 fi done @@ -449,6 +450,7 @@ run_dummy_training_in_swarm () { echo "Expected output $EXPECTED_OUTPUT found" else echo "Expected output $EXPECTED_OUTPUT missing" + cat "$CONSOLE_OUTPUT" exit 1 fi done From 9229c4b1b8f7c243777b31a8d8ccde40efd8952e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 28 Oct 2025 13:57:43 +0100 Subject: [PATCH 256/337] Revert "wait longer for swarm training to finish" This reverts commit a842d228c3a331628c05240a022e01462ef361ee. --- runIntegrationTests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index f0f928dc..fb3b9d5a 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -408,7 +408,7 @@ run_dummy_training_in_swarm () { cd admin@test.odelia/startup "$CWD"/tests/integration_tests/_submitDummyTraining.exp docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX - sleep 180 + sleep 120 cd "$CWD" cd "$PROJECT_DIR"/prod_00/localhost/startup From f1d4f9cbda65a4f9f04c486549c81b2686d82971 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 28 Oct 2025 14:14:28 +0100 Subject: [PATCH 257/337] removed own swarm server and client controllers, removed corresponding unit tests, use NVFlare controller classes instead --- .../app/config/config_fed_client.conf | 2 +- .../app/config/config_fed_server.conf | 2 +- .../cifar10/app/config/config_fed_client.conf | 2 +- .../cifar10/app/config/config_fed_server.conf | 2 +- .../app/config/config_fed_client.conf | 2 +- .../app/config/config_fed_server.conf | 2 +- controller/controller/__init__.py | 2 - controller/controller/gatherer.py | 215 ------- controller/controller/swarm_client_ctl.py | 528 ------------------ controller/controller/swarm_server_ctl.py | 120 ---- controller/setup.py | 12 - docker_config/Dockerfile_ODELIA | 5 - runIntegrationTests.sh | 11 - ...run_controller_unit_tests_with_coverage.sh | 15 - tests/unit_tests/controller/.coveragerc | 3 - tests/unit_tests/controller/test_gatherer.py | 260 --------- .../test_swarm_client_controller.py | 177 ------ .../test_swarm_server_controller.py | 212 ------- 18 files changed, 6 insertions(+), 1566 deletions(-) delete mode 100644 controller/controller/__init__.py delete mode 100644 controller/controller/gatherer.py delete mode 100644 controller/controller/swarm_client_ctl.py delete mode 100644 controller/controller/swarm_server_ctl.py delete mode 100644 controller/setup.py delete mode 100755 tests/integration_tests/_run_controller_unit_tests_with_coverage.sh delete mode 100644 tests/unit_tests/controller/.coveragerc delete mode 100644 tests/unit_tests/controller/test_gatherer.py delete mode 100644 tests/unit_tests/controller/test_swarm_client_controller.py delete mode 100644 tests/unit_tests/controller/test_swarm_server_controller.py diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf index 42ac6ebb..3b82afec 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_client.conf @@ -27,7 +27,7 @@ tasks = ["swarm_*"] executor { # client-side controller for training and logic and aggregation management - path = "controller.SwarmClientController" + path = "nvflare.app_common.ccwf.SwarmClientController" args { # train task must be implemented by Executor learn_task_name = "train" diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_server.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_server.conf index fe11655d..c9b6cdd5 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_server.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_server.conf @@ -13,7 +13,7 @@ workflows = [ { # server-side controller to manage job life cycle id = "swarm_controller" - path = "controller.SwarmServerController" + path = "nvflare.app_common.ccwf.SwarmServerController" args { # can also set aggregation clients and train clients, see class for all available args num_rounds = 20 diff --git a/application/jobs/cifar10/app/config/config_fed_client.conf b/application/jobs/cifar10/app/config/config_fed_client.conf index b10895db..a639cf81 100755 --- a/application/jobs/cifar10/app/config/config_fed_client.conf +++ b/application/jobs/cifar10/app/config/config_fed_client.conf @@ -22,7 +22,7 @@ executors = [ tasks = ["swarm_*"] executor { # client-side controller for training and logic and aggregration management - path = "controller.SwarmClientController" + path = "nvflare.app_common.ccwf.SwarmClientController" args { # train task must be implemented by Executor learn_task_name = "train" diff --git a/application/jobs/cifar10/app/config/config_fed_server.conf b/application/jobs/cifar10/app/config/config_fed_server.conf index f4f2fd7c..ff387ba2 100755 --- a/application/jobs/cifar10/app/config/config_fed_server.conf +++ b/application/jobs/cifar10/app/config/config_fed_server.conf @@ -23,7 +23,7 @@ workflows = [ { # server-side controller to manage job life cycle id = "swarm_controller" - path = "controller.SwarmServerController" + path = "nvflare.app_common.ccwf.SwarmServerController" args { # can also set aggregation clients and train clients, see class for all available args num_rounds = 3 diff --git a/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_client.conf b/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_client.conf index ff18ef95..92c80ae5 100644 --- a/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_client.conf +++ b/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_client.conf @@ -27,7 +27,7 @@ tasks = ["swarm_*"] executor { # client-side controller for training and logic and aggregation management - path = "controller.SwarmClientController" + path = "nvflare.app_common.ccwf.SwarmClientController" args { # train task must be implemented by Executor learn_task_name = "train" diff --git a/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_server.conf b/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_server.conf index 0ece834f..2f2dafc5 100644 --- a/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_server.conf +++ b/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_server.conf @@ -13,7 +13,7 @@ workflows = [ { # server-side controller to manage job life cycle id = "swarm_controller" - path = "controller.SwarmServerController" + path = "nvflare.app_common.ccwf.SwarmServerController" args { # can also set aggregation clients and train clients, see class for all available args num_rounds = 5 diff --git a/controller/controller/__init__.py b/controller/controller/__init__.py deleted file mode 100644 index 32bff9fa..00000000 --- a/controller/controller/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -from .swarm_client_ctl import SwarmClientController -from .swarm_server_ctl import SwarmServerController diff --git a/controller/controller/gatherer.py b/controller/controller/gatherer.py deleted file mode 100644 index 878ce560..00000000 --- a/controller/controller/gatherer.py +++ /dev/null @@ -1,215 +0,0 @@ -import threading -import time -import numpy as np - -from nvflare.apis.fl_component import FLComponent -from nvflare.apis.fl_constant import ReturnCode -from nvflare.apis.fl_context import FLContext -from nvflare.apis.shareable import Shareable, make_reply -from nvflare.app_common.abstract.aggregator import Aggregator -from nvflare.app_common.abstract.metric_comparator import MetricComparator -from nvflare.app_common.app_constant import AppConstants -from nvflare.app_common.app_event_type import AppEventType -from nvflare.app_common.ccwf.client_ctl import ClientSideController -from nvflare.app_common.ccwf.common import Constant -from nvflare.security.logging import secure_format_traceback - - -class _TrainerStatus: - """ - Internal class to keep track of trainer's status including reply time. - """ - def __init__(self, name: str): - self.name = name - self.reply_time = None - - -class Gatherer(FLComponent): - """ - Gatherer class responsible for gathering and aggregating training results from multiple clients - during the swarm learning process. - """ - def __init__( - self, - task_data: Shareable, - fl_ctx: FLContext, - for_round: int, - executor: ClientSideController, - aggregator: Aggregator, - metric_comparator: MetricComparator, - all_clients: list, - trainers: list, - min_responses_required: int, - wait_time_after_min_resps_received: float, - timeout, - ): - super().__init__() - self.fl_ctx = fl_ctx - self.executor = executor - self.aggregator = aggregator - self.metric_comparator = metric_comparator - self.all_clients = all_clients - self.trainers = trainers - self.for_round = for_round - self.trainer_statuses = {} - self.start_time = time.time() - self.timeout = timeout - - for t in trainers: - self.trainer_statuses[t] = _TrainerStatus(t) - if min_responses_required <= 0 or min_responses_required >= len(trainers): - min_responses_required = len(trainers) - self.min_responses_required = min_responses_required - self.wait_time_after_min_resps_received = wait_time_after_min_resps_received - self.min_resps_received_time = None - self.lock = threading.Lock() - self.current_best_client = task_data.get_header(Constant.CLIENT) - self.current_best_global_metric = task_data.get_header(Constant.METRIC) - self.current_best_round = task_data.get_header(Constant.ROUND) - if not self.current_best_client: - self.log_info(fl_ctx, "gatherer starting from scratch") - else: - self.log_info( - fl_ctx, - f"gatherer starting with previous best result from client {self.current_best_client} " - f"with metric {self.current_best_global_metric} " - f"at round {self.current_best_round}", - ) - - def gather(self, client_name: str, result: Shareable, fl_ctx: FLContext) -> Shareable: - """ - Gather the results from a client and perform aggregation if applicable. - """ - with self.lock: - try: - return self._do_gather(client_name, result, fl_ctx) - except Exception as e: - self.log_error(fl_ctx, f"Exception gathering: {secure_format_traceback()}") - return make_reply(ReturnCode.EXECUTION_EXCEPTION) - - def _do_gather(self, client_name: str, result: Shareable, fl_ctx: FLContext) -> Shareable: - result_round = result.get_header(AppConstants.CURRENT_ROUND) - ts = self.trainer_statuses.get(client_name) - if not ts: - self.log_error( - fl_ctx, f"Received result from {client_name} for round {result_round}, but it is not a trainer" - ) - return make_reply(ReturnCode.EXECUTION_EXCEPTION) - - if result_round > self.for_round: - self.log_error( - fl_ctx, - f"Logic error: received result from {client_name} for round {result_round}, " - f"which is > gatherer's current round {self.for_round}", - ) - self.executor.update_status(action="gather", error=ReturnCode.EXECUTION_EXCEPTION) - return make_reply(ReturnCode.EXECUTION_EXCEPTION) - - if result_round < self.for_round: - self.log_warning( - fl_ctx, - f"Received late result from {client_name} for round {result_round}, " - f"which is < gatherer's current round {self.for_round}", - ) - - if result_round == self.for_round: - now = time.time() - ts.reply_time = now - if not self.min_resps_received_time: - num_resps_received = sum(1 for ts in self.trainer_statuses.values() if ts.reply_time) - if num_resps_received >= self.min_responses_required: - self.min_resps_received_time = now - - rc = result.get_return_code(ReturnCode.OK) - if rc != ReturnCode.OK: - self.log_error(fl_ctx, f"Bad result from {client_name} for round {result_round}: {rc}.") - return make_reply(ReturnCode.EXECUTION_EXCEPTION) - - fl_ctx.set_prop(AppConstants.CURRENT_ROUND, self.for_round, private=True, sticky=True) - fl_ctx.set_prop(AppConstants.TRAINING_RESULT, result, private=True, sticky=False) - self.fire_event(AppEventType.BEFORE_CONTRIBUTION_ACCEPT, fl_ctx) - - accepted = self.aggregator.accept(result, fl_ctx) - accepted_msg = "ACCEPTED" if accepted else "REJECTED" - self.log_info( - fl_ctx, f"Contribution from {client_name} {accepted_msg} by the aggregator at round {result_round}." - ) - - fl_ctx.set_prop(AppConstants.AGGREGATION_ACCEPTED, accepted, private=True, sticky=False) - self.fire_event(AppEventType.AFTER_CONTRIBUTION_ACCEPT, fl_ctx) - return make_reply(ReturnCode.OK) - - def aggregate(self): - """ - Perform the aggregation of results gathered from trainers. - """ - - def _is_valid_value(value: float) -> bool: - return ( value is not None ) and ( not np.isnan(value) ) - - fl_ctx = self.fl_ctx - self.log_info(fl_ctx, f"Start aggregation for round {self.for_round}") - self.fire_event(AppEventType.BEFORE_AGGREGATION, fl_ctx) - try: - aggr_result = self.aggregator.aggregate(fl_ctx) - except Exception as e: - self.log_error(fl_ctx, f"Exception in aggregation: {secure_format_traceback()}") - self.executor.update_status(action="aggregate", error=ReturnCode.EXECUTION_EXCEPTION) - return make_reply(ReturnCode.EXECUTION_EXCEPTION) - - fl_ctx.set_prop(AppConstants.AGGREGATION_RESULT, aggr_result, private=True, sticky=False) - self.fire_event(AppEventType.AFTER_AGGREGATION, fl_ctx) - self.log_info(fl_ctx, f"Finished aggregation for round {self.for_round}") - - mine_is_better = False - if _is_valid_value(self.current_best_global_metric): - if ( - _is_valid_value(self.executor.best_metric) - and self.metric_comparator.compare(self.executor.best_metric, self.current_best_global_metric) > 0 - ): - mine_is_better = True - elif _is_valid_value(self.executor.best_metric): - mine_is_better = True - - # Determine if the local metric is better than the current global best - if mine_is_better: - self.log_info( - fl_ctx, f"I got better metric {self.executor.best_metric} at round {self.executor.best_round}" - ) - best_round = self.executor.best_round - best_metric = self.executor.best_metric - best_client = self.executor.me - else: - best_round = self.current_best_round - best_metric = self.current_best_global_metric - best_client = self.current_best_client - - self.log_info(fl_ctx, f"Global best metric is {best_metric} from client {best_client} at round {best_round}") - - aggr_result.set_header(Constant.ROUND, best_round) - aggr_result.set_header(Constant.METRIC, best_metric) - aggr_result.set_header(Constant.CLIENT, best_client) - - return aggr_result - - def is_done(self): - """ - Check if the gather process is complete, either by receiving all responses or timing out. - For compatibility with NVFlare Gatherer, do not return False, but None. - """ - unfinished = sum(1 for s in self.trainer_statuses.values() if not s.reply_time) - if unfinished == 0: - return True - - now = time.time() - if self.timeout and now - self.start_time > self.timeout: - self.log_warning(self.fl_ctx, f"Gatherer for round {self.for_round} timed out after {self.timeout} seconds") - return True - - if self.min_resps_received_time and now - self.min_resps_received_time > self.wait_time_after_min_resps_received: - self.log_info( - self.fl_ctx, - f"Gatherer for round {self.for_round} exit after {self.wait_time_after_min_resps_received} seconds " - f"since received minimum responses", - ) - return True diff --git a/controller/controller/swarm_client_ctl.py b/controller/controller/swarm_client_ctl.py deleted file mode 100644 index 244924ba..00000000 --- a/controller/controller/swarm_client_ctl.py +++ /dev/null @@ -1,528 +0,0 @@ -import copy -import logging -import random -import threading -import time - -from controller.gatherer import Gatherer - -from nvflare.apis.controller_spec import Task -from nvflare.apis.fl_constant import FLContextKey, ReturnCode -from nvflare.apis.fl_context import FLContext -from nvflare.apis.shareable import Shareable, make_reply -from nvflare.apis.signal import Signal -from nvflare.app_common.abstract.aggregator import Aggregator -from nvflare.app_common.abstract.learnable import Learnable -from nvflare.app_common.abstract.metric_comparator import MetricComparator -from nvflare.app_common.app_constant import AppConstants -from nvflare.app_common.app_event_type import AppEventType -from nvflare.app_common.ccwf.client_ctl import ClientSideController -from nvflare.app_common.ccwf.common import Constant, NumberMetricComparator, ResultType, make_task_name -from nvflare.fuel.utils.validation_utils import check_non_empty_str, check_positive_int, check_positive_number -from nvflare.security.logging import secure_format_traceback - - -class SwarmClientController(ClientSideController): - """ - The SwarmClientController class manages the client-side execution of the swarm learning workflow. - It handles the training, aggregation, and communication with other clients in a decentralized manner. - """ - def __init__( - self, - task_name_prefix=Constant.TN_PREFIX_SWARM, # Prefix for tasks associated with the swarm workflow - learn_task_name=AppConstants.TASK_TRAIN, # Name of the task to be executed for learning - persistor_id=AppConstants.DEFAULT_PERSISTOR_ID, # ID of the persistor component - shareable_generator_id=AppConstants.DEFAULT_SHAREABLE_GENERATOR_ID, # ID of the shareable generator component - aggregator_id=AppConstants.DEFAULT_AGGREGATOR_ID, # ID of the aggregator component - metric_comparator_id=None, # Optional ID for a custom metric comparator - learn_task_check_interval=Constant.LEARN_TASK_CHECK_INTERVAL, # Interval for checking learning tasks (in seconds) - learn_task_abort_timeout=Constant.LEARN_TASK_ABORT_TIMEOUT, # Timeout for aborting a learning task (in seconds) - learn_task_ack_timeout=Constant.LEARN_TASK_ACK_TIMEOUT, # Timeout for acknowledging a learning task (in seconds) - learn_task_timeout=None, # Timeout for the overall learning task (in seconds) - final_result_ack_timeout=Constant.FINAL_RESULT_ACK_TIMEOUT, # Timeout for acknowledging the final result (in seconds) - min_responses_required: int = 1, # Minimum number of responses required to proceed - wait_time_after_min_resps_received: float = 10.0, # Time to wait after minimum responses are received (in seconds) - ): - """ - Initializes the SwarmClientController, validating the input parameters and setting up internal state. - """ - try: - # Validate required arguments - check_non_empty_str("learn_task_name", learn_task_name) - check_non_empty_str("persistor_id", persistor_id) - check_non_empty_str("shareable_generator_id", shareable_generator_id) - check_non_empty_str("aggregator_id", aggregator_id) - - if metric_comparator_id is not None: - check_non_empty_str("metric_comparator_id", metric_comparator_id) - - if learn_task_timeout is not None: - check_positive_number("learn_task_timeout", learn_task_timeout) - - check_positive_int("min_responses_required", min_responses_required) - check_positive_number("wait_time_after_min_resps_received", wait_time_after_min_resps_received) - - super().__init__( - task_name_prefix=task_name_prefix, - learn_task_name=learn_task_name, - persistor_id=persistor_id, - shareable_generator_id=shareable_generator_id, - learn_task_check_interval=learn_task_check_interval, - learn_task_ack_timeout=learn_task_ack_timeout, - learn_task_abort_timeout=learn_task_abort_timeout, - final_result_ack_timeout=final_result_ack_timeout, - allow_busy_task=True, - ) - # Initialize internal variables - self.metric_comparator_id = metric_comparator_id - self.metric_comparator = None - self.report_learn_result_task_name = make_task_name(task_name_prefix, Constant.BASENAME_REPORT_LEARN_RESULT) - self.learn_task_timeout = learn_task_timeout - self.min_responses_required = min_responses_required - self.wait_time_after_min_resps_received = wait_time_after_min_resps_received - self.aggregator_id = aggregator_id - self.aggregator = None - self.gatherer = None - self.gatherer_waiter = threading.Event() - self.trainers = None - self.aggrs = None - self.is_trainer = False - self.is_aggr = False - self.last_aggr_round_done = -1 - except Exception as e: - logging.getLogger("SwarmClientController").log(logging.ERROR, f"Error during initialization: {e}") - # cannot log via self.log_error because we have no FLContext here - raise - - def process_config(self, fl_ctx: FLContext): - """ - Process the configuration for the swarm learning workflow, identifying the roles of the current client. - """ - try: - all_clients = self.get_config_prop(Constant.CLIENTS) - - self.trainers = self.get_config_prop(Constant.TRAIN_CLIENTS) - if not self.trainers: - self.trainers = all_clients - self.is_trainer = (self.me in self.trainers) - - self.aggrs = self.get_config_prop(Constant.AGGR_CLIENTS) - if not self.aggrs: - self.aggrs = all_clients - self.is_aggr = (self.me in self.aggrs) - - # Register message handler for sharing results - self.engine.register_aux_message_handler( - topic=self.topic_for_my_workflow(Constant.TOPIC_SHARE_RESULT), - message_handle_func=self._process_share_result, - ) - except Exception as e: - self.log_error(fl_ctx, f"Exception during process_config: {secure_format_traceback()}") - raise - - def execute(self, task_name: str, shareable: Shareable, fl_ctx: FLContext, abort_signal: Signal) -> Shareable: - """ - Execute a specific task based on the task name. Handles both regular and learning result tasks. - """ - try: - if task_name == self.report_learn_result_task_name: - return self._process_learn_result(shareable, fl_ctx, abort_signal) - return super().execute(task_name, shareable, fl_ctx, abort_signal) - except Exception as e: - self.log_error(fl_ctx, f"Exception during execute: {secure_format_traceback()}") - return make_reply(ReturnCode.EXECUTION_EXCEPTION) - - def start_run(self, fl_ctx: FLContext): - """ - Start the swarm learning run, setting up the aggregator and metric comparator components. - """ - try: - super().start_run(fl_ctx) - self.aggregator = self.engine.get_component(self.aggregator_id) - if not isinstance(self.aggregator, Aggregator): - self.system_panic( - f"Aggregator {self.aggregator_id} must be an Aggregator but got {type(self.aggregator)}", - fl_ctx, - ) - return - - if self.metric_comparator_id: - self.metric_comparator = self.engine.get_component(self.metric_comparator_id) - if not isinstance(self.metric_comparator, MetricComparator): - self.system_panic( - f"Metric comparator {self.metric_comparator_id} must be a MetricComparator " - f"but got {type(self.metric_comparator)}", - fl_ctx, - ) - return - else: - self.metric_comparator = NumberMetricComparator() - - # Start a thread to monitor the gather process - aggr_thread = threading.Thread(target=self._monitor_gather) - aggr_thread.daemon = True - aggr_thread.start() - self.log_info(fl_ctx, "Started aggregator thread") - except Exception as e: - self.log_error(fl_ctx, f"Exception during start_run: {secure_format_traceback()}") - raise - - def handle_event(self, event_type: str, fl_ctx: FLContext): - """ - Handle specific events, such as when a global best model is available, updating the client's status. - """ - try: - if event_type == AppEventType.GLOBAL_BEST_MODEL_AVAILABLE: - client = fl_ctx.get_prop(Constant.CLIENT) - if client and client != self.me: - return - - self.best_metric = fl_ctx.get_prop(AppConstants.VALIDATION_RESULT) - self.best_result = copy.deepcopy(fl_ctx.get_prop(AppConstants.GLOBAL_MODEL)) - self.log_info(fl_ctx, f"Got GLOBAL_BEST_MODEL_AVAILABLE: best metric={self.best_metric}") - current_round = fl_ctx.get_prop(AppConstants.CURRENT_ROUND) - self.best_round = current_round - self.update_status(last_round=current_round, action="better_aggregation") - else: - super().handle_event(event_type, fl_ctx) - except Exception as e: - self.log_error(fl_ctx, f"Exception during handle_event: {secure_format_traceback()}") - raise - - def start_workflow(self, shareable: Shareable, fl_ctx: FLContext, abort_signal: Signal) -> Shareable: - """ - Start the swarm learning workflow by scattering tasks to the appropriate clients. - """ - try: - clients = self.get_config_prop(Constant.CLIENTS) - aggr_clients = self.get_config_prop(Constant.AGGR_CLIENTS, []) - train_clients = self.get_config_prop(Constant.TRAIN_CLIENTS, []) - - self.log_info( - fl_ctx, f"Starting Swarm Workflow on clients {clients}, aggrs {aggr_clients}, trainers {train_clients}" - ) - - if not self._scatter( - task_data=shareable, for_round=self.get_config_prop(Constant.START_ROUND, 0), fl_ctx=fl_ctx - ): - return make_reply(ReturnCode.EXECUTION_EXCEPTION) - - self.log_info(fl_ctx, "Started Swarm Workflow") - return make_reply(ReturnCode.OK) - except Exception as e: - self.log_error(fl_ctx, f"Exception during start_workflow: {secure_format_traceback()}") - return make_reply(ReturnCode.EXECUTION_EXCEPTION) - - def _scatter(self, task_data: Shareable, for_round: int, fl_ctx: FLContext) -> bool: - """ - Distribute learning tasks to the training and aggregation clients for a specific round. - """ - try: - clients = self.get_config_prop(Constant.TRAIN_CLIENTS) - aggr_clients = self.get_config_prop(Constant.AGGR_CLIENTS) - - aggr = random.choice(aggr_clients) - - task_data.set_header(AppConstants.CURRENT_ROUND, for_round) - task_data.add_cookie(AppConstants.CONTRIBUTION_ROUND, for_round) - task_data.set_header(Constant.AGGREGATOR, aggr) - - targets = copy.copy(clients) - if aggr not in targets: - targets.append(aggr) - - self.log_info(fl_ctx, f"Broadcasting learn task of round {for_round} to {targets}; aggr client is {aggr}") - return self.send_learn_task(targets=targets, request=task_data, fl_ctx=fl_ctx) - except Exception as e: - self.log_error(fl_ctx, f"Exception during _scatter: {secure_format_traceback()}") - return False - - def _monitor_gather(self): - """ - Monitor the gather process to check if the aggregation for a round is complete. - """ - while True: - if self.asked_to_stop: - return - - gatherer = self.gatherer - if gatherer: - assert isinstance(gatherer, Gatherer) - if gatherer.is_done(): - self.last_aggr_round_done = gatherer.for_round - self.gatherer = None - self.gatherer_waiter.clear() - try: - self._end_gather(gatherer) - except Exception as e: - self.log_error(gatherer.fl_ctx, f"Exception ending gatherer: {secure_format_traceback()}") - self.update_status(action="aggregate", error=ReturnCode.EXECUTION_EXCEPTION) - time.sleep(0.2) - - def _end_gather(self, gatherer: Gatherer): - """ - Finalize the aggregation process and determine the next steps in the workflow. - """ - fl_ctx = gatherer.fl_ctx - try: - aggr_result = gatherer.aggregate() - except Exception as e: - self.log_error(fl_ctx, f"Exception in aggregation: {secure_format_traceback()}") - self.update_status(action="aggregate", error=ReturnCode.EXECUTION_EXCEPTION) - return - - self.log_debug(fl_ctx, f"Aggregation result: {aggr_result}") - global_weights = self.shareable_generator.shareable_to_learnable(aggr_result, fl_ctx) - self.record_last_result(fl_ctx, gatherer.for_round, global_weights) - - num_rounds_done = gatherer.for_round - self.get_config_prop(Constant.START_ROUND, 0) + 1 - if num_rounds_done >= self.get_config_prop(AppConstants.NUM_ROUNDS): - self.log_info(fl_ctx, f"Swarm Learning Done: number of rounds completed {num_rounds_done}") - - self._distribute_final_results(aggr_result, fl_ctx) - return - - next_round_data = self.shareable_generator.learnable_to_shareable(global_weights, fl_ctx) - assert isinstance(next_round_data, Shareable) - - best_round = aggr_result.get_header(Constant.ROUND) - best_metric = aggr_result.get_header(Constant.METRIC) - best_client = aggr_result.get_header(Constant.CLIENT) - - if best_client: - next_round_data.set_header(Constant.ROUND, best_round) - next_round_data.set_header(Constant.CLIENT, best_client) - next_round_data.set_header(Constant.METRIC, best_metric) - - self._scatter(next_round_data, gatherer.for_round + 1, gatherer.fl_ctx) - - def _ask_to_share_best_result(self, client: str, metric, fl_ctx: FLContext): - """ - Request the client with the best metric to share its result with the other clients. - """ - try: - self.log_info(fl_ctx, f"Client {client} has the best metric {metric} - asking it to share result") - resp = self.engine.send_aux_request( - targets=[client], - topic=self.topic_for_my_workflow(Constant.TOPIC_SHARE_RESULT), - request=Shareable(), - timeout=self.final_result_ack_timeout, - fl_ctx=fl_ctx, - secure=False, - ) - - assert isinstance(resp, dict) - reply = resp.get(client) - if not reply: - self.log_error(fl_ctx, f"Failed to ask client {client} to share final result") - return - - if not isinstance(reply, Shareable): - self.log_error(fl_ctx, f"Client {client} failed to respond to share final result request") - return - - rc = reply.get_return_code() - if rc != ReturnCode.OK: - self.log_error(fl_ctx, f"Client {client} failed to respond to share final result request: {rc}") - except Exception as e: - self.log_error(fl_ctx, f"Exception during _ask_to_share_best_result: {secure_format_traceback()}") - - def _distribute_final_results(self, aggr_result: Shareable, fl_ctx: FLContext): - """ - Distribute the final results of the swarm learning process to all clients. - """ - try: - best_client = aggr_result.get_header(Constant.CLIENT) - best_metric = aggr_result.get_header(Constant.METRIC) - - if best_client: - if best_client == self.me: - self.log_info(fl_ctx, f"I have the global best metric {best_metric}") - self.broadcast_final_result( - fl_ctx, ResultType.BEST, self.best_result, self.best_metric, self.best_round - ) - else: - self._ask_to_share_best_result(best_client, best_metric, fl_ctx) - else: - self.log_info(fl_ctx, "No global best result!") - - self.log_info(fl_ctx, "Distributing last result") - self.broadcast_final_result(fl_ctx, ResultType.LAST, self.last_result, round_num=self.last_round) - except Exception as e: - self.log_error(fl_ctx, f"Exception during _distribute_final_results: {secure_format_traceback()}") - - def _process_learn_result(self, request: Shareable, fl_ctx: FLContext, abort_signal: Signal) -> Shareable: - """ - Process the learning result received from a peer client. - """ - try: - peer_ctx = fl_ctx.get_peer_context() - assert isinstance(peer_ctx, FLContext) - client_name = peer_ctx.get_identity_name() - current_round = request.get_header(AppConstants.CURRENT_ROUND) - self.log_info(fl_ctx, f"Got training result from {client_name} for round {current_round}") - - peer_ctx.set_prop(FLContextKey.SHAREABLE, request) - - gatherer = self.gatherer - if not gatherer: - if current_round <= self.last_aggr_round_done: - self.log_info(fl_ctx, f"Dropped result from late {client_name} for round {current_round}") - return make_reply(ReturnCode.OK) - - self.log_info(fl_ctx, f"Got result from {client_name} for round {current_round} before gatherer setup") - self.gatherer_waiter.wait(self.learn_task_abort_timeout) - - if abort_signal.triggered: - return make_reply(ReturnCode.TASK_ABORTED) - - gatherer = self.gatherer - if not gatherer: - self.log_error(fl_ctx, f"Still no gatherer after {self.learn_task_abort_timeout} seconds") - self.log_error(fl_ctx, f"Ignored result from {client_name} for round {current_round} since no gatherer") - self.update_status(action="wait_for_gatherer", error=ReturnCode.EXECUTION_EXCEPTION) - return make_reply(ReturnCode.EXECUTION_EXCEPTION) - - assert isinstance(gatherer, Gatherer) - if gatherer.for_round != current_round: - self.log_warning( - fl_ctx, - f"Got result from {client_name} for round {current_round}, " - f"but I'm waiting for round {gatherer.for_round}", - ) - return gatherer.gather(client_name, request, fl_ctx) - except Exception as e: - self.log_exception(fl_ctx, f"Exception processing learn result: {secure_format_traceback()}") - self.log_error(fl_ctx, f"Exception processing learn result: {e}") - self.update_status(action="process_learn_result", error=ReturnCode.EXECUTION_EXCEPTION) - return make_reply(ReturnCode.EXECUTION_EXCEPTION) - - def do_learn_task(self, name: str, task_data: Shareable, fl_ctx: FLContext, abort_signal: Signal): - """ - Perform the learning task as part of the swarm learning workflow, handling the training and communication - with the aggregator. - """ - try: - current_round = task_data.get_header(AppConstants.CURRENT_ROUND) - self.update_status(last_round=current_round, action="start_learn_task") - - aggr = task_data.get_header(Constant.AGGREGATOR) - if not aggr: - self.log_error(fl_ctx, f"Missing aggregation client for round {current_round}") - self.update_status(action="do_learn_task", error=ReturnCode.EXECUTION_EXCEPTION) - return - - self.log_info(fl_ctx, f"Round {current_round} started.") - task_data.set_header(FLContextKey.TASK_NAME, name) - - base_model = fl_ctx.get_prop(AppConstants.GLOBAL_MODEL) - if not base_model: - base_model = Learnable() - fl_ctx.set_prop(AppConstants.GLOBAL_MODEL, base_model, private=True, sticky=True) - global_weights = self.shareable_generator.shareable_to_learnable(task_data, fl_ctx) - - self.log_debug(fl_ctx, f"Current global model: {global_weights}") - - fl_ctx.set_prop(AppConstants.GLOBAL_MODEL, global_weights, private=True, sticky=True) - fl_ctx.set_prop(AppConstants.CURRENT_ROUND, current_round, private=True, sticky=True) - self.fire_event(AppEventType.ROUND_STARTED, fl_ctx) - - if self.me == aggr: - gatherer = self.gatherer - if gatherer: - self.log_error( - fl_ctx, - f"Logic error: got task for round {current_round} while gathering for round {gatherer.for_round}", - ) - self.update_status(action="do_learn_task", error=ReturnCode.EXECUTION_EXCEPTION) - return - - self.log_info(fl_ctx, f"Setting up the gatherer for round {current_round}") - - self.gatherer = Gatherer( - fl_ctx=fl_ctx, - all_clients=self.get_config_prop(Constant.CLIENTS), - metric_comparator=self.metric_comparator, - trainers=self.trainers, - for_round=current_round, - timeout=self.learn_task_timeout, - min_responses_required=self.min_responses_required, - wait_time_after_min_resps_received=self.wait_time_after_min_resps_received, - aggregator=self.aggregator, - executor=self, - task_data=task_data, - ) - self.gatherer_waiter.set() - - if self.is_trainer: - result = self.execute_learn_task(task_data, fl_ctx, abort_signal) - - rc = result.get_return_code(ReturnCode.OK) - if rc != ReturnCode.OK: - self.log_error(fl_ctx, f"Learn executor failed: {rc}") - self.update_status(action="learner_execution", error=rc) - return - - self.log_info(fl_ctx, f"Sending training result to aggregation client {aggr}") - task = Task( - name=self.report_learn_result_task_name, - data=result, - timeout=int(self.learn_task_ack_timeout), - secure=self.is_task_secure(fl_ctx), - ) - - resp = self.broadcast_and_wait( - task=task, - targets=[aggr], - min_responses=1, - fl_ctx=fl_ctx, - ) - - reply = resp.get(aggr) - if not reply: - self.log_error(fl_ctx, f"Failed to receive reply from aggregation client: {aggr}") - self.update_status(action="receive_learn_result_reply", error=ReturnCode.EXECUTION_EXCEPTION) - return - - if not isinstance(reply, Shareable): - self.log_error( - fl_ctx, f"Bad reply from aggregation client {aggr}: expect Shareable but got {type(reply)}" - ) - self.update_status(action="receive_learn_result_reply", error=ReturnCode.EXECUTION_EXCEPTION) - return - - rc = reply.get_return_code(ReturnCode.OK) - if rc != ReturnCode.OK: - self.log_error(fl_ctx, f"Bad return code from aggregation client {aggr}: {rc}") - self.update_status(action="receive_learn_result_reply", error=ReturnCode.EXECUTION_EXCEPTION) - return - - self.log_info(fl_ctx, f"Finished round {current_round}") - self.update_status(last_round=current_round, action="finished_learn_task") - except Exception as e: - self.log_error(fl_ctx, f"Exception during do_learn_task: {secure_format_traceback()}") - self.update_status(action="do_learn_task", error=ReturnCode.EXECUTION_EXCEPTION) - - def _process_share_result(self, topic: str, request: Shareable, fl_ctx: FLContext) -> Shareable: - """ - Process a request from another client to share the current best result. - """ - try: - peer_ctx = fl_ctx.get_peer_context() - assert isinstance(peer_ctx, FLContext) - client_name = peer_ctx.get_identity_name() - if not self.best_result: - self.log_error( - fl_ctx, f"Got request from {client_name} to share my best result, but I don't have a best result" - ) - return make_reply(ReturnCode.BAD_REQUEST_DATA) - - self.update_status(action="start_share_result_request_process") - self.broadcast_final_result( - fl_ctx, ResultType.BEST, self.best_result, metric=self.best_metric, round_num=self.best_round - ) - return make_reply(ReturnCode.OK) - except Exception as e: - self.log_error(fl_ctx, f"Exception during _process_share_result: {secure_format_traceback()}") - self.log_error(fl_ctx, f"Exception during _process_share_result: {e}") - return make_reply(ReturnCode.EXECUTION_EXCEPTION) diff --git a/controller/controller/swarm_server_ctl.py b/controller/controller/swarm_server_ctl.py deleted file mode 100644 index b356d1d7..00000000 --- a/controller/controller/swarm_server_ctl.py +++ /dev/null @@ -1,120 +0,0 @@ -from nvflare.apis.fl_context import FLContext -from nvflare.app_common.ccwf.common import Constant -from nvflare.app_common.ccwf.server_ctl import ServerSideController -from nvflare.fuel.utils.validation_utils import DefaultValuePolicy, normalize_config_arg, validate_candidates - - -class SwarmServerController(ServerSideController): - """ - The SwarmServerController class manages the server side of the swarm learning workflow, a decentralized - form of federated learning. This controller is responsible for managing the overall job status and ensuring - the proper execution of the learning workflow across multiple rounds. - """ - - def __init__( - self, - num_rounds: int, # Number of training rounds to be performed across the workflow - start_round: int = 0, # Initial round to start training (default is 0) - task_name_prefix=Constant.TN_PREFIX_SWARM, # Prefix for naming tasks, default is 'swarm' - start_task_timeout=Constant.START_TASK_TIMEOUT, # Timeout for starting a task (in seconds) - configure_task_timeout=Constant.CONFIG_TASK_TIMEOUT, # Timeout for configuring a task (in seconds) - task_check_period: float = Constant.TASK_CHECK_INTERVAL, # Interval for checking task status (in seconds) - job_status_check_interval: float = Constant.JOB_STATUS_CHECK_INTERVAL, # Interval for checking job status (in seconds) - end_workflow_timeout: float = Constant.END_WORKFLOW_TIMEOUT, # Timeout for ending the workflow (in seconds) - participating_clients=None, # List of clients participating in the job - result_clients=None, # List of clients to receive the final model - starting_client=None, # Client responsible for initiating the workflow - max_status_report_interval: float = Constant.PER_CLIENT_STATUS_REPORT_TIMEOUT, # Max interval for client status reporting (in seconds) - progress_timeout: float = Constant.WORKFLOW_PROGRESS_TIMEOUT, # Timeout for overall workflow progress (in seconds) - private_p2p: bool = True, # Flag to indicate private peer-to-peer communication - aggr_clients=None, # Clients designated for aggregation - train_clients=None, # Clients designated for training - ): - """ - Initializes the SwarmServerController. This includes setting up the base ServerSideController and handling - client configurations for training and aggregation. - """ - try: - # Normalize and validate result_clients and starting_client inputs - result_clients = normalize_config_arg(result_clients) - starting_client = normalize_config_arg(starting_client) - - # Initialize the ServerSideController with validated arguments - super().__init__( - num_rounds=num_rounds, - start_round=start_round, - task_name_prefix=task_name_prefix, - start_task_timeout=start_task_timeout, - configure_task_timeout=configure_task_timeout, - task_check_period=task_check_period, - job_status_check_interval=job_status_check_interval, - end_workflow_timeout=end_workflow_timeout, - participating_clients=participating_clients, - result_clients=result_clients, - result_clients_policy=DefaultValuePolicy.ALL, - starting_client=starting_client, - starting_client_policy=DefaultValuePolicy.ANY, - max_status_report_interval=max_status_report_interval, - progress_timeout=progress_timeout, - private_p2p=private_p2p, - ) - - # If aggr_clients or train_clients are not provided, initialize them as empty lists - if not aggr_clients: - aggr_clients = [] - - if not train_clients: - train_clients = [] - - # Assign aggregation and training clients - self.aggr_clients = aggr_clients - self.train_clients = train_clients - except Exception as e: - self.log_error(None, f"Error during initialization: {e}") - raise - - def start_controller(self, fl_ctx: FLContext): - """ - Starts the SwarmServerController, initiating the swarm learning process. This method validates the client - assignments and ensures that every participating client is designated as either a training or aggregation client. - """ - try: - # Call the base class method to start the controller - super().start_controller(fl_ctx) - - # Validate and assign train_clients based on participating_clients - self.train_clients = validate_candidates( - var_name="train_clients", - candidates=self.train_clients, - base=self.participating_clients, - default_policy=DefaultValuePolicy.ALL, - allow_none=False, - ) - - # Validate and assign aggr_clients based on participating_clients - self.aggr_clients = validate_candidates( - var_name="aggr_clients", - candidates=self.aggr_clients, - base=self.participating_clients, - default_policy=DefaultValuePolicy.ALL, - allow_none=False, - ) - - # Ensure every participating client is in at least one category: training or aggregation - for c in self.participating_clients: - if c not in self.train_clients and c not in self.aggr_clients: - raise RuntimeError(f"Config Error: client {c} is neither train client nor aggr client") - except Exception as e: - self.log_error(fl_ctx, f"Error during start_controller: {e}") - raise - - def prepare_config(self): - """ - Prepares and returns the configuration for the current swarm learning round, including the lists of - aggregation and training clients. - """ - try: - return {Constant.AGGR_CLIENTS: self.aggr_clients, Constant.TRAIN_CLIENTS: self.train_clients} - except Exception as e: - self.log_error(None, f"Error during prepare_config: {e}") - raise diff --git a/controller/setup.py b/controller/setup.py deleted file mode 100644 index 169120c6..00000000 --- a/controller/setup.py +++ /dev/null @@ -1,12 +0,0 @@ -# controller/setup.py -from setuptools import setup, find_packages - -setup( - name='controller', - version='0.1.0', - packages=find_packages(), - install_requires=[], - author='Jeff', - author_email='jiefu.zhu@tu-dresden.de', - description='A package for SwarmClientController and SwarmServerController', -) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index a138ca05..8da82653 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -256,11 +256,6 @@ COPY ./MediSwarm/docker_config/master_template.yml /workspace/nvflare/nvflare/li RUN python3 -m pip install /workspace/nvflare RUN rm -rf /workspace/nvflare -# Install the ODELIA controller package from local source -COPY ./MediSwarm/controller /workspace/controller -RUN python3 -m pip install /workspace/controller -RUN rm -rf /workspace/controller - # Copy the source code for local training and deploying to the swarm COPY ./MediSwarm /MediSwarm RUN mkdir -p /fl_admin/transfer diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 62c9dbd8..83ef6e88 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -100,11 +100,6 @@ _run_test_in_docker() { } -run_unit_tests_controller(){ - echo "[Run] Controller unit tests" - _run_test_in_docker tests/integration_tests/_run_controller_unit_tests_with_coverage.sh -} - run_dummy_training_standalone(){ echo "[Run] Minimal example, standalone" _run_test_in_docker tests/integration_tests/_run_minimal_example_standalone.sh @@ -440,11 +435,6 @@ case "$1" in check_files_on_github ;; - run_unit_tests_controller) - run_unit_tests_controller - cleanup_temporary_data - ;; - run_dummy_training_standalone) run_dummy_training_standalone cleanup_temporary_data @@ -517,7 +507,6 @@ case "$1" in all | "") check_files_on_github - run_unit_tests_controller run_dummy_training_standalone run_dummy_training_simulation_mode run_dummy_training_poc_mode diff --git a/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh b/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh deleted file mode 100755 index 46e6e11c..00000000 --- a/tests/integration_tests/_run_controller_unit_tests_with_coverage.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/env bash - -set -e - -run_controller_unit_tests_with_coverage () { - # run unit tests of ODELIA swarm learning and report coverage - export MPLCONFIGDIR=/tmp - export COVERAGE_FILE=/tmp/.MediSwarm_coverage - cd /MediSwarm/tests/unit_tests/controller - PYTHONPATH=/MediSwarm/controller/controller python3 -m coverage run --source=/MediSwarm/controller/controller -m unittest discover - coverage report -m - rm "$COVERAGE_FILE" -} - -run_controller_unit_tests_with_coverage diff --git a/tests/unit_tests/controller/.coveragerc b/tests/unit_tests/controller/.coveragerc deleted file mode 100644 index bcc99efd..00000000 --- a/tests/unit_tests/controller/.coveragerc +++ /dev/null @@ -1,3 +0,0 @@ -[report] -exclude_also = - pass diff --git a/tests/unit_tests/controller/test_gatherer.py b/tests/unit_tests/controller/test_gatherer.py deleted file mode 100644 index 1e1a485c..00000000 --- a/tests/unit_tests/controller/test_gatherer.py +++ /dev/null @@ -1,260 +0,0 @@ -import unittest -import time -import logging -from mock import mock -from numpy import NaN - -from gatherer import _TrainerStatus, Gatherer - -from nvflare.apis.signal import Signal -from nvflare.apis.shareable import Shareable, make_reply -from nvflare.apis.fl_constant import ReturnCode -from nvflare.apis.fl_context import FLContext -from nvflare.app_common.app_constant import AppConstants -from nvflare.app_common.ccwf.client_ctl import ClientSideController -from nvflare.app_common.ccwf.common import Constant, NumberMetricComparator -from nvflare.app_common.abstract.aggregator import Aggregator - - -class MockedResult(Shareable): - def __init__(self, current_round: int): - super().__init__() - self.current_round: int = current_round - - def get_header(self, key): - if key == AppConstants.CURRENT_ROUND: - return self.current_round - - def get_return_code(self, _): - return ReturnCode.OK - -class MockedResultRaisingException(MockedResult): - def get_header(self, _): - raise Exception("Test exception") - -class MockedResultFailing(MockedResult): - def get_return_code(self, _): - return ReturnCode.EXECUTION_RESULT_ERROR - - -class MockedAggregator(Aggregator): - def accept(self, _a: Shareable, _b: FLContext) -> bool: - return True - - def aggregate(self, _: FLContext) -> Shareable: - return Shareable() - -class MockedAggregatorRaisingException(MockedAggregator): - def aggregate(self, _: FLContext) -> Shareable: - raise Exception("foo") - -class MockedClientSideController(ClientSideController): - def __init__(self): - super().__init__(task_name_prefix="test_prefix") - - def do_learn_task(self, _a: str, _b: Shareable, _c: FLContext, _d: Signal): - pass - - def start_workflow(self, _a: Shareable, _b: FLContext, _c: Signal) -> Shareable: - pass - -class EventCatcher(): - def __init__(self): - self.events_caught = [] - - def catch_event(self, *event_args): - self.events_caught.append(event_args[0]) - -class TestGatherer(unittest.TestCase): - CLIENT_THAT_TRAINS = "client_a" - CLIENT_THAT_DOES_NOT_TRAIN = "client_b" - OTHER_CLIENT_THAT_TRAINS = "client_c" - YET_ANOTHER_CLIENT_THAT_TRAINS = "client_d" - - def _get_gatherer(self, - task_data = Shareable(), - for_round = 0, - all_clients = [CLIENT_THAT_TRAINS, CLIENT_THAT_DOES_NOT_TRAIN], - trainers = [CLIENT_THAT_TRAINS], - min_responses_required = 1, - timeout = 0.1 - ): - return Gatherer(task_data = task_data, - fl_ctx = self.fl_context, - for_round = for_round, - executor = MockedClientSideController(), - aggregator = self.aggregator, - metric_comparator = NumberMetricComparator(), - all_clients = all_clients, - trainers = trainers, - min_responses_required = min_responses_required, - wait_time_after_min_resps_received = 0.1, - timeout = timeout) - - def setUp(self): - self.fl_context = FLContext() - self.aggregator = MockedAggregator() - self.gatherer = self._get_gatherer() - self.testee_logger = logging.getLogger("Gatherer") - - - def test_trainer_status_can_be_accessed(self): - name = "test name" - now = time.time() - trainer_status = _TrainerStatus(name) - trainer_status.reply_time = now - self.assertEqual(name, trainer_status.name) - self.assertEqual(now, trainer_status.reply_time) - - def test_gatherer_initialization_logs_correctly(self): - for current_best_client, expected_message in ((None, "INFO:Gatherer:[identity=, run=?]: gatherer starting from scratch"), - (self.CLIENT_THAT_TRAINS, "INFO:Gatherer:[identity=, run=?]: gatherer starting with previous best result from client client_a with metric None at round None")): - with (self.assertLogs(self.testee_logger, logging.INFO) as log): - task_data = Shareable() - task_data.set_header(Constant.CLIENT, current_best_client) - self.gatherer = self._get_gatherer(task_data=task_data) - self.assertEqual(log.output, [expected_message]) - - def test_gatherer_returns_error_on_result_from_non_training_client(self): - result = MockedResult(0) - with self.assertLogs(logging.getLogger("Gatherer"), logging.ERROR) as log: - response = self.gatherer.gather(self.CLIENT_THAT_DOES_NOT_TRAIN, result, self.fl_context) - self.assertEqual(make_reply(ReturnCode.EXECUTION_EXCEPTION), response) - expected_message = f"ERROR:Gatherer:[identity=, run=?]: Received result from {self.CLIENT_THAT_DOES_NOT_TRAIN} for round 0, but it is not a trainer" - self.assertIn(expected_message, log.output) - - def test_gatherer_receives_from_earlier_round_logs_warning(self): - current_round = 2 - self.gatherer = self._get_gatherer(for_round=current_round) - result = MockedResult(current_round - 1) - with self.assertLogs(logging.getLogger("Gatherer"), logging.INFO) as log: - self.gatherer.gather(self.CLIENT_THAT_TRAINS, result, self.fl_context) - expected_message = f"WARNING:Gatherer:[identity=, run=?]: Received late result from {self.CLIENT_THAT_TRAINS} for round {current_round-1}, which is < gatherer's current round {current_round}" - self.assertIn(expected_message, log.output) - - def test_gatherer_receives_from_later_round_logs_warning(self): - current_round = 1 - self.gatherer = self._get_gatherer(for_round=current_round) - result = MockedResult(current_round + 1) - with self.assertLogs(logging.getLogger("Gatherer"), logging.ERROR) as log: - response = self.gatherer.gather(self.CLIENT_THAT_TRAINS, result, self.fl_context) - self.assertEqual(make_reply(ReturnCode.EXECUTION_EXCEPTION), response) - expected_message = f"ERROR:Gatherer:[identity=, run=?]: Logic error: received result from {self.CLIENT_THAT_TRAINS} for round {current_round+1}, which is > gatherer's current round {current_round}" - self.assertIn(expected_message, log.output) - - def test_gatherer_logs_exception_from_gathering(self): - result = MockedResultRaisingException(0) - with self.assertLogs(logging.getLogger("Gatherer"), logging.ERROR) as log: # but does not raise exception - self.gatherer.gather(self.CLIENT_THAT_TRAINS, result, self.fl_context) - self.assertTrue(log.output[0].startswith('ERROR:Gatherer:[identity=, run=?]: Exception gathering: Traceback')) - - def test_gatherer_gathering_from_current_round_with_enough_responses_gets_logged_and_events_are_fired(self): - event_catcher = EventCatcher() - - current_round = 0 - result = MockedResult(current_round) - with self.assertLogs(logging.getLogger("Gatherer"), logging.INFO) as log, \ - mock.patch("nvflare.apis.fl_component.FLComponent.fire_event", side_effect=event_catcher.catch_event): - response = self.gatherer.gather(self.CLIENT_THAT_TRAINS, result, self.fl_context) - self.assertEqual(make_reply(ReturnCode.OK), response) - expected_entry = f"INFO:Gatherer:[identity=, run=?]: Contribution from {self.CLIENT_THAT_TRAINS} ACCEPTED by the aggregator at round 0." - self.assertTrue(expected_entry in log.output) - self.assertListEqual(event_catcher.events_caught, ["_before_contribution_accept", "_after_contribution_accept"]) - - def test_gatherer_gathering_bad_result_gets_logged(self): - current_round = 0 - result = MockedResultFailing(current_round) - with self.assertLogs(logging.getLogger("Gatherer"), logging.ERROR) as log: - response = self.gatherer.gather(self.CLIENT_THAT_TRAINS, result, self.fl_context) - self.assertEqual(make_reply(ReturnCode.EXECUTION_EXCEPTION), response) - expected_entry = f"ERROR:Gatherer:[identity=, run=?]: Bad result from {self.CLIENT_THAT_TRAINS} for round {current_round}: EXECUTION_RESULT_ERROR." - self.assertTrue(expected_entry in log.output) - - def _set_metrics(self, executor_best: float, current_best: float): - self.gatherer.executor.best_metric = executor_best - self.gatherer.current_best_global_metric = current_best - - def test_aggregating_returns_error_on_exception_during_aggregation(self): - self.gatherer.aggregator = MockedAggregatorRaisingException() - self._set_metrics(0.0, 0.0) - self.assertEqual(make_reply(ReturnCode.EXECUTION_EXCEPTION), self.gatherer.aggregate()) - - def _prepare_for_aggregation(self, executor_best: float, current_best: float): - current_round = 0 - result = MockedResult(current_round) - self.gatherer = self._get_gatherer() - self._set_metrics(executor_best, current_best) - self.gatherer.gather(self.CLIENT_THAT_TRAINS, result, self.fl_context) - - def test_aggregating_determines_best_metric_correctly(self): - for executor_best, current_best, best, first_is_better in ((0.4, 0.6, 0.6, False ), # other is better (note: for metrics, larger is better) - (0.6, 0.4, 0.6, True ), # own is better - (0.5, 0.5, 0.5, False ), # own is not better, take other - (None, 0.5, 0.5, False ), # 0.5 is better than None - (0.5, None, 0.5, True ), - (0.5, NaN, 0.5, True ), - (NaN, 0.5, 0.5, False )): - self._prepare_for_aggregation(executor_best, current_best) - with self.assertLogs(logging.getLogger("Gatherer"), logging.INFO) as log: - result = self.gatherer.aggregate() - - if first_is_better: - self.assertTrue("INFO:Gatherer:[identity=, run=?]: Finished aggregation for round 0" == log.output[-3]) - self.assertTrue(log.output[-2].startswith("INFO:Gatherer:[identity=, run=?]: I got better metric")) - else: - self.assertTrue("INFO:Gatherer:[identity=, run=?]: Finished aggregation for round 0" == log.output[-2]) - self.assertAlmostEqual(best, result.get_header(Constant.METRIC)) - - def test_aggregating_fires_events(self): - event_catcher = EventCatcher() - - self._prepare_for_aggregation(0.4, 0.6) - with mock.patch("nvflare.apis.fl_component.FLComponent.fire_event", side_effect=event_catcher.catch_event): - self.gatherer.aggregate() - - self.assertListEqual(event_catcher.events_caught, ["_before_aggregation", "_after_aggregation"]) - - def test_gatherer_is_done_if_all_are_finished(self): - for trainer in self.gatherer.trainer_statuses.keys(): - self.gatherer.trainer_statuses[trainer].reply_time = time.time() - self.assertTrue(self.gatherer.is_done()) - - def test_gatherer_is_done_if_timeout(self): - time.sleep(0.11) - with self.assertLogs(logging.getLogger("Gatherer"), logging.INFO) as log: - self.assertTrue(self.gatherer.is_done()) - self.assertTrue("WARNING:Gatherer:[identity=, run=?]: Gatherer for round 0 timed out after 0.1 seconds" in log.output) - - def test_gatherer_is_done_if_enough_responses_received(self): - self.gatherer = self._get_gatherer(all_clients=[self.CLIENT_THAT_TRAINS, self.OTHER_CLIENT_THAT_TRAINS], - trainers=[self.CLIENT_THAT_TRAINS, self.OTHER_CLIENT_THAT_TRAINS], - min_responses_required=1) - self.gatherer.trainer_statuses[self.OTHER_CLIENT_THAT_TRAINS].reply_time = time.time() - time.sleep(0.11) - with self.assertLogs(logging.getLogger("Gatherer"), logging.INFO) as log: - self.assertTrue(self.gatherer.is_done()) - self.assertTrue("WARNING:Gatherer:[identity=, run=?]: Gatherer for round 0 timed out after 0.1 seconds" in log.output) - - def test_gatherer_is_not_done_if_no_trainer_is_finished(self): - self.assertIsNone(self.gatherer.is_done()) - - def test_gatherer_is_not_done_if_insufficient_responses_received(self): - self.gatherer = self._get_gatherer(all_clients=[self.CLIENT_THAT_TRAINS, self.OTHER_CLIENT_THAT_TRAINS], - trainers=[self.CLIENT_THAT_TRAINS, self.OTHER_CLIENT_THAT_TRAINS], - min_responses_required=2) - self.gatherer.trainer_statuses[self.OTHER_CLIENT_THAT_TRAINS].reply_time = time.time() - self.assertIsNone(self.gatherer.is_done()) - - def test_gatherer_is_done_if_enough_responses_received_and_waiting_time_expired(self): - self.gatherer = self._get_gatherer(all_clients=[self.CLIENT_THAT_TRAINS, self.OTHER_CLIENT_THAT_TRAINS, self.YET_ANOTHER_CLIENT_THAT_TRAINS], - trainers=[self.CLIENT_THAT_TRAINS, self.OTHER_CLIENT_THAT_TRAINS, self.YET_ANOTHER_CLIENT_THAT_TRAINS], - min_responses_required=2, - timeout=0.5) - now = time.time() - self.gatherer.trainer_statuses[self.CLIENT_THAT_TRAINS].reply_time = now - self.gatherer.trainer_statuses[self.OTHER_CLIENT_THAT_TRAINS].reply_time = now - self.gatherer.min_resps_received_time = now - time.sleep(0.11) - with self.assertLogs(logging.getLogger("Gatherer"), logging.INFO) as log: - self.assertTrue(self.gatherer.is_done()) - self.assertTrue("INFO:Gatherer:[identity=, run=?]: Gatherer for round 0 exit after 0.1 seconds since received minimum responses" in log.output) diff --git a/tests/unit_tests/controller/test_swarm_client_controller.py b/tests/unit_tests/controller/test_swarm_client_controller.py deleted file mode 100644 index 845c0bee..00000000 --- a/tests/unit_tests/controller/test_swarm_client_controller.py +++ /dev/null @@ -1,177 +0,0 @@ -import unittest -import logging -from mock import mock - -from nvflare.apis.fl_constant import ReturnCode -from nvflare.apis.fl_context import FLContext, FLContextManager -from nvflare.apis.shareable import Shareable, make_reply -from nvflare.apis.signal import Signal -from nvflare.app_common.app_event_type import AppEventType -from nvflare.app_common.ccwf.common import Constant - -from swarm_client_ctl import SwarmClientController - - -TASK_NAME_PREFIX = "test_prefix" -LEARN_TASK_NAME = "test_learn_task" - - -class MockedEngineForTesting: - def __init__(self): - self.fl_ctx_mgr = FLContextManager(engine=self) - - def new_context(self): - context = self.fl_ctx_mgr.new_context() - return context - - def register_aux_message_handler(self, topic, message_handle_func): - pass - - def fire_event(self, event_type, fl_ctx): - pass - - -class TestSwarmClientController(unittest.TestCase): - def setup_controller(self, - task_name_prefix=TASK_NAME_PREFIX, - learn_task_name=LEARN_TASK_NAME, - persistor_id="test_persistor_id", - shareable_generator_id="test_generator_id", - aggregator_id="test_aggregator_id", - **kwargs - ): - self.controller = SwarmClientController( - task_name_prefix=task_name_prefix, - learn_task_name=learn_task_name, - persistor_id=persistor_id, - shareable_generator_id=shareable_generator_id, - aggregator_id=aggregator_id, - **kwargs - ) - - def setUp(self): - """ - Set up a mock FLContext and instantiate the SwarmClientController with test data - for unit testing. - """ - self.controller = None - self.setup_controller() - self.engine = None - self.testee_logger = logging.getLogger("SwarmClientController") - - def test_initialization_sets_members_correctly(self): - """ - Test the initialization of the SwarmClientController to ensure proper assignment - of attributes. - """ - self.assertIsInstance(self.controller, SwarmClientController) - self.assertEqual(self.controller.task_name_prefix, TASK_NAME_PREFIX) - self.assertEqual(self.controller.learn_task_name, LEARN_TASK_NAME) - metric_comparator_id = "test_metric_comparator_id" - self.setup_controller(metric_comparator_id=metric_comparator_id) - self.assertEqual(self.controller.metric_comparator_id, metric_comparator_id) - - def test_incorrect_initializations_raise_errors_and_logs(self): - for argument_empty in ("learn_task_name","persistor_id", "shareable_generator_id", "aggregator_id", "metric_comparator_id"): - with self.assertLogs(self.testee_logger, logging.ERROR) as log, self.assertRaises(ValueError) as error: - self.setup_controller(**{argument_empty: ""}) - self.assertTrue(log.output[0].startswith(f"ERROR:SwarmClientController:Error during initialization: {argument_empty} must not be empty")) - - for nonpositive_number, value in (("learn_task_timeout", -1.0), ("min_responses_required", 0), ("wait_time_after_min_resps_received", 0.0), ("learn_task_timeout", 0.0)): - # no need to distinguish between float and int here - with self.assertLogs(self.testee_logger, logging.ERROR) as log, self.assertRaises(ValueError) as error: - self.setup_controller(**{nonpositive_number: value}) - self.assertTrue(log.output[0].startswith(f"ERROR:SwarmClientController:Error during initialization: {nonpositive_number} must > 0, but got {value}")) - - def _setup_for_processing_config(self, config): - self.setup_controller() - fl_engine = MockedEngineForTesting() - fl_context = fl_engine.new_context() - self.controller.me = config[Constant.CLIENTS][0] - self.controller.engine = fl_engine - self.controller.config = config - return fl_context - - def test_process_config_sets_client_roles_correctly(self): - """ - Test the process_config method to verify correct role assignment as trainer or aggregator. - """ - for config in ({Constant.CLIENTS: ["C1", "C2", "C3"], Constant.TRAIN_CLIENTS: ["C1", "C2"], Constant.AGGR_CLIENTS: ["C2", "C3"]}, - {Constant.CLIENTS: ["C1", "C2", "C3"], Constant.TRAIN_CLIENTS: ["C2", "C3"], Constant.AGGR_CLIENTS: ["C1", "C3"]}, - {Constant.CLIENTS: ["C1", "C2", "C3"], Constant.TRAIN_CLIENTS: ["C1", "C2"], Constant.AGGR_CLIENTS: ["C1", "C3"]}, - {Constant.CLIENTS: ["C1", "C2", "C3"], Constant.TRAIN_CLIENTS: ["C1", "C2"] }, - {Constant.CLIENTS: ["C1", "C2", "C3"], Constant.AGGR_CLIENTS: ["C2", "C3"]}, - {Constant.CLIENTS: ["C1", "C2", "C3"], }): - fl_context = self._setup_for_processing_config(config) - self.controller.process_config(fl_context) - is_trainer = ( Constant.TRAIN_CLIENTS not in config.keys() ) or ( "C1" in config[Constant.TRAIN_CLIENTS] ) - is_aggregator = (Constant.AGGR_CLIENTS not in config.keys()) or ("C1" in config[Constant.AGGR_CLIENTS]) - self.assertEqual(self.controller.is_trainer, is_trainer) - self.assertEqual(self.controller.is_aggr, is_aggregator) - - def test_process_config_raises_errors_and_logs(self): - fl_context = FLContext() - with self.assertLogs(self.testee_logger, logging.ERROR) as log, self.assertRaises(TypeError) as error: - self.controller.process_config(fl_context) - self.assertTrue(log.output[0].startswith("ERROR:SwarmClientController:[identity=, run=?]: Exception during process_config")) - - def _setup_for_executing(self, config): - fl_context = self._setup_for_processing_config(config) - shareable = Shareable() - shareable[Constant.CONFIG] = None - abort_signal = Signal() - return fl_context, shareable, abort_signal - - def test_execute_returns_if_no_exception(self): - """ - Test the execute method to ensure proper handling of the task execution flow. - """ - for task_name, expected_result in (("test_prefix_report_learn_result", {"__headers__": {"__rc__": "EXECUTION_EXCEPTION"}}), - ("wrong_task_name", {"__headers__": {"__rc__": "TASK_UNKNOWN"}})): - config = {Constant.CLIENTS: ["C1", "C2", "C3"], Constant.TRAIN_CLIENTS: ["C1", "C2"], Constant.AGGR_CLIENTS: ["C2", "C3"]} - fl_context, shareable, abort_signal = self._setup_for_executing(config) - result = self.controller.execute(task_name, shareable, fl_context, abort_signal) - self.assertDictEqual(result, expected_result) - - def test_execute_logs_and_returns_on_exception(self): - config = {Constant.CLIENTS: ["C1", "C2", "C3"], Constant.TRAIN_CLIENTS: ["C1", "C2"], Constant.AGGR_CLIENTS: ["C2", "C3"]} - fl_context, shareable, abort_signal = self._setup_for_executing(config) - - with self.assertLogs(self.testee_logger, logging.ERROR) as log: - with mock.patch("swarm_client_ctl.SwarmClientController._process_learn_result", side_effect=Exception("exception")): - result = self.controller.execute("test_prefix_report_learn_result", shareable, fl_context, abort_signal) - self.assertEqual(result, make_reply(ReturnCode.EXECUTION_EXCEPTION)) - self.assertTrue(log.output[0].startswith("ERROR:SwarmClientController:[identity=, run=?]: Exception during execute")) - - def test_handle_event_unexpected_event_does_not_fail(self): - fl_context = FLContext() - result = self.controller.handle_event(AppEventType.LOCAL_BEST_MODEL_AVAILABLE, fl_context) - self.assertIsNone(result) - - def test_handle_event_expected_event_logged_correctly(self): - fl_context = FLContext() - # TODO Unlike error logs, the SwarmClientController logs this only once via self.log_info, as opposed to self.log_error followed by logger.error. - # Unify this when unifying logging. - with self.assertLogs(logging.getLogger("SwarmClientController"), logging.INFO) as log: - self.controller.handle_event(AppEventType.GLOBAL_BEST_MODEL_AVAILABLE, fl_context) - self.assertEqual(log.output[0], "INFO:SwarmClientController:[identity=, run=?]: Got GLOBAL_BEST_MODEL_AVAILABLE: best metric=None") - - def test_handle_event_other_client_affected_does_not_fail(self): - fl_context = FLContext() - fl_context.set_prop(Constant.CLIENT, "C1") - self.controller.me = "C2" - self.controller.handle_event(AppEventType.GLOBAL_BEST_MODEL_AVAILABLE, fl_context) - - def test_handle_event_logs_and_raises_exception(self): - fl_context = FLContext() - - with self.assertLogs(self.testee_logger, logging.ERROR) as log, self.assertRaises(Exception) as error: - with mock.patch("swarm_client_ctl.SwarmClientController.update_status", side_effect=Exception("exception")): - self.controller.handle_event(AppEventType.GLOBAL_BEST_MODEL_AVAILABLE, fl_context) - self.assertTrue(log.output[0].startswith("ERROR:SwarmClientController:[identity=, run=?]: Exception during handle_event")) - - """ - The start_run, start_workflow, and learn_task methods are not unit-tested (yet) - because a minimum setup to run them is not straight-forward. - They are probably better tested as part of an integration test? - """ diff --git a/tests/unit_tests/controller/test_swarm_server_controller.py b/tests/unit_tests/controller/test_swarm_server_controller.py deleted file mode 100644 index d3ed971e..00000000 --- a/tests/unit_tests/controller/test_swarm_server_controller.py +++ /dev/null @@ -1,212 +0,0 @@ -import unittest -import logging -from dataclasses import dataclass - -from swarm_server_ctl import SwarmServerController -from nvflare.apis.fl_context import FLContextManager -from nvflare.app_common.ccwf.common import Constant -from nvflare.apis.fl_constant import FLContextKey - - -@dataclass -class Client: - name: str - - -class MockedEngineForTesting: - def __init__(self, job_id, clients): - self.job_id = job_id - self.clients = [Client(i) for i in clients] - self.fl_ctx_mgr = FLContextManager(engine=self) - - def new_context(self): - context = self.fl_ctx_mgr.new_context() - context.set_prop(FLContextKey.WORKFLOW, self.job_id) - return context - - def get_clients(self): - return self.clients - - def fire_event(self, _a, _b): - pass - - -class TestSwarmServerController(unittest.TestCase): - CLIENT_THAT_TRAINS = "client1" - CLIENT_THAT_AGGREGATES = "client2" - CLIENT_THAT_TRAINS_AND_AGGREGATES = "client3" - CLIENT_THAT_DOES_NOTHING = "client4" - CLIENT_THAT_IS_NOT_INVOLVED = "client5" - OTHER_CLIENT_THAT_TRAINS = "client6" - OTHER_CLIENT_THAT_AGGREGATES = "client7" - DEFAULT_NUM_ROUNDS = 2 - - def _set_up(self, clients): - """ - Set up a mock FLContext and instantiate the SwarmServerController with test data - for unit testing. - """ - self._engine = MockedEngineForTesting(job_id="UnitTestJob", clients=clients) - self.fl_ctx = self._engine.new_context() - self.testee_logger = logging.getLogger("SwarmServerController") - - def setUp(self): - self._set_up(clients=[]) - - - def _get_minimum_valid_controller(self): - participating_clients = [self.CLIENT_THAT_TRAINS, self.CLIENT_THAT_AGGREGATES] - controller = SwarmServerController(num_rounds=TestSwarmServerController.DEFAULT_NUM_ROUNDS, - participating_clients=participating_clients, - starting_client=self.CLIENT_THAT_TRAINS, - train_clients=[self.CLIENT_THAT_TRAINS], - aggr_clients=[self.CLIENT_THAT_AGGREGATES]) - self._set_up(clients=participating_clients) - return controller - - def _verify_exception_and_error_log(self, error, log, log_prefix: str, expected_message: str): - self.assertEqual(expected_message, str(error.exception)) - self.assertIn(f"{log_prefix}: {expected_message}", log.output) - - def _verify_exception_and_error_log_during_constructor(self, error, log, expected_message: str): - log_prefix = "ERROR:SwarmServerController:Error during initialization" - self._verify_exception_and_error_log(error, log, log_prefix, expected_message) - - def _initialize_start_finalize(self, controller): - controller.initialize_run(self.fl_ctx) - controller.start_controller(self.fl_ctx) - controller.finalize_run(self.fl_ctx) - - def test_initialization_initializes_correctly(self): - """ - Test the initialization of the SwarmServerController to ensure proper assignment - of attributes. - """ - controller = self._get_minimum_valid_controller() - self.assertIsInstance(controller, SwarmServerController) - self.assertEqual(self.DEFAULT_NUM_ROUNDS, controller.num_rounds) - self.assertEqual(self.CLIENT_THAT_TRAINS, controller.starting_client) - - def test_prepare_config_initializes_correctly(self): - """ - Test the preparation of the configuration dictionary, ensuring it includes the - aggregation and training clients. - """ - controller = self._get_minimum_valid_controller() - config = controller.prepare_config() - self.assertIn(Constant.AGGR_CLIENTS, config) - self.assertIn(Constant.TRAIN_CLIENTS, config) - - def test_starting_controller_succeeds(self): - """ - Test the start_controller method to verify that the participating clients are - correctly assigned as training or aggregation clients. - """ - controller = self._get_minimum_valid_controller() - controller.initialize_run(self.fl_ctx) - controller.start_controller(self.fl_ctx) - self.assertIn(self.CLIENT_THAT_TRAINS, controller.train_clients) - self.assertIn(self.CLIENT_THAT_AGGREGATES, controller.aggr_clients) - controller.finalize_run(self.fl_ctx) - - def test_invalid_starting_client_raises_error(self): - """ - Test the behavior when an invalid starting_client is provided, ensuring that a - ValueError is raised. - """ - participating_clients = [self.CLIENT_THAT_TRAINS, self.CLIENT_THAT_AGGREGATES] - controller = SwarmServerController(num_rounds=TestSwarmServerController.DEFAULT_NUM_ROUNDS, - participating_clients=participating_clients, - starting_client=self.CLIENT_THAT_IS_NOT_INVOLVED, - train_clients=[self.CLIENT_THAT_TRAINS], - aggr_clients=[self.CLIENT_THAT_AGGREGATES]) - self._set_up(clients=participating_clients) - with self.assertRaises(ValueError) as error: - controller.initialize_run(self.fl_ctx) - self.assertEqual(f"invalid value '{self.CLIENT_THAT_IS_NOT_INVOLVED}' in 'starting_client'", str(error.exception)) - controller.finalize_run(self.fl_ctx) - - def test_unspecified_training_client_succeeds_initialization(self): - participating_clients = [self.CLIENT_THAT_AGGREGATES, self.OTHER_CLIENT_THAT_AGGREGATES] - controller = SwarmServerController(num_rounds=TestSwarmServerController.DEFAULT_NUM_ROUNDS, - participating_clients=participating_clients, - starting_client=self.CLIENT_THAT_AGGREGATES, - # no train_clients given - aggr_clients=participating_clients) - self._set_up(clients=participating_clients) - self._initialize_start_finalize(controller) - - def test_no_training_client_succeeds_initialization(self): - participating_clients = [self.CLIENT_THAT_AGGREGATES, self.OTHER_CLIENT_THAT_AGGREGATES] - controller = SwarmServerController(num_rounds=TestSwarmServerController.DEFAULT_NUM_ROUNDS, - participating_clients=participating_clients, - starting_client=self.CLIENT_THAT_AGGREGATES, - train_clients=[], - aggr_clients=participating_clients) - self._set_up(clients=participating_clients) - self._initialize_start_finalize(controller) - - def test_unspecified_aggregating_client_succeeds_initialization(self): - participating_clients = [self.CLIENT_THAT_TRAINS, self.OTHER_CLIENT_THAT_TRAINS] - controller = SwarmServerController(num_rounds=TestSwarmServerController.DEFAULT_NUM_ROUNDS, - participating_clients=participating_clients, - starting_client=self.CLIENT_THAT_TRAINS, - train_clients=participating_clients) # no aggr_clients given - self._set_up(clients=participating_clients) - self._initialize_start_finalize(controller) - - def test_no_aggregating_client_succeeds_initialization(self): - participating_clients = [self.CLIENT_THAT_TRAINS, self.OTHER_CLIENT_THAT_TRAINS] - controller = SwarmServerController(num_rounds=TestSwarmServerController.DEFAULT_NUM_ROUNDS, - participating_clients=participating_clients, - starting_client=self.CLIENT_THAT_TRAINS, - train_clients=participating_clients, - aggr_clients=[]) - self._set_up(clients=participating_clients) - self._initialize_start_finalize(controller) - - def test_uncategorized_client_raises_error(self): - """ - Test the scenario where a participating client is neither in train_clients nor - aggr_clients, ensuring that a RuntimeError is raised. - """ - participating_clients = [self.CLIENT_THAT_TRAINS, self.CLIENT_THAT_AGGREGATES, self.CLIENT_THAT_DOES_NOTHING] - controller = SwarmServerController(num_rounds=TestSwarmServerController.DEFAULT_NUM_ROUNDS, - participating_clients=participating_clients, - starting_client=self.CLIENT_THAT_TRAINS, - train_clients=[self.CLIENT_THAT_TRAINS], - aggr_clients=[self.CLIENT_THAT_AGGREGATES]) - self._set_up(clients=participating_clients) - expected_message = f"Config Error: client {self.CLIENT_THAT_DOES_NOTHING} is neither train client nor aggr client" - log_prefix = f"ERROR:SwarmServerController:[identity=, run=?, wf=UnitTestJob]: Error during start_controller" - with self.assertLogs(self.testee_logger, logging.DEBUG) as log, self.assertRaises(RuntimeError) as error: - controller.initialize_run(self.fl_ctx) - self._verify_exception_and_error_log(error, log, log_prefix, expected_message) - controller.finalize_run(self.fl_ctx) - - def test_client_can_be_both_training_and_aggregating(self): - participating_clients = [self.CLIENT_THAT_TRAINS, self.CLIENT_THAT_AGGREGATES, self.CLIENT_THAT_TRAINS_AND_AGGREGATES] - controller = SwarmServerController(num_rounds=TestSwarmServerController.DEFAULT_NUM_ROUNDS, - participating_clients=participating_clients, - starting_client=self.CLIENT_THAT_TRAINS, - train_clients=[self.CLIENT_THAT_TRAINS, self.CLIENT_THAT_TRAINS_AND_AGGREGATES], - aggr_clients=[self.CLIENT_THAT_AGGREGATES, self.CLIENT_THAT_TRAINS_AND_AGGREGATES]) - self._set_up(clients=participating_clients) - self._initialize_start_finalize(controller) - - def test_one_participating_client_fails_initialization(self): - with self.assertLogs(self.testee_logger, logging.DEBUG) as log, self.assertRaises(ValueError) as error: - _ = SwarmServerController(num_rounds=TestSwarmServerController.DEFAULT_NUM_ROUNDS, - participating_clients=[self.CLIENT_THAT_TRAINS], - starting_client=self.CLIENT_THAT_TRAINS) - expected_message = f"Not enough participating_clients: must > 1, but got ['{self.CLIENT_THAT_TRAINS}']" - self._verify_exception_and_error_log_during_constructor(error, log, expected_message) - - def test_error_in_prepare_config_is_raised(self): - controller = self._get_minimum_valid_controller() - del controller.train_clients # do something (that usually would not make any sense) to trigger an exception/error thrown in prepare_config - expected_message = "'SwarmServerController' object has no attribute 'train_clients'" - log_prefix = f"ERROR:SwarmServerController:Error during prepare_config" - with self.assertLogs(self.testee_logger, logging.DEBUG) as log, self.assertRaises(AttributeError) as error: - controller.prepare_config() - self._verify_exception_and_error_log(error, log, log_prefix, expected_message) From 3e01f1c45bf7abc429ddb08a31ca7149007df425 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 28 Oct 2025 14:19:42 +0100 Subject: [PATCH 258/337] removed modified value for end_workflow_timeout which is not supported by the default server controller --- .../app/config/config_fed_server.conf | 1 - .../app/config/config_fed_server.conf | 1 - 2 files changed, 2 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_server.conf b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_server.conf index c9b6cdd5..335b173e 100644 --- a/application/jobs/ODELIA_ternary_classification/app/config/config_fed_server.conf +++ b/application/jobs/ODELIA_ternary_classification/app/config/config_fed_server.conf @@ -19,7 +19,6 @@ workflows = [ num_rounds = 20 start_task_timeout = 360000 progress_timeout = 360000 - end_workflow_timeout = 360000 configure_task_timeout = 360000 max_status_report_interval = 360000 } diff --git a/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_server.conf b/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_server.conf index 2f2dafc5..619199b8 100644 --- a/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_server.conf +++ b/application/jobs/minimal_training_pytorch_cnn/app/config/config_fed_server.conf @@ -19,7 +19,6 @@ workflows = [ num_rounds = 5 start_task_timeout = 36000 progress_timeout = 36000 - end_workflow_timeout = 36000 configure_task_timeout = 36000 max_status_report_interval = 36000 } From 12d8ca09ab0705786ce2f60d5f7eb41e8e0653ad Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 29 Oct 2025 14:05:51 +0100 Subject: [PATCH 259/337] added clients --- application/provision/project_MEVIS_test.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/application/provision/project_MEVIS_test.yml b/application/provision/project_MEVIS_test.yml index 2701cb4d..2ee62242 100644 --- a/application/provision/project_MEVIS_test.yml +++ b/application/provision/project_MEVIS_test.yml @@ -15,6 +15,15 @@ participants: - name: MHA type: client org: MEVIS_Test + - name: RUMC + type: client + org: MEVIS_Test + - name: UKA + type: client + org: MEVIS_Test + - name: UZH + type: client + org: MEVIS_Test - name: admin@mevis.odelia type: admin org: MEVIS_Test From b46549622323a7cfc13f394a982800e670a5f089 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Thu, 30 Oct 2025 05:09:43 +0100 Subject: [PATCH 260/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index a138ca05..4507c1c1 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -53,7 +53,7 @@ RUN apt install -y \ libtasn1-6=4.18.0-4ubuntu0.1 \ libudev1=249.11-0ubuntu3.17 \ libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-160.170 \ + linux-libc-dev=5.15.0-161.171 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ openssl=3.0.2-0ubuntu1.20 \ From 3c9a58ec8a94d898e05bead69606dce940e710c1 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 29 Oct 2025 15:15:26 +0100 Subject: [PATCH 261/337] simplified check for expected output in server and client logs --- runIntegrationTests.sh | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index fb3b9d5a..94f170d9 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -411,16 +411,18 @@ run_dummy_training_in_swarm () { sleep 120 cd "$CWD" + # check for expected output in server log (clients joined, job ID assigned, 5 rounds, start of round logged, finished training logged) cd "$PROJECT_DIR"/prod_00/localhost/startup CONSOLE_OUTPUT=nohup.out - for EXPECTED_OUTPUT in 'Total clients: 2' 'updated status of client client_A on round 4' 'updated status of client client_B on round 4' 'all_done=True' 'Server runner finished.' \ - 'Start to the run Job: [0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}' 'updated status of client client_B on round 4' \ - '.*SwarmServerController - INFO - .*updated status of client client_B on round 3: .* action=start_learn_task, all_done=False' \ - '.*SwarmServerController - INFO - .*updated status of client client_B on round 3: .* action=finished_learn_task, all_done=False' \ - '.*ClientManager - INFO - Client: New client client_A.* joined.*' \ - '.*ClientManager - INFO - Client: New client client_B.* joined.*' \ - '.*ClientManager - INFO - Client: New client client_.* joined. Sent token: .* Total clients: 1' \ - '.*ClientManager - INFO - Client: New client client_.* joined. Sent token: .* Total clients: 2'; + for EXPECTED_OUTPUT in 'Client: New client client_A.* joined.*' \ + 'Client: New client client_B.* joined.*' \ + 'Client: New client client_.* joined. Sent token: .* Total clients: 1' \ + 'Client: New client client_.* joined. Sent token: .* Total clients: 2' \ + 'Start to the run Job: [0-9a-f\-]\+' \ + 'updated status of client client_A on round 4: .* action=start_learn_task, all_done=False' \ + 'updated status of client client_B on round 4: .* action=start_learn_task, all_done=False' \ + 'all_done=True' \ + 'Server runner finished.'; do if grep -q --regexp="$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then echo "Expected output $EXPECTED_OUTPUT found" @@ -432,6 +434,7 @@ run_dummy_training_in_swarm () { done cd "$CWD" + # check for expected output in client log cd "$PROJECT_DIR"/prod_00/client_A/startup CONSOLE_OUTPUT=nohup.out for EXPECTED_OUTPUT in 'Sending training result to aggregation client' \ @@ -439,12 +442,12 @@ run_dummy_training_in_swarm () { 'val/AUC_ROC' \ 'validation metric .* from client' \ 'aggregating [0-9]* update(s) at round [0-9]*' \ - 'FederatedClient - INFO - Successfully registered client:client_A for project' \ - 'FederatedClient - INFO - Got engine after .* seconds' \ - 'FederatedClient - INFO - Got the new primary SP:' \ - 'SwarmClientController - INFO - .*: accepted learn request from client_.' \ - 'Gatherer - INFO - .*: Contribution from client_. ACCEPTED by the aggregator at round .' \ - 'SwarmClientController - INFO - .*: Broadcasting learn task of round . to .*; aggr client is client_.' + 'Successfully registered client:client_A for project' \ + 'Got engine after .* seconds' \ + 'Got the new primary SP:' \ + 'accepted learn request from client_.' \ + 'Contribution from client_. ACCEPTED by the aggregator at round .' \ + 'Broadcasting learn task of round . to .*; aggr client is client_.' do if grep -q --regexp="$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then echo "Expected output $EXPECTED_OUTPUT found" From 827ae5447a298b6308e5a4e4ae68cab393e90c5d Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Tue, 4 Nov 2025 05:09:39 +0100 Subject: [PATCH 262/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 4507c1c1..bd9dd90b 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -94,7 +94,7 @@ RUN apt install -y \ libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 \ libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 \ libsqlite3-0=3.37.2-2ubuntu0.5 \ - libssh-4=0.9.6-2ubuntu0.22.04.4 \ + libssh-4=0.9.6-2ubuntu0.22.04.5 \ lsb-release=11.1.0ubuntu4 \ media-types=7.0.0 \ pinentry-curses=1.1.1-1build2 \ From 3edc3eb2bce831fc64c1c3f57f9f391487eb5354 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 4 Nov 2025 13:29:40 +0100 Subject: [PATCH 263/337] added ubuntu versions known to work --- assets/readme/README.participant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index be8d4759..f90454cc 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -5,7 +5,7 @@ This guide is for data scientists and medical research sites participating in a ## Prerequisites - Hardware: Min. 32GB RAM, 8 cores, NVIDIA GPU with 24GB VRAM, 4TB storage -- OS: Ubuntu 20.04 LTS +- OS: Ubuntu 20.04 LTS, 22.04 LTS, or 24.04 LTS - Software: Docker, OpenVPN, Git ## Setup From b43fa6ea8ff9737af38846de148637cf79c49092 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 4 Nov 2025 13:29:57 +0100 Subject: [PATCH 264/337] git not required for participating --- assets/readme/README.participant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index f90454cc..51cf59db 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -6,7 +6,7 @@ This guide is for data scientists and medical research sites participating in a - Hardware: Min. 32GB RAM, 8 cores, NVIDIA GPU with 24GB VRAM, 4TB storage - OS: Ubuntu 20.04 LTS, 22.04 LTS, or 24.04 LTS -- Software: Docker, OpenVPN, Git +- Software: Docker, OpenVPN ## Setup 0. Add this line to your `/etc/hosts`: `172.24.4.65 dl3.tud.de dl3` From 7e4e372c79a9a095137cbbdb162ea11be368ec3f Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 4 Nov 2025 13:34:14 +0100 Subject: [PATCH 265/337] example as reminder that site name should end in "_1" --- assets/readme/README.participant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 51cf59db..c42ce479 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -79,7 +79,7 @@ The dataset must be in the following format. 1. Directories ```bash - export SITE_NAME= + export SITE_NAME= export DATADIR= export SCRATCHDIR= ``` From 5a9c275ed9861685d0cd9fe265ac5bbd46ae28c0 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 4 Nov 2025 13:34:33 +0100 Subject: [PATCH 266/337] consistently end sentences --- assets/readme/README.participant.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index c42ce479..3fe53c55 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -160,9 +160,9 @@ For any issues, check if the commands above point to problems and contact your S ## Troubleshooting -* Folders where files are located need to have the correct name -* Image files need to have the correct file name including capitalization -* The directories listed as identifiers in the tables `annotation.csv` and `split.csv` should all be present and named correctly (including capitalization), only those directories should be present -* The tables should not have additional or duplicate columns, entries need to have the correct captitalization +* Folders where files are located need to have the correct name. +* Image files need to have the correct file name including capitalization. +* The directories listed as identifiers in the tables `annotation.csv` and `split.csv` should all be present and named correctly (including capitalization), only those directories should be present. +* The tables should not have additional or duplicate columns, entries need to have the correct captitalization. * Image and table folders and files need to be present in the folders specified via `--data_dir`. Symlinks to other locations do not work, they are not available in the Docker mount. * The correct startup kit needs to be used. `SSLCertVerificationError` or `authentication failed` may indicate an incorrect startup kit incompatible with the current experiment. From e5d9d02a701edaadd5a0e4fbaa02f3e3aca56df2 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 4 Nov 2025 13:34:50 +0100 Subject: [PATCH 267/337] added VPN pitfall --- assets/readme/README.participant.md | 1 + 1 file changed, 1 insertion(+) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 3fe53c55..4c52892b 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -166,3 +166,4 @@ For any issues, check if the commands above point to problems and contact your S * The tables should not have additional or duplicate columns, entries need to have the correct captitalization. * Image and table folders and files need to be present in the folders specified via `--data_dir`. Symlinks to other locations do not work, they are not available in the Docker mount. * The correct startup kit needs to be used. `SSLCertVerificationError` or `authentication failed` may indicate an incorrect startup kit incompatible with the current experiment. +* Do not start the VPN connection more than once on the same machine or on more than one machine at the same time. From 77a02946cbd5df867b8a80fd15562a4f823af705 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 7 Nov 2025 10:28:02 +0100 Subject: [PATCH 268/337] throttle local VPN to 60 Mbit/s, matching production setup more closely --- tests/local_vpn/_openvpn_start.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/local_vpn/_openvpn_start.sh b/tests/local_vpn/_openvpn_start.sh index 059cbd6c..3483614e 100644 --- a/tests/local_vpn/_openvpn_start.sh +++ b/tests/local_vpn/_openvpn_start.sh @@ -30,5 +30,4 @@ nohup openvpn --duplicate-cn --client-to-client --config /etc/openvpn/server/ser sleep 2 chmod a+r /server_config/nohup.out -tc qdisc add dev eth0 root tbf rate 30mbit burst 5mbit limit 16gbit - +tc qdisc add dev eth0 root tbf rate 60mbit burst 5mbit limit 16gbit From 93f7dbb140a511545a668493a00e36ee0b573e45 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 7 Nov 2025 10:29:03 +0100 Subject: [PATCH 269/337] changed test site name --- application/provision/project_MEVIS_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/application/provision/project_MEVIS_test.yml b/application/provision/project_MEVIS_test.yml index 2ee62242..318d9982 100644 --- a/application/provision/project_MEVIS_test.yml +++ b/application/provision/project_MEVIS_test.yml @@ -21,7 +21,7 @@ participants: - name: UKA type: client org: MEVIS_Test - - name: UZH + - name: UMCU type: client org: MEVIS_Test - name: admin@mevis.odelia From 2435e4e7c2d736b3b62b1034460f4bf9ed612deb Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 11 Nov 2025 10:16:50 +0100 Subject: [PATCH 270/337] use git archive rather than copy source code directory and clean it up afterwards --- buildDockerImageAndStartupKits.sh | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/buildDockerImageAndStartupKits.sh b/buildDockerImageAndStartupKits.sh index 654e63cd..96b43927 100755 --- a/buildDockerImageAndStartupKits.sh +++ b/buildDockerImageAndStartupKits.sh @@ -32,13 +32,12 @@ CONTAINER_VERSION_ID=`git rev-parse --short HEAD` CWD=`pwd` CLEAN_SOURCE_DIR=`mktemp -d` mkdir $CLEAN_SOURCE_DIR/MediSwarm -rsync -ax --exclude workspace . $CLEAN_SOURCE_DIR/MediSwarm/ -cd $CLEAN_SOURCE_DIR/MediSwarm -git clean -x -q -f . +git archive --format=tar HEAD | tar x -C $CLEAN_SOURCE_DIR/MediSwarm/ cd docker_config/NVFlare -git clean -x -q -f . +git archive --format=tar HEAD | tar x -C $CLEAN_SOURCE_DIR/MediSwarm/docker_config/NVFlare cd ../.. -rm .git -rf + +cd $CLEAN_SOURCE_DIR/MediSwarm chmod a+rX . -R # replacements in copy of source code From 6ec15de485ab5c9c83b721e4f49ed3739e0d2c55 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 13 Nov 2025 13:58:17 +0100 Subject: [PATCH 271/337] extracted copying cached pretrained model weights to separate script --- _cacheAndCopyPretrainedModelWeights.sh | 28 ++++++++++++++++++++++++++ buildDockerImageAndStartupKits.sh | 23 +-------------------- 2 files changed, 29 insertions(+), 22 deletions(-) create mode 100755 _cacheAndCopyPretrainedModelWeights.sh diff --git a/_cacheAndCopyPretrainedModelWeights.sh b/_cacheAndCopyPretrainedModelWeights.sh new file mode 100755 index 00000000..c615f5e9 --- /dev/null +++ b/_cacheAndCopyPretrainedModelWeights.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -e + +SOURCE_DIR=$1 +TARGET_DIR=$2 + +# prepare pre-trained model weights for being included in Docker image + +MODEL_WEIGHTS_FILE=$SOURCE_DIR'/docker_config/torch_home_cache/hub/checkpoints/dinov2_vits14_pretrain.pth' +MODEL_LICENSE_FILE=$SOURCE_DIR'/docker_config/torch_home_cache/hub/facebookresearch_dinov2_main/LICENSE' +if [[ ! -f $MODEL_WEIGHTS_FILE || ! -f $MODEL_LICENSE_FILE ]]; then + echo "Pre-trained model not available. Attempting download" + HUBDIR=$(dirname $(dirname $MODEL_LICENSE_FILE)) + mkdir -p $(dirname $MODEL_WEIGHTS_FILE) + wget https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth -O $MODEL_WEIGHTS_FILE + wget https://github.com/facebookresearch/dinov2/archive/refs/heads/main.zip -O /tmp/dinov2.zip + unzip /tmp/dinov2.zip -d $HUBDIR + mv $HUBDIR/dinov2-main $HUBDIR/$(basename $(dirname $MODEL_LICENSE_FILE)) + touch $HUBDIR/trusted_list +fi + +if echo 2e405cee1bad14912278296d4f42e993 $MODEL_WEIGHTS_FILE | md5sum --check - && echo 153d2db1c329326a2d9f881317ea942e $MODEL_LICENSE_FILE | md5sum --check -; then + cp -r $SOURCE_DIR/docker_config/torch_home_cache $TARGET_DIR/torch_home_cache +else + exit 1 +fi +chmod a+rX $TARGET_DIR/torch_home_cache -R diff --git a/buildDockerImageAndStartupKits.sh b/buildDockerImageAndStartupKits.sh index 96b43927..4d3bc5e4 100755 --- a/buildDockerImageAndStartupKits.sh +++ b/buildDockerImageAndStartupKits.sh @@ -44,28 +44,7 @@ chmod a+rX . -R sed -i 's#__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_DOCKER_IMAGE__#'$VERSION'#' docker_config/master_template.yml sed -i 's#__REPLACED_BY_CONTAINER_VERSION_IDENTIFIER_WHEN_BUILDING_DOCKER_IMAGE__#'$CONTAINER_VERSION_ID'#' docker_config/master_template.yml -# prepare pre-trained model weights for being included in Docker image - -MODEL_WEIGHTS_FILE=$CWD'/docker_config/torch_home_cache/hub/checkpoints/dinov2_vits14_pretrain.pth' -MODEL_LICENSE_FILE=$CWD'/docker_config/torch_home_cache/hub/facebookresearch_dinov2_main/LICENSE' -if [[ ! -f $MODEL_WEIGHTS_FILE || ! -f $MODEL_LICENSE_FILE ]]; then - echo "Pre-trained model not available. Attempting download" - HUBDIR=$(dirname $(dirname $MODEL_LICENSE_FILE)) - mkdir -p $(dirname $MODEL_WEIGHTS_FILE) - wget https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth -O $MODEL_WEIGHTS_FILE - wget https://github.com/facebookresearch/dinov2/archive/refs/heads/main.zip -O /tmp/dinov2.zip - unzip /tmp/dinov2.zip -d $HUBDIR - mv $HUBDIR/dinov2-main $HUBDIR/$(basename $(dirname $MODEL_LICENSE_FILE)) - touch $HUBDIR/trusted_list -fi - -if echo 2e405cee1bad14912278296d4f42e993 $MODEL_WEIGHTS_FILE | md5sum --check - && echo 153d2db1c329326a2d9f881317ea942e $MODEL_LICENSE_FILE | md5sum --check -; then - cp -r $CWD/docker_config/torch_home_cache $CLEAN_SOURCE_DIR/torch_home_cache -else - exit 1 -fi -chmod a+rX $CLEAN_SOURCE_DIR/torch_home_cache -R - +./_cacheAndCopyPretrainedModelWeights.sh $CWD $CLEAN_SOURCE_DIR cd $CWD # build and print follow-up steps From 4ca17323783f4e7c2cc92b8dc921e85469e80b19 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 13 Nov 2025 14:04:34 +0100 Subject: [PATCH 272/337] refactored to split steps --- _cacheAndCopyPretrainedModelWeights.sh | 48 ++++++++++++++++---------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/_cacheAndCopyPretrainedModelWeights.sh b/_cacheAndCopyPretrainedModelWeights.sh index c615f5e9..81a228e2 100755 --- a/_cacheAndCopyPretrainedModelWeights.sh +++ b/_cacheAndCopyPretrainedModelWeights.sh @@ -2,27 +2,39 @@ set -e -SOURCE_DIR=$1 -TARGET_DIR=$2 - # prepare pre-trained model weights for being included in Docker image +SOURCE_DIR=$1 +TARGET_DIR=$2 MODEL_WEIGHTS_FILE=$SOURCE_DIR'/docker_config/torch_home_cache/hub/checkpoints/dinov2_vits14_pretrain.pth' MODEL_LICENSE_FILE=$SOURCE_DIR'/docker_config/torch_home_cache/hub/facebookresearch_dinov2_main/LICENSE' -if [[ ! -f $MODEL_WEIGHTS_FILE || ! -f $MODEL_LICENSE_FILE ]]; then - echo "Pre-trained model not available. Attempting download" - HUBDIR=$(dirname $(dirname $MODEL_LICENSE_FILE)) - mkdir -p $(dirname $MODEL_WEIGHTS_FILE) - wget https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth -O $MODEL_WEIGHTS_FILE - wget https://github.com/facebookresearch/dinov2/archive/refs/heads/main.zip -O /tmp/dinov2.zip - unzip /tmp/dinov2.zip -d $HUBDIR - mv $HUBDIR/dinov2-main $HUBDIR/$(basename $(dirname $MODEL_LICENSE_FILE)) - touch $HUBDIR/trusted_list -fi -if echo 2e405cee1bad14912278296d4f42e993 $MODEL_WEIGHTS_FILE | md5sum --check - && echo 153d2db1c329326a2d9f881317ea942e $MODEL_LICENSE_FILE | md5sum --check -; then +cache_files () { + if [[ ! -f $MODEL_WEIGHTS_FILE || ! -f $MODEL_LICENSE_FILE ]]; then + echo "Pre-trained model not available. Attempting download" + HUBDIR=$(dirname $(dirname $MODEL_LICENSE_FILE)) + mkdir -p $(dirname $MODEL_WEIGHTS_FILE) + wget https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth -O $MODEL_WEIGHTS_FILE + wget https://github.com/facebookresearch/dinov2/archive/refs/heads/main.zip -O /tmp/dinov2.zip + unzip /tmp/dinov2.zip -d $HUBDIR + mv $HUBDIR/dinov2-main $HUBDIR/$(basename $(dirname $MODEL_LICENSE_FILE)) + touch $HUBDIR/trusted_list + fi +} + +verify_files () { + if echo 2e405cee1bad14912278296d4f42e993 $MODEL_WEIGHTS_FILE | md5sum --check - && echo 153d2db1c329326a2d9f881317ea942e $MODEL_LICENSE_FILE | md5sum --check -; then + echo "File contents verified successfully." + else + exit 1 + fi +} + +copy_files() { cp -r $SOURCE_DIR/docker_config/torch_home_cache $TARGET_DIR/torch_home_cache -else - exit 1 -fi -chmod a+rX $TARGET_DIR/torch_home_cache -R + chmod a+rX $TARGET_DIR/torch_home_cache -R +} + +cache_files +verify_files +copy_files From 561249626f53ff8b58ad284f5be1fb1672bc93ec Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 13 Nov 2025 14:08:28 +0100 Subject: [PATCH 273/337] meaningful error message --- _cacheAndCopyPretrainedModelWeights.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/_cacheAndCopyPretrainedModelWeights.sh b/_cacheAndCopyPretrainedModelWeights.sh index 81a228e2..535c1453 100755 --- a/_cacheAndCopyPretrainedModelWeights.sh +++ b/_cacheAndCopyPretrainedModelWeights.sh @@ -26,6 +26,7 @@ verify_files () { if echo 2e405cee1bad14912278296d4f42e993 $MODEL_WEIGHTS_FILE | md5sum --check - && echo 153d2db1c329326a2d9f881317ea942e $MODEL_LICENSE_FILE | md5sum --check -; then echo "File contents verified successfully." else + echo "Unexpected file contents." exit 1 fi } From 5f90178ee81fa7e952d842d77a3d470f8511ab95 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 17 Nov 2025 10:48:13 +0100 Subject: [PATCH 274/337] removed removed test also from CI --- .github/workflows/pr-test.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index 3ab36d4b..d0257023 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -52,11 +52,6 @@ jobs: run: | ./runIntegrationTests.sh check_files_on_github - - name: Run controller unit tests - continue-on-error: false - run: | - ./runIntegrationTests.sh run_unit_tests_controller - - name: Run dummy training standalone continue-on-error: false run: | From c68e688f1cca8208b5dfc9ae8e0a8faf668945e7 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 17 Nov 2025 13:20:22 +0100 Subject: [PATCH 275/337] changed capitalization of expected output to what the NVFlare classes print --- runIntegrationTests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 8d375d24..e6ecdc57 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -432,7 +432,7 @@ run_dummy_training_in_swarm () { # check for expected output in client log cd "$PROJECT_DIR"/prod_00/client_A/startup CONSOLE_OUTPUT=nohup.out - for EXPECTED_OUTPUT in 'Sending training result to aggregation client' \ + for EXPECTED_OUTPUT in 'sending training result to aggregation client' \ 'Epoch 9: 100%' \ 'val/AUC_ROC' \ 'validation metric .* from client' \ @@ -442,7 +442,7 @@ run_dummy_training_in_swarm () { 'Got the new primary SP:' \ 'accepted learn request from client_.' \ 'Contribution from client_. ACCEPTED by the aggregator at round .' \ - 'Broadcasting learn task of round . to .*; aggr client is client_.' + 'broadcasting learn task of round . to .*; aggr client is client_.' do if grep -q --regexp="$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then echo "Expected output $EXPECTED_OUTPUT found" From 233ceaf46eb7689e3759973f17bde8345456d119 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 17 Nov 2025 13:25:34 +0100 Subject: [PATCH 276/337] print error message after output to keep error visible --- runIntegrationTests.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index e6ecdc57..ac0790eb 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -422,8 +422,8 @@ run_dummy_training_in_swarm () { if grep -q --regexp="$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then echo "Expected output $EXPECTED_OUTPUT found" else - echo "Expected output $EXPECTED_OUTPUT missing" cat "$CONSOLE_OUTPUT" + echo "Expected output $EXPECTED_OUTPUT missing" exit 1 fi done @@ -447,8 +447,8 @@ run_dummy_training_in_swarm () { if grep -q --regexp="$EXPECTED_OUTPUT" "$CONSOLE_OUTPUT"; then echo "Expected output $EXPECTED_OUTPUT found" else - echo "Expected output $EXPECTED_OUTPUT missing" cat "$CONSOLE_OUTPUT" + echo "Expected output $EXPECTED_OUTPUT missing" exit 1 fi done From d319093146dcb0f6a0aa363df7ac7b95765c3b2c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 18 Nov 2025 11:32:05 +0100 Subject: [PATCH 277/337] swarm config file for testing controller changes (should be removed again before merging to main) --- .../project_TestControllerChanges.yml | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 application/provision/project_TestControllerChanges.yml diff --git a/application/provision/project_TestControllerChanges.yml b/application/provision/project_TestControllerChanges.yml new file mode 100644 index 00000000..554dc6d7 --- /dev/null +++ b/application/provision/project_TestControllerChanges.yml @@ -0,0 +1,69 @@ +api_version: 3 +name: odelia___REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS___controller_changes_test +description: Five-node swarm to test removing custom controller classes + +participants: + # change example.com to the FQDN of the server + - name: dl3.tud.de + type: server + org: TUD + fed_learn_port: 8012 + admin_port: 8013 + - name: CAM_1 + type: client + org: Cambridge_challenge_data + - name: MHA_1 + type: client + org: MHA_challenge_data + - name: RUMC_1 + type: client + org: RUMC_challenge_data + - name: UKA_1 + type: client + org: UKA_challenge_data + - name: UMCU_1 + type: client + org: UMCU_challenge_data + - name: jiefu.zhu@tu-dresden.de + type: admin + org: TUD + role: project_admin + +# The same methods in all builders are called in their order defined in builders section +builders: + - path: nvflare.lighter.impl.workspace.WorkspaceBuilder + args: + template_file: master_template.yml + - path: nvflare.lighter.impl.template.TemplateBuilder + - path: nvflare.lighter.impl.static_file.StaticFileBuilder + args: + # config_folder can be set to inform NVIDIA FLARE where to get configuration + config_folder: config + + # scheme for communication driver (currently supporting the default, grpc, only). + scheme: http + + # app_validator is used to verify if uploaded app has proper structures + # if not set, no app_validator is included in fed_server.json + # app_validator: PATH_TO_YOUR_OWN_APP_VALIDATOR + + # when docker_image is set to a docker image name, docker.sh will be generated on server/client/admin + docker_image: jefftud/odelia:__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__ + + # download_job_url is set to http://download.server.com/ as default in fed_server.json. You can override this + # to different url. + # download_job_url: http://download.server.com/ + + overseer_agent: + path: nvflare.ha.dummy_overseer_agent.DummyOverseerAgent + # if overseer_exists is true, args here are ignored. Provisioning + # tool will fill role, name and other local parameters automatically. + # if overseer_exists is false, args in this section will be used and the sp_end_point + # must match the server defined above in the format of SERVER_NAME:FL_PORT:ADMIN_PORT + # + overseer_exists: false + args: + sp_end_point: dl3.tud.de:8012:8013 + + - path: nvflare.lighter.impl.cert.CertBuilder + - path: nvflare.lighter.impl.signature.SignatureBuilder From 3225a2ad1b1f397ec7b15af0040703b79b9ced24 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 21 Nov 2025 09:58:46 +0100 Subject: [PATCH 278/337] Revert "swarm config file for testing controller changes (should be removed again before merging to main)" This reverts commit d319093146dcb0f6a0aa363df7ac7b95765c3b2c. --- .../project_TestControllerChanges.yml | 69 ------------------- 1 file changed, 69 deletions(-) delete mode 100644 application/provision/project_TestControllerChanges.yml diff --git a/application/provision/project_TestControllerChanges.yml b/application/provision/project_TestControllerChanges.yml deleted file mode 100644 index 554dc6d7..00000000 --- a/application/provision/project_TestControllerChanges.yml +++ /dev/null @@ -1,69 +0,0 @@ -api_version: 3 -name: odelia___REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS___controller_changes_test -description: Five-node swarm to test removing custom controller classes - -participants: - # change example.com to the FQDN of the server - - name: dl3.tud.de - type: server - org: TUD - fed_learn_port: 8012 - admin_port: 8013 - - name: CAM_1 - type: client - org: Cambridge_challenge_data - - name: MHA_1 - type: client - org: MHA_challenge_data - - name: RUMC_1 - type: client - org: RUMC_challenge_data - - name: UKA_1 - type: client - org: UKA_challenge_data - - name: UMCU_1 - type: client - org: UMCU_challenge_data - - name: jiefu.zhu@tu-dresden.de - type: admin - org: TUD - role: project_admin - -# The same methods in all builders are called in their order defined in builders section -builders: - - path: nvflare.lighter.impl.workspace.WorkspaceBuilder - args: - template_file: master_template.yml - - path: nvflare.lighter.impl.template.TemplateBuilder - - path: nvflare.lighter.impl.static_file.StaticFileBuilder - args: - # config_folder can be set to inform NVIDIA FLARE where to get configuration - config_folder: config - - # scheme for communication driver (currently supporting the default, grpc, only). - scheme: http - - # app_validator is used to verify if uploaded app has proper structures - # if not set, no app_validator is included in fed_server.json - # app_validator: PATH_TO_YOUR_OWN_APP_VALIDATOR - - # when docker_image is set to a docker image name, docker.sh will be generated on server/client/admin - docker_image: jefftud/odelia:__REPLACED_BY_CURRENT_VERSION_NUMBER_WHEN_BUILDING_STARTUP_KITS__ - - # download_job_url is set to http://download.server.com/ as default in fed_server.json. You can override this - # to different url. - # download_job_url: http://download.server.com/ - - overseer_agent: - path: nvflare.ha.dummy_overseer_agent.DummyOverseerAgent - # if overseer_exists is true, args here are ignored. Provisioning - # tool will fill role, name and other local parameters automatically. - # if overseer_exists is false, args in this section will be used and the sp_end_point - # must match the server defined above in the format of SERVER_NAME:FL_PORT:ADMIN_PORT - # - overseer_exists: false - args: - sp_end_point: dl3.tud.de:8012:8013 - - - path: nvflare.lighter.impl.cert.CertBuilder - - path: nvflare.lighter.impl.signature.SignatureBuilder From 7897d4bf1507f5065082f35e70e0b71b4aee7f05 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Tue, 25 Nov 2025 05:12:54 +0100 Subject: [PATCH 279/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index bd9dd90b..8e6e94a5 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -86,8 +86,8 @@ RUN apt install -y \ libnpth0=1.6-3build2 \ libpsl5=0.21.0-1.2build2 \ libpython3-stdlib=3.10.6-1~22.04.1 \ - libpython3.10-minimal=3.10.12-1~22.04.11 \ - libpython3.10-stdlib=3.10.12-1~22.04.11 \ + libpython3.10-minimal=3.10.12-1~22.04.12 \ + libpython3.10-stdlib=3.10.12-1~22.04.12 \ libreadline8=8.1.2-1 \ librtmp1=2.4+20151223.gitfa8646d.1-2build4 \ libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 \ @@ -100,8 +100,8 @@ RUN apt install -y \ pinentry-curses=1.1.1-1build2 \ publicsuffix=20211207.1025-1 \ python3-minimal=3.10.6-1~22.04.1 \ - python3.10-minimal=3.10.12-1~22.04.11 \ - python3.10=3.10.12-1~22.04.11 \ + python3.10-minimal=3.10.12-1~22.04.12 \ + python3.10=3.10.12-1~22.04.12 \ python3=3.10.6-1~22.04.1 \ readline-common=8.1.2-1 \ unzip=6.0-26ubuntu3.2 \ From efabeec4c095c5e36c919e920bdcb040b4b2e258 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 27 Nov 2025 15:58:36 +0100 Subject: [PATCH 280/337] made compatible with Docker 29.0, untested on earlier versions --- scripts/dev_utils/remove_old_odelia_docker_images.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/dev_utils/remove_old_odelia_docker_images.sh b/scripts/dev_utils/remove_old_odelia_docker_images.sh index 5f25f6d3..a32cf3d0 100755 --- a/scripts/dev_utils/remove_old_odelia_docker_images.sh +++ b/scripts/dev_utils/remove_old_odelia_docker_images.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -export OLD_ODELIA_DOCKER_IMAGES=$(docker image list | grep jefftud/odelia | sed 's|jefftud/odelia *[0-9a-z.-]* *||' | sed 's| *.*||' | tail -n +2) -export OLD_ODELIA_DOCKER_IMAGES_LOCAL=$(docker image list | grep localhost:5000/odelia | sed 's|localhost:5000/odelia *[0-9a-z.-]* *||' | sed 's| *.*||' | tail -n +2) +export OLD_ODELIA_DOCKER_IMAGES=$(docker image list --no-trunc | grep jefftud/odelia | sed 's|jefftud/odelia *[0-9a-z.-]* *sha256:||' | sed 's| *.*||' | tail -n +2) +export OLD_ODELIA_DOCKER_IMAGES_LOCAL=$(docker image list --no-trunc | grep localhost:5000/odelia | sed 's|localhost:5000/odelia *[0-9a-z.-]* *sha256:||' | sed 's| *.*||' | tail -n +2) echo "All docker images:" -docker image list +docker image list --no-trunc echo "The following Docker images are old ODELIA docker images:" From 533f8a4f68ce757961dabb425c3efdf62acc654b Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 28 Nov 2025 11:24:24 +0100 Subject: [PATCH 281/337] slightly restructured participant README, added hint to ping server host to identify network issues --- assets/readme/README.participant.md | 32 ++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 4c52892b..d0804bf5 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -150,15 +150,33 @@ To have a baseline for swarm training, train the same model in a comparable way - TODO describe prediction results once implemented - **TensorBoard logs** are stored in their respective folders inside the run directory -5. (Optional) You can verify that the container is running properly: - ```bash - docker ps # Check if odelia_swarm_client_$SITE_NAME is listed - nvidia-smi # Check if the GPU is busy training (it will be idling while waiting for model transfer) - tail -f nohup.out # Follow training log - ``` +## Troubleshooting + +### Container Running Properly? + +You can verify that the container is running properly: + +```bash +docker ps # Check if odelia_swarm_client_$SITE_NAME is listed +nvidia-smi # Check if the GPU is busy training (it will be idling while waiting for model transfer) +tail -f nohup.out # Follow training log +``` + For any issues, check if the commands above point to problems and contact your Swarm Operator. -## Troubleshooting +### Connection to Swarm Server Working? + +Let the following command run for an hour or so + +```bash +ping dl3.tud.de +``` + +* If dl3.tud.de cannot be resolved, double-check whether it is contained in `/etc/hosts` +* If it cannot be reached at all, double-check if the VPN connection is working. +* If intermittent package loss occurs, double-check if your network connection is working properly. Creating new VPN credentials and certificate for connection may also help, contact your Swarm Operator for this purpose. + +### Further Possible Issues * Folders where files are located need to have the correct name. * Image files need to have the correct file name including capitalization. From 76d14a77f3948fb2146f962e7f09f23ac18e0ace Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 1 Dec 2025 16:43:48 +0100 Subject: [PATCH 282/337] removed python packages that were only needed for testing --- docker_config/Dockerfile_ODELIA | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 233c71a5..efc7a20b 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -230,14 +230,11 @@ RUN python3 -m pip install \ typing-inspection==0.4.1 \ xxhash==3.5.0 -# Install packages needed for testing -RUN python3 -m pip install \ - coverage==7.8.2 \ - mock==5.2.0 # Install packages needed for listing licenses of installed pip packages RUN python3 -m pip install \ pip-licenses==5.0.0 \ prettytable==3.16.0 + # Install packages needed for creating SBOM of apt packages RUN python3 -m pip install \ defusedxml==0.7.1 \ From cceafb26e702940072ff24e89b83a30cfd20da47 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 1 Dec 2025 17:10:52 +0100 Subject: [PATCH 283/337] removed nvflare unit tests from "all" integration tests as it fails for insufficient GPU memory --- runIntegrationTests.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index ac0790eb..291d2a81 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -502,6 +502,11 @@ case "$1" in check_files_on_github ;; + run_nvflare_unit_tests) + run_nvflare_unit_tests + # TODO add to CI or "all" section if we want this (takes several minutes and fails for insufficient GPU memory) + ;; + run_dummy_training_standalone) run_dummy_training_standalone cleanup_temporary_data @@ -579,7 +584,6 @@ case "$1" in run_dummy_training_standalone run_dummy_training_simulation_mode run_dummy_training_poc_mode - run_nvflare_unit_tests create_synthetic_data run_3dcnn_simulation_mode create_startup_kits_and_check_contained_files From c465962c40fd2dfd082bdd1dd34974c5d1d6d32f Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 8 Dec 2025 15:39:47 +0100 Subject: [PATCH 284/337] changed pip-licenses output to json format, consistent with distro2sbom, and added fields --- scripts/_list_licenses.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/_list_licenses.sh b/scripts/_list_licenses.sh index 5ecec196..923a6d51 100755 --- a/scripts/_list_licenses.sh +++ b/scripts/_list_licenses.sh @@ -2,6 +2,6 @@ # this script is called inside the ODELIA docker containers to list licenses of all pip and apt packages as well as for pre-trained weights -pip-licenses -s -u --order=license +pip-licenses --with-system --with-urls --with-description --format json distro2sbom -s --format json grep "DINOv2 code and model weights are released under" /torch_home/hub/facebookresearch_dinov2_main/README.md From 8e70d08c4fc96b2e8c2b7355919bb6ee4b13d476 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Mon, 8 Dec 2025 16:17:15 +0100 Subject: [PATCH 285/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 8e6e94a5..f3560d58 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -53,7 +53,7 @@ RUN apt install -y \ libtasn1-6=4.18.0-4ubuntu0.1 \ libudev1=249.11-0ubuntu3.17 \ libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-161.171 \ + linux-libc-dev=5.15.0-163.173 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ openssl=3.0.2-0ubuntu1.20 \ From 2ce19783d69bf3e1c023edc6cda29f013af0f83f Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Mon, 15 Dec 2025 05:24:03 +0100 Subject: [PATCH 286/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 02d9be57..4d321c74 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -53,7 +53,7 @@ RUN apt install -y \ libtasn1-6=4.18.0-4ubuntu0.1 \ libudev1=249.11-0ubuntu3.17 \ libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-163.173 \ + linux-libc-dev=5.15.0-164.174 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ openssl=3.0.2-0ubuntu1.20 \ From 35e461b0461c6ace333798991cde9507e6d7bf66 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Thu, 8 Jan 2026 05:23:12 +0100 Subject: [PATCH 287/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 4d321c74..d30e50b8 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -26,7 +26,7 @@ RUN apt install -y \ coreutils=8.32-4.1ubuntu1.2 \ dpkg=1.21.1ubuntu2.6 \ e2fsprogs=1.46.5-2ubuntu1.2 \ - gpgv=2.2.27-3ubuntu2.4 \ + gpgv=2.2.27-3ubuntu2.5 \ libblkid1=2.37.2-4ubuntu3.4 \ libc-bin=2.35-0ubuntu3.11 \ libc-dev-bin=2.35-0ubuntu3.11 \ @@ -63,17 +63,17 @@ RUN apt install -y \ RUN apt install -y \ apt-transport-https=2.4.14 \ curl=7.81.0-1ubuntu1.21 \ - dirmngr=2.2.27-3ubuntu2.4 \ + dirmngr=2.2.27-3ubuntu2.5 \ distro-info-data=0.52ubuntu0.11 \ - gnupg-l10n=2.2.27-3ubuntu2.4 \ - gnupg-utils=2.2.27-3ubuntu2.4 \ - gnupg=2.2.27-3ubuntu2.4 \ - gpg-agent=2.2.27-3ubuntu2.4 \ - gpg-wks-client=2.2.27-3ubuntu2.4 \ - gpg-wks-server=2.2.27-3ubuntu2.4 \ - gpg=2.2.27-3ubuntu2.4 \ - gpgconf=2.2.27-3ubuntu2.4 \ - gpgsm=2.2.27-3ubuntu2.4 \ + gnupg-l10n=2.2.27-3ubuntu2.5 \ + gnupg-utils=2.2.27-3ubuntu2.5 \ + gnupg=2.2.27-3ubuntu2.5 \ + gpg-agent=2.2.27-3ubuntu2.5 \ + gpg-wks-client=2.2.27-3ubuntu2.5 \ + gpg-wks-server=2.2.27-3ubuntu2.5 \ + gpg=2.2.27-3ubuntu2.5 \ + gpgconf=2.2.27-3ubuntu2.5 \ + gpgsm=2.2.27-3ubuntu2.5 \ libassuan0=2.5.5-1build1 \ libbrotli1=1.0.9-2build6 \ libcurl4=7.81.0-1ubuntu1.21 \ From 237ee83d813c7f872e50923f8231fb38c2af1c45 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Tue, 13 Jan 2026 05:23:00 +0100 Subject: [PATCH 288/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index d30e50b8..20bf5b31 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -50,7 +50,7 @@ RUN apt install -y \ libss2=1.46.5-2ubuntu1.2 \ libssl3=3.0.2-0ubuntu1.20 \ libsystemd0=249.11-0ubuntu3.17 \ - libtasn1-6=4.18.0-4ubuntu0.1 \ + libtasn1-6=4.18.0-4ubuntu0.2 \ libudev1=249.11-0ubuntu3.17 \ libuuid1=2.37.2-4ubuntu3.4 \ linux-libc-dev=5.15.0-164.174 \ @@ -86,8 +86,8 @@ RUN apt install -y \ libnpth0=1.6-3build2 \ libpsl5=0.21.0-1.2build2 \ libpython3-stdlib=3.10.6-1~22.04.1 \ - libpython3.10-minimal=3.10.12-1~22.04.12 \ - libpython3.10-stdlib=3.10.12-1~22.04.12 \ + libpython3.10-minimal=3.10.12-1~22.04.13 \ + libpython3.10-stdlib=3.10.12-1~22.04.13 \ libreadline8=8.1.2-1 \ librtmp1=2.4+20151223.gitfa8646d.1-2build4 \ libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 \ @@ -100,8 +100,8 @@ RUN apt install -y \ pinentry-curses=1.1.1-1build2 \ publicsuffix=20211207.1025-1 \ python3-minimal=3.10.6-1~22.04.1 \ - python3.10-minimal=3.10.12-1~22.04.12 \ - python3.10=3.10.12-1~22.04.12 \ + python3.10-minimal=3.10.12-1~22.04.13 \ + python3.10=3.10.12-1~22.04.13 \ python3=3.10.6-1~22.04.1 \ readline-common=8.1.2-1 \ unzip=6.0-26ubuntu3.2 \ From 74ef4d513a0b83d7c1610c773483b4ce3c096a24 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 14 Jan 2026 15:27:33 +0100 Subject: [PATCH 289/337] compute ground truth and class probabilities for aggregated model --- .../app/custom/threedcnn_ptl.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index ad291652..31b359f2 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -140,9 +140,30 @@ def prepare_training(logger, max_epochs: int, site_name: str): return data_module, model, checkpointing, trainer, path_run_dir, env_vars +def output_GT_and_classprobs_csv(model, data_module: DataModule) -> None: + results = [] + for batch in data_module.val_dataloader(): + source, target = batch['source'], batch['target'] + + with torch.no_grad(): + logits = model(source) # .to(torch.float) + + # Transfer logits to integer + pred_prob = model.logits2probabilities(logits) + + for b in range(pred_prob.size(0)): + results.append({ + 'GT': target[b].tolist(), + 'NN_prob': pred_prob[b].tolist(), + }) + print(results) + brmpf + + def validate_and_train(logger, data_module, model, trainer) -> None: logger.info("--- Validate global model ---") trainer.validate(model, datamodule=data_module) + output_GT_and_classprobs_csv(model, data_module) logger.info("--- Train new model ---") trainer.fit(model, datamodule=data_module) From 92f60a4560b4888124150ebf9ae65d4f0f72c776 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 15 Jan 2026 13:47:36 +0100 Subject: [PATCH 290/337] output ground truth and predictions to csv --- .../app/custom/main.py | 4 +- .../app/custom/threedcnn_ptl.py | 44 +++++++++++-------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/main.py b/application/jobs/ODELIA_ternary_classification/app/custom/main.py index b86d6665..03f04d77 100755 --- a/application/jobs/ODELIA_ternary_classification/app/custom/main.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/main.py @@ -53,10 +53,10 @@ def main(): input_model = flare.receive() logger.info(f"Current round: {input_model.current_round}") - threedcnn_ptl.validate_and_train(logger, data_module, model, trainer) + threedcnn_ptl.validate_and_train(logger, data_module, model, trainer, path_run_dir) elif TRAINING_MODE in [TM_PREFLIGHT_CHECK, TM_LOCAL_TRAINING]: - threedcnn_ptl.validate_and_train(logger, data_module, model, trainer) + threedcnn_ptl.validate_and_train(logger, data_module, model, trainer, path_run_dir) if TRAINING_MODE in [TM_LOCAL_TRAINING, TM_SWARM]: threedcnn_ptl.finalize_training(logger, model, checkpointing, trainer, path_run_dir, env_vars) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index 31b359f2..95172cbd 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -9,7 +9,7 @@ import torch.multiprocessing as mp import logging - +import csv def get_num_epochs_per_round(site_name: str) -> int: NUM_EPOCHS_FOR_SITE = { @@ -140,30 +140,38 @@ def prepare_training(logger, max_epochs: int, site_name: str): return data_module, model, checkpointing, trainer, path_run_dir, env_vars -def output_GT_and_classprobs_csv(model, data_module: DataModule) -> None: - results = [] - for batch in data_module.val_dataloader(): - source, target = batch['source'], batch['target'] +def output_GT_and_classprobs_csv(model, data_module: DataModule, epoch: int, csv_filename) -> None: + def _determine_GT_and_classprobs(model, data_module: DataModule): + results = [] + for batch in data_module.val_dataloader(): + source, target = batch['source'], batch['target'] + + with torch.no_grad(): + logits = model(source) + + pred_prob = model.logits2probabilities(logits) - with torch.no_grad(): - logits = model(source) # .to(torch.float) + for b in range(pred_prob.size(0)): + results.append({'GT': target[b].tolist(), + 'pred_prob': pred_prob[b].tolist(), + }) + return results - # Transfer logits to integer - pred_prob = model.logits2probabilities(logits) + def output_csv(results, epoch: int, csv_filename) -> None: + with open(csv_filename, 'a') as csvfile: + datawriter = csv.writer(csvfile) + for datapoint in results: + output_data = [epoch, datapoint['GT'][0]] + datapoint['pred_prob'] + datawriter.writerow(output_data) - for b in range(pred_prob.size(0)): - results.append({ - 'GT': target[b].tolist(), - 'NN_prob': pred_prob[b].tolist(), - }) - print(results) - brmpf + results = _determine_GT_and_classprobs(model, data_module) + output_csv(results, epoch, csv_filename) -def validate_and_train(logger, data_module, model, trainer) -> None: +def validate_and_train(logger, data_module, model, trainer, path_run_dir) -> None: logger.info("--- Validate global model ---") trainer.validate(model, datamodule=data_module) - output_GT_and_classprobs_csv(model, data_module) + output_GT_and_classprobs_csv(model, data_module, trainer.current_epoch, path_run_dir/'aggregated_model_results.csv') logger.info("--- Train new model ---") trainer.fit(model, datamodule=data_module) From 0b43321e66b46c2fa4cb3e27e29ac73a89565409 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 16 Jan 2026 13:59:50 +0100 Subject: [PATCH 291/337] notes on cloning the MediSwarm repository with submodules --- assets/readme/README.developer.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/assets/readme/README.developer.md b/assets/readme/README.developer.md index 0d595563..3da6b675 100644 --- a/assets/readme/README.developer.md +++ b/assets/readme/README.developer.md @@ -1,5 +1,17 @@ # Usage for MediSwarm and Application Code Developers +## Cloning the Repository + +We use a git submodule for a fork of NVFlare, so the MediSwarm repository should be cloned using + ```bash + git clone https://github.com/KatherLab/MediSwarm.git --recurse-submodules + ``` + +If you have a clone without having initialized the submodule, use the following command in the MediSwarm directory + ```bash + git submodule update --init --recursive + ``` + ## Versioning of ODELIA Docker Images If needed, update the version number in file [odelia_image.version](../../odelia_image.version). It will be used @@ -82,7 +94,6 @@ export CONFIG=original ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=1 --start_client ``` - ## Running the Application 1. **CIFAR-10 example:** From 12ee6f12adabb8c737190a190894fce45fce741e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 16 Jan 2026 14:23:54 +0100 Subject: [PATCH 292/337] added checkpoint to output ground truth and class probabilities after each epoch --- .../app/custom/threedcnn_ptl.py | 78 +++++++++++-------- 1 file changed, 47 insertions(+), 31 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index 95172cbd..413c9924 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -1,7 +1,7 @@ from sklearn.model_selection import train_test_split import torch from pytorch_lightning import Trainer -from pytorch_lightning.callbacks import ModelCheckpoint +from pytorch_lightning.callbacks import ModelCheckpoint, Callback from pytorch_lightning.loggers import TensorBoardLogger from data.datamodules import DataModule from models import ResNet, MST @@ -11,6 +11,9 @@ import logging import csv +FILENAME_GT_PREDPROB_AGGREGATED_MODEL = 'aggregated_model_gt_and_classprob.csv' +FILENAME_GT_PREDPROB_SITE_MODEL = 'site_model_gt_and_classprob.csv' + def get_num_epochs_per_round(site_name: str) -> int: NUM_EPOCHS_FOR_SITE = { "TUD_1": 2, "TUD_2": 4, "TUD_3": 8, @@ -70,6 +73,45 @@ def create_run_directory(env_vars): ) +def output_GT_and_classprobs_csv(model, data_module: DataModule, epoch: int, csv_filename) -> None: + def _determine_GT_and_classprobs(model, data_module: DataModule): + results = [] + device = torch.device('cuda') + for batch in data_module.val_dataloader(): + source, target = batch['source'], batch['target'] + + with torch.no_grad(): + logits = model.to(device)(source.to(device)) + + pred_prob = model.logits2probabilities(logits) + + for b in range(pred_prob.size(0)): + results.append({'GT': target[b].tolist(), + 'pred_prob': pred_prob[b].tolist(), + }) + return results + + def output_csv(results, epoch: int, csv_filename) -> None: + with open(csv_filename, 'a') as csvfile: + datawriter = csv.writer(csvfile) + for datapoint in results: + output_data = [epoch, datapoint['GT'][0]] + datapoint['pred_prob'] + datawriter.writerow(output_data) + + results = _determine_GT_and_classprobs(model, data_module) + output_csv(results, epoch, csv_filename) + + +class GT_PredProb_Output_Callback(Callback): + def __init__(self, data_module, csv_filename): + self.data_module = data_module + self.csv_filename = csv_filename + super().__init__() + + def on_train_epoch_end(self, trainer, pl_module): + output_GT_and_classprobs_csv(pl_module, self.data_module, trainer.current_epoch, self.csv_filename) + + def prepare_training(logger, max_epochs: int, site_name: str): try: env_vars = load_environment_variables() @@ -119,12 +161,14 @@ def prepare_training(logger, max_epochs: int, site_name: str): mode=min_max, ) + gt_predprob_output = GT_PredProb_Output_Callback(data_module, path_run_dir/FILENAME_GT_PREDPROB_SITE_MODEL) + trainer = Trainer( accelerator='gpu', accumulate_grad_batches=1, precision='16-mixed', default_root_dir=str(path_run_dir), - callbacks=[checkpointing], + callbacks=[checkpointing, gt_predprob_output], enable_checkpointing=True, check_val_every_n_epoch=1, log_every_n_steps=log_every_n_steps, @@ -140,38 +184,10 @@ def prepare_training(logger, max_epochs: int, site_name: str): return data_module, model, checkpointing, trainer, path_run_dir, env_vars -def output_GT_and_classprobs_csv(model, data_module: DataModule, epoch: int, csv_filename) -> None: - def _determine_GT_and_classprobs(model, data_module: DataModule): - results = [] - for batch in data_module.val_dataloader(): - source, target = batch['source'], batch['target'] - - with torch.no_grad(): - logits = model(source) - - pred_prob = model.logits2probabilities(logits) - - for b in range(pred_prob.size(0)): - results.append({'GT': target[b].tolist(), - 'pred_prob': pred_prob[b].tolist(), - }) - return results - - def output_csv(results, epoch: int, csv_filename) -> None: - with open(csv_filename, 'a') as csvfile: - datawriter = csv.writer(csvfile) - for datapoint in results: - output_data = [epoch, datapoint['GT'][0]] + datapoint['pred_prob'] - datawriter.writerow(output_data) - - results = _determine_GT_and_classprobs(model, data_module) - output_csv(results, epoch, csv_filename) - - def validate_and_train(logger, data_module, model, trainer, path_run_dir) -> None: logger.info("--- Validate global model ---") trainer.validate(model, datamodule=data_module) - output_GT_and_classprobs_csv(model, data_module, trainer.current_epoch, path_run_dir/'aggregated_model_results.csv') + output_GT_and_classprobs_csv(model, data_module, trainer.current_epoch, path_run_dir/FILENAME_GT_PREDPROB_AGGREGATED_MODEL) logger.info("--- Train new model ---") trainer.fit(model, datamodule=data_module) From 4ae24334eb9c5030371322b5edbf98644ca1cd4e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 16 Jan 2026 15:01:43 +0100 Subject: [PATCH 293/337] output validation predictions for aggregated model only in swarm training --- .../jobs/ODELIA_ternary_classification/app/custom/main.py | 2 +- .../app/custom/threedcnn_ptl.py | 5 +++-- application/provision/project_MEVIS_test.yml | 4 ++-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/main.py b/application/jobs/ODELIA_ternary_classification/app/custom/main.py index 03f04d77..2d85aa0c 100755 --- a/application/jobs/ODELIA_ternary_classification/app/custom/main.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/main.py @@ -56,7 +56,7 @@ def main(): threedcnn_ptl.validate_and_train(logger, data_module, model, trainer, path_run_dir) elif TRAINING_MODE in [TM_PREFLIGHT_CHECK, TM_LOCAL_TRAINING]: - threedcnn_ptl.validate_and_train(logger, data_module, model, trainer, path_run_dir) + threedcnn_ptl.validate_and_train(logger, data_module, model, trainer, path_run_dir, output_GT_and_classprob=False) if TRAINING_MODE in [TM_LOCAL_TRAINING, TM_SWARM]: threedcnn_ptl.finalize_training(logger, model, checkpointing, trainer, path_run_dir, env_vars) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index 413c9924..b72d408e 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -184,10 +184,11 @@ def prepare_training(logger, max_epochs: int, site_name: str): return data_module, model, checkpointing, trainer, path_run_dir, env_vars -def validate_and_train(logger, data_module, model, trainer, path_run_dir) -> None: +def validate_and_train(logger, data_module, model, trainer, path_run_dir, output_GT_and_classprob=True) -> None: logger.info("--- Validate global model ---") trainer.validate(model, datamodule=data_module) - output_GT_and_classprobs_csv(model, data_module, trainer.current_epoch, path_run_dir/FILENAME_GT_PREDPROB_AGGREGATED_MODEL) + if output_GT_and_classprob: + output_GT_and_classprobs_csv(model, data_module, trainer.current_epoch, path_run_dir/FILENAME_GT_PREDPROB_AGGREGATED_MODEL) logger.info("--- Train new model ---") trainer.fit(model, datamodule=data_module) diff --git a/application/provision/project_MEVIS_test.yml b/application/provision/project_MEVIS_test.yml index 318d9982..c8becf14 100644 --- a/application/provision/project_MEVIS_test.yml +++ b/application/provision/project_MEVIS_test.yml @@ -4,7 +4,7 @@ description: > Test setup. participants: - - name: odelia-vm-a-localvpn + - name: localhost type: server org: MEVIS_Test fed_learn_port: 8022 @@ -63,7 +63,7 @@ builders: # overseer_exists: false args: - sp_end_point: odelia-vm-a-localvpn:8022:8023 + sp_end_point: localhost:8022:8023 - path: nvflare.lighter.impl.cert.CertBuilder - path: nvflare.lighter.impl.signature.SignatureBuilder From b61b888db95ab71b05ebe07da26fb7cec52a7933 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 19 Jan 2026 11:02:00 +0100 Subject: [PATCH 294/337] verify that csv with ground truth and class probability values is written --- runIntegrationTests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 291d2a81..c700949b 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -456,7 +456,7 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00/client_A/ FILES_PRESENT=$(find . -type f -name "*.*") - for EXPECTED_FILE in 'custom/minimal_training.py' 'best_FL_global_model.pt' 'FL_global_model.pt' ; + for EXPECTED_FILE in 'custom/minimal_training.py' 'best_FL_global_model.pt' 'FL_global_model.pt' 'site_model_gt_and_classprob.csv'; do if echo "$FILES_PRESENT" | grep -q "$EXPECTED_FILE" ; then echo "Expected file $EXPECTED_FILE found" From 6f380707090ebd6e67f700d3c1621ce9b5419700 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 19 Jan 2026 11:24:18 +0100 Subject: [PATCH 295/337] =?UTF-8?q?Revert=20"verify=20that=20csv=20with=20?= =?UTF-8?q?ground=20truth=20and=20class=20probability=20values=20is=20writ?= =?UTF-8?q?ten"=20=E2=80=93=20does=20not=20make=20sense=20as=20the=20dummy?= =?UTF-8?q?=20training=20does=20not=20generate=20these=20files?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit b61b888db95ab71b05ebe07da26fb7cec52a7933. --- runIntegrationTests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index c700949b..291d2a81 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -456,7 +456,7 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00/client_A/ FILES_PRESENT=$(find . -type f -name "*.*") - for EXPECTED_FILE in 'custom/minimal_training.py' 'best_FL_global_model.pt' 'FL_global_model.pt' 'site_model_gt_and_classprob.csv'; + for EXPECTED_FILE in 'custom/minimal_training.py' 'best_FL_global_model.pt' 'FL_global_model.pt' ; do if echo "$FILES_PRESENT" | grep -q "$EXPECTED_FILE" ; then echo "Expected file $EXPECTED_FILE found" From 9feb6040e4b834a33688e4223168b1ea5a35db94 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 19 Jan 2026 18:20:19 +0100 Subject: [PATCH 296/337] documented user name needed when starting admin console --- assets/readme/README.operator.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/assets/readme/README.operator.md b/assets/readme/README.operator.md index 130629b7..e2677f6c 100644 --- a/assets/readme/README.operator.md +++ b/assets/readme/README.operator.md @@ -85,4 +85,5 @@ passwords somewhere, they are only displayed once (or you can download them agai prevent encrypted archives) 4. Make sure the participants have started their clients via the respective startup kits, see below 5. Start the *admin* startup kit using the respective `startup/docker.sh` script to start the admin console -6. Deploy a job by `submit_job ` +6. Log in using the user name configured as "name" of the node of type "admin" (only user name needed, auth happens via certificate) +7. Deploy a job by `submit_job ` From 3cb1cb8f88384cf02fc7e48d661257cc3bd33437 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Thu, 22 Jan 2026 05:27:28 +0100 Subject: [PATCH 297/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 20bf5b31..ee6e2690 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -28,10 +28,10 @@ RUN apt install -y \ e2fsprogs=1.46.5-2ubuntu1.2 \ gpgv=2.2.27-3ubuntu2.5 \ libblkid1=2.37.2-4ubuntu3.4 \ - libc-bin=2.35-0ubuntu3.11 \ - libc-dev-bin=2.35-0ubuntu3.11 \ - libc6-dev=2.35-0ubuntu3.11 \ - libc6=2.35-0ubuntu3.11 \ + libc-bin=2.35-0ubuntu3.12 \ + libc-dev-bin=2.35-0ubuntu3.12 \ + libc6-dev=2.35-0ubuntu3.12 \ + libc6=2.35-0ubuntu3.12 \ libcap2=1:2.44-1ubuntu0.22.04.2 \ libcom-err2=1.46.5-2ubuntu1.2 \ libext2fs2=1.46.5-2ubuntu1.2 \ From 025d09737235c79aa7d47d1d4e3434e61b73fb69 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 22 Jan 2026 18:03:08 +0100 Subject: [PATCH 298/337] added troubleshooting options when the disk is full --- assets/readme/README.participant.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index d0804bf5..d3dfb50c 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -185,3 +185,11 @@ ping dl3.tud.de * Image and table folders and files need to be present in the folders specified via `--data_dir`. Symlinks to other locations do not work, they are not available in the Docker mount. * The correct startup kit needs to be used. `SSLCertVerificationError` or `authentication failed` may indicate an incorrect startup kit incompatible with the current experiment. * Do not start the VPN connection more than once on the same machine or on more than one machine at the same time. +* Disk full. This can have multiple reasons: + * Failed trainings may have accumulated large logs. Identify which startup kit folders are big (`du -hsc`). Maybe compression is already a solution, otherwise delete/move elsewhere what is no longer needed. + * Many trainings accumulate many checkpoints (can be GB of data per training). Compression won’t help, possibly delete/move elsewhere what is no longer needed. + * Intermediate steps or unnecessary input for data conversion may have accumulated. + * Docker may have accumulated many images. Delete unnecessary old images (in particular on a development workstation, they tend to accumulate quickly). You can use [remove_old_odelia_docker_images.sh](../../scripts/dev_utils/remove_old_odelia_docker_images.sh) to remove all but the latest one (if that is what you want). Afterwards, call `docker system prune`. +* If you have partitioned your system to have a small system partition and a large data partition, you probably want to configure the container storage to happen on the data partition. + * This can be configured via `echo '{"data-root": "/data/var_lib_docker", "features": {"containerd-snapshotter": true}}' > /etc/docker/daemon.json` (where the containerd-snapshotter may or may not be necessary). + * If the `data-root` is on an external, network or otherwise slow drive, you need to make sure it is available when the container daemon is started, otherwise you will not see previous containers after a reboot. Maybe `sed -i "s/After=/After=SERVICE_PROVIDING_YOUR_DATA_DRIVE.service /g" /usr/lib/systemd/system/containerd.service` is also helpful for you to configure this. \ No newline at end of file From 31b9e2a4f1ad9b5c5ab2a3643fd9ff4eebf3a8da Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 23 Jan 2026 15:50:48 +0100 Subject: [PATCH 299/337] added note on (in our case) redundant entries for server and overseer --- assets/readme/README.operator.md | 1 + 1 file changed, 1 insertion(+) diff --git a/assets/readme/README.operator.md b/assets/readme/README.operator.md index e2677f6c..148bae1b 100644 --- a/assets/readme/README.operator.md +++ b/assets/readme/README.operator.md @@ -24,6 +24,7 @@ For example, add the following line (replace `` with the server's actual IP ### Via Script (recommended) 1. Use, e.g., the file `application/provision/project_MEVIS_test.yml`, adapt as needed (network protocol etc.) + * when adapting the server host name or ports, the server’s `name:`, `fed_learn_port`, and `admin_port` must match the `sp_end_point` in the `overseer_agent` section 2. Call `buildDockerImageAndStartupKits.sh -p /path/to/project_configuration.yml` to build the Docker image and the startup kits 3. Startup kits are generated to `workspace//prod_00/` 4. Deploy startup kits to the respective server/client operators From f42f8d09d3268b80ac537a910ccf88ce8dae3bd3 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 26 Jan 2026 09:56:21 +0100 Subject: [PATCH 300/337] output ground truth and class probabilities also for training data --- .../app/custom/threedcnn_ptl.py | 39 +++++++++++++------ application/provision/project_MEVIS_test.yml | 6 +-- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index b72d408e..cdd220aa 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -11,8 +11,11 @@ import logging import csv -FILENAME_GT_PREDPROB_AGGREGATED_MODEL = 'aggregated_model_gt_and_classprob.csv' -FILENAME_GT_PREDPROB_SITE_MODEL = 'site_model_gt_and_classprob.csv' +FILENAME_GT_PREDPROB_AGGREGATED_MODEL_TRAIN = 'aggregated_model_gt_and_classprob_train.csv' +FILENAME_GT_PREDPROB_SITE_MODEL_TRAIN = 'site_model_gt_and_classprob_train.csv' + +FILENAME_GT_PREDPROB_AGGREGATED_MODEL_VALIDATION = 'aggregated_model_gt_and_classprob_validation.csv' +FILENAME_GT_PREDPROB_SITE_MODEL_VALIDATION = 'site_model_gt_and_classprob_validation.csv' def get_num_epochs_per_round(site_name: str) -> int: NUM_EPOCHS_FOR_SITE = { @@ -73,11 +76,11 @@ def create_run_directory(env_vars): ) -def output_GT_and_classprobs_csv(model, data_module: DataModule, epoch: int, csv_filename) -> None: - def _determine_GT_and_classprobs(model, data_module: DataModule): +def output_GT_and_classprobs_csv(model, data_module: DataModule, epoch: int, csv_filename_train, csv_filename_validation) -> None: + def _determine_GT_and_classprobs(model, data_loader: torch.utils.data.dataloader.DataLoader): results = [] device = torch.device('cuda') - for batch in data_module.val_dataloader(): + for batch in data_loader: source, target = batch['source'], batch['target'] with torch.no_grad(): @@ -98,18 +101,26 @@ def output_csv(results, epoch: int, csv_filename) -> None: output_data = [epoch, datapoint['GT'][0]] + datapoint['pred_prob'] datawriter.writerow(output_data) - results = _determine_GT_and_classprobs(model, data_module) - output_csv(results, epoch, csv_filename) + results_train = _determine_GT_and_classprobs(model, data_module.train_dataloader()) + output_csv(results_train, epoch, csv_filename_train) + results_validation = _determine_GT_and_classprobs(model, data_module.val_dataloader()) + output_csv(results_validation, epoch, csv_filename_validation) + class GT_PredProb_Output_Callback(Callback): - def __init__(self, data_module, csv_filename): + def __init__(self, data_module, csv_filename_train, csv_filename_validation): self.data_module = data_module - self.csv_filename = csv_filename + self.csv_filename_train = csv_filename_train + self.csv_filename_validation = csv_filename_validation super().__init__() def on_train_epoch_end(self, trainer, pl_module): - output_GT_and_classprobs_csv(pl_module, self.data_module, trainer.current_epoch, self.csv_filename) + output_GT_and_classprobs_csv(pl_module, + self.data_module, + trainer.current_epoch, + self.csv_filename_train, + self.csv_filename_validation) def prepare_training(logger, max_epochs: int, site_name: str): @@ -161,7 +172,9 @@ def prepare_training(logger, max_epochs: int, site_name: str): mode=min_max, ) - gt_predprob_output = GT_PredProb_Output_Callback(data_module, path_run_dir/FILENAME_GT_PREDPROB_SITE_MODEL) + gt_predprob_output = GT_PredProb_Output_Callback(data_module, + path_run_dir/FILENAME_GT_PREDPROB_SITE_MODEL_TRAIN, + path_run_dir/FILENAME_GT_PREDPROB_SITE_MODEL_VALIDATION) trainer = Trainer( accelerator='gpu', @@ -188,7 +201,9 @@ def validate_and_train(logger, data_module, model, trainer, path_run_dir, output logger.info("--- Validate global model ---") trainer.validate(model, datamodule=data_module) if output_GT_and_classprob: - output_GT_and_classprobs_csv(model, data_module, trainer.current_epoch, path_run_dir/FILENAME_GT_PREDPROB_AGGREGATED_MODEL) + output_GT_and_classprobs_csv(model, data_module, trainer.current_epoch, + path_run_dir/FILENAME_GT_PREDPROB_AGGREGATED_MODEL_TRAIN, + path_run_dir/FILENAME_GT_PREDPROB_AGGREGATED_MODEL_VALIDATION) logger.info("--- Train new model ---") trainer.fit(model, datamodule=data_module) diff --git a/application/provision/project_MEVIS_test.yml b/application/provision/project_MEVIS_test.yml index c8becf14..2039cc7a 100644 --- a/application/provision/project_MEVIS_test.yml +++ b/application/provision/project_MEVIS_test.yml @@ -7,8 +7,8 @@ participants: - name: localhost type: server org: MEVIS_Test - fed_learn_port: 8022 - admin_port: 8023 + fed_learn_port: 8032 + admin_port: 8033 - name: CAM type: client org: MEVIS_Test @@ -63,7 +63,7 @@ builders: # overseer_exists: false args: - sp_end_point: localhost:8022:8023 + sp_end_point: localhost:8032:8033 - path: nvflare.lighter.impl.cert.CertBuilder - path: nvflare.lighter.impl.signature.SignatureBuilder From 93e75490a11fb5b3f0fe901ed60d08d49662efae Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 26 Jan 2026 11:16:32 +0100 Subject: [PATCH 301/337] extended list of output files (included csvs written now) and updated paths --- assets/readme/README.participant.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index d0804bf5..79c947d2 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -141,14 +141,15 @@ To have a baseline for swarm training, train the same model in a comparable way sudo chmod a+r nohup.out ``` -4. Output files: - - **Training logs and checkpoints** are saved under: - ``` - $SCRATCHDIR/runs/$SITE_NAME// - ``` - - **Best checkpoint** usually saved as `best.ckpt` or `last.ckpt` +4. Output files are located in the directory of the startup kit + - Training log: `/log.txt` + - Class probabilities for each round/epoch for training/validation data: `/app_$SITE_NAME/runs/$SITE_NAME//{aggregated,site}_model_gt_and_classprob_{train,validation}.csv` + - Best checkpoint for local data: `/app_$SITE_NAME/runs/$SITE_NAME//epoch=….ckpt` + - Last checkpoint for local data: `/app_$SITE_NAME/runs/$SITE_NAME//last.ckpt` + - Last aggregated model: ``/app_$SITE_NAME/FL_global_model.pt` + - TensorBoard logs: `/app_$SITE_NAME/runs/$SITE_NAME//lightning_logs` + - Code that was used for training: `/app_$SITE_NAME/custom` - TODO describe prediction results once implemented - - **TensorBoard logs** are stored in their respective folders inside the run directory ## Troubleshooting From d452e6690b48d5be322614a3e9d847b9463962cb Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 26 Jan 2026 11:17:51 +0100 Subject: [PATCH 302/337] fixed markdown syntax --- assets/readme/README.participant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 79c947d2..0b6a8136 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -146,7 +146,7 @@ To have a baseline for swarm training, train the same model in a comparable way - Class probabilities for each round/epoch for training/validation data: `/app_$SITE_NAME/runs/$SITE_NAME//{aggregated,site}_model_gt_and_classprob_{train,validation}.csv` - Best checkpoint for local data: `/app_$SITE_NAME/runs/$SITE_NAME//epoch=….ckpt` - Last checkpoint for local data: `/app_$SITE_NAME/runs/$SITE_NAME//last.ckpt` - - Last aggregated model: ``/app_$SITE_NAME/FL_global_model.pt` + - Last aggregated model: `/app_$SITE_NAME/FL_global_model.pt` - TensorBoard logs: `/app_$SITE_NAME/runs/$SITE_NAME//lightning_logs` - Code that was used for training: `/app_$SITE_NAME/custom` - TODO describe prediction results once implemented From 80c77475580dae2272afa787097d2e3ba983a517 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 26 Jan 2026 13:36:02 +0100 Subject: [PATCH 303/337] removed oudated and thus possibly confusing README file information that is still relevant has been moved to separate README files per target group --- assets/readme/README_old.md | 362 ------------------------------------ 1 file changed, 362 deletions(-) delete mode 100644 assets/readme/README_old.md diff --git a/assets/readme/README_old.md b/assets/readme/README_old.md deleted file mode 100644 index 516d4d0b..00000000 --- a/assets/readme/README_old.md +++ /dev/null @@ -1,362 +0,0 @@ -# Introduction - -MediSwarm is an open-source project dedicated to advancing medical deep learning through swarm intelligence, leveraging -the NVFlare platform. Developed in collaboration with the Odelia consortium, this repository aims to create a -decentralized and collaborative framework for medical research and applications. - -## Key Features - -- **Swarm Learning:** Utilizes swarm intelligence principles to improve model performance and adaptability. -- **NVFlare Integration:** Built on NVFlare, providing robust and scalable federated learning capabilities. -- **Data Privacy:** Ensures data security and compliance with privacy regulations by keeping data local to each - institution. -- **Collaborative Research:** Facilitates collaboration among medical researchers and institutions for enhanced - outcomes. -- **Extensible Framework:** Designed to support various medical applications and easily integrate with existing - workflows. - -## Prerequisites - -### Hardware recommendations - -* 64 GB of RAM (32 GB is the absolute minimum) -* 16 CPU cores (8 is the absolute minimum) -* an NVIDIA GPU with 48 GB of RAM (24 GB is the minimum) -* 8 TB of Storage (4 TB is the absolute minimum) - -We demonstrate that the system can run on lightweight hardware like this. For less than 10k EUR, you can configure -systems from suppliers like Lambda, Dell Precision, and Dell Alienware. - -### Operating System - -* Ubuntu 20.04 LTS - -### Software - -* Docker -* openvpn -* git - -### Cloning the repository - - ```bash - git clone https://github.com/KatherLab/MediSwarm.git --recurse-submodules - ``` - -* The last argument is necessary because we are using a git submodule for the (ODELIA fork of - NVFlare)[https://github.com/KatherLab/NVFlare_MediSwarm] -* If you have cloned it without this argument, use `git submodule update --init --recursive` - -### VPN - -A VPN is necessary so that the swarm nodes can communicate with each other securely across firewalls. For that purpose, - -1. Install OpenVPN - ```bash - sudo apt-get install openvpn - ``` -2. If you have a graphical user interface(GUI), follow this guide to connect to the - VPN: [VPN setup guide(GUI).pdf](assets/VPN%20setup%20guide%28GUI%29.pdf) -3. If you have a command line interface(CLI), follow this guide to connect to the - VPN: [VPN setup guide(CLI).md](assets/VPN%20setup%20guide%28CLI%29.md) - -# Usage for Swarm Participants - -## Setup - -1. Make sure your compute node satisfies the specification and has the necessary software installed. -2. Clone the repository and connect the client node to the VPN as described above. TODO is cloning the repository - necessary for swarm participants? -3. TODO anything else? - -## Prepare Dataset - -1. see Step 3: Prepare Data in (this document)[application/jobs/ODELIA_ternary_classification/app/scripts/README.md] - -## Prepare Training Participation - -1. Extract startup kit provided by swarm operator - -## Run Pre-Flight Check - -1. Directories - ```bash - export SITE_NAME= # TODO should be defined above, also needed for dataset location - export DATADIR= - export SCRATCHDIR= - ``` -2. From the directory where you unpacked the startup kit, - ```bash - cd $SITE_NAME/startup - ``` -3. Verify that your Docker/GPU setup is working - ```bash - ./docker.sh --scratch_dir $SCRATCHDIR --GPU device=0 --dummy_training - ``` - * This will pull the Docker image, which might take a while. - * If you have multiple GPUs and 0 is busy, use a different one. - * The “training” itself should take less than minute and does not yield a meaningful classification performance. -4. Verify that your local data can be accessed and the model can be trained locally - ```bash - ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --preflight_check - ``` - * Training time depends on the size of the local dataset. - -## Configurable Parameters for docker.sh - -TODO consider what should be described and recommended as configurable here, given that the goal of the startup kits is -to ensure everyone runs the same training - -When launching the client using `./docker.sh`, the following environment variables are automatically passed into the -container. You can override them to customize training behavior: - -| Environment Variable | Default | Description | -|----------------------|-----------------|----------------------------------------------------------------------| -| `SITE_NAME` | *from flag* | Name of your local site, e.g. `TUD_1`, passed via `--start_client` | -| `DATA_DIR` | *from flag* | Path to the host folder that contains your local data | -| `SCRATCH_DIR` | *from flag* | Path for saving training outputs and temporary files | -| `GPU_DEVICE` | `device=0` | GPU identifier to use inside the container (or `all`) | -| `MODEL` | `MST` | Model architecture, choices: `MST`, `ResNet` | -| `INSTITUTION` | `ODELIA` | Institution name, used to group experiment logs | -| `CONFIG` | `unilateral` | Configuration schema for dataset (e.g. label scheme) | -| `NUM_EPOCHS` | `1` (test mode) | Number of training epochs (used in preflight/local training) | -| `TRAINING_MODE` | derived | Internal use. Automatically set based on flags like `--start_client` | - -These are injected into the container as `--env` variables. You can modify their defaults by editing `docker.sh` or -exporting before run: - -```bash -export MODEL=ResNet -export CONFIG=original -./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=1 --start_client -``` - -## Start Swarm Node - -1. From the directory where you unpacked the startup kit: - ```bash - cd $SITE_NAME/startup # Skip this if you just ran the pre-flight check - ``` - -2. Start the client: - ```bash - ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --start_client - ``` - If you have multiple GPUs and 0 is busy, use a different one. - -3. Console output is captured in `nohup.out`, which may have been created with limited permissions in the container, so - make it readable if necessary: - ```bash - sudo chmod a+r nohup.out - ``` - -4. Output files: - - **Training logs and checkpoints** are saved under: - ``` - $SCRATCHDIR/runs/$SITE_NAME// - ``` - - **Best checkpoint** usually saved as `best.ckpt` or `last.ckpt` - - **Prediction results**, if enabled, will appear in subfolders of the same directory - - **TensorBoard logs**, if activated, are stored in their respective folders inside the run directory - - TODO what is enabled/activated should be hard-coded, adapt accordingly - -5. (Optional) You can verify that the container is running properly: - ```bash - docker ps # Check if odelia_swarm_client_$SITE_NAME is listed - nvidia-smi # Check if the GPU is busy training (it will be idling while waiting for model transfer) - tail -f nohup.out # Follow training log - ``` - -## Run Local Training - -1. From the directory where you unpacked the startup kit - ```bash - cd $SITE_NAME/startup - ``` -2. Start local training - ```bash - /docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU all --local_training - ``` - * TODO update when handling of the number of epochs has been implemented -3. Output files - * TODO describe - -# Usage for MediSwarm and Application Code Developers - -## Versioning of ODELIA Docker Images - -If needed, update the version number in file (odelia_image.version)[odelia_image.version]. It will be used automatically -for the Docker image and startup kits. - -## Build the Docker Image and Startup Kits - -The Docker image contains all dependencies for administrative purposes (dashboard, command-line provisioning, admin -console, server) as well as for running the 3DCNN pipeline under the pytorch-lightning framework. -The project description specifies the swarm nodes etc. to be used for a swarm training. - -```bash -cd MediSwarm -./buildDockerImageAndStartupKits.sh -p application/provision/ -``` - -1. Make sure you have no uncommitted changes. -2. If package versions are still not available, you may have to check what the current version is and update the - `Dockerfile` accordingly. Version numbers are hard-coded to avoid issues due to silently different versions being - installed. -3. After successful build (and after verifying that everything works as expected, i.e., local tests, building startup - kits, running local trainings in the startup kit), you can manually push the image to DockerHub, provided you have - the necessary rights. Make sure you are not re-using a version number for this purpose. - -## Running Local Tests - - ```bash - ./runTestsInDocker.sh - ``` - -You should see - -1. several expected errors and warnings printed from unit tests that should succeed overall, and a coverage report -2. output of a successful simulation run with two nodes -3. output of a successful proof-of-concept run run with two nodes -4. output of a set of startup kits being generated -5. output of a dummy training run using one of the startup kits -6. TODO update this to what the tests output now - -Optionally, uncomment running NVFlare unit tests in `_runTestsInsideDocker.sh`. - -## Distributing Startup Kits - -Distribute the startup kits to the clients. - -## Running the Application - -1. **CIFAR-10 example:** - See [cifar10/README.md](application/jobs/cifar10/README.md) -2. **Minimal PyTorch CNN example:** - See [application/jobs/minimal_training_pytorch_cnn/README.md](application/jobs/minimal_training_pytorch_cnn/README.md) -3. **3D CNN for classifying breast tumors:** - See [ODELIA_ternary_classification/README.md](application/jobs/ODELIA_ternary_classification/README.md) - -## Contributing Application Code - -1. Take a look at application/jobs/minimal_training_pytorch_cnn for a minimal example how pytorch code can be adapted to - work with NVFlare -2. Take a look at application/jobs/ODELIA_ternary_classification for a more relastic example of pytorch code that can - run in the swarm -3. Use the local tests to check if the code is swarm-ready -4. TODO more detailed instructions - -# Usage for Swarm Operators - -## Setting up a Swarm - -Production mode is designed for secure, real-world deployments. It supports both local and remote setups, whether -on-premise or in the cloud. For more details, refer to -the [NVFLARE Production Mode](https://nvflare.readthedocs.io/en/2.4.1/real_world_fl.html). - -To set up production mode, follow these steps: - -## Edit `/etc/hosts` - -Ensure that your `/etc/hosts` file includes the correct host mappings. All hosts need to be able to communicate to the -server node. - -For example, add the following line (replace `` with the server's actual IP address): - -```plaintext - dl3.tud.de dl3 -``` - -## Create Startup Kits - -### Via Script (recommended) - -1. Use, e.g., the file `application/provision/project_MEVIS_test.yml`, adapt as needed (network protocol etc.) -2. Call `buildStartupKits.sh /path/to/project_configuration.yml` to build the startup kits -3. Startup kits are generated to `workspace//prod_00/` -4. Deploy startup kits to the respective server/clients - -### Via the Dashboard (not recommended) - -```bash -docker run -d --rm \ - --ipc=host -p 8443:8443 \ - --name=odelia_swarm_admin \ - -v /var/run/docker.sock:/var/run/docker.sock \ - \ - /bin/bash -c "nvflare dashboard --start --local --cred :" -``` - -using some credentials chosen for the swarm admin account. - -Access the dashboard in a web browser at `https://localhost:8443` log in with these credentials, and configure the -project: - -1. enter project short name, name, description -2. enter docker download link: jefftud/odelia: -3. if needed, enter dates -4. click save -5. Server Configuration > Server (DNS name): -6. click make project public - -#### Register client per site - -Access the dashboard at `https://:8443`. - -1. register a user -2. enter organziation (corresponding to the site) -3. enter role (e.g., org admin) -4. add a site (note: must not contain spaces, best use alphanumerical name) -5. specify number of GPUs and their memory - -#### Approve clients and finish configuration - -Access the dashboard at `https://localhost:8443` log in with the admin credentials. - -1. Users Dashboard > approve client user -2. Client Sites > approve client sites -3. Project Home > freeze project - -## Download startup kits - -After setting up the project admin configuration, server and clients can download their startup kits. Store the -passwords somewhere, they are only displayed once (or you can download them again). - -## Starting a Swarm Training - -1. Connect the *server* host to the VPN as described above. -2. Start the *server* startup kit using the respective `startup/docker.sh` script with the option to start the server -3. Provide the *client* startup kits to the swarm participants (be aware that email providers or other channels may - prevent encrypted archives) -4. Make sure the participants have started their clients via the respective startup kits, see below -5. Start the *admin* startup kit using the respective `startup/docker.sh` script to start the admin console -6. Deploy a job by `submit_job ` - -# License - -This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. - -# Maintainers - -[Jeff](https://github.com/Ultimate-Storm) -[Ole Schwen](mailto:ole.schwen@mevis.fraunhofer.de) -[Steffen Renisch](mailto:steffen.renisch@mevis.fraunhofer.de) - -# Contributing - -Feel free to dive in! [Open an issue](https://github.com/KatherLab/MediSwarm/issues) or submit pull requests. - -# Credits - -This project utilizes platforms and resources from the following repositories: - -- **[NVFLARE](https://github.com/NVIDIA/NVFlare)**: NVFLARE (NVIDIA Federated Learning Application Runtime Environment) - is an open-source framework that provides a robust and scalable platform for federated learning applications. We have - integrated NVFLARE to efficiently handle the federated learning aspects of our project. - -Special thanks to the contributors and maintainers of these repositories for their valuable work and support. - ---- - -For more details about NVFLARE and its features, please visit -the [NVFLARE GitHub repository](https://github.com/NVIDIA/NVFlare). From 6795846fc08ce9dfd69943a599b1f0b52d8c5ef7 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 26 Jan 2026 14:08:17 +0100 Subject: [PATCH 304/337] elaborated on making application code MediSwarm-compatible --- assets/readme/README.developer.md | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/assets/readme/README.developer.md b/assets/readme/README.developer.md index 3da6b675..b0452d34 100644 --- a/assets/readme/README.developer.md +++ b/assets/readme/README.developer.md @@ -29,7 +29,7 @@ The project description specifies the swarm nodes etc. to be used for a swarm tr ``` 1. Make sure you have no uncommitted changes. -2. If package versions are still not available, you may have to check what the current version is and update the +2. If package versions are no longer available, you may have to check what the current version is and update the `Dockerfile` accordingly. Version numbers are hard-coded to avoid issues due to silently different versions being installed. 3. After successful build (and after verifying that everything works as expected, i.e., local tests, building startup @@ -105,13 +105,30 @@ export CONFIG=original ## Contributing Application Code -1. Take a look at application/jobs/minimal_training_pytorch_cnn for a minimal example how pytorch code can be adapted to - work with NVFlare -2. Take a look at application/jobs/ODELIA_ternary_classification for a more realistic example of pytorch code that can - run in the swarm -3. Use the local tests to check if the code is swarm-ready -4. TODO more detailed instructions +* Take a look at application/jobs/minimal_training_pytorch_cnn for a minimal example how pytorch code can be adapted to work with NVFlare +* Take a look at application/jobs/ODELIA_ternary_classification for a more realistic example of pytorch code that can run in the swarm +* If your application code needs additonal/other/newer Python packages than installed via [Dockerfile_ODELIA](../../docker_config/Dockerfile_ODELIA), create and use an adapted Dockerfile for building the Docker image + * Ensure (by checking against the installation log) that all packages and dependenciese are installed explicitly at pinned versions. +* If your application code needs, e.g., other pre-trained weights in the image, adapt [_cacheAndCopyPretrainedModelWeights.sh](../../_cacheAndCopyPretrainedModelWeights.sh) and [_list_licenses.sh](../../scripts/_list_licenses.sh) + +To make sure your code is swarm-compatible and to isolate potential issues, we recommend the following steps. + +1. Create a small dataset (potentially a synthetic one; see, e.g., [create_synthetic_dataset.py](../../application/jobs/ODELIA_ternary_classification/app/scripts/create_synthetic_dataset/create_synthetic_dataset.py)). + This avoids data issues and allows faster feedback cycles. +2. Start with a working version outside the swarm framework in a known environment. + This way, you have a known-to-work baseline. +3. Make sure the code runs in the Docker container in "local training" mode, i.e., without the swarm learning framework, either manually or like in [_run_minimal_example_standalone.sh](../../tests/integration_tests/_run_minimal_example_standalone.sh) + This will tell you if the code is compatible with the Docker container at hand. +4. Make sure the code runs in NVFlare simulation mode, see [_run_3dcnn_simulation_mode.sh](../../tests/integration_tests/_run_3dcnn_simulation_mode.sh). + This checks compatibility of the code with the swarm training framework by running different clients in different threads. +5. Make sure the code runs in NVFlare proof-of-concept mode, see [_run_minimal_example_proof_of_concept_mode.sh](../../tests/integration_tests/_run_minimal_example_proof_of_concept_mode.sh). + Proof-of-concept mode runs different clients in different processes. + This step probably provides few additional insights if simulation mode has already succeeded and can possibly be skipped. +6. Make sure the code runs in an actual swarm training. + +TODO iterate instructions and add missing details ## Continuous Integration Tests to be executed after pushing to github are defined in `.github/workflows/pr-test.yaml`. +This largely builds on the integration tests defined above, running those that finish within reasonable time. From 1b3c7275bfda4450e2190c7f58447151e224676a Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 27 Jan 2026 14:12:45 +0100 Subject: [PATCH 305/337] increased version number --- odelia_image.version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/odelia_image.version b/odelia_image.version index c9a1f1c3..1e799ca4 100644 --- a/odelia_image.version +++ b/odelia_image.version @@ -1,2 +1,2 @@ # version of the ODELIA Docker image, read by different scripts -1.0.1 +1.0.2 From d5dbf2ca3d9e3eaf92e58bf67b6802ca4cf2b22b Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 28 Jan 2026 11:42:09 +0100 Subject: [PATCH 306/337] manually upgraded apt versions --- docker_config/Dockerfile_ODELIA | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index ee6e2690..aa9831b7 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -48,7 +48,7 @@ RUN apt install -y \ libseccomp2=2.5.3-2ubuntu3~22.04.1 \ libsmartcols1=2.37.2-4ubuntu3.4 \ libss2=1.46.5-2ubuntu1.2 \ - libssl3=3.0.2-0ubuntu1.20 \ + libssl3=3.0.2-0ubuntu1.21 \ libsystemd0=249.11-0ubuntu3.17 \ libtasn1-6=4.18.0-4ubuntu0.2 \ libudev1=249.11-0ubuntu3.17 \ @@ -56,7 +56,7 @@ RUN apt install -y \ linux-libc-dev=5.15.0-164.174 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ - openssl=3.0.2-0ubuntu1.20 \ + openssl=3.0.2-0ubuntu1.21 \ util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions From 5adffcf67ee3ecf7c62765ba4c8f467ba0e1ae5c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 28 Jan 2026 11:42:09 +0100 Subject: [PATCH 307/337] manually upgraded apt versions --- docker_config/Dockerfile_ODELIA | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index ee6e2690..aa9831b7 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -48,7 +48,7 @@ RUN apt install -y \ libseccomp2=2.5.3-2ubuntu3~22.04.1 \ libsmartcols1=2.37.2-4ubuntu3.4 \ libss2=1.46.5-2ubuntu1.2 \ - libssl3=3.0.2-0ubuntu1.20 \ + libssl3=3.0.2-0ubuntu1.21 \ libsystemd0=249.11-0ubuntu3.17 \ libtasn1-6=4.18.0-4ubuntu0.2 \ libudev1=249.11-0ubuntu3.17 \ @@ -56,7 +56,7 @@ RUN apt install -y \ linux-libc-dev=5.15.0-164.174 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ - openssl=3.0.2-0ubuntu1.20 \ + openssl=3.0.2-0ubuntu1.21 \ util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions From eede564ce7b06a4fdbc8bed2a21d9d4b43c9e0d6 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 28 Jan 2026 11:42:09 +0100 Subject: [PATCH 308/337] manually upgraded apt versions --- docker_config/Dockerfile_ODELIA | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index ee6e2690..aa9831b7 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -48,7 +48,7 @@ RUN apt install -y \ libseccomp2=2.5.3-2ubuntu3~22.04.1 \ libsmartcols1=2.37.2-4ubuntu3.4 \ libss2=1.46.5-2ubuntu1.2 \ - libssl3=3.0.2-0ubuntu1.20 \ + libssl3=3.0.2-0ubuntu1.21 \ libsystemd0=249.11-0ubuntu3.17 \ libtasn1-6=4.18.0-4ubuntu0.2 \ libudev1=249.11-0ubuntu3.17 \ @@ -56,7 +56,7 @@ RUN apt install -y \ linux-libc-dev=5.15.0-164.174 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ - openssl=3.0.2-0ubuntu1.20 \ + openssl=3.0.2-0ubuntu1.21 \ util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions From f8bcaf7ce964f73ea380e8375b8708fdeff900a6 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 29 Jan 2026 13:20:06 +0100 Subject: [PATCH 309/337] call expect script via command --- runIntegrationTests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 291d2a81..b9d2648d 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -401,7 +401,7 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup - "$CWD"/tests/integration_tests/_submitDummyTraining.exp + expect -f "$CWD"/tests/integration_tests/_submitDummyTraining.exp docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX sleep 120 cd "$CWD" From 70be233ae83660cc1fbc87c7fbc350fd05e1458e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 29 Jan 2026 13:33:46 +0100 Subject: [PATCH 310/337] show permissions of file failing CI --- runIntegrationTests.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index b9d2648d..8496e9bf 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -401,6 +401,7 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup + ls -l "$CWD"/tests/integration_tests/ expect -f "$CWD"/tests/integration_tests/_submitDummyTraining.exp docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX sleep 120 From dd7396ecbefc87a9261acc1b8df8a667b423e2cb Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 29 Jan 2026 13:50:22 +0100 Subject: [PATCH 311/337] further debugging of failing CI --- runIntegrationTests.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 8496e9bf..fa46149a 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -401,7 +401,8 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup - ls -l "$CWD"/tests/integration_tests/ + cat "$CWD"/tests/integration_tests/_submitDummyTraining.exp + expect -v expect -f "$CWD"/tests/integration_tests/_submitDummyTraining.exp docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX sleep 120 From 27a458e0f24a3d731f4b962516ce4cb09aa1c052 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 29 Jan 2026 14:05:25 +0100 Subject: [PATCH 312/337] temporarily removed tests from CI for faster feedback --- .github/workflows/pr-test.yaml | 40 ---------------------------------- 1 file changed, 40 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index d0257023..522563ac 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -52,46 +52,6 @@ jobs: run: | ./runIntegrationTests.sh check_files_on_github - - name: Run dummy training standalone - continue-on-error: false - run: | - ./runIntegrationTests.sh run_dummy_training_standalone - - - name: Run dummy training in simulation mode - continue-on-error: false - run: | - ./runIntegrationTests.sh run_dummy_training_simulation_mode - - - name: Run dummy training in proof-of-concept mode - continue-on-error: false - run: | - ./runIntegrationTests.sh run_dummy_training_poc_mode - - - name: Run 3DCNN training in simulation mode - continue-on-error: false - run: | - ./runIntegrationTests.sh run_3dcnn_simulation_mode - - - name: Run integration test creating startup kits - continue-on-error: false - run: | - ./runIntegrationTests.sh create_startup_kits - - - name: Run intergration test listing licenses - continue-on-error: false - run: | - ./runIntegrationTests.sh run_list_licenses - - - name: Run integration test Docker GPU preflight check - continue-on-error: false - run: | - ./runIntegrationTests.sh run_docker_gpu_preflight_check - - - name: Run integration test Data access preflight check - continue-on-error: false - run: | - ./runIntegrationTests.sh run_data_access_preflight_check - - name: Run dummy training in swarm continue-on-error: false run: | From 17f54b2e8744093116852c5b4e4f4aeedc127b33 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 29 Jan 2026 14:05:36 +0100 Subject: [PATCH 313/337] check folder permissions --- runIntegrationTests.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index fa46149a..ec64f62e 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -401,8 +401,7 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup - cat "$CWD"/tests/integration_tests/_submitDummyTraining.exp - expect -v + namei -f "$CWD"/tests/integration_tests/_submitDummyTraining.exp expect -f "$CWD"/tests/integration_tests/_submitDummyTraining.exp docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX sleep 120 From a754e063d942ded5e864d28f600ca2bad5be9e99 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 29 Jan 2026 14:14:23 +0100 Subject: [PATCH 314/337] temporarily removed another step not necessary for debugging --- .github/workflows/pr-test.yaml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index 522563ac..4726a6e8 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -34,16 +34,6 @@ jobs: echo "VERSION=$VERSION" echo "version=$VERSION" >> $GITHUB_OUTPUT - - name: Build Docker image for real project (MEVIS) - run: | - chmod +x buildDockerImageAndStartupKits.sh - ./buildDockerImageAndStartupKits.sh -p application/provision/project_MEVIS_test.yml - - - name: Show workspace path for MEVIS project - run: | - echo "WORKSPACE_PATH: ${{ env.WORKSPACE_PATH }}" - find workspace -maxdepth 1 -type d -name "odelia_*_MEVIS_test" || echo "No workspace found" - - name: Build Docker image and dummy startup kits run: ./buildDockerImageAndStartupKits.sh -p tests/provision/dummy_project_for_testing.yml --use-docker-cache From 15a1d551d21989b13fe7735cd5648f8a06c5d1d3 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 29 Jan 2026 14:14:31 +0100 Subject: [PATCH 315/337] fixed option --- runIntegrationTests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index ec64f62e..8ecbe5f7 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -401,7 +401,7 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup - namei -f "$CWD"/tests/integration_tests/_submitDummyTraining.exp + namei -l "$CWD"/tests/integration_tests/_submitDummyTraining.exp expect -f "$CWD"/tests/integration_tests/_submitDummyTraining.exp docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX sleep 120 From f6f163c70b12c8b0b4dd8103bc604c094f8f774a Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 29 Jan 2026 14:39:56 +0100 Subject: [PATCH 316/337] Revert "temporarily removed tests from CI for faster feedback" This reverts commit 27a458e0f24a3d731f4b962516ce4cb09aa1c052. --- .github/workflows/pr-test.yaml | 40 ++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index 4726a6e8..cb1f7171 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -42,6 +42,46 @@ jobs: run: | ./runIntegrationTests.sh check_files_on_github + - name: Run dummy training standalone + continue-on-error: false + run: | + ./runIntegrationTests.sh run_dummy_training_standalone + + - name: Run dummy training in simulation mode + continue-on-error: false + run: | + ./runIntegrationTests.sh run_dummy_training_simulation_mode + + - name: Run dummy training in proof-of-concept mode + continue-on-error: false + run: | + ./runIntegrationTests.sh run_dummy_training_poc_mode + + - name: Run 3DCNN training in simulation mode + continue-on-error: false + run: | + ./runIntegrationTests.sh run_3dcnn_simulation_mode + + - name: Run integration test creating startup kits + continue-on-error: false + run: | + ./runIntegrationTests.sh create_startup_kits + + - name: Run intergration test listing licenses + continue-on-error: false + run: | + ./runIntegrationTests.sh run_list_licenses + + - name: Run integration test Docker GPU preflight check + continue-on-error: false + run: | + ./runIntegrationTests.sh run_docker_gpu_preflight_check + + - name: Run integration test Data access preflight check + continue-on-error: false + run: | + ./runIntegrationTests.sh run_data_access_preflight_check + - name: Run dummy training in swarm continue-on-error: false run: | From 09eefdfd3313d00bfa359b6033dd4bf215df797e Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 29 Jan 2026 14:40:07 +0100 Subject: [PATCH 317/337] Revert "temporarily removed another step not necessary for debugging" This reverts commit a754e063d942ded5e864d28f600ca2bad5be9e99. --- .github/workflows/pr-test.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/pr-test.yaml b/.github/workflows/pr-test.yaml index cb1f7171..d0257023 100644 --- a/.github/workflows/pr-test.yaml +++ b/.github/workflows/pr-test.yaml @@ -34,6 +34,16 @@ jobs: echo "VERSION=$VERSION" echo "version=$VERSION" >> $GITHUB_OUTPUT + - name: Build Docker image for real project (MEVIS) + run: | + chmod +x buildDockerImageAndStartupKits.sh + ./buildDockerImageAndStartupKits.sh -p application/provision/project_MEVIS_test.yml + + - name: Show workspace path for MEVIS project + run: | + echo "WORKSPACE_PATH: ${{ env.WORKSPACE_PATH }}" + find workspace -maxdepth 1 -type d -name "odelia_*_MEVIS_test" || echo "No workspace found" + - name: Build Docker image and dummy startup kits run: ./buildDockerImageAndStartupKits.sh -p tests/provision/dummy_project_for_testing.yml --use-docker-cache From 6fc5784e3808a4b075b98321acf9b3ee1be3c27a Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 29 Jan 2026 14:40:44 +0100 Subject: [PATCH 318/337] removed debugging experiments --- runIntegrationTests.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/runIntegrationTests.sh b/runIntegrationTests.sh index 8ecbe5f7..b9d2648d 100755 --- a/runIntegrationTests.sh +++ b/runIntegrationTests.sh @@ -401,7 +401,6 @@ run_dummy_training_in_swarm () { cd "$PROJECT_DIR"/prod_00 cd admin@test.odelia/startup - namei -l "$CWD"/tests/integration_tests/_submitDummyTraining.exp expect -f "$CWD"/tests/integration_tests/_submitDummyTraining.exp docker kill odelia_swarm_admin_$CONTAINER_VERSION_SUFFIX sleep 120 From 602fb264bbba73793beaec3d3cafda5177b088fd Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Wed, 28 Jan 2026 11:42:09 +0100 Subject: [PATCH 319/337] manually upgraded apt versions --- docker_config/Dockerfile_ODELIA | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index ee6e2690..aa9831b7 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -48,7 +48,7 @@ RUN apt install -y \ libseccomp2=2.5.3-2ubuntu3~22.04.1 \ libsmartcols1=2.37.2-4ubuntu3.4 \ libss2=1.46.5-2ubuntu1.2 \ - libssl3=3.0.2-0ubuntu1.20 \ + libssl3=3.0.2-0ubuntu1.21 \ libsystemd0=249.11-0ubuntu3.17 \ libtasn1-6=4.18.0-4ubuntu0.2 \ libudev1=249.11-0ubuntu3.17 \ @@ -56,7 +56,7 @@ RUN apt install -y \ linux-libc-dev=5.15.0-164.174 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ - openssl=3.0.2-0ubuntu1.20 \ + openssl=3.0.2-0ubuntu1.21 \ util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions From 6a21655d7fefe0e9087803b4ea9e0fc82a9e82b4 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Fri, 30 Jan 2026 05:37:17 +0100 Subject: [PATCH 320/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index ee6e2690..bf7833c0 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -48,15 +48,15 @@ RUN apt install -y \ libseccomp2=2.5.3-2ubuntu3~22.04.1 \ libsmartcols1=2.37.2-4ubuntu3.4 \ libss2=1.46.5-2ubuntu1.2 \ - libssl3=3.0.2-0ubuntu1.20 \ + libssl3=3.0.2-0ubuntu1.21 \ libsystemd0=249.11-0ubuntu3.17 \ libtasn1-6=4.18.0-4ubuntu0.2 \ libudev1=249.11-0ubuntu3.17 \ libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-164.174 \ + linux-libc-dev=5.15.0-168.178 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ - openssl=3.0.2-0ubuntu1.20 \ + openssl=3.0.2-0ubuntu1.21 \ util-linux=2.37.2-4ubuntu3.4 # Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions From 0747a77d43d4748297d2f95a1e6f2cf77a869174 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 30 Jan 2026 09:57:21 +0100 Subject: [PATCH 321/337] log hash of dataset UIDs --- .../app/custom/threedcnn_ptl.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index ad291652..f8c1a531 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -7,6 +7,8 @@ from models import ResNet, MST from env_config import load_environment_variables, prepare_odelia_dataset, generate_run_directory import torch.multiprocessing as mp +from hashlib import sha3_224 as hash_function +from typing import List import logging @@ -28,6 +30,32 @@ def set_up_logging(): return logger +def log_data_hash(dm: DataModule, logger) -> None: + def _get_imagename_hashes(dataloader) -> List[str]: + h = hash_function() + hashes = [] + for batch in dataloader: + assert (len(batch['uid']) == 1) # currently only implemented for batch size 1 + h.update(batch['uid'][0].encode('utf-8')) + print(h.hexdigest()) + hashes.append(h.hexdigest()) + return hashes + + def _check_for_duplicates(strings: List[str]) -> None: + if len(strings) != len(set(strings)): + print("Duplicates detected. Please make sure this was intended") + + imagename_hashes_train = _get_imagename_hashes(dm.train_dataloader()) + imagename_hashes_validation = _get_imagename_hashes(dm.val_dataloader()) + _check_for_duplicates(imagename_hashes_train + imagename_hashes_validation) + imagename_hashes_train.sort() + imagename_hashes_validation.sort() + all_hashes = ''.join(imagename_hashes_train) + ''.join(imagename_hashes_validation) + h = hash_function() + h.update(all_hashes.encode('utf-8')) + logger.info(f"Data hash: f{h.hexdigest()}") + + def set_up_data_module(logger): torch.set_float32_matmul_precision('high') ds_train, ds_val, path_run_dir, run_name = prepare_odelia_dataset() @@ -56,6 +84,8 @@ def set_up_data_module(logger): # logger.info(f"Label '{label}': {pct:.2f}% of training set, Count: {distribution['counts'][label]}") # logger.info(f"Number of unique labels: {len(distribution['counts'])}") + log_data_hash(dm, logger) + loss_kwargs = {} return dm, path_run_dir, run_name, num_classes, loss_kwargs From 3649a5ccf8f56b3ee800d78b7b09b267fde08b73 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 30 Jan 2026 13:27:52 +0100 Subject: [PATCH 322/337] extend hash to image data --- .../app/custom/threedcnn_ptl.py | 43 ++++++++++++++----- 1 file changed, 32 insertions(+), 11 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index f8c1a531..03705146 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -8,7 +8,7 @@ from env_config import load_environment_variables, prepare_odelia_dataset, generate_run_directory import torch.multiprocessing as mp from hashlib import sha3_224 as hash_function -from typing import List +from typing import List, Tuple import logging @@ -37,20 +37,41 @@ def _get_imagename_hashes(dataloader) -> List[str]: for batch in dataloader: assert (len(batch['uid']) == 1) # currently only implemented for batch size 1 h.update(batch['uid'][0].encode('utf-8')) - print(h.hexdigest()) hashes.append(h.hexdigest()) return hashes - def _check_for_duplicates(strings: List[str]) -> None: + def _get_imagedata_hashes(dataloader) -> List[str]: + h = hash_function() + hashes = [] + for batch in dataloader: + assert (len(batch['source']) == 1) # currently only implemented for batch size 1 + h.update(batch['source'][0].detach().cpu().numpy().data) + hashes.append(h.hexdigest()) + return hashes + + def _check_for_duplicates(strings: List[str], where: str) -> None: if len(strings) != len(set(strings)): - print("Duplicates detected. Please make sure this was intended") - - imagename_hashes_train = _get_imagename_hashes(dm.train_dataloader()) - imagename_hashes_validation = _get_imagename_hashes(dm.val_dataloader()) - _check_for_duplicates(imagename_hashes_train + imagename_hashes_validation) - imagename_hashes_train.sort() - imagename_hashes_validation.sort() - all_hashes = ''.join(imagename_hashes_train) + ''.join(imagename_hashes_validation) + print(f"Duplicate {where} detected. Please make sure this was intended") + + def _get_imagename_hashes_train_val(dm: DataModule) -> Tuple[str, str]: + imagename_hashes_train = _get_imagename_hashes(dm.train_dataloader()) + imagename_hashes_validation = _get_imagename_hashes(dm.val_dataloader()) + _check_for_duplicates(imagename_hashes_train + imagename_hashes_validation, 'image UIDs') + imagename_hashes_train.sort() + imagename_hashes_validation.sort() + return imagename_hashes_train, imagename_hashes_validation + + def _get_imagedata_hashes_train_val(dm: DataModule) -> Tuple[str, str]: + imagedata_hashes_train = _get_imagedata_hashes(dm.train_dataloader()) + imagedata_hashes_validation = _get_imagedata_hashes(dm.val_dataloader()) + _check_for_duplicates(imagedata_hashes_train + imagedata_hashes_validation, 'image data') + imagedata_hashes_train.sort() + imagedata_hashes_validation.sort() + return imagedata_hashes_train, imagedata_hashes_validation + + imagename_hashes_train, imagename_hashes_validation = _get_imagename_hashes_train_val(dm) + imagedata_hashes_train, imagedata_hashes_validation = _get_imagedata_hashes_train_val(dm) + all_hashes = ''.join(imagename_hashes_train) + ''.join(imagename_hashes_validation) + ''.join(imagedata_hashes_train) + ''.join(imagedata_hashes_validation) h = hash_function() h.update(all_hashes.encode('utf-8')) logger.info(f"Data hash: f{h.hexdigest()}") From 5a4768f58b39865931ec445a7c71812c250f698c Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 2 Feb 2026 14:48:06 +0100 Subject: [PATCH 323/337] documented output files of local training --- assets/readme/README.participant.md | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index d897a69b..753a1e8f 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -113,8 +113,12 @@ To have a baseline for swarm training, train the same model in a comparable way ./docker.sh --data_dir $DATADIR --scratch_dir $SCRATCHDIR --GPU device=0 --local_training 2>&1 | tee local_training_console_output.txt ``` * This currently runs 100 epochs (somewhat comparable to 20 rounds with 5 epochs each in the swarm case). -3. Output files - * Same as for the swarm training (see below). +3. Output files are located in the directory of the startup kit: + * Logged output during training: `local_training_console_output.txt` + * Class probabilities for each round/epoch for training/validation data: `runs/$SITE_NAME//{aggregated,site}_model_gt_and_classprob_{train,validation}.csv` + * Best checkpoint for local data: `runs/$SITE_NAME//epoch=….ckpt` + * Last checkpoint for local data: `runs/$SITE_NAME//last.ckpt` + * TensorBoard logs: `runs/$SITE_NAME//lightning_logs` ### Start Swarm Node @@ -141,15 +145,15 @@ To have a baseline for swarm training, train the same model in a comparable way sudo chmod a+r nohup.out ``` -4. Output files are located in the directory of the startup kit - - Training log: `/log.txt` - - Class probabilities for each round/epoch for training/validation data: `/app_$SITE_NAME/runs/$SITE_NAME//{aggregated,site}_model_gt_and_classprob_{train,validation}.csv` - - Best checkpoint for local data: `/app_$SITE_NAME/runs/$SITE_NAME//epoch=….ckpt` - - Last checkpoint for local data: `/app_$SITE_NAME/runs/$SITE_NAME//last.ckpt` - - Last aggregated model: `/app_$SITE_NAME/FL_global_model.pt` - - TensorBoard logs: `/app_$SITE_NAME/runs/$SITE_NAME//lightning_logs` - - Code that was used for training: `/app_$SITE_NAME/custom` - - TODO describe prediction results once implemented +4. Output files are located in the directory of the startup kit: + * Training log: `/log.txt` + * Class probabilities for each round/epoch for training/validation data: `/app_$SITE_NAME/runs/$SITE_NAME//{aggregated,site}_model_gt_and_classprob_{train,validation}.csv` + * Best checkpoint for local data: `/app_$SITE_NAME/runs/$SITE_NAME//epoch=….ckpt` + * Last checkpoint for local data: `/app_$SITE_NAME/runs/$SITE_NAME//last.ckpt` + * Last aggregated model: `/app_$SITE_NAME/FL_global_model.pt` + * TensorBoard logs: `/app_$SITE_NAME/runs/$SITE_NAME//lightning_logs` + * Code that was used for training: `/app_$SITE_NAME/custom` + * TODO describe prediction results once implemented ## Troubleshooting From 680ba6d75a1dd6064394664cd75c8821ef966fe1 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 2 Feb 2026 14:51:49 +0100 Subject: [PATCH 324/337] local training has no aggregated models --- assets/readme/README.participant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 753a1e8f..861dee52 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -115,7 +115,7 @@ To have a baseline for swarm training, train the same model in a comparable way * This currently runs 100 epochs (somewhat comparable to 20 rounds with 5 epochs each in the swarm case). 3. Output files are located in the directory of the startup kit: * Logged output during training: `local_training_console_output.txt` - * Class probabilities for each round/epoch for training/validation data: `runs/$SITE_NAME//{aggregated,site}_model_gt_and_classprob_{train,validation}.csv` + * Class probabilities for each round/epoch for training/validation data: `runs/$SITE_NAME//site_model_gt_and_classprob_{train,validation}.csv` * Best checkpoint for local data: `runs/$SITE_NAME//epoch=….ckpt` * Last checkpoint for local data: `runs/$SITE_NAME//last.ckpt` * TensorBoard logs: `runs/$SITE_NAME//lightning_logs` From c61211f1135a88d8b2622032c2f3dcd78ccdfe77 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 2 Feb 2026 17:33:38 +0100 Subject: [PATCH 325/337] noted difference that local training results end up in startup, swarm training results do not --- assets/readme/README.participant.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 861dee52..9ff01ba1 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -114,11 +114,11 @@ To have a baseline for swarm training, train the same model in a comparable way ``` * This currently runs 100 epochs (somewhat comparable to 20 rounds with 5 epochs each in the swarm case). 3. Output files are located in the directory of the startup kit: - * Logged output during training: `local_training_console_output.txt` - * Class probabilities for each round/epoch for training/validation data: `runs/$SITE_NAME//site_model_gt_and_classprob_{train,validation}.csv` - * Best checkpoint for local data: `runs/$SITE_NAME//epoch=….ckpt` - * Last checkpoint for local data: `runs/$SITE_NAME//last.ckpt` - * TensorBoard logs: `runs/$SITE_NAME//lightning_logs` + * Logged output during training: `startup/local_training_console_output.txt` + * Class probabilities for each round/epoch for training/validation data: `startup/runs/$SITE_NAME//site_model_gt_and_classprob_{train,validation}.csv` + * Best checkpoint for local data: `startup/runs/$SITE_NAME//epoch=….ckpt` + * Last checkpoint for local data: `startup/runs/$SITE_NAME//last.ckpt` + * TensorBoard logs: `startup/runs/$SITE_NAME//lightning_logs` ### Start Swarm Node @@ -145,7 +145,7 @@ To have a baseline for swarm training, train the same model in a comparable way sudo chmod a+r nohup.out ``` -4. Output files are located in the directory of the startup kit: +4. Output files are located in the directory of the startup kit (note: unlike local training results, this is *not* in the `startup` directory) * Training log: `/log.txt` * Class probabilities for each round/epoch for training/validation data: `/app_$SITE_NAME/runs/$SITE_NAME//{aggregated,site}_model_gt_and_classprob_{train,validation}.csv` * Best checkpoint for local data: `/app_$SITE_NAME/runs/$SITE_NAME//epoch=….ckpt` From 95232d4d12efa878cb66a4209588f50739a508df Mon Sep 17 00:00:00 2001 From: oleschwen Date: Tue, 3 Feb 2026 09:47:18 +0100 Subject: [PATCH 326/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index bf7833c0..a788e91b 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -28,10 +28,10 @@ RUN apt install -y \ e2fsprogs=1.46.5-2ubuntu1.2 \ gpgv=2.2.27-3ubuntu2.5 \ libblkid1=2.37.2-4ubuntu3.4 \ - libc-bin=2.35-0ubuntu3.12 \ - libc-dev-bin=2.35-0ubuntu3.12 \ - libc6-dev=2.35-0ubuntu3.12 \ - libc6=2.35-0ubuntu3.12 \ + libc-bin=2.35-0ubuntu3.13 \ + libc-dev-bin=2.35-0ubuntu3.13 \ + libc6-dev=2.35-0ubuntu3.13 \ + libc6=2.35-0ubuntu3.13 \ libcap2=1:2.44-1ubuntu0.22.04.2 \ libcom-err2=1.46.5-2ubuntu1.2 \ libext2fs2=1.46.5-2ubuntu1.2 \ @@ -79,8 +79,8 @@ RUN apt install -y \ libcurl4=7.81.0-1ubuntu1.21 \ libexpat1=2.4.7-1ubuntu0.6 \ libksba8=1.6.0-2ubuntu0.2 \ - libldap-2.5-0=2.5.19+dfsg-0ubuntu0.22.04.1 \ - libldap-common=2.5.19+dfsg-0ubuntu0.22.04.1 \ + libldap-2.5-0=2.5.20+dfsg-0ubuntu0.22.04.1 \ + libldap-common=2.5.20+dfsg-0ubuntu0.22.04.1 \ libmpdec3=2.5.1-2build2 \ libnghttp2-14=1.43.0-1ubuntu0.2 \ libnpth0=1.6-3build2 \ From e28e20caa38b19bc9d23c3d69799dc193209c07e Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Tue, 3 Feb 2026 13:51:28 +0100 Subject: [PATCH 327/337] Add files via upload --- assets/openvpn_always_on_setup_guide.md | 154 ++++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 assets/openvpn_always_on_setup_guide.md diff --git a/assets/openvpn_always_on_setup_guide.md b/assets/openvpn_always_on_setup_guide.md new file mode 100644 index 00000000..254e4e0d --- /dev/null +++ b/assets/openvpn_always_on_setup_guide.md @@ -0,0 +1,154 @@ +# OpenVPN Setup on Ubuntu (DL0) + +This guide explains how to set up an OpenVPN connection on **Ubuntu** using a provided `.ovpn` file and credentials, with: + +- Automatic reconnection if the VPN drops +- Automatic connection on server reboot +- Secure credential handling +- systemd-based management (no cron jobs) + +--- + +## 1. Install OpenVPN + +```bash +sudo apt update +sudo apt install -y openvpn +``` + +--- + +## 2. Place the `.ovpn` Configuration File + +Copy your VPN configuration file to the OpenVPN client directory and rename it: + +```bash +sudo cp 14357-de-9957-tt8rw.accessgate.cloud.ovpn \ +/etc/openvpn/client/.conf +``` + +OpenVPN expects `.conf` files in this directory for systemd integration. + +--- + +## 3. Create a Credentials File + +Create a credentials file so the VPN can connect automatically without interactive login. + +```bash +sudo nano /etc/openvpn/client/.auth +``` + +Add the following **two lines**: + +``` + + +``` + +Secure the file: + +```bash +sudo chmod 600 /etc/openvpn/client/.auth +sudo chown root:root /etc/openvpn/client/.auth +``` + +--- + +## 4. Update VPN Configuration for Auto-Reconnect + +Edit the configuration file: + +```bash +sudo nano /etc/openvpn/client/.conf +``` + +Ensure the following lines are present (add them if missing): + +```conf +auth-user-pass /etc/openvpn/client/.auth +auth-nocache + +persist-key +persist-tun +resolv-retry infinite +keepalive 10 60 +``` + +These options ensure: +- Infinite retry if the server is temporarily unreachable +- Automatic reconnection if the tunnel drops +- Tunnel persistence across reconnects + +--- + +## 5. Enable VPN Auto-Start on Boot + +Enable the OpenVPN client service: + +```bash +sudo systemctl enable openvpn-client@ +``` + +Start the VPN immediately: + +```bash +sudo systemctl start openvpn-client@ +``` + +--- + +## 6. Verify VPN Status + +Check service status: + +```bash +sudo systemctl status openvpn-client@ +``` + +Follow logs (useful for debugging): + +```bash +journalctl -u openvpn-client@ -f +``` + +--- + +## 7. Confirm VPN Connectivity + +Check network interfaces: + +```bash +ip a +``` + +Or verify your public IP: + +```bash +curl ifconfig.me +``` + +The IP should correspond to the VPN, not your ISP. + +--- + +## 8. Behavior Summary + +| Event | Result | +|------|-------| +| VPN connection drops | Automatically reconnects | +| VPN server unavailable | Retries indefinitely | +| Server reboot | VPN reconnects on startup | +| Network delay during boot | systemd retries until successful | + +--- + +## Notes + +- Credentials are stored securely with restricted permissions. +- No cron jobs or custom watchdog scripts are required. +- Configuration is suitable for headless servers. + +--- + +**End of document** From e046100acdc5fb3f82d7ce2cf84725961399cd49 Mon Sep 17 00:00:00 2001 From: JieFu Zhu Date: Tue, 3 Feb 2026 13:54:43 +0100 Subject: [PATCH 328/337] Revise OpenVPN setup guide with additional notes Updated the title and added reminders about replacing placeholders and contacting for lost config files. --- assets/openvpn_always_on_setup_guide.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/assets/openvpn_always_on_setup_guide.md b/assets/openvpn_always_on_setup_guide.md index 254e4e0d..bb7a575b 100644 --- a/assets/openvpn_always_on_setup_guide.md +++ b/assets/openvpn_always_on_setup_guide.md @@ -1,4 +1,4 @@ -# OpenVPN Setup on Ubuntu (DL0) +# OpenVPN Setup on Ubuntu This guide explains how to set up an OpenVPN connection on **Ubuntu** using a provided `.ovpn` file and credentials, with: @@ -8,6 +8,9 @@ This guide explains how to set up an OpenVPN connection on **Ubuntu** using a pr - systemd-based management (no cron jobs) --- +Please remember to replace with your actual institute. +Please ask Jeff if you lost your openvpn config file, which is 14357-de-9957-tt8rw.accessgate.cloud.ovpn or your credentials. + ## 1. Install OpenVPN @@ -151,4 +154,3 @@ The IP should correspond to the VPN, not your ISP. --- -**End of document** From bb1bf7dea4a3091bff00e2f7438824e3ae1c9cf5 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Thu, 5 Feb 2026 13:56:07 +0100 Subject: [PATCH 329/337] use original dataset (without augmentation) for computing hash --- .../custom/data/datasets/dataset_3d_odelia.py | 2 + .../app/custom/env_config.py | 13 +++- .../app/custom/threedcnn_ptl.py | 60 +++++++++++-------- 3 files changed, 48 insertions(+), 27 deletions(-) diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/data/datasets/dataset_3d_odelia.py b/application/jobs/ODELIA_ternary_classification/app/custom/data/datasets/dataset_3d_odelia.py index eba4aa12..1998b718 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/data/datasets/dataset_3d_odelia.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/data/datasets/dataset_3d_odelia.py @@ -87,6 +87,8 @@ def __init__( ImageOrSubjectToTensor() if to_tensor else tio.Lambda(lambda x: x) ]) + elif transform == 'USE_UNPROCESSED_IMAGES': + self.transform = tio.Compose([tio.Lambda(lambda x: x)]) else: self.transform = transform diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/env_config.py b/application/jobs/ODELIA_ternary_classification/app/custom/env_config.py index 93efb091..ebaf4ab4 100755 --- a/application/jobs/ODELIA_ternary_classification/app/custom/env_config.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/env_config.py @@ -1,7 +1,7 @@ import os from datetime import datetime from pathlib import Path - +from data.datasets import ODELIA_Dataset3D def load_environment_variables(): return { @@ -37,7 +37,6 @@ def prepare_odelia_dataset(): path_run_dir = Path.cwd() / 'runs' / institution / run_name path_run_dir.mkdir(parents=True, exist_ok=True) - from data.datasets import ODELIA_Dataset3D ds_train = ODELIA_Dataset3D(institutions=institution, split='train', config=config, random_flip=True, random_rotate=True, random_inverse=False, noise=True) ds_val = ODELIA_Dataset3D(institutions=institution, split='val', config=config) @@ -49,6 +48,16 @@ def prepare_odelia_dataset(): return ds_train, ds_val, path_run_dir, run_name +def prepare_odelia_dataset_without_augmentation(): + institution = os.environ.get('INSTITUTION', os.environ['SITE_NAME']) + config = os.environ.get('CONFIG', 'unilateral') + + ds_train = ODELIA_Dataset3D(institutions=institution, split='train', config=config, transform='USE_UNPROCESSED_IMAGES') + ds_val = ODELIA_Dataset3D(institutions=institution, split='val', config=config, transform='USE_UNPROCESSED_IMAGES') + + return ds_train, ds_val + + def generate_run_directory(scratch_dir, task_data_name, model_name, local_compare_flag): current_time = datetime.now().strftime("%Y_%m_%d_%H%M%S") mode = 'local_compare' if local_compare_flag else 'swarm_learning' diff --git a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py index f1fb4a8b..106793e7 100644 --- a/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py +++ b/application/jobs/ODELIA_ternary_classification/app/custom/threedcnn_ptl.py @@ -5,7 +5,7 @@ from pytorch_lightning.loggers import TensorBoardLogger from data.datamodules import DataModule from models import ResNet, MST -from env_config import load_environment_variables, prepare_odelia_dataset, generate_run_directory +from env_config import load_environment_variables, prepare_odelia_dataset, prepare_odelia_dataset_without_augmentation, generate_run_directory import torch.multiprocessing as mp from hashlib import sha3_224 as hash_function from typing import List, Tuple @@ -37,35 +37,36 @@ def set_up_logging(): def log_data_hash(dm: DataModule, logger) -> None: - def _get_imagename_hashes(dataloader) -> List[str]: - h = hash_function() + def _hexdigest(data) -> str: + return hash_function(data).hexdigest() + + def _hexdigest_string(data) -> str: + return _hexdigest(data.encode('utf-8')) + + def _get_imageuid_hashes(dataloader) -> List[str]: hashes = [] for batch in dataloader: assert (len(batch['uid']) == 1) # currently only implemented for batch size 1 - h.update(batch['uid'][0].encode('utf-8')) - hashes.append(h.hexdigest()) + hashes.append(_hexdigest_string(batch['uid'][0])) return hashes def _get_imagedata_hashes(dataloader) -> List[str]: - h = hash_function() hashes = [] for batch in dataloader: - assert (len(batch['source']) == 1) # currently only implemented for batch size 1 - h.update(batch['source'][0].detach().cpu().numpy().data) - hashes.append(h.hexdigest()) + hashes.append(_hexdigest(batch['source']['data'][0].detach().cpu().numpy().data)) return hashes def _check_for_duplicates(strings: List[str], where: str) -> None: if len(strings) != len(set(strings)): print(f"Duplicate {where} detected. Please make sure this was intended") - def _get_imagename_hashes_train_val(dm: DataModule) -> Tuple[str, str]: - imagename_hashes_train = _get_imagename_hashes(dm.train_dataloader()) - imagename_hashes_validation = _get_imagename_hashes(dm.val_dataloader()) - _check_for_duplicates(imagename_hashes_train + imagename_hashes_validation, 'image UIDs') - imagename_hashes_train.sort() - imagename_hashes_validation.sort() - return imagename_hashes_train, imagename_hashes_validation + def _get_imageuid_hashes_train_val(dm: DataModule) -> Tuple[str, str]: + imageuid_hashes_train = _get_imageuid_hashes(dm.train_dataloader()) + imageuid_hashes_validation = _get_imageuid_hashes(dm.val_dataloader()) + _check_for_duplicates(imageuid_hashes_train + imageuid_hashes_validation, 'image UIDs') + imageuid_hashes_train.sort() + imageuid_hashes_validation.sort() + return ''.join(imageuid_hashes_train), ''.join(imageuid_hashes_validation) def _get_imagedata_hashes_train_val(dm: DataModule) -> Tuple[str, str]: imagedata_hashes_train = _get_imagedata_hashes(dm.train_dataloader()) @@ -73,18 +74,29 @@ def _get_imagedata_hashes_train_val(dm: DataModule) -> Tuple[str, str]: _check_for_duplicates(imagedata_hashes_train + imagedata_hashes_validation, 'image data') imagedata_hashes_train.sort() imagedata_hashes_validation.sort() - return imagedata_hashes_train, imagedata_hashes_validation + return ''.join(imagedata_hashes_train), ''.join(imagedata_hashes_validation) - imagename_hashes_train, imagename_hashes_validation = _get_imagename_hashes_train_val(dm) + imageuid_hashes_train, imageuid_hashes_validation = _get_imageuid_hashes_train_val(dm) imagedata_hashes_train, imagedata_hashes_validation = _get_imagedata_hashes_train_val(dm) - all_hashes = ''.join(imagename_hashes_train) + ''.join(imagename_hashes_validation) + ''.join(imagedata_hashes_train) + ''.join(imagedata_hashes_validation) - h = hash_function() - h.update(all_hashes.encode('utf-8')) - logger.info(f"Data hash: f{h.hexdigest()}") + hash_all = _hexdigest_string(imageuid_hashes_train + imageuid_hashes_validation + imagedata_hashes_train + imagedata_hashes_validation) + logger.info(f"Data hash: {hash_all}") def set_up_data_module(logger): + def _log_dataset_hash(logger) -> None: + ds_train_woaug, ds_val_woaug = prepare_odelia_dataset_without_augmentation() + datamodule = DataModule( + ds_train=ds_train_woaug, + ds_val=ds_val_woaug, + batch_size=1, + pin_memory=True, + weights=None, + num_workers=mp.cpu_count(), + ) + log_data_hash(datamodule, logger) + torch.set_float32_matmul_precision('high') + _log_dataset_hash(logger) ds_train, ds_val, path_run_dir, run_name = prepare_odelia_dataset() num_classes = sum(ds_train.class_labels_num) logger.info(f"Dataset path: {ds_train}") @@ -97,7 +109,7 @@ def set_up_data_module(logger): dm = DataModule( ds_train=ds_train, ds_val=ds_val, - ds_test=ds_val, + ds_test=ds_val, # TODO shouldn't this remain unset? batch_size=1, pin_memory=True, weights=None, @@ -111,8 +123,6 @@ def set_up_data_module(logger): # logger.info(f"Label '{label}': {pct:.2f}% of training set, Count: {distribution['counts'][label]}") # logger.info(f"Number of unique labels: {len(distribution['counts'])}") - log_data_hash(dm, logger) - loss_kwargs = {} return dm, path_run_dir, run_name, num_classes, loss_kwargs From 95518741448778a4ba32fed4932dc3550ca30102 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Fri, 6 Feb 2026 05:40:09 +0100 Subject: [PATCH 330/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index a788e91b..691f1462 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -86,8 +86,8 @@ RUN apt install -y \ libnpth0=1.6-3build2 \ libpsl5=0.21.0-1.2build2 \ libpython3-stdlib=3.10.6-1~22.04.1 \ - libpython3.10-minimal=3.10.12-1~22.04.13 \ - libpython3.10-stdlib=3.10.12-1~22.04.13 \ + libpython3.10-minimal=3.10.12-1~22.04.14 \ + libpython3.10-stdlib=3.10.12-1~22.04.14 \ libreadline8=8.1.2-1 \ librtmp1=2.4+20151223.gitfa8646d.1-2build4 \ libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 \ @@ -100,8 +100,8 @@ RUN apt install -y \ pinentry-curses=1.1.1-1build2 \ publicsuffix=20211207.1025-1 \ python3-minimal=3.10.6-1~22.04.1 \ - python3.10-minimal=3.10.12-1~22.04.13 \ - python3.10=3.10.12-1~22.04.13 \ + python3.10-minimal=3.10.12-1~22.04.14 \ + python3.10=3.10.12-1~22.04.14 \ python3=3.10.6-1~22.04.1 \ readline-common=8.1.2-1 \ unzip=6.0-26ubuntu3.2 \ From 7b99f5dfe905c0285e7e753eca34f6cbd07c6d54 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Fri, 6 Feb 2026 10:12:35 +0100 Subject: [PATCH 331/337] improve Docker build caching by installing apt packages (with version changs) later --- docker_config/Dockerfile_ODELIA | 161 ++++++++++++++++---------------- 1 file changed, 82 insertions(+), 79 deletions(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 691f1462..336721bf 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -17,98 +17,21 @@ RUN apt install -y \ apt-utils=2.4.14 \ libapt-pkg6.0=2.4.14 -# Update versions of installed packages +# Install python and dependencies RUN apt install -y \ - base-files=12ubuntu4.7 \ - bash=5.1-6ubuntu1.1 \ - bsdutils=1:2.37.2-4ubuntu3.4 \ - ca-certificates=20240203~22.04.1 \ - coreutils=8.32-4.1ubuntu1.2 \ - dpkg=1.21.1ubuntu2.6 \ - e2fsprogs=1.46.5-2ubuntu1.2 \ - gpgv=2.2.27-3ubuntu2.5 \ - libblkid1=2.37.2-4ubuntu3.4 \ - libc-bin=2.35-0ubuntu3.13 \ - libc-dev-bin=2.35-0ubuntu3.13 \ - libc6-dev=2.35-0ubuntu3.13 \ - libc6=2.35-0ubuntu3.13 \ - libcap2=1:2.44-1ubuntu0.22.04.2 \ - libcom-err2=1.46.5-2ubuntu1.2 \ - libext2fs2=1.46.5-2ubuntu1.2 \ - libgnutls30=3.7.3-4ubuntu1.7 \ - libgssapi-krb5-2=1.19.2-2ubuntu0.7 \ - libk5crypto3=1.19.2-2ubuntu0.7 \ - libkrb5-3=1.19.2-2ubuntu0.7 \ - libkrb5support0=1.19.2-2ubuntu0.7 \ - libmount1=2.37.2-4ubuntu3.4 \ - libpam-modules-bin=1.4.0-11ubuntu2.6 \ - libpam-modules=1.4.0-11ubuntu2.6 \ - libpam-runtime=1.4.0-11ubuntu2.6 \ - libpam0g=1.4.0-11ubuntu2.6 \ - libseccomp2=2.5.3-2ubuntu3~22.04.1 \ - libsmartcols1=2.37.2-4ubuntu3.4 \ - libss2=1.46.5-2ubuntu1.2 \ - libssl3=3.0.2-0ubuntu1.21 \ - libsystemd0=249.11-0ubuntu3.17 \ - libtasn1-6=4.18.0-4ubuntu0.2 \ - libudev1=249.11-0ubuntu3.17 \ - libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-168.178 \ - logsave=1.46.5-2ubuntu1.2 \ - mount=2.37.2-4ubuntu3.4 \ - openssl=3.0.2-0ubuntu1.21 \ - util-linux=2.37.2-4ubuntu3.4 - -# Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions -RUN apt install -y \ - apt-transport-https=2.4.14 \ - curl=7.81.0-1ubuntu1.21 \ - dirmngr=2.2.27-3ubuntu2.5 \ - distro-info-data=0.52ubuntu0.11 \ - gnupg-l10n=2.2.27-3ubuntu2.5 \ - gnupg-utils=2.2.27-3ubuntu2.5 \ - gnupg=2.2.27-3ubuntu2.5 \ - gpg-agent=2.2.27-3ubuntu2.5 \ - gpg-wks-client=2.2.27-3ubuntu2.5 \ - gpg-wks-server=2.2.27-3ubuntu2.5 \ - gpg=2.2.27-3ubuntu2.5 \ - gpgconf=2.2.27-3ubuntu2.5 \ - gpgsm=2.2.27-3ubuntu2.5 \ - libassuan0=2.5.5-1build1 \ - libbrotli1=1.0.9-2build6 \ - libcurl4=7.81.0-1ubuntu1.21 \ libexpat1=2.4.7-1ubuntu0.6 \ - libksba8=1.6.0-2ubuntu0.2 \ - libldap-2.5-0=2.5.20+dfsg-0ubuntu0.22.04.1 \ - libldap-common=2.5.20+dfsg-0ubuntu0.22.04.1 \ libmpdec3=2.5.1-2build2 \ - libnghttp2-14=1.43.0-1ubuntu0.2 \ - libnpth0=1.6-3build2 \ - libpsl5=0.21.0-1.2build2 \ libpython3-stdlib=3.10.6-1~22.04.1 \ libpython3.10-minimal=3.10.12-1~22.04.14 \ libpython3.10-stdlib=3.10.12-1~22.04.14 \ libreadline8=8.1.2-1 \ - librtmp1=2.4+20151223.gitfa8646d.1-2build4 \ - libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 \ - libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 \ - libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 \ libsqlite3-0=3.37.2-2ubuntu0.5 \ - libssh-4=0.9.6-2ubuntu0.22.04.5 \ - lsb-release=11.1.0ubuntu4 \ media-types=7.0.0 \ - pinentry-curses=1.1.1-1build2 \ - publicsuffix=20211207.1025-1 \ python3-minimal=3.10.6-1~22.04.1 \ python3.10-minimal=3.10.12-1~22.04.14 \ python3.10=3.10.12-1~22.04.14 \ python3=3.10.6-1~22.04.1 \ - readline-common=8.1.2-1 \ - unzip=6.0-26ubuntu3.2 \ - zip=3.0-12build2 - -# Clean up apt cache -RUN rm -rf /var/lib/apt/lists/* + readline-common=8.1.2-1 # uninstall conda to prevent usage and avoid and potential repository license issues RUN python3 -m pip uninstall -y conda conda-package-handling conda_index @@ -245,6 +168,86 @@ RUN python3 -m pip install \ # Clean up pip cache RUN python3 -m pip cache purge +# Update versions of installed packages +RUN apt install -y \ + base-files=12ubuntu4.7 \ + bash=5.1-6ubuntu1.1 \ + bsdutils=1:2.37.2-4ubuntu3.4 \ + ca-certificates=20240203~22.04.1 \ + coreutils=8.32-4.1ubuntu1.2 \ + dpkg=1.21.1ubuntu2.6 \ + e2fsprogs=1.46.5-2ubuntu1.2 \ + gpgv=2.2.27-3ubuntu2.5 \ + libblkid1=2.37.2-4ubuntu3.4 \ + libc-bin=2.35-0ubuntu3.13 \ + libc-dev-bin=2.35-0ubuntu3.13 \ + libc6-dev=2.35-0ubuntu3.13 \ + libc6=2.35-0ubuntu3.13 \ + libcap2=1:2.44-1ubuntu0.22.04.2 \ + libcom-err2=1.46.5-2ubuntu1.2 \ + libext2fs2=1.46.5-2ubuntu1.2 \ + libgnutls30=3.7.3-4ubuntu1.7 \ + libgssapi-krb5-2=1.19.2-2ubuntu0.7 \ + libk5crypto3=1.19.2-2ubuntu0.7 \ + libkrb5-3=1.19.2-2ubuntu0.7 \ + libkrb5support0=1.19.2-2ubuntu0.7 \ + libmount1=2.37.2-4ubuntu3.4 \ + libpam-modules-bin=1.4.0-11ubuntu2.6 \ + libpam-modules=1.4.0-11ubuntu2.6 \ + libpam-runtime=1.4.0-11ubuntu2.6 \ + libpam0g=1.4.0-11ubuntu2.6 \ + libseccomp2=2.5.3-2ubuntu3~22.04.1 \ + libsmartcols1=2.37.2-4ubuntu3.4 \ + libss2=1.46.5-2ubuntu1.2 \ + libssl3=3.0.2-0ubuntu1.21 \ + libsystemd0=249.11-0ubuntu3.17 \ + libtasn1-6=4.18.0-4ubuntu0.2 \ + libudev1=249.11-0ubuntu3.17 \ + libuuid1=2.37.2-4ubuntu3.4 \ + linux-libc-dev=5.15.0-168.178 \ + logsave=1.46.5-2ubuntu1.2 \ + mount=2.37.2-4ubuntu3.4 \ + openssl=3.0.2-0ubuntu1.21 \ + util-linux=2.37.2-4ubuntu3.4 + +# Install apt-transport-https curl gnupg lsb-release zip and dependencies at defined versions +RUN apt install -y \ + apt-transport-https=2.4.14 \ + curl=7.81.0-1ubuntu1.21 \ + dirmngr=2.2.27-3ubuntu2.5 \ + distro-info-data=0.52ubuntu0.11 \ + gnupg-l10n=2.2.27-3ubuntu2.5 \ + gnupg-utils=2.2.27-3ubuntu2.5 \ + gnupg=2.2.27-3ubuntu2.5 \ + gpg-agent=2.2.27-3ubuntu2.5 \ + gpg-wks-client=2.2.27-3ubuntu2.5 \ + gpg-wks-server=2.2.27-3ubuntu2.5 \ + gpg=2.2.27-3ubuntu2.5 \ + gpgconf=2.2.27-3ubuntu2.5 \ + gpgsm=2.2.27-3ubuntu2.5 \ + libassuan0=2.5.5-1build1 \ + libbrotli1=1.0.9-2build6 \ + libcurl4=7.81.0-1ubuntu1.21 \ + libksba8=1.6.0-2ubuntu0.2 \ + libldap-2.5-0=2.5.20+dfsg-0ubuntu0.22.04.1 \ + libldap-common=2.5.20+dfsg-0ubuntu0.22.04.1 \ + libnghttp2-14=1.43.0-1ubuntu0.2 \ + libnpth0=1.6-3build2 \ + libpsl5=0.21.0-1.2build2 \ + librtmp1=2.4+20151223.gitfa8646d.1-2build4 \ + libsasl2-2=2.1.27+dfsg2-3ubuntu1.2 \ + libsasl2-modules-db=2.1.27+dfsg2-3ubuntu1.2 \ + libsasl2-modules=2.1.27+dfsg2-3ubuntu1.2 \ + libssh-4=0.9.6-2ubuntu0.22.04.5 \ + lsb-release=11.1.0ubuntu4 \ + pinentry-curses=1.1.1-1build2 \ + publicsuffix=20211207.1025-1 \ + unzip=6.0-26ubuntu3.2 \ + zip=3.0-12build2 + +# Clean up apt cache +RUN rm -rf /var/lib/apt/lists/* + # install ODELIA fork of NVFlare from local source WORKDIR /workspace/ COPY ./MediSwarm/docker_config/NVFlare /workspace/nvflare From 6341503002ab13de13119fb254fd4ac4c100d4bf Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 9 Feb 2026 11:07:29 +0100 Subject: [PATCH 332/337] added what to check --- assets/openvpn_always_on_setup_guide.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/assets/openvpn_always_on_setup_guide.md b/assets/openvpn_always_on_setup_guide.md index bb7a575b..b723a88f 100644 --- a/assets/openvpn_always_on_setup_guide.md +++ b/assets/openvpn_always_on_setup_guide.md @@ -125,6 +125,8 @@ Check network interfaces: ip a ``` +You should see an interface named `tun0` or similar with an IP address starting with `172.24.4.` + Or verify your public IP: ```bash @@ -153,4 +155,3 @@ The IP should correspond to the VPN, not your ISP. - Configuration is suitable for headless servers. --- - From 3f5689cd29bda8546c1c74f49e7c0c2638926f31 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 9 Feb 2026 15:26:15 +0100 Subject: [PATCH 333/337] removed instructions for ifconfig.me, they do not apply in case of split tunneling --- assets/openvpn_always_on_setup_guide.md | 8 -------- 1 file changed, 8 deletions(-) diff --git a/assets/openvpn_always_on_setup_guide.md b/assets/openvpn_always_on_setup_guide.md index b723a88f..559c8be4 100644 --- a/assets/openvpn_always_on_setup_guide.md +++ b/assets/openvpn_always_on_setup_guide.md @@ -127,14 +127,6 @@ ip a You should see an interface named `tun0` or similar with an IP address starting with `172.24.4.` -Or verify your public IP: - -```bash -curl ifconfig.me -``` - -The IP should correspond to the VPN, not your ISP. - --- ## 8. Behavior Summary From 8c0f8f6045225f078f4f84379f3e964832f3ec47 Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Mon, 9 Feb 2026 15:36:08 +0100 Subject: [PATCH 334/337] table layout in source --- assets/openvpn_always_on_setup_guide.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/assets/openvpn_always_on_setup_guide.md b/assets/openvpn_always_on_setup_guide.md index 559c8be4..1c75b58d 100644 --- a/assets/openvpn_always_on_setup_guide.md +++ b/assets/openvpn_always_on_setup_guide.md @@ -131,11 +131,11 @@ You should see an interface named `tun0` or similar with an IP address starting ## 8. Behavior Summary -| Event | Result | -|------|-------| -| VPN connection drops | Automatically reconnects | -| VPN server unavailable | Retries indefinitely | -| Server reboot | VPN reconnects on startup | +| Event | Result | +|---------------------------+----------------------------------| +| VPN connection drops | Automatically reconnects | +| VPN server unavailable | Retries indefinitely | +| Server reboot | VPN reconnects on startup | | Network delay during boot | systemd retries until successful | --- From 4e18a4f63e2058648ee1092b5f6638d08cdbf5e4 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Tue, 10 Feb 2026 05:51:49 +0100 Subject: [PATCH 335/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index 336721bf..ac43f8e8 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -204,7 +204,7 @@ RUN apt install -y \ libtasn1-6=4.18.0-4ubuntu0.2 \ libudev1=249.11-0ubuntu3.17 \ libuuid1=2.37.2-4ubuntu3.4 \ - linux-libc-dev=5.15.0-168.178 \ + linux-libc-dev=5.15.0-170.180 \ logsave=1.46.5-2ubuntu1.2 \ mount=2.37.2-4ubuntu3.4 \ openssl=3.0.2-0ubuntu1.21 \ From aa892d7aa2b4d48a12eec415dc7e2a64f91b6e9b Mon Sep 17 00:00:00 2001 From: Ole Schwen Date: Tue, 10 Feb 2026 13:44:55 +0100 Subject: [PATCH 336/337] link to current VPN installation guide, removed previous one after moving troubleshooting section --- assets/VPN setup guide(CLI).md | 91 -------------------- assets/openvpn_always_on_setup_guide.md | 15 ++++ assets/readme/README.participant.md | 11 +-- scripts/client_node_setup/setup_vpntunnel.sh | 67 -------------- 4 files changed, 16 insertions(+), 168 deletions(-) delete mode 100644 assets/VPN setup guide(CLI).md delete mode 100755 scripts/client_node_setup/setup_vpntunnel.sh diff --git a/assets/VPN setup guide(CLI).md b/assets/VPN setup guide(CLI).md deleted file mode 100644 index e70fe26c..00000000 --- a/assets/VPN setup guide(CLI).md +++ /dev/null @@ -1,91 +0,0 @@ -# GoodAccess VPN Setup Guide (CLI) - -This guide provides step-by-step instructions to set up and connect to GoodAccess VPN on a Linux machine using the command-line interface (CLI). - ---- - -## Step 1: Receive VPN Credentials and Setup Files - -You will receive the following via a secure communication channel (e.g., email): -- VPN **username** and **password** -- A setup file (typically in `.zip` format) - ---- - -## Step 2: Unzip the Setup File - -1. Locate the received `.zip` file. -2. Extract its contents: - ```sh - unzip .zip - ``` -3. Move the `.ovpn` file to the correct directory: - ```sh - mv .ovpn assets/openvpn_configs/good_access - ``` -4. Open the `.ovpn` file and verify that the following line exists: - ``` - auth-user-pass /etc/openvpn/credentials - ``` - - If it does not exist, **manually add it** after `auth-user-pass`. - ---- - -## Step 3: Establish a New VPN Connection - -To initiate a VPN connection, run: -```sh -sh envsetup_scripts/setup_vpntunnel.sh -d -n -``` -Where `` is your institute's name. For the ODELIA project, choose from: -- TUD -- Ribera -- VHIO -- Radboud -- UKA -- UMCU -- MHA -- Cambridge -- USZ -- MEVIS - -You will be prompted to enter the **username** and **password** provided by TUD. - ---- - -## Step 4: Verify VPN Connection - -To confirm that you are connected, check your IP address: -```sh -hostname -I -``` -You should see an IP in the range **172.24.4.xx/22**. - ---- - -## Step 5: Reconnect to VPN - -If your machine restarts or loses connection, reconnect by running: -```sh -sh envsetup_scripts/setup_vpntunnel.sh -``` - -The `.ovpn` file assigned to you by TUD is required for re-establishing the connection. - -For further troubleshooting, refer to the VPN Connect Guide on the GoodAccess support page: -[GoodAccess VPN Connect Guide](https://support.goodaccess.com/configuration-guides/linux) - - - -## Step 6: Troubleshooting — Disconnecting Existing VPN Connections - -Some users have experienced that connecting to GoodAccess **disconnects an existing VPN or ssh connection**. -This may happen because OpenVPN is configured to redirect all network traffic through the GoodAccess tunnel, which overrides your local or other VPN routes and may make the machine inaccessible in its local network. - -If this occurs, you can prevent the redirection by starting OpenVPN with: -```sh -openvpn --config .ovpn --pull-filter ignore redirect-gateway -``` -This tells the OpenVPN client **not** to override your default gateway, allowing your other VPN or ssh connection to remain active. - -> **Note:** This behavior was observed by Aitor and Ole after certain OpenVPN updates. The above command has been effective in resolving the issue. \ No newline at end of file diff --git a/assets/openvpn_always_on_setup_guide.md b/assets/openvpn_always_on_setup_guide.md index 1c75b58d..d32e5481 100644 --- a/assets/openvpn_always_on_setup_guide.md +++ b/assets/openvpn_always_on_setup_guide.md @@ -147,3 +147,18 @@ You should see an interface named `tun0` or similar with an IP address starting - Configuration is suitable for headless servers. --- + +## Troubleshooting + +### Disconnecting Existing VPN Connections + +Some users have experienced that connecting to GoodAccess **disconnects an existing VPN or ssh connection**. +This may happen because OpenVPN is configured to redirect all network traffic through the GoodAccess tunnel, which overrides your local or other VPN routes and may make the machine inaccessible in its local network. + +If this occurs, you can prevent the redirection by starting OpenVPN with +```sh +openvpn --config .ovpn --pull-filter ignore redirect-gateway +``` +This tells the OpenVPN client **not** to override your default gateway, allowing your other VPN or ssh connection to remain active. + +**TODO** describe how this can be configured in when starting the VPN as a system service. \ No newline at end of file diff --git a/assets/readme/README.participant.md b/assets/readme/README.participant.md index 9ff01ba1..62f5f238 100644 --- a/assets/readme/README.participant.md +++ b/assets/readme/README.participant.md @@ -11,16 +11,7 @@ This guide is for data scientists and medical research sites participating in a ## Setup 0. Add this line to your `/etc/hosts`: `172.24.4.65 dl3.tud.de dl3` 1. Make sure your compute node satisfies the specification and has the necessary software installed. -2. Set up the VPN. A VPN is necessary so that the swarm nodes can communicate with each other securely across firewalls. For that purpose, - 1. Install OpenVPN - ```bash - sudo apt-get install openvpn - ``` - 2. If you have a graphical user interface(GUI), follow this guide to connect to the - VPN: [VPN setup guide(GUI).pdf](../VPN%20setup%20guide%28GUI%29.pdf) - 3. If you have a command line interface(CLI), follow this guide to connect to the - VPN: [VPN setup guide(CLI).md](../VPN%20setup%20guide%28CLI%29.md) - 4. You may want to clone this repository or selectively download VPN-related scripts for this purpose. +2. Set up the VPN. A VPN is necessary so that the swarm nodes can communicate with each other securely across firewalls. For that purpose, follow the instructions in [openvpn_always_on_setup_guide.md](../openvpn_always_on_setup_guide.md). ## Prepare Dataset diff --git a/scripts/client_node_setup/setup_vpntunnel.sh b/scripts/client_node_setup/setup_vpntunnel.sh deleted file mode 100755 index 7a53d00b..00000000 --- a/scripts/client_node_setup/setup_vpntunnel.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env bash - -set -e - -ACTION="nochange" - -while [[ "$#" -gt 0 ]]; do - case $1 in - -d) host_index="$2"; shift ;; - -n) ACTION="new";; - -h) ACTION="help";; - *) echo "Unknown parameter passed: $1"; exit 1 ;; - esac - shift -done - -if [[ $ACTION = "help" ]]; then - echo "Usage: setup_vpntunnel.sh -d [-n]" - echo " -n one-time setup" - echo "" - exit 1 -fi - -if [ -z "$host_index" ]; then - echo "Please specify your host index via option -d " - echo "Host index should be chosen from [TUD, Ribera, VHIO, Radboud, UKA, Utrecht, Mitera, Cambridge, Zurich]" - exit 1 -fi - -if [ $ACTION = "new" ]; then - # to the initial setup - echo "Setting up VPN tunnel for swarm learning ..." - sudo apt-get -y install openvpn - - echo "Please enter your vpn credentials (ask TUD maintainer for the account and password if you don't have the data yet)" - read -p "vpn account: " vpn_account - stty -echo - read -p "vpn password: " vpn_password - stty echo - printf '%s\n' $vpn_account $vpn_password | sudo tee /etc/openvpn/credentials > /dev/null - sudo chmod 600 /etc/openvpn/credentials -fi - -if [[ ! -f ./assets/openvpn_configs/good_access/$host_index.ovpn ]]; then - echo "Configuration file ./assets/openvpn_configs/good_access/$host_index.ovpn not found" - exit 1 -fi - -if [[ ! -f /etc/openvpn/credentials ]]; then - echo "Credentials file /etc/openvpn/credentials not found, please use option -n to create it" - exit 1 -fi - -echo "Starting VPN tunnel for swarm learning ..." -sudo nohup openvpn --config ./assets/openvpn_configs/good_access/$host_index.ovpn & -sleep 3 -sudo chmod a+r nohup.out - -if [ $? -ne 0 ]; then - echo "An error occurred while running the script. Please check the output above or nohup.out for more details." - exit 1 -fi - -hostname -I - -echo "You should see an IP address of the form 172.24.4.x in the output above." -echo "If it does not appear, try 'hostname -I' again in 10 or 20 seconds." From 4a795e0025e84b93ec9782c49cebc9b5208ff636 Mon Sep 17 00:00:00 2001 From: Ultimate-Storm Date: Wed, 11 Feb 2026 05:50:01 +0100 Subject: [PATCH 337/337] chore: update apt versions in Dockerfile_ODELIA --- docker_config/Dockerfile_ODELIA | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker_config/Dockerfile_ODELIA b/docker_config/Dockerfile_ODELIA index ac43f8e8..eb3e56bd 100644 --- a/docker_config/Dockerfile_ODELIA +++ b/docker_config/Dockerfile_ODELIA @@ -19,7 +19,7 @@ RUN apt install -y \ # Install python and dependencies RUN apt install -y \ - libexpat1=2.4.7-1ubuntu0.6 \ + libexpat1=2.4.7-1ubuntu0.7 \ libmpdec3=2.5.1-2build2 \ libpython3-stdlib=3.10.6-1~22.04.1 \ libpython3.10-minimal=3.10.12-1~22.04.14 \