From b33d48e2182a6a9e8c4323febbe694578320a924 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 18 Jul 2025 12:35:19 +0200 Subject: [PATCH 01/79] upgraded rclone version --- ci/github/helpers/install_rclone_docker_volume_plugin.bash | 2 +- scripts/install_rclone.bash | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/github/helpers/install_rclone_docker_volume_plugin.bash b/ci/github/helpers/install_rclone_docker_volume_plugin.bash index 1f0e54658fbc..b8ec9552c533 100755 --- a/ci/github/helpers/install_rclone_docker_volume_plugin.bash +++ b/ci/github/helpers/install_rclone_docker_volume_plugin.bash @@ -10,7 +10,7 @@ set -o pipefail # don't hide errors within pipes IFS=$'\n\t' # Installation instructions from https://rclone.org/docker/ -R_CLONE_VERSION="1.66.0" +R_CLONE_VERSION="1.70.3" mkdir --parents /var/lib/docker-plugins/rclone/config mkdir --parents /var/lib/docker-plugins/rclone/cache docker plugin install rclone/docker-volume-rclone:amd64-${R_CLONE_VERSION} args="-v" --alias rclone --grant-all-permissions diff --git a/scripts/install_rclone.bash b/scripts/install_rclone.bash index e6378cdd9b34..31ec36302a9b 100755 --- a/scripts/install_rclone.bash +++ b/scripts/install_rclone.bash @@ -9,7 +9,7 @@ set -o nounset # abort on unbound variable set -o pipefail # don't hide errors within pipes IFS=$'\n\t' -R_CLONE_VERSION="1.63.1" +R_CLONE_VERSION="1.70.3" TARGETARCH="${TARGETARCH:-amd64}" echo "platform ${TARGETARCH}" From efa0435d9618758eba641f31b46e2638874be714 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 18 Jul 2025 12:47:38 +0200 Subject: [PATCH 02/79] using faster settings for rclone --- .../src/settings_library/r_clone.py | 26 +++++++++++++++++++ .../simcore_sdk/node_ports_common/r_clone.py | 13 ++++++++-- .../simcore_service_agent/services/backup.py | 2 +- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index d1a6472e9c67..adb87ee45726 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -36,3 +36,29 @@ class RCloneSettings(BaseCustomSettings): description="`--buffer-size X`: sets the amount of RAM to use for each individual transfer", ), ] = "16M" + + R_CLONE_OPTION_CHECKERS: Annotated[ + NonNegativeInt, + Field( + description="`--checkers X`: sets the number checkers", + ), + ] = 32 + + R_CLONE_S3_UPLOAD_CONCURRENCY: Annotated[ + NonNegativeInt, + Field( + description="`--s3-upload-concurrency X`: sets the number of concurrent uploads to S3", + ), + ] = 8 + + R_CLONE_CHUNK_SIZE: Annotated[ + str, + Field(description="`--s3-chunk-size X`: sets the chunk size for S3"), + ] = "128M" + + R_CLONE_ORDER_BY: Annotated[ + str, + Field( + description="`--order-by X`: sets the order of file upload, e.g., 'size,mixed'", + ), + ] = "size,mixed" diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py index db5e107b753c..69b005e6ed0a 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py @@ -30,8 +30,7 @@ _logger = logging.getLogger(__name__) -class BaseRCloneError(OsparcErrorMixin, RuntimeError): - ... +class BaseRCloneError(OsparcErrorMixin, RuntimeError): ... class RCloneFailedError(BaseRCloneError): @@ -207,6 +206,16 @@ async def _sync_sources( # https://forum.rclone.org/t/how-to-set-a-memory-limit/10230/4 "--buffer-size", # docs https://rclone.org/docs/#buffer-size-size r_clone_settings.R_CLONE_OPTION_BUFFER_SIZE, + "--checkers", + f"{r_clone_settings.R_CLONE_OPTION_CHECKERS}", + "--s3-upload-concurrency", + f"{r_clone_settings.R_CLONE_S3_UPLOAD_CONCURRENCY}", + "--s3-chunk-size", + r_clone_settings.R_CLONE_CHUNK_SIZE, + # handles the order of file upload + "--order-by", + r_clone_settings.R_CLONE_ORDER_BY, + "--fast-list", "--use-json-log", # frequent polling for faster progress updates "--stats", diff --git a/services/agent/src/simcore_service_agent/services/backup.py b/services/agent/src/simcore_service_agent/services/backup.py index a7e125af0c42..ebd94ee27303 100644 --- a/services/agent/src/simcore_service_agent/services/backup.py +++ b/services/agent/src/simcore_service_agent/services/backup.py @@ -162,7 +162,7 @@ async def _store_in_s3( # below two options reduce to a minimum the memory footprint # https://forum.rclone.org/t/how-to-set-a-memory-limit/10230/4 "--buffer-size", # docs https://rclone.org/docs/#buffer-size-size - "0M", + "16M", "--stats", "5s", "--stats-one-line", From 55c43402e0b5d87549c7af8dc1951d0c2f5b695e Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Thu, 21 Aug 2025 10:50:47 +0200 Subject: [PATCH 03/79] removed unsued feature --- .github/workflows/ci-testing-deploy.yml | 2 - .../install_rclone_docker_volume_plugin.bash | 18 -- services/director-v2/requirements/_test.in | 1 - services/director-v2/requirements/_test.txt | 54 ----- .../core/settings.py | 8 - .../docker_service_specs/sidecar.py | 13 -- .../scheduler/_core/_events_utils.py | 15 +- .../modules/dynamic_sidecar/volumes.py | 101 +--------- ...t_dynamic_sidecar_nodeports_integration.py | 186 +++--------------- 9 files changed, 35 insertions(+), 363 deletions(-) delete mode 100755 ci/github/helpers/install_rclone_docker_volume_plugin.bash diff --git a/.github/workflows/ci-testing-deploy.yml b/.github/workflows/ci-testing-deploy.yml index b130c5254ba4..9c0f75fbd567 100644 --- a/.github/workflows/ci-testing-deploy.yml +++ b/.github/workflows/ci-testing-deploy.yml @@ -1590,8 +1590,6 @@ jobs: with: python-version: ${{ matrix.python }} cache-dependency-glob: "**/director-v2/requirements/ci.txt" - - name: setup rclone docker volume plugin - run: sudo ./ci/github/helpers/install_rclone_docker_volume_plugin.bash - name: Download and load Docker images uses: ./.github/actions/download-load-docker-images with: diff --git a/ci/github/helpers/install_rclone_docker_volume_plugin.bash b/ci/github/helpers/install_rclone_docker_volume_plugin.bash deleted file mode 100755 index b8ec9552c533..000000000000 --- a/ci/github/helpers/install_rclone_docker_volume_plugin.bash +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# -# Installs the latest version of rclone plugin -# - -# http://redsymbol.net/articles/unofficial-bash-strict-mode/ -set -o errexit # abort on nonzero exitstatus -set -o nounset # abort on unbound variable -set -o pipefail # don't hide errors within pipes -IFS=$'\n\t' - -# Installation instructions from https://rclone.org/docker/ -R_CLONE_VERSION="1.70.3" -mkdir --parents /var/lib/docker-plugins/rclone/config -mkdir --parents /var/lib/docker-plugins/rclone/cache -docker plugin install rclone/docker-volume-rclone:amd64-${R_CLONE_VERSION} args="-v" --alias rclone --grant-all-permissions -docker plugin list -docker plugin inspect rclone diff --git a/services/director-v2/requirements/_test.in b/services/director-v2/requirements/_test.in index 34b5327e2538..409b03549001 100644 --- a/services/director-v2/requirements/_test.in +++ b/services/director-v2/requirements/_test.in @@ -10,7 +10,6 @@ --constraint _base.txt aio_pika -aioboto3 alembic # migration due to pytest_simcore.postgres_service2 asgi_lifespan async-asgi-testclient # replacement for fastapi.testclient.TestClient [see b) below] diff --git a/services/director-v2/requirements/_test.txt b/services/director-v2/requirements/_test.txt index 619015cc94b0..23366bd01891 100644 --- a/services/director-v2/requirements/_test.txt +++ b/services/director-v2/requirements/_test.txt @@ -2,33 +2,10 @@ aio-pika==9.5.5 # via # -c requirements/_base.txt # -r requirements/_test.in -aioboto3==14.3.0 - # via -r requirements/_test.in -aiobotocore==2.22.0 - # via aioboto3 -aiofiles==24.1.0 - # via - # -c requirements/_base.txt - # aioboto3 -aiohappyeyeballs==2.6.1 - # via - # -c requirements/_base.txt - # aiohttp -aiohttp==3.12.12 - # via - # -c requirements/../../../requirements/constraints.txt - # -c requirements/_base.txt - # aiobotocore -aioitertools==0.12.0 - # via aiobotocore aiormq==6.8.1 # via # -c requirements/_base.txt # aio-pika -aiosignal==1.3.2 - # via - # -c requirements/_base.txt - # aiohttp alembic==1.15.2 # via # -c requirements/_base.txt @@ -44,17 +21,9 @@ async-asgi-testclient==1.4.11 attrs==25.3.0 # via # -c requirements/_base.txt - # aiohttp # pytest-docker bokeh==3.7.3 # via dask -boto3==1.37.3 - # via aiobotocore -botocore==1.37.3 - # via - # aiobotocore - # boto3 - # s3transfer certifi==2025.4.26 # via # -c requirements/../../../requirements/constraints.txt @@ -103,11 +72,6 @@ fakeredis==2.30.3 # via -r requirements/_test.in flaky==3.8.1 # via -r requirements/_test.in -frozenlist==1.6.0 - # via - # -c requirements/_base.txt - # aiohttp - # aiosignal fsspec==2025.3.2 # via # -c requirements/_base.txt @@ -151,11 +115,6 @@ jinja2==3.1.6 # bokeh # dask # distributed -jmespath==1.0.1 - # via - # aiobotocore - # boto3 - # botocore locket==1.0.0 # via # -c requirements/_base.txt @@ -180,8 +139,6 @@ msgpack==1.1.0 multidict==6.4.4 # via # -c requirements/_base.txt - # aiobotocore - # aiohttp # async-asgi-testclient # yarl mypy==1.16.1 @@ -227,7 +184,6 @@ pprintpp==0.4.0 propcache==0.3.1 # via # -c requirements/_base.txt - # aiohttp # yarl psutil==7.0.0 # via @@ -263,8 +219,6 @@ pytest-xdist==3.8.0 python-dateutil==2.9.0.post0 # via # -c requirements/_base.txt - # aiobotocore - # botocore # pandas pytz==2025.2 # via pandas @@ -287,8 +241,6 @@ requests==2.32.4 # docker respx==0.22.0 # via -r requirements/_test.in -s3transfer==0.11.3 - # via boto3 six==1.17.0 # via # -c requirements/_base.txt @@ -348,21 +300,15 @@ urllib3==2.5.0 # via # -c requirements/../../../requirements/constraints.txt # -c requirements/_base.txt - # botocore # distributed # docker # requests -wrapt==1.17.2 - # via - # -c requirements/_base.txt - # aiobotocore xyzservices==2025.4.0 # via bokeh yarl==1.20.0 # via # -c requirements/_base.txt # aio-pika - # aiohttp # aiormq zict==3.0.0 # via diff --git a/services/director-v2/src/simcore_service_director_v2/core/settings.py b/services/director-v2/src/simcore_service_director_v2/core/settings.py index 03f256b01b0e..72e830fff467 100644 --- a/services/director-v2/src/simcore_service_director_v2/core/settings.py +++ b/services/director-v2/src/simcore_service_director_v2/core/settings.py @@ -128,14 +128,6 @@ class AppSettings(BaseApplicationSettings, MixinLoggingSettings): ) DIRECTOR_V2_DEV_FEATURES_ENABLED: bool = False - DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED: bool = Field( - default=False, - description=( - "Under development feature. If enabled state " - "is saved using rclone docker volumes." - ), - ) - # for passing self-signed certificate to spawned services DIRECTOR_V2_SELF_SIGNED_SSL_SECRET_ID: str = Field( default="", diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py index dd05734f237f..2a2a8edabedf 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py @@ -321,19 +321,6 @@ async def _get_mounts( storage_directory_name=_storage_directory_name, ) ) - # for now only enable this with dev features enabled - elif app_settings.DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED: - mounts.append( - DynamicSidecarVolumesPathsResolver.mount_r_clone( - swarm_stack_name=dynamic_services_scheduler_settings.SWARM_STACK_NAME, - path=path_to_mount, - node_uuid=scheduler_data.node_uuid, - service_run_id=scheduler_data.run_id, - project_id=scheduler_data.project_id, - user_id=scheduler_data.user_id, - r_clone_settings=dynamic_sidecar_settings.DYNAMIC_SIDECAR_R_CLONE_SETTINGS, - ) - ) else: mounts.append( DynamicSidecarVolumesPathsResolver.mount_entry( diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py index e5083d5895cd..3aec31d163dd 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py @@ -356,16 +356,10 @@ async def attempt_pod_removal_and_data_saving( try: tasks = [ - service_push_outputs(app, scheduler_data.node_uuid, sidecars_client) + service_push_outputs(app, scheduler_data.node_uuid, sidecars_client), + service_save_state(app, scheduler_data.node_uuid, sidecars_client), ] - # When enabled no longer uploads state via nodeports - # It uses rclone mounted volumes for this task. - if not app_settings.DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED: - tasks.append( - service_save_state(app, scheduler_data.node_uuid, sidecars_client) - ) - await logged_gather(*tasks, max_concurrency=2) scheduler_data.dynamic_sidecar.were_state_and_outputs_saved = True @@ -547,11 +541,8 @@ async def _restore_service_state_with_metrics() -> None: tasks = [ _pull_user_services_images_with_metrics(), _pull_output_ports_with_metrics(), + _restore_service_state_with_metrics(), ] - # When enabled no longer downloads state via nodeports - # S3 is used to store state paths - if not app_settings.DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED: - tasks.append(_restore_service_state_with_metrics()) await limited_gather(*tasks, limit=3) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py index bf375b29eede..71630b814cb4 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py @@ -1,6 +1,6 @@ import os from pathlib import Path -from typing import Any +from typing import Any, Final from models_library.api_schemas_directorv2.services import ( CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME, @@ -20,72 +20,8 @@ WRITE_SIZE, AwsEfsSettings, ) -from settings_library.r_clone import S3Provider -from ...core.dynamic_services_settings.sidecar import RCloneSettings -from .errors import DynamicSidecarError - -DY_SIDECAR_SHARED_STORE_PATH = Path("/shared-store") - - -def _get_s3_volume_driver_config( - r_clone_settings: RCloneSettings, - project_id: ProjectID, - node_uuid: NodeID, - storage_directory_name: str, -) -> dict[str, Any]: - assert "/" not in storage_directory_name # nosec - driver_config: dict[str, Any] = { - "Name": "rclone", - "Options": { - "type": "s3", - "s3-access_key_id": r_clone_settings.R_CLONE_S3.S3_ACCESS_KEY, - "s3-secret_access_key": r_clone_settings.R_CLONE_S3.S3_SECRET_KEY, - "path": f"{r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME}/{project_id}/{node_uuid}/{storage_directory_name}", - "allow-other": "true", - "vfs-cache-mode": r_clone_settings.R_CLONE_VFS_CACHE_MODE.value, - # Directly connected to how much time it takes for - # files to appear on remote s3, please se discussion - # SEE https://forum.rclone.org/t/file-added-to-s3-on-one-machine-not-visible-on-2nd-machine-unless-mount-is-restarted/20645 - # SEE https://rclone.org/commands/rclone_mount/#vfs-directory-cache - "dir-cache-time": f"{r_clone_settings.R_CLONE_DIR_CACHE_TIME_SECONDS}s", - "poll-interval": f"{r_clone_settings.R_CLONE_POLL_INTERVAL_SECONDS}s", - }, - } - if r_clone_settings.R_CLONE_S3.S3_ENDPOINT: - driver_config["Options"][ - "s3-endpoint" - ] = r_clone_settings.R_CLONE_S3.S3_ENDPOINT - - extra_options: dict[str, str] | None = None - - if r_clone_settings.R_CLONE_PROVIDER == S3Provider.MINIO: - extra_options = { - "s3-provider": "Minio", - "s3-region": "us-east-1", - "s3-location_constraint": "", - "s3-server_side_encryption": "", - } - elif r_clone_settings.R_CLONE_PROVIDER == S3Provider.CEPH: - extra_options = { - "s3-provider": "Ceph", - "s3-acl": "private", - } - elif r_clone_settings.R_CLONE_PROVIDER == S3Provider.AWS: - extra_options = { - "s3-provider": "AWS", - "s3-region": r_clone_settings.R_CLONE_S3.S3_REGION, - "s3-acl": "private", - } - else: - msg = f"Unexpected, all {S3Provider.__name__} should be covered" - raise DynamicSidecarError(msg=msg) - - assert extra_options is not None # nosec - options: dict[str, Any] = driver_config["Options"] - options.update(extra_options) - - return driver_config +DY_SIDECAR_SHARED_STORE_PATH: Final[Path] = Path("/shared-store") def _get_efs_volume_driver_config( @@ -225,39 +161,6 @@ def mount_user_preferences( volume_size_limit="10M" if has_quota_support else None, ) - @classmethod - def mount_r_clone( - cls, - swarm_stack_name: str, - path: Path, - node_uuid: NodeID, - service_run_id: ServiceRunID, - project_id: ProjectID, - user_id: UserID, - r_clone_settings: RCloneSettings, - ) -> dict[str, Any]: - return { - "Source": cls.source(path, node_uuid, service_run_id), - "Target": cls.target(path), - "Type": "volume", - "VolumeOptions": { - "Labels": { - "source": cls.source(path, node_uuid, service_run_id), - "run_id": f"{service_run_id}", - "node_uuid": f"{node_uuid}", - "study_id": f"{project_id}", - "user_id": f"{user_id}", - "swarm_stack_name": swarm_stack_name, - }, - "DriverConfig": _get_s3_volume_driver_config( - r_clone_settings=r_clone_settings, - project_id=project_id, - node_uuid=node_uuid, - storage_directory_name=cls.volume_name(path).strip("_"), - ), - }, - } - @classmethod def mount_efs( cls, diff --git a/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py b/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py index 66d68768f059..b2364beabaa4 100644 --- a/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py +++ b/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py @@ -14,7 +14,6 @@ from typing import Any, NamedTuple, cast from uuid import uuid4 -import aioboto3 import aiodocker import httpx import pytest @@ -94,7 +93,6 @@ is_legacy, patch_dynamic_service_url, run_command, - sleep_for, ) from yarl import URL @@ -330,26 +328,6 @@ async def db_manager(sqlalchemy_async_engine: AsyncEngine) -> DBManager: return DBManager(sqlalchemy_async_engine, application_name=APP_NAME) -def _is_docker_r_clone_plugin_installed() -> bool: - return "rclone:" in run_command("docker plugin ls") - - -@pytest.fixture( - scope="session", - params={ - # NOTE: There is an issue with the docker rclone volume plugin: - # SEE https://github.com/rclone/rclone/issues/6059 - # Disabling rclone test until this is fixed. - # "true", - "false", - }, -) -def dev_feature_r_clone_enabled(request) -> str: - if request.param == "true" and not _is_docker_r_clone_plugin_installed(): - pytest.skip("Required docker plugin `rclone` not installed.") - return request.param - - @pytest.fixture async def patch_storage_setup( mocker: MockerFixture, @@ -375,7 +353,6 @@ def mock_env( mock_env: EnvVarsDict, monkeypatch: pytest.MonkeyPatch, network_name: str, - dev_feature_r_clone_enabled: str, dask_scheduler_service: str, dask_scheduler_auth: ClusterAuthentication, minimal_configuration: None, @@ -422,7 +399,6 @@ def mock_env( "RABBIT_HOST": f"{get_localhost_ip()}", "POSTGRES_HOST": f"{get_localhost_ip()}", "R_CLONE_PROVIDER": "MINIO", - "DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED": dev_feature_r_clone_enabled, "COMPUTATIONAL_BACKEND_ENABLED": "true", "COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED": "true", "COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL": dask_scheduler_service, @@ -711,36 +687,6 @@ async def _fetch_data_via_data_manager( return save_to -async def _fetch_data_via_aioboto( - r_clone_settings: RCloneSettings, - dir_tag: str, - temp_dir: Path, - node_id: NodeIDStr, - project_id: ProjectID, -) -> Path: - save_to = temp_dir / f"aioboto_{dir_tag}_{uuid4()}" - save_to.mkdir(parents=True, exist_ok=True) - - session = aioboto3.Session( - aws_access_key_id=r_clone_settings.R_CLONE_S3.S3_ACCESS_KEY, - aws_secret_access_key=r_clone_settings.R_CLONE_S3.S3_SECRET_KEY, - ) - async with session.resource( - "s3", endpoint_url=r_clone_settings.R_CLONE_S3.S3_ENDPOINT - ) as s3: - bucket = await s3.Bucket(r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME) - async for s3_object in bucket.objects.all(): - key_path = f"{project_id}/{node_id}/{DY_SERVICES_R_CLONE_DIR_NAME}/" - if s3_object.key.startswith(key_path): - file_object = await s3_object.get() - file_path = save_to / s3_object.key.replace(key_path, "") - print(f"Saving file to {file_path}") - file_content = await file_object["Body"].read() - file_path.write_bytes(file_content) - - return save_to - - async def _start_and_wait_for_dynamic_services_ready( director_v2_client: httpx.AsyncClient, product_name: str, @@ -1075,39 +1021,13 @@ async def test_nodeports_integration( app_settings.DYNAMIC_SERVICES.DYNAMIC_SIDECAR.DYNAMIC_SIDECAR_R_CLONE_SETTINGS ) - if app_settings.DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED: - await sleep_for( - WAIT_FOR_R_CLONE_VOLUME_TO_SYNC_DATA, - "Waiting for rclone to sync data from the docker volume", - ) - - dy_path_volume_before = ( - await _fetch_data_via_aioboto( - r_clone_settings=r_clone_settings, - dir_tag="dy", - temp_dir=tmp_path, - node_id=services_node_uuids.dy, - project_id=current_study.uuid, - ) - if app_settings.DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED - else await _fetch_data_from_container( - dir_tag="dy", service_uuid=services_node_uuids.dy, temp_dir=tmp_path - ) + dy_path_volume_before = await _fetch_data_from_container( + dir_tag="dy", service_uuid=services_node_uuids.dy, temp_dir=tmp_path ) - dy_compose_spec_path_volume_before = ( - await _fetch_data_via_aioboto( - r_clone_settings=r_clone_settings, - dir_tag="dy_compose_spec", - temp_dir=tmp_path, - node_id=services_node_uuids.dy_compose_spec, - project_id=current_study.uuid, - ) - if app_settings.DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED - else await _fetch_data_from_container( - dir_tag="dy_compose_spec", - service_uuid=services_node_uuids.dy_compose_spec, - temp_dir=tmp_path, - ) + dy_compose_spec_path_volume_before = await _fetch_data_from_container( + dir_tag="dy_compose_spec", + service_uuid=services_node_uuids.dy_compose_spec, + temp_dir=tmp_path, ) # STEP 5 @@ -1125,52 +1045,26 @@ async def test_nodeports_integration( await _wait_for_dy_services_to_fully_stop(async_client) - if app_settings.DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED: - await sleep_for( - WAIT_FOR_R_CLONE_VOLUME_TO_SYNC_DATA, - "Waiting for rclone to sync data from the docker volume", - ) - - dy_path_data_manager_before = ( - await _fetch_data_via_aioboto( - r_clone_settings=r_clone_settings, - dir_tag="dy", - temp_dir=tmp_path, - node_id=services_node_uuids.dy, - project_id=current_study.uuid, - ) - if app_settings.DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED - else await _fetch_data_via_data_manager( - r_clone_settings=r_clone_settings, - dir_tag="dy", - user_id=current_user["id"], - project_id=current_study.uuid, - service_uuid=NodeID(services_node_uuids.dy), - temp_dir=tmp_path, - io_log_redirect_cb=mock_io_log_redirect_cb, - faker=faker, - ) + dy_path_data_manager_before = await _fetch_data_via_data_manager( + r_clone_settings=r_clone_settings, + dir_tag="dy", + user_id=current_user["id"], + project_id=current_study.uuid, + service_uuid=NodeID(services_node_uuids.dy), + temp_dir=tmp_path, + io_log_redirect_cb=mock_io_log_redirect_cb, + faker=faker, ) - dy_compose_spec_path_data_manager_before = ( - await _fetch_data_via_aioboto( - r_clone_settings=r_clone_settings, - dir_tag="dy_compose_spec", - temp_dir=tmp_path, - node_id=services_node_uuids.dy_compose_spec, - project_id=current_study.uuid, - ) - if app_settings.DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED - else await _fetch_data_via_data_manager( - r_clone_settings=r_clone_settings, - dir_tag="dy_compose_spec", - user_id=current_user["id"], - project_id=current_study.uuid, - service_uuid=NodeID(services_node_uuids.dy_compose_spec), - temp_dir=tmp_path, - io_log_redirect_cb=mock_io_log_redirect_cb, - faker=faker, - ) + dy_compose_spec_path_data_manager_before = await _fetch_data_via_data_manager( + r_clone_settings=r_clone_settings, + dir_tag="dy_compose_spec", + user_id=current_user["id"], + project_id=current_study.uuid, + service_uuid=NodeID(services_node_uuids.dy_compose_spec), + temp_dir=tmp_path, + io_log_redirect_cb=mock_io_log_redirect_cb, + faker=faker, ) # STEP 6 @@ -1185,33 +1079,13 @@ async def test_nodeports_integration( catalog_url=services_endpoint["catalog"], ) - dy_path_volume_after = ( - await _fetch_data_via_aioboto( - r_clone_settings=r_clone_settings, - dir_tag="dy", - temp_dir=tmp_path, - node_id=services_node_uuids.dy, - project_id=current_study.uuid, - ) - if app_settings.DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED - else await _fetch_data_from_container( - dir_tag="dy", service_uuid=services_node_uuids.dy, temp_dir=tmp_path - ) + dy_path_volume_after = await _fetch_data_from_container( + dir_tag="dy", service_uuid=services_node_uuids.dy, temp_dir=tmp_path ) - dy_compose_spec_path_volume_after = ( - await _fetch_data_via_aioboto( - r_clone_settings=r_clone_settings, - dir_tag="dy_compose_spec", - temp_dir=tmp_path, - node_id=services_node_uuids.dy_compose_spec, - project_id=current_study.uuid, - ) - if app_settings.DIRECTOR_V2_DEV_FEATURE_R_CLONE_MOUNTS_ENABLED - else await _fetch_data_from_container( - dir_tag="dy_compose_spec", - service_uuid=services_node_uuids.dy_compose_spec, - temp_dir=tmp_path, - ) + dy_compose_spec_path_volume_after = await _fetch_data_from_container( + dir_tag="dy_compose_spec", + service_uuid=services_node_uuids.dy_compose_spec, + temp_dir=tmp_path, ) # STEP 7 From 9ff8d304af57c1018d6cc99ccb3ca898787b03df Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Thu, 21 Aug 2025 10:56:14 +0200 Subject: [PATCH 04/79] removed unused --- .../modules/dynamic_sidecar/docker_service_specs/sidecar.py | 2 -- .../modules/dynamic_sidecar/scheduler/_core/_events_utils.py | 1 - 2 files changed, 3 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py index 2a2a8edabedf..485447466d65 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py @@ -241,7 +241,6 @@ async def _get_mounts( scheduler_data: SchedulerData, dynamic_sidecar_settings: DynamicSidecarSettings, dynamic_services_scheduler_settings: DynamicServicesSchedulerSettings, - app_settings: AppSettings, has_quota_support: bool, rpc_client: RabbitMQRPCClient, is_efs_enabled: bool, @@ -433,7 +432,6 @@ async def get_dynamic_sidecar_spec( # pylint:disable=too-many-arguments# noqa: scheduler_data=scheduler_data, dynamic_services_scheduler_settings=dynamic_services_scheduler_settings, dynamic_sidecar_settings=dynamic_sidecar_settings, - app_settings=app_settings, has_quota_support=has_quota_support, rpc_client=rpc_client, is_efs_enabled=user_extra_properties.is_efs_enabled, diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py index 3aec31d163dd..205585ec779f 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py @@ -472,7 +472,6 @@ async def wait_for_sidecar_api(app: FastAPI, scheduler_data: SchedulerData) -> N async def prepare_services_environment( app: FastAPI, scheduler_data: SchedulerData ) -> None: - app_settings: AppSettings = app.state.settings sidecars_client = await get_sidecars_client(app, scheduler_data.node_uuid) dynamic_sidecar_endpoint = scheduler_data.endpoint From 720ee1b1a5879cc485aa422d84db131c532a79e6 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 22 Aug 2025 07:03:07 +0200 Subject: [PATCH 05/79] moved config_file placement --- .../src/servicelib/r_clone_utils.py | 13 +++++++++++++ .../tests/tests_r_clone_utils.py | 11 +++++++++++ .../simcore_sdk/node_ports_common/r_clone.py | 17 +++-------------- .../tests/unit/test_node_ports_v2_r_clone.py | 7 ------- 4 files changed, 27 insertions(+), 21 deletions(-) create mode 100644 packages/service-library/src/servicelib/r_clone_utils.py create mode 100644 packages/service-library/tests/tests_r_clone_utils.py diff --git a/packages/service-library/src/servicelib/r_clone_utils.py b/packages/service-library/src/servicelib/r_clone_utils.py new file mode 100644 index 000000000000..933bd29d2e63 --- /dev/null +++ b/packages/service-library/src/servicelib/r_clone_utils.py @@ -0,0 +1,13 @@ +from collections.abc import AsyncIterator +from contextlib import asynccontextmanager + +from aiofiles import tempfile + + +@asynccontextmanager +async def config_file(config: str) -> AsyncIterator[str]: + async with tempfile.NamedTemporaryFile("w") as f: + await f.write(config) + await f.flush() + assert isinstance(f.name, str) # nosec + yield f.name diff --git a/packages/service-library/tests/tests_r_clone_utils.py b/packages/service-library/tests/tests_r_clone_utils.py new file mode 100644 index 000000000000..94f9a826ab03 --- /dev/null +++ b/packages/service-library/tests/tests_r_clone_utils.py @@ -0,0 +1,11 @@ +from pathlib import Path + +from faker import Faker +from servicelib.r_clone_utils import config_file + + +async def test_config_file(faker: Faker) -> None: + text_to_write = faker.text() + async with config_file(text_to_write) as file_name: + assert text_to_write == Path(file_name).read_text() + assert Path(file_name).exists() is False diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py index 69b005e6ed0a..8156bba3ba44 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py @@ -3,16 +3,14 @@ import re import shlex from asyncio.streams import StreamReader -from collections.abc import AsyncIterator -from contextlib import asynccontextmanager from pathlib import Path from typing import Final from aiocache import cached # type: ignore[import-untyped] -from aiofiles import tempfile from common_library.errors_classes import OsparcErrorMixin from pydantic import AnyUrl, BaseModel, ByteSize from servicelib.progress_bar import ProgressBarData +from servicelib.r_clone_utils import config_file from servicelib.utils import logged_gather from settings_library.r_clone import RCloneSettings from settings_library.utils_r_clone import get_r_clone_config @@ -45,15 +43,6 @@ class RCloneDirectoryNotFoundError(BaseRCloneError): ) -@asynccontextmanager -async def _config_file(config: str) -> AsyncIterator[str]: - async with tempfile.NamedTemporaryFile("w") as f: - await f.write(config) - await f.flush() - assert isinstance(f.name, str) # nosec - yield f.name - - async def _read_stream(stream: StreamReader, r_clone_log_parsers: list[BaseLogParser]): while True: line: bytes = await stream.readline() @@ -149,7 +138,7 @@ async def _get_folder_size( r_clone_config_file_content = get_r_clone_config( r_clone_settings, s3_config_key=s3_config_key ) - async with _config_file(r_clone_config_file_content) as config_file_name: + async with config_file(r_clone_config_file_content) as config_file_name: r_clone_command = ( "rclone", f"--config {config_file_name}", @@ -193,7 +182,7 @@ async def _sync_sources( r_clone_config_file_content = get_r_clone_config( r_clone_settings, s3_config_key=s3_config_key ) - async with _config_file(r_clone_config_file_content) as config_file_name: + async with config_file(r_clone_config_file_content) as config_file_name: r_clone_command = ( "rclone", "--config", diff --git a/packages/simcore-sdk/tests/unit/test_node_ports_v2_r_clone.py b/packages/simcore-sdk/tests/unit/test_node_ports_v2_r_clone.py index 181813559fbd..372fdd17774c 100644 --- a/packages/simcore-sdk/tests/unit/test_node_ports_v2_r_clone.py +++ b/packages/simcore-sdk/tests/unit/test_node_ports_v2_r_clone.py @@ -71,13 +71,6 @@ async def test_is_r_clone_available_cached( assert await r_clone.is_r_clone_available(None) is False -async def test__config_file(faker: Faker) -> None: - text_to_write = faker.text() - async with r_clone._config_file(text_to_write) as file_name: # noqa: SLF001 - assert text_to_write == Path(file_name).read_text() - assert Path(file_name).exists() is False - - async def test__async_command_ok() -> None: result = await r_clone._async_r_clone_command("ls", "-la") # noqa: SLF001 assert len(result) > 0 From 36f9ebf42c173f95420e96c0c543f972620b281b Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 8 Dec 2025 13:26:32 +0100 Subject: [PATCH 06/79] renamed --- .../src/settings_library/r_clone.py | 1 + .../src/settings_library/utils_r_clone.py | 18 +++++++++++++----- .../tests/test_utils_r_clone.py | 4 ++-- .../simcore_sdk/node_ports_common/r_clone.py | 7 +++---- 4 files changed, 19 insertions(+), 11 deletions(-) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index adb87ee45726..91ddbf4f68c0 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -9,6 +9,7 @@ class S3Provider(StrEnum): AWS = "AWS" + AWS_MOTO = "AWS_MOTO" CEPH = "CEPH" MINIO = "MINIO" diff --git a/packages/settings-library/src/settings_library/utils_r_clone.py b/packages/settings-library/src/settings_library/utils_r_clone.py index cda4f878ad5b..ed4551221743 100644 --- a/packages/settings-library/src/settings_library/utils_r_clone.py +++ b/packages/settings-library/src/settings_library/utils_r_clone.py @@ -15,21 +15,29 @@ _PROVIDER_SETTINGS_OPTIONS: dict[S3Provider, dict[str, str]] = { # NOTE: # AWS_SESSION_TOKEN should be required for STS S3Provider.AWS: {"provider": "AWS"}, + S3Provider.AWS_MOTO: { + "provider": "Other", + "force_path_style": "true", + "endpoint": "{endpoint}", + }, S3Provider.CEPH: {"provider": "Ceph", "endpoint": "{endpoint}"}, S3Provider.MINIO: {"provider": "Minio", "endpoint": "{endpoint}"}, } -def _format_config(settings_options: dict[str, str], s3_config_key: str) -> str: +def format_config(config_key: str, settings_options: dict[str, str]) -> str: + """creates .ini file content for a given rclone configuration""" config = configparser.ConfigParser() - config[s3_config_key] = settings_options + config[config_key] = settings_options with StringIO() as string_io: config.write(string_io) string_io.seek(0) return string_io.read() -def get_r_clone_config(r_clone_settings: RCloneSettings, *, s3_config_key: str) -> str: +def get_s3_r_clone_config( + r_clone_settings: RCloneSettings, *, s3_config_key: str +) -> str: """ Arguments: r_clone_settings -- current rclone configuration @@ -44,8 +52,8 @@ def get_r_clone_config(r_clone_settings: RCloneSettings, *, s3_config_key: str) _PROVIDER_SETTINGS_OPTIONS[r_clone_settings.R_CLONE_PROVIDER] ) - r_clone_config_template = _format_config( - settings_options=settings_options, s3_config_key=s3_config_key + r_clone_config_template = format_config( + config_key=s3_config_key, settings_options=settings_options ) # replace entries in template diff --git a/packages/settings-library/tests/test_utils_r_clone.py b/packages/settings-library/tests/test_utils_r_clone.py index 82dabf47daf8..a1a0be0207ab 100644 --- a/packages/settings-library/tests/test_utils_r_clone.py +++ b/packages/settings-library/tests/test_utils_r_clone.py @@ -5,7 +5,7 @@ from settings_library.r_clone import RCloneSettings, S3Provider from settings_library.utils_r_clone import ( _COMMON_SETTINGS_OPTIONS, - get_r_clone_config, + get_s3_r_clone_config, resolve_provider, ) @@ -24,7 +24,7 @@ def r_clone_settings( def test_r_clone_config_template_replacement(r_clone_settings: RCloneSettings) -> None: - r_clone_config = get_r_clone_config(r_clone_settings, s3_config_key="target-s3") + r_clone_config = get_s3_r_clone_config(r_clone_settings, s3_config_key="target-s3") print(r_clone_config) assert "{endpoint}" not in r_clone_config diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py index 8156bba3ba44..ea49122d1d86 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py @@ -13,7 +13,7 @@ from servicelib.r_clone_utils import config_file from servicelib.utils import logged_gather from settings_library.r_clone import RCloneSettings -from settings_library.utils_r_clone import get_r_clone_config +from settings_library.utils_r_clone import get_s3_r_clone_config from ._utils import BaseLogParser from .r_clone_utils import ( @@ -81,7 +81,6 @@ async def _async_r_clone_command( [asyncio.create_task(_read_stream(proc.stdout, [*r_clone_log_parsers]))] ) - # NOTE: ANE not sure why you do this call here. The above one already reads out the stream. _stdout, _stderr = await proc.communicate() command_output = command_result_parser.get_output() @@ -135,7 +134,7 @@ async def _get_folder_size( folder: Path, s3_config_key: str, ) -> ByteSize: - r_clone_config_file_content = get_r_clone_config( + r_clone_config_file_content = get_s3_r_clone_config( r_clone_settings, s3_config_key=s3_config_key ) async with config_file(r_clone_config_file_content) as config_file_name: @@ -179,7 +178,7 @@ async def _sync_sources( s3_config_key=s3_config_key, ) - r_clone_config_file_content = get_r_clone_config( + r_clone_config_file_content = get_s3_r_clone_config( r_clone_settings, s3_config_key=s3_config_key ) async with config_file(r_clone_config_file_content) as config_file_name: From 611e99a0d9ccf1572f7263b03bd25e8768231e72 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Tue, 9 Dec 2025 14:43:31 +0100 Subject: [PATCH 07/79] working version --- .../modules/rclone/__init__.py | 0 .../modules/rclone/_config_provider.py | 24 + .../modules/rclone/_mount.py | 385 +++++++++++++ .../tests/unit/rclone/test__mount.py | 521 ++++++++++++++++++ 4 files changed, 930 insertions(+) create mode 100644 services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/__init__.py create mode 100644 services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_config_provider.py create mode 100644 services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py create mode 100644 services/dynamic-sidecar/tests/unit/rclone/test__mount.py diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/__init__.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_config_provider.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_config_provider.py new file mode 100644 index 000000000000..e7a372bdf7ef --- /dev/null +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_config_provider.py @@ -0,0 +1,24 @@ +from enum import Enum, auto +from typing import Final + +from settings_library.utils_r_clone import get_s3_r_clone_config +from simcore_service_dynamic_sidecar.core.settings import ApplicationSettings + +CONFIG_KEY: Final[str] = "MOUNT_REMOTE" + + +class MountRemoteType(Enum): + S3 = auto() + + +def get_config_content( + settings: ApplicationSettings, mount_remote_type: MountRemoteType +) -> str: + match mount_remote_type: + case MountRemoteType.S3: + return get_s3_r_clone_config( + settings.DY_SIDECAR_R_CLONE_SETTINGS, s3_config_key=CONFIG_KEY + ) + case _: + msg = f"Mount type {mount_remote_type} not implemented" + raise NotImplementedError(msg) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py new file mode 100644 index 000000000000..bf56ed16371c --- /dev/null +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py @@ -0,0 +1,385 @@ +import asyncio +import logging +import os +from collections.abc import Awaitable, Callable +from contextlib import AsyncExitStack +from datetime import timedelta +from pathlib import Path +from typing import Any, Final +from uuid import uuid4 + +import httpx +from common_library.errors_classes import OsparcErrorMixin +from fastapi import FastAPI +from httpx import AsyncClient +from models_library.basic_types import PortInt +from models_library.progress_bar import ProgressReport +from pydantic import BaseModel, NonNegativeFloat +from servicelib.container_utils import run_command_in_container +from servicelib.logging_utils import log_catch +from servicelib.r_clone_utils import config_file +from simcore_service_dynamic_sidecar.core.settings import ( + ApplicationSettings, + RCloneMountSettings, +) +from tenacity import ( + before_sleep_log, + retry, + retry_if_exception_type, + stop_after_delay, + wait_fixed, +) + +from ._config_provider import CONFIG_KEY, MountRemoteType, get_config_content + +_logger = logging.getLogger(__name__) + + +_DEFAULT_REMOTE_CONTROL_HOST: Final[str] = "localhost" +_MAX_WAIT_RC_HTTP_INTERFACE_READY: Final[timedelta] = timedelta(seconds=10) +_DEFAULT_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=1) +_DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT: Final[timedelta] = timedelta(seconds=5) + + +class _BaseRcloneMountError(OsparcErrorMixin, RuntimeError): + pass + + +class _ProcessAlreadyStartedError(_BaseRcloneMountError): + msg_template: str = "Process already started with pid='{pid}' via '{command}'" + + +class _MountAlreadyStartedError(_BaseRcloneMountError): + msg_template: str = ( + "Mount process already stareted with pid='{pid}' via '{command}'" + ) + + +class _WaitingForTransfersToCompleteError(_BaseRcloneMountError): + msg_template: str = "Waiting for all transfers to complete" + + +class _WaitingForQueueToBeEmptyError(_BaseRcloneMountError): + msg_template: str = "Waiting for VFS queue to be empty: queue={queue}" + + +type MountID = str + + +def _get_command__pid_of_background_command(command: str) -> str: + return f"sh -c '{command} & echo $!'" + + +def _get_command__sigterm_process(pid: str) -> str: + return f"kill -SIGTERM {pid}" + + +class DaemonProcessManager: + """manage a command that is meant to run in a container forever""" + + def __init__(self, command: str, *, timeout: NonNegativeFloat = 5) -> None: + self.command = command + self.timeout = timeout + self.pid: str | None = None + + async def _run_in_container(self, command: str) -> str: + self_container = os.environ["HOSTNAME"] + return await run_command_in_container( + self_container, command=command, timeout=self.timeout + ) + + async def start(self): + if self.pid: + raise _ProcessAlreadyStartedError(pid=self.pid, command=self.command) + + command_result = await self._run_in_container( + command=_get_command__pid_of_background_command(self.command) + ) + # pid is printed as the first line of the output + self.pid = command_result.strip().split("\n")[0] + _logger.debug("Started rclone mount with pid=%s", self.pid) + + async def stop(self): + if self.pid is None: + return + + # since the process could have failed to start or failed shortly after + # starting the pid mind not be corresponding to a running process + # and will raise an error + with log_catch(_logger, reraise=False): + await self._run_in_container( + command=_get_command__sigterm_process(self.pid) + ) + + +def _get_rclone_mount_command( + config_file_path: str, + remote_path: Path, + local_mount_path: Path, + vfs_cache_path: Path, + rc_addr: str, + rc_user: str, + rc_password: str, +) -> str: + escaped_remote_path = f"{remote_path}".lstrip("/") + command: list[str] = [ + "rclone", + "--config", + config_file_path, + f"--log-file=/tmp/rclone-debug{uuid4()}.log", + "-vv", + "mount", + f"{CONFIG_KEY}:{escaped_remote_path}", + f"{local_mount_path}", + "--vfs-cache-mode full", + "--vfs-write-back", + "1s", # write-back delay + "--vfs-fast-fingerprint", # recommended for s3 backend + "--no-modtime", # don't read/write the modification time + "--cache-dir", + f"{vfs_cache_path}", + "--rc", + f"--rc-addr={rc_addr}", + "--rc-enable-metrics", + f"--rc-user='{rc_user}'", + f"--rc-pass='{rc_password}'", + "--allow-non-empty", + ] + return " ".join(command) + + +class MountActivity(BaseModel): + transferring: dict[str, ProgressReport] + queued: list[str] + + +class RCloneRCInterfaceClient: + def __init__( + self, + remote_control_port: PortInt, + r_clone_mount_settings: RCloneMountSettings, + *, + update_handler: Callable[[MountActivity], Awaitable[None]], + remote_control_host: str = _DEFAULT_REMOTE_CONTROL_HOST, + update_interval: timedelta = _DEFAULT_UPDATE_INTERVAL, + r_clone_client_timeout: timedelta = _DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT, + ) -> None: + self._r_clone_mount_settings = r_clone_mount_settings + self._update_interval_seconds = update_interval.total_seconds() + self._r_clone_client_timeout = r_clone_client_timeout + self._update_handler = update_handler + + self._rc_host = remote_control_host + self._rc_port = remote_control_port + self.rc_user = f"{uuid4()}" + self.rc_password = f"{uuid4()}" + + self._cleanup_stack = AsyncExitStack() + self._client: AsyncClient | None = None + + self._continue_running: bool = True + self._transfer_monitor: asyncio.Task | None = None + + async def setup(self) -> None: + self._client = await self._cleanup_stack.enter_async_context( + AsyncClient(timeout=self._r_clone_client_timeout.total_seconds()) + ) + self._transfer_monitor = asyncio.create_task(self._monitor()) + + async def teardown(self) -> None: + if self._transfer_monitor is not None: + self._continue_running = False + await self._transfer_monitor + self._transfer_monitor = None + + await self._cleanup_stack.aclose() + + @property + def _base_url(self) -> str: + return f"http://{self._rc_host}:{self._rc_port}" + + async def _request(self, method: str, path: str) -> Any: + assert self._client is not None # nosec + + response = await self._client.request( + method, f"{self._base_url}/{path}", auth=(self.rc_user, self.rc_password) + ) + response.raise_for_status() + result = response.json() + _logger.debug("'%s %s' replied with: %s", method, path, result) + return result + + async def _post_core_stats(self) -> dict: + return await self._request("POST", "core/stats") + + async def _post_vfs_queue(self) -> dict: + return await self._request("POST", "vfs/queue") + + async def _rc_noop(self) -> dict: + return await self._request("POST", "rc/noop") + + async def _monitor(self) -> None: + while self._continue_running: + await asyncio.sleep(self._update_interval_seconds) + + core_stats, vfs_queue = await asyncio.gather( + self._post_core_stats(), self._post_vfs_queue() + ) + + mount_activity = MountActivity( + transferring=( + { + x["name"]: ProgressReport(actual_value=x["percentage"] / 100) + for x in core_stats["transferring"] + } + if "transferring" in core_stats + else {} + ), + queued=[x["name"] for x in vfs_queue["queue"]], + ) + + await self._update_handler(mount_activity) + + @retry( + wait=wait_fixed(1), + stop=stop_after_delay(_MAX_WAIT_RC_HTTP_INTERFACE_READY.total_seconds()), + reraise=True, + retry=retry_if_exception_type(httpx.HTTPError), + before_sleep=before_sleep_log(_logger, logging.WARNING), + ) + async def wait_for_interface_to_be_ready(self) -> None: + await self._rc_noop() + + async def wait_for_all_transfers_to_complete(self) -> None: + @retry( + wait=wait_fixed(1), + stop=stop_after_delay( + self._r_clone_mount_settings.R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT.total_seconds() + ), + reraise=True, + retry=retry_if_exception_type( + (_WaitingForQueueToBeEmptyError, _WaitingForTransfersToCompleteError) + ), + before_sleep=before_sleep_log(_logger, logging.WARNING), + ) + async def _() -> None: + core_stats, vfs_queue = await asyncio.gather( + self._post_core_stats(), self._post_vfs_queue() + ) + + if ( + core_stats["transfers"] != core_stats["totalTransfers"] + or "transferring" in core_stats + ): + raise _WaitingForTransfersToCompleteError + + queue = vfs_queue["queue"] + if len(queue) != 0: + raise _WaitingForQueueToBeEmptyError(queue=queue) + + await _() + + +class TrackedMount: + def __init__( + self, + settings: ApplicationSettings, + remote_type: MountRemoteType, + *, + rc_port: PortInt, + remote_path: Path, + local_mount_path: Path, + vfs_cache_path: Path, + ) -> None: + self.settings = settings + self.mount_type = remote_type + self.rc_port = rc_port + self.remote_path = remote_path + self.local_mount_path = local_mount_path + self.vfs_cache_path = vfs_cache_path + + async def _handler( + mount_activity: MountActivity, + ) -> None: + _logger.debug("mount_activity=%s", mount_activity) + + self.rc_interface = RCloneRCInterfaceClient( + remote_control_port=rc_port, + r_clone_mount_settings=settings.R_CLONE_MOUNT_SETTINGS, + update_handler=_handler, + ) + + # used internally to handle the mount command + self._daemon_manager: DaemonProcessManager | None = None + self._cleanup_stack = AsyncExitStack() + + async def setup(self) -> None: + pass + + async def teardown(self) -> None: + await self.stop_mount() + + async def start_mount(self) -> None: + if self._daemon_manager is not None: + raise _MountAlreadyStartedError( + pid=self._daemon_manager.pid, command=self._daemon_manager.command + ) + + config_file_path = await self._cleanup_stack.enter_async_context( + config_file(get_config_content(self.settings, self.mount_type)) + ) + + self._daemon_manager = DaemonProcessManager( + command=_get_rclone_mount_command( + config_file_path=config_file_path, + remote_path=self.remote_path, + local_mount_path=self.local_mount_path, + vfs_cache_path=self.vfs_cache_path, + rc_addr=f"0.0.0.0:{self.rc_port}", + rc_user=self.rc_interface.rc_user, + rc_password=self.rc_interface.rc_password, + ) + ) + await self._daemon_manager.start() + await self.rc_interface.setup() + await self.rc_interface.wait_for_interface_to_be_ready() + + async def stop_mount(self) -> None: + if self._daemon_manager is None: + return + + await self.rc_interface.wait_for_all_transfers_to_complete() + await self.rc_interface.teardown() + + await self._daemon_manager.stop() + self._daemon_manager = None + + await self._cleanup_stack.aclose() + + +class RCloneMountManager: + def __init__(self, app: FastAPI) -> None: + self.app = app + # keep track of all started mount commands via their pid and http endpoint, might need different ports for the http API + # add rc-user and rc-password the the config stored here so that nobody can access without credentials + + self._started_mounts: dict[MountID, TrackedMount] = {} + + async def start_mount(self, remote_type: MountRemoteType) -> MountID: + # create a mount via some configuration and keep track of it + pass + + async def stop_mount(self, mount_id: MountID) -> None: + pass + + async def setup(self) -> None: + pass + + async def teardown(self) -> None: + # await for all to terminate, limited gather + pass + + +# TODO: be able to mange multiple sources to be mounted + +# TODO: oauth atuthorization pattern needs to be setup for non S3 providers diff --git a/services/dynamic-sidecar/tests/unit/rclone/test__mount.py b/services/dynamic-sidecar/tests/unit/rclone/test__mount.py new file mode 100644 index 000000000000..18e0ae6fab48 --- /dev/null +++ b/services/dynamic-sidecar/tests/unit/rclone/test__mount.py @@ -0,0 +1,521 @@ +# pylint: disable=protected-access +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +import os +import re +import secrets +from collections.abc import AsyncIterable, AsyncIterator +from contextlib import asynccontextmanager +from pathlib import Path +from typing import Final, cast + +import aioboto3 +import aiodocker +import aiofiles +import pytest +from aiobotocore.session import ClientCreatorContext +from aiodocker.networks import DockerNetwork +from botocore.client import Config +from faker import Faker +from models_library.api_schemas_storage.storage_schemas import S3BucketName +from models_library.basic_types import PortInt +from pydantic import ByteSize, TypeAdapter +from pytest_mock import MockerFixture +from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict +from servicelib.container_utils import run_command_in_container +from servicelib.file_utils import create_sha256_checksum +from servicelib.logging_utils import _dampen_noisy_loggers +from servicelib.utils import limited_gather +from simcore_service_dynamic_sidecar.core.settings import ApplicationSettings +from simcore_service_dynamic_sidecar.modules.rclone import _mount +from simcore_service_dynamic_sidecar.modules.rclone._config_provider import ( + MountRemoteType, +) +from simcore_service_dynamic_sidecar.modules.rclone._mount import ( + DaemonProcessManager, + TrackedMount, +) +from types_aiobotocore_s3 import S3Client + +_dampen_noisy_loggers(("botocore", "aiobotocore", "aioboto3", "moto.server")) + + +@pytest.fixture +def r_clone_version(package_dir: Path) -> str: + install_rclone_bash = ( + (package_dir / ".." / ".." / ".." / "..").resolve() + / "scripts" + / "install_rclone.bash" + ) + assert install_rclone_bash.exists() + + match = re.search(r'R_CLONE_VERSION="([\d.]+)"', install_rclone_bash.read_text()) + assert match + return match.group(1) + + +@pytest.fixture +def local_s3_content_path(tmpdir: Path) -> Path: + # path where s3 are created and then uploaded form + path = Path(tmpdir) / "copy_to_s3" + path.mkdir(parents=True, exist_ok=True) + return path + + +@pytest.fixture +def r_clone_local_mount_path(tmpdir: Path) -> Path: + # where rclone mount will make the files available + path = Path(tmpdir) / "r_clone_local_mount_path" + path.mkdir(parents=True, exist_ok=True) + return path + + +@pytest.fixture +def config_path(tmpdir: Path) -> Path: + # where the configuration path for rclone is found inside the container + path = Path(tmpdir) / "config_path" + path.mkdir(parents=True, exist_ok=True) + return path + + +@pytest.fixture +def mock_config_file(config_path: Path, faker: Faker, mocker: MockerFixture) -> None: + # ensure this returns a path where the config is living which has to be mounted in the container + # replace context manager with one that writes here + @asynccontextmanager + async def config_file(config: str) -> AsyncIterator[str]: + file_path = config_path / f"{faker.uuid4()}" + file_path.write_text(config) + yield f"{file_path}" + + file_path.unlink() + + mocker.patch.object(_mount, "config_file", config_file) + + +_MONITORING_PORT: Final[PortInt] = 5572 + + +@pytest.fixture +async def docker_network() -> AsyncIterable[DockerNetwork]: + async with aiodocker.Docker() as client: + network_to_attach = await client.networks.create({"Name": "a_test_network"}) + try: + yield network_to_attach + finally: + await network_to_attach.delete() + + +@pytest.fixture +async def r_clone_container( + r_clone_version: str, + r_clone_local_mount_path: Path, + config_path: Path, + monkeypatch: pytest.MonkeyPatch, + docker_network: DockerNetwork, +) -> AsyncIterable[str]: + async with aiodocker.Docker() as client: + container = await client.containers.run( + config={ + "Image": f"rclone/rclone:{r_clone_version}", + "Entrypoint": ["/bin/sh", "-c", "apk add findutils && sleep 10000"], + "ExposedPorts": {f"{_MONITORING_PORT}/tcp": {}}, + "HostConfig": { + "PortBindings": { + f"{_MONITORING_PORT}/tcp": [{"HostPort": f"{_MONITORING_PORT}"}] + }, + "Binds": [ + f"{r_clone_local_mount_path}:{r_clone_local_mount_path}:rw", + f"{config_path}:{config_path}:rw", + ], + "Devices": [ + { + "PathOnHost": "/dev/fuse", + "PathInContainer": "/dev/fuse", + "CgroupPermissions": "rwm", + } + ], + "CapAdd": ["SYS_ADMIN"], + "SecurityOpt": ["apparmor:unconfined", "seccomp:unconfined"], + }, + } + ) + container_inspect = await container.show() + + container_name = container_inspect["Name"][1:] + monkeypatch.setenv("HOSTNAME", container_name) + + await docker_network.connect({"Container": container.id}) + + try: + yield container.id + finally: + await container.delete(force=True) + + +@pytest.fixture +async def moto_container(docker_network: DockerNetwork) -> AsyncIterable[None]: + async with aiodocker.Docker() as client: + container = await client.containers.run( + config={ + "Image": "motoserver/moto:latest", + "ExposedPorts": {"5000/tcp": {}}, + "HostConfig": { + "PortBindings": {"5000/tcp": [{"HostPort": "5000"}]}, + }, + "Env": ["MOTO_PORT=5000"], + }, + name="moto", + ) + await docker_network.connect({"Container": container.id}) + + try: + yield None + finally: + await container.delete(force=True) + + +async def test_daemon_container_process(r_clone_container: str): + container_process = DaemonProcessManager("sleep 10000") + await container_process.start() + assert container_process.pid + + ps_command = "ps -o pid,stat,comm" + result = await container_process._run_in_container(ps_command) # noqa: SLF001 + assert f"{container_process.pid} S" in result # check sleeping + + await container_process.stop() + await container_process._run_in_container(ps_command) # noqa: SLF001 + assert f"{container_process.pid} Z" not in result # check killed + + +@pytest.fixture +def mock_environment( + monkeypatch: pytest.MonkeyPatch, mock_environment: EnvVarsDict +) -> EnvVarsDict: + setenvs_from_dict( + monkeypatch, + { + "R_CLONE_PROVIDER": "AWS_MOTO", + "S3_ENDPOINT": "http://moto:5000", + "S3_ACCESS_KEY": "test", + "S3_BUCKET_NAME": "test", + "S3_SECRET_KEY": "test", + "S3_REGION": "us-east-1", + }, + ) + return mock_environment + + +@pytest.fixture +def application_settings(mock_environment: EnvVarsDict) -> ApplicationSettings: + return ApplicationSettings.create_from_envs() + + +@pytest.fixture +def remote_path() -> Path: + return Path("test") + + +@pytest.fixture +async def s3_client( + application_settings: ApplicationSettings, +) -> AsyncIterable[S3Client]: + s3_settings = application_settings.DY_SIDECAR_R_CLONE_SETTINGS.R_CLONE_S3 + session = aioboto3.Session() + session_client = session.client( + "s3", + endpoint_url=f"{s3_settings.S3_ENDPOINT}".replace("moto", "localhost"), + aws_access_key_id=s3_settings.S3_ACCESS_KEY, + aws_secret_access_key=s3_settings.S3_SECRET_KEY, + region_name=s3_settings.S3_REGION, + config=Config(signature_version="s3v4"), + ) + assert isinstance(session_client, ClientCreatorContext) # nosec + async with session_client as client: + client = cast(S3Client, client) + yield client + + +@pytest.fixture +def bucket_name(application_settings: ApplicationSettings) -> S3BucketName: + return TypeAdapter(S3BucketName).validate_python( + application_settings.DY_SIDECAR_R_CLONE_SETTINGS.R_CLONE_S3.S3_BUCKET_NAME, + ) + + +def _secure_randint(a: int, b: int) -> int: + return a + secrets.randbelow(b - a + 1) + + +_DEFAULT_CHUCNK_SIZE: Final[ByteSize] = TypeAdapter(ByteSize).validate_python("1kb") + + +async def _get_random_file( + faker: Faker, + *, + store_to: Path, + file_size: ByteSize, + chunk_size: ByteSize = _DEFAULT_CHUCNK_SIZE, +) -> Path: + # creates a file in a path and returns it's hash + # generate a random file of size X and a random path inside the directory + + path_in_folder = Path( + faker.file_path(depth=_secure_randint(0, 5), extension="bin") + ).relative_to("/") + file_path = store_to / path_in_folder + + # ensure parent directory exists + file_path.parent.mkdir(parents=True, exist_ok=True) + assert file_path.parent.exists() + + async with aiofiles.open(file_path, "wb") as file: + written = 0 + while written < file_size: + to_write = min(chunk_size, file_size - written) + chunk = os.urandom(to_write) + await file.write(chunk) + written += to_write + + return path_in_folder + + +def _get_random_file_size() -> ByteSize: + return TypeAdapter(ByteSize).validate_python(f"{_secure_randint(1,1024)}Kb") + + +@pytest.fixture +async def create_files_in_s3( + application_settings: ApplicationSettings, + moto_container: None, + s3_client: S3Client, + bucket_name: S3BucketName, + faker: Faker, + remote_path: Path, + local_s3_content_path: Path, +) -> AsyncIterable[None]: + + await s3_client.create_bucket(Bucket=bucket_name) + + async def _create_file() -> None: + path_in_folder = await _get_random_file( + faker, + store_to=local_s3_content_path, + file_size=_get_random_file_size(), + ) + file_path = local_s3_content_path / path_in_folder + assert file_path.exists() + await s3_client.upload_file( + Filename=f"{file_path}", + Bucket=bucket_name, + Key=f"{remote_path/path_in_folder}", + ) + + files_to_create = _secure_randint(5, 20) + await limited_gather(*[_create_file() for _ in range(files_to_create)], limit=5) + + yield None + + files_in_bucket = await s3_client.list_objects_v2(Bucket=bucket_name) + + await limited_gather( + *[ + s3_client.delete_object(Bucket=bucket_name, Key=obj["Key"]) + for obj in files_in_bucket.get("Contents", []) + ], + limit=10, + ) + + # check all content form s3 was removed + files_in_bucket = await s3_client.list_objects_v2(Bucket=bucket_name) + assert files_in_bucket.get("Contents", []) == [] + + +@pytest.fixture +def mock_default_remote_control_host(mocker: MockerFixture) -> None: + mocker.patch( + "simcore_service_dynamic_sidecar.modules.rclone._mount._DEFAULT_REMOTE_CONTROL_HOST", + "0.0.0.0", # noqa: S104 + ) + + +@pytest.fixture +def vfs_cache_path(tmpdir: Path) -> Path: + # path inside the docker container where the vfs cache will be stored + # for tests this can be just placed in the tmp directory ? + # TODO: for better tests it's better that is mounted as a volume + return Path("/tmp/rclone_cache") # noqa: S108 + + +@pytest.fixture +async def tracked_mount( + mock_default_remote_control_host: None, + r_clone_container: str, + mock_config_file: None, + application_settings: ApplicationSettings, + remote_path: Path, + r_clone_local_mount_path: Path, + vfs_cache_path: Path, +) -> AsyncIterable[TrackedMount]: + tracked_mount = TrackedMount( + application_settings, + MountRemoteType.S3, + rc_port=_MONITORING_PORT, + remote_path=remote_path, + local_mount_path=r_clone_local_mount_path, + vfs_cache_path=vfs_cache_path, + ) + await tracked_mount.setup() + + yield tracked_mount + + await tracked_mount.teardown() + + +async def _get_file_checksums_from_local_path( + local_s3_content_path: Path, +) -> dict[Path, str]: + local_checksums = {} + for dirpath, _, filenames in os.walk(local_s3_content_path): + for filename in filenames: + file_path = Path(dirpath) / filename + relative_path = file_path.relative_to(local_s3_content_path) + + async with aiofiles.open(file_path, "rb") as file: + checksum = await create_sha256_checksum(file) + + local_checksums[relative_path] = checksum + return local_checksums + + +async def _get_file_checksums_from_container( + remote_path: Path, + r_clone_container: str, + bucket_name: S3BucketName, +) -> dict[Path, str]: + remote_checksum_and_files = await run_command_in_container( + r_clone_container, + command=f"find {remote_path} -type f -exec sha256sum {{}} \\;", + timeout=30, + ) + + def _parse_entry(entry: str) -> tuple[Path, str]: + checksum, file_path = entry.strip().split() + relative_path = ( + Path(file_path).relative_to(remote_path).relative_to(Path(bucket_name)) + ) + return relative_path, checksum + + return dict( + [_parse_entry(x) for x in remote_checksum_and_files.strip().split("\n")] + ) + + +async def _get_files_from_s3( + s3_client: S3Client, + bucket_name: S3BucketName, +) -> dict[Path, str]: + """Download files from S3 and return their SHA256 checksums.""" + files_in_bucket = await s3_client.list_objects_v2(Bucket=bucket_name) + + async def _get_file_checksum(key: str) -> tuple[Path, str]: + response = await s3_client.get_object(Bucket=bucket_name, Key=key) + checksum = await create_sha256_checksum(response["Body"]) + return Path(key).relative_to(Path(bucket_name)), checksum + + results = await limited_gather( + *[ + _get_file_checksum(obj["Key"]) + for obj in files_in_bucket.get("Contents", []) + ], + limit=10, + ) + + return dict(results) + + +async def _assert_local_content_in_s3( + s3_client: S3Client, + bucket_name: S3BucketName, + local_s3_content_path: Path, +) -> None: + files_local_folder = await _get_file_checksums_from_local_path( + local_s3_content_path + ) + files_from_s3 = await _get_files_from_s3(s3_client, bucket_name) + + assert files_local_folder == files_from_s3 + + +async def _assert_same_files_in_all_places( + s3_client: S3Client, + bucket_name: S3BucketName, + r_clone_container: str, + r_clone_local_mount_path: Path, +) -> None: + files_from_container = await _get_file_checksums_from_container( + r_clone_local_mount_path, r_clone_container, bucket_name + ) + files_from_s3 = await _get_files_from_s3(s3_client, bucket_name) + assert files_from_container == files_from_s3 + + +async def _change_file_in_container(remote_path: Path, r_clone_container: str) -> None: + await run_command_in_container( + r_clone_container, + command=f"dd if=/dev/urandom of={remote_path} bs={_get_random_file_size()} count=1", + timeout=30, + ) + + +async def test_r_clone_mount( + create_files_in_s3: None, + tracked_mount: TrackedMount, + r_clone_local_mount_path: Path, + # maybe drop + s3_client: S3Client, + bucket_name: S3BucketName, + r_clone_container: str, + local_s3_content_path: Path, +): + await tracked_mount.start_mount() + + await _assert_local_content_in_s3(s3_client, bucket_name, local_s3_content_path) + + def _get_random_file_in_container() -> Path: + return ( + r_clone_local_mount_path + / bucket_name + / secrets.choice( + [x for x in local_s3_content_path.rglob("*") if x.is_file()] + ).relative_to(local_s3_content_path) + ) + + # change and check all is the same + files_to_change = {_get_random_file_in_container() for _ in range(15)} + await limited_gather( + *[_change_file_in_container(x, r_clone_container) for x in files_to_change], + limit=10, + ) + + await tracked_mount.rc_interface.wait_for_all_transfers_to_complete() + await _assert_same_files_in_all_places( + s3_client, + bucket_name, + r_clone_container, + r_clone_local_mount_path, + ) + + await tracked_mount.stop_mount() + + +# TODO: +# better real world tests +# use this to mount a folder like the node directly in a separate path on the sidecar and expose this +# to the users somehow +# so that we can use it form a jupyter-math + + +# TODO: we need a mode to check if rclone mount properly resumes the mounting in case of crash and restart +# we need a test for this one From f3f9400f638979b850fd95db531ae45afed922be Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Tue, 9 Dec 2025 15:07:00 +0100 Subject: [PATCH 08/79] update upon changes and at regular intervals --- .../modules/rclone/_mount.py | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py index bf56ed16371c..c7670bae52fc 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py @@ -3,7 +3,7 @@ import os from collections.abc import Awaitable, Callable from contextlib import AsyncExitStack -from datetime import timedelta +from datetime import UTC, datetime, timedelta from pathlib import Path from typing import Any, Final from uuid import uuid4 @@ -40,6 +40,8 @@ _DEFAULT_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=1) _DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT: Final[timedelta] = timedelta(seconds=5) +_DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=5) + class _BaseRcloneMountError(OsparcErrorMixin, RuntimeError): pass @@ -290,6 +292,7 @@ def __init__( remote_path: Path, local_mount_path: Path, vfs_cache_path: Path, + mount_activity_update_interval: timedelta = _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL, ) -> None: self.settings = settings self.mount_type = remote_type @@ -298,21 +301,38 @@ def __init__( self.local_mount_path = local_mount_path self.vfs_cache_path = vfs_cache_path - async def _handler( - mount_activity: MountActivity, - ) -> None: - _logger.debug("mount_activity=%s", mount_activity) - self.rc_interface = RCloneRCInterfaceClient( remote_control_port=rc_port, r_clone_mount_settings=settings.R_CLONE_MOUNT_SETTINGS, - update_handler=_handler, + update_handler=self._progress_handler, ) + self._last_mount_activity: MountActivity | None = None + self._last_mount_activity_update: datetime = datetime.fromtimestamp(0, UTC) + self._mount_activity_update_interval = mount_activity_update_interval # used internally to handle the mount command self._daemon_manager: DaemonProcessManager | None = None self._cleanup_stack = AsyncExitStack() + async def _progress_handler(self, mount_activity: MountActivity) -> None: + now = datetime.now(UTC) + + enough_time_passed = ( + now - self._last_mount_activity_update + > self._mount_activity_update_interval + ) + + if enough_time_passed and self._last_mount_activity != mount_activity: + self._last_mount_activity = mount_activity + self._last_mount_activity_update = now + + # NOTE: this could also be useful if pushed to the UI + _logger.info( + "Activity for '%s': %s", + self.local_mount_path, + self._last_mount_activity, + ) + async def setup(self) -> None: pass From 66102a317d8ce7199602d6bda8eb0a710b916009 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Tue, 9 Dec 2025 15:12:29 +0100 Subject: [PATCH 09/79] refactor --- .../core/settings.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py index 78ebce5d2c30..bf2a0a63dcb1 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py @@ -47,6 +47,19 @@ class ResourceTrackingSettings(BaseApplicationSettings): ) +class RCloneMountSettings(BaseApplicationSettings): + R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT: timedelta = Field( + default=timedelta(minutes=60), + description="max amount of time to wait when closing the rclone mount", + ) + + _validate_r_clone_mount_transfers_completed_timeout = ( + validate_numeric_string_as_timedelta( + "R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT" + ) + ) + + class SystemMonitorSettings(BaseApplicationSettings): DY_SIDECAR_SYSTEM_MONITOR_TELEMETRY_ENABLE: bool = Field( default=False, description="enabled/disabled disk usage monitoring" @@ -202,6 +215,10 @@ class ApplicationSettings(BaseApplicationSettings, MixinLoggingSettings): description="settings for opentelemetry tracing", ) + R_CLONE_MOUNT_SETTINGS: RCloneMountSettings = Field( + json_schema_extra={"auto_default_from_env": True} + ) + @property def are_prometheus_metrics_enabled(self) -> bool: return ( # pylint: disable=no-member From c11f078e9e2d80afd22cd0a1f45aeaf4fad5af20 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 10 Dec 2025 13:12:55 +0100 Subject: [PATCH 10/79] rclone mount to simcore-sdk --- .../src/settings_library/r_clone.py | 21 + .../_r_clone_mount/__init__.py | 3 + .../_r_clone_mount/_config_provider.py | 22 + .../node_ports_common/_r_clone_mount/_core.py | 439 +++++++++++++++ ..._node_ports_common__r_clone_mount__core.py | 503 ++++++++++++++++++ 5 files changed, 988 insertions(+) create mode 100644 packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/__init__.py create mode 100644 packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_config_provider.py create mode 100644 packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_core.py create mode 100644 packages/simcore-sdk/tests/unit/test_node_ports_common__r_clone_mount__core.py diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index 91ddbf4f68c0..b0d7e20b77c3 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -1,6 +1,8 @@ +from datetime import timedelta from enum import StrEnum from typing import Annotated +from common_library.pydantic_validators import validate_numeric_string_as_timedelta from pydantic import Field, NonNegativeInt from .base import BaseCustomSettings @@ -14,6 +16,21 @@ class S3Provider(StrEnum): MINIO = "MINIO" +class RCloneMountSettings(BaseCustomSettings): + """all settings related to mounting go here""" + + R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT: timedelta = Field( + default=timedelta(minutes=60), + description="max amount of time to wait when closing the rclone mount", + ) + + _validate_r_clone_mount_transfers_completed_timeout = ( + validate_numeric_string_as_timedelta( + "R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT" + ) + ) + + class RCloneSettings(BaseCustomSettings): R_CLONE_S3: Annotated[ S3Settings, Field(json_schema_extra={"auto_default_from_env": True}) @@ -63,3 +80,7 @@ class RCloneSettings(BaseCustomSettings): description="`--order-by X`: sets the order of file upload, e.g., 'size,mixed'", ), ] = "size,mixed" + + R_CLONE_MOUNT_SETTINGS: RCloneMountSettings = Field( + json_schema_extra={"auto_default_from_env": True} + ) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/__init__.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/__init__.py new file mode 100644 index 000000000000..cac25b70ce20 --- /dev/null +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/__init__.py @@ -0,0 +1,3 @@ +from ._core import RCloneMountManager + +__all__: tuple[str, ...] = ("RCloneMountManager",) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_config_provider.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_config_provider.py new file mode 100644 index 000000000000..7610234ed5a0 --- /dev/null +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_config_provider.py @@ -0,0 +1,22 @@ +from enum import Enum, auto +from typing import Final + +from settings_library.r_clone import RCloneSettings +from settings_library.utils_r_clone import get_s3_r_clone_config + +CONFIG_KEY: Final[str] = "MOUNT_REMOTE" + + +class MountRemoteType(Enum): + S3 = auto() + + +def get_config_content( + r_clone_settings: RCloneSettings, mount_remote_type: MountRemoteType +) -> str: + match mount_remote_type: + case MountRemoteType.S3: + return get_s3_r_clone_config(r_clone_settings, s3_config_key=CONFIG_KEY) + case _: + msg = f"Mount type {mount_remote_type} not implemented" + raise NotImplementedError(msg) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_core.py new file mode 100644 index 000000000000..d7d72e628c4f --- /dev/null +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_core.py @@ -0,0 +1,439 @@ +import asyncio +import logging +import os +from collections.abc import Awaitable, Callable +from contextlib import AsyncExitStack +from datetime import UTC, datetime, timedelta +from pathlib import Path +from typing import Any, Final +from uuid import uuid4 + +import httpx +from common_library.errors_classes import OsparcErrorMixin +from httpx import AsyncClient +from models_library.basic_types import PortInt +from models_library.progress_bar import ProgressReport +from pydantic import BaseModel, NonNegativeFloat +from servicelib.container_utils import run_command_in_container +from servicelib.logging_utils import log_catch +from servicelib.r_clone_utils import config_file +from servicelib.utils import unused_port +from settings_library.r_clone import RCloneMountSettings, RCloneSettings +from tenacity import ( + before_sleep_log, + retry, + retry_if_exception_type, + stop_after_delay, + wait_fixed, +) + +from ._config_provider import CONFIG_KEY, MountRemoteType, get_config_content + +_logger = logging.getLogger(__name__) + + +_DEFAULT_REMOTE_CONTROL_HOST: Final[str] = "localhost" +_MAX_WAIT_RC_HTTP_INTERFACE_READY: Final[timedelta] = timedelta(seconds=10) +_DEFAULT_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=1) +_DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT: Final[timedelta] = timedelta(seconds=5) + +_DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=5) + + +class _BaseRcloneMountError(OsparcErrorMixin, RuntimeError): + pass + + +class _ProcessAlreadyStartedError(_BaseRcloneMountError): + msg_template: str = "Process already started with pid='{pid}' via '{command}'" + + +class _MountAlreadyStartedError(_BaseRcloneMountError): + msg_template: str = ( + "Mount process already stareted with pid='{pid}' via '{command}'" + ) + + +class _WaitingForTransfersToCompleteError(_BaseRcloneMountError): + msg_template: str = "Waiting for all transfers to complete" + + +class _WaitingForQueueToBeEmptyError(_BaseRcloneMountError): + msg_template: str = "Waiting for VFS queue to be empty: queue={queue}" + + +def _get_command__pid_of_background_command(command: str) -> str: + return f"sh -c '{command} & echo $!'" + + +def _get_command__sigterm_process(pid: str) -> str: + return f"kill -SIGTERM {pid}" + + +class DaemonProcessManager: + """manage a command that is meant to run in a container forever""" + + def __init__(self, command: str, *, timeout: NonNegativeFloat = 5) -> None: + self.command = command + self.timeout = timeout + self.pid: str | None = None + + async def _run_in_container(self, command: str) -> str: + self_container = os.environ["HOSTNAME"] + return await run_command_in_container( + self_container, command=command, timeout=self.timeout + ) + + async def start(self): + if self.pid: + raise _ProcessAlreadyStartedError(pid=self.pid, command=self.command) + + command_result = await self._run_in_container( + command=_get_command__pid_of_background_command(self.command) + ) + # pid is printed as the first line of the output + self.pid = command_result.strip().split("\n")[0] + _logger.debug("Started rclone mount with pid=%s", self.pid) + + async def stop(self): + if self.pid is None: + return + + # since the process could have failed to start or failed shortly after + # starting the pid mind not be corresponding to a running process + # and will raise an error + with log_catch(_logger, reraise=False): + await self._run_in_container( + command=_get_command__sigterm_process(self.pid) + ) + + +def _get_rclone_mount_command( + config_file_path: str, + remote_path: Path, + local_mount_path: Path, + vfs_cache_path: Path, + rc_addr: str, + rc_user: str, + rc_password: str, +) -> str: + escaped_remote_path = f"{remote_path}".lstrip("/") + command: list[str] = [ + "rclone", + "--config", + config_file_path, + f"--log-file=/tmp/rclone-debug{uuid4()}.log", # TODO: maybe it is possible to make a reproducible path insteaa of random for simpler access to logs? + "-vv", + "mount", + f"{CONFIG_KEY}:{escaped_remote_path}", + f"{local_mount_path}", + "--vfs-cache-mode full", + "--vfs-write-back", + "1s", # write-back delay TODO: could be part of the settings? + "--vfs-fast-fingerprint", # recommended for s3 backend + "--no-modtime", # don't read/write the modification time + "--cache-dir", + f"{vfs_cache_path}", + "--rc", + f"--rc-addr={rc_addr}", + "--rc-enable-metrics", + f"--rc-user='{rc_user}'", + f"--rc-pass='{rc_password}'", + "--allow-non-empty", + ] + return " ".join(command) + + +class MountActivity(BaseModel): + transferring: dict[str, ProgressReport] + queued: list[str] + + +class RCloneRCInterfaceClient: + def __init__( + self, + remote_control_port: PortInt, + r_clone_mount_settings: RCloneMountSettings, + *, + update_handler: Callable[[MountActivity], Awaitable[None]], + remote_control_host: str = _DEFAULT_REMOTE_CONTROL_HOST, + update_interval: timedelta = _DEFAULT_UPDATE_INTERVAL, + r_clone_client_timeout: timedelta = _DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT, + ) -> None: + self._r_clone_mount_settings = r_clone_mount_settings + self._update_interval_seconds = update_interval.total_seconds() + self._r_clone_client_timeout = r_clone_client_timeout + self._update_handler = update_handler + + self._rc_host = remote_control_host + self._rc_port = remote_control_port + self.rc_user = f"{uuid4()}" + self.rc_password = f"{uuid4()}" + + self._cleanup_stack = AsyncExitStack() + self._client: AsyncClient | None = None + + self._continue_running: bool = True + self._transfer_monitor: asyncio.Task | None = None + + async def setup(self) -> None: + self._client = await self._cleanup_stack.enter_async_context( + AsyncClient(timeout=self._r_clone_client_timeout.total_seconds()) + ) + self._transfer_monitor = asyncio.create_task(self._monitor()) + + async def teardown(self) -> None: + if self._transfer_monitor is not None: + self._continue_running = False + await self._transfer_monitor + self._transfer_monitor = None + + await self._cleanup_stack.aclose() + + @property + def _base_url(self) -> str: + return f"http://{self._rc_host}:{self._rc_port}" + + async def _request(self, method: str, path: str) -> Any: + assert self._client is not None # nosec + + response = await self._client.request( + method, f"{self._base_url}/{path}", auth=(self.rc_user, self.rc_password) + ) + response.raise_for_status() + result = response.json() + _logger.debug("'%s %s' replied with: %s", method, path, result) + return result + + async def _post_core_stats(self) -> dict: + return await self._request("POST", "core/stats") + + async def _post_vfs_queue(self) -> dict: + return await self._request("POST", "vfs/queue") + + async def _rc_noop(self) -> dict: + return await self._request("POST", "rc/noop") + + async def _monitor(self) -> None: + while self._continue_running: + await asyncio.sleep(self._update_interval_seconds) + + core_stats, vfs_queue = await asyncio.gather( + self._post_core_stats(), self._post_vfs_queue() + ) + + mount_activity = MountActivity( + transferring=( + { + x["name"]: ProgressReport(actual_value=x["percentage"] / 100) + for x in core_stats["transferring"] + } + if "transferring" in core_stats + else {} + ), + queued=[x["name"] for x in vfs_queue["queue"]], + ) + + await self._update_handler(mount_activity) + + @retry( + wait=wait_fixed(1), + stop=stop_after_delay(_MAX_WAIT_RC_HTTP_INTERFACE_READY.total_seconds()), + reraise=True, + retry=retry_if_exception_type(httpx.HTTPError), + before_sleep=before_sleep_log(_logger, logging.WARNING), + ) + async def wait_for_interface_to_be_ready(self) -> None: + await self._rc_noop() + + async def wait_for_all_transfers_to_complete(self) -> None: + """ + Should be waited before closing the mount + to ensure all data is transferred to remote. + """ + + @retry( + wait=wait_fixed(1), + stop=stop_after_delay( + self._r_clone_mount_settings.R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT.total_seconds() + ), + reraise=True, + retry=retry_if_exception_type( + (_WaitingForQueueToBeEmptyError, _WaitingForTransfersToCompleteError) + ), + before_sleep=before_sleep_log(_logger, logging.WARNING), + ) + async def _() -> None: + core_stats, vfs_queue = await asyncio.gather( + self._post_core_stats(), self._post_vfs_queue() + ) + + if ( + core_stats["transfers"] != core_stats["totalTransfers"] + or "transferring" in core_stats + ): + raise _WaitingForTransfersToCompleteError + + queue = vfs_queue["queue"] + if len(queue) != 0: + raise _WaitingForQueueToBeEmptyError(queue=queue) + + await _() + + +class TrackedMount: + def __init__( + self, + r_clone_settings: RCloneSettings, + remote_type: MountRemoteType, + *, + rc_port: PortInt, + remote_path: Path, + local_mount_path: Path, + vfs_cache_path: Path, + mount_activity_update_interval: timedelta = _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL, + ) -> None: + self.r_clone_settings = r_clone_settings + self.mount_type = remote_type + self.rc_port = rc_port + self.remote_path = remote_path + self.local_mount_path = local_mount_path + self.vfs_cache_path = vfs_cache_path + + self.rc_interface = RCloneRCInterfaceClient( + remote_control_port=rc_port, + r_clone_mount_settings=r_clone_settings.R_CLONE_MOUNT_SETTINGS, + update_handler=self._progress_handler, + ) + self._last_mount_activity: MountActivity | None = None + self._last_mount_activity_update: datetime = datetime.fromtimestamp(0, UTC) + self._mount_activity_update_interval = mount_activity_update_interval + + # used internally to handle the mount command + self._daemon_manager: DaemonProcessManager | None = None + self._cleanup_stack = AsyncExitStack() + + async def _progress_handler(self, mount_activity: MountActivity) -> None: + now = datetime.now(UTC) + + enough_time_passed = ( + now - self._last_mount_activity_update + > self._mount_activity_update_interval + ) + + if enough_time_passed and self._last_mount_activity != mount_activity: + self._last_mount_activity = mount_activity + self._last_mount_activity_update = now + + # NOTE: this could also be useful if pushed to the UI + _logger.info( + "Activity for '%s': %s", + self.local_mount_path, + self._last_mount_activity, + ) + + async def teardown(self) -> None: + await self.stop_mount() + + async def start_mount(self) -> None: + if self._daemon_manager is not None: + raise _MountAlreadyStartedError( + pid=self._daemon_manager.pid, command=self._daemon_manager.command + ) + + config_file_path = await self._cleanup_stack.enter_async_context( + config_file(get_config_content(self.r_clone_settings, self.mount_type)) + ) + + self._daemon_manager = DaemonProcessManager( + command=_get_rclone_mount_command( + config_file_path=config_file_path, + remote_path=self.remote_path, + local_mount_path=self.local_mount_path, + vfs_cache_path=self.vfs_cache_path, + rc_addr=f"0.0.0.0:{self.rc_port}", + rc_user=self.rc_interface.rc_user, + rc_password=self.rc_interface.rc_password, + ) + ) + await self._daemon_manager.start() + await self.rc_interface.setup() + await self.rc_interface.wait_for_interface_to_be_ready() + + async def stop_mount(self) -> None: + if self._daemon_manager is None: + return + + await self.rc_interface.wait_for_all_transfers_to_complete() + await self.rc_interface.teardown() + + await self._daemon_manager.stop() + self._daemon_manager = None + + await self._cleanup_stack.aclose() + + +class RCloneMountManager: + def __init__( + self, r_clone_settings: RCloneSettings, common_vfs_cache_path: Path + ) -> None: + self.r_clone_settings = r_clone_settings + self.common_vfs_cache_path = common_vfs_cache_path + + self._started_mounts: dict[str, TrackedMount] = {} + + @staticmethod + def _get_mount_id(local_mount_path: Path) -> str: + return f"{local_mount_path}".replace("/", "_") + + async def start_mount( + self, + remote_type: MountRemoteType, + remote_path: Path, + local_mount_path: Path, + vfs_cache_path_overwrite: Path | None = None, + ) -> None: + mount_id = self._get_mount_id(local_mount_path) + vfs_cache_path = ( + vfs_cache_path_overwrite or self.common_vfs_cache_path + ) / mount_id + vfs_cache_path.mkdir(parents=True, exist_ok=True) + + free_port = await asyncio.get_running_loop().run_in_executor(None, unused_port) + + tracked_mount = TrackedMount( + self.r_clone_settings, + remote_type, + rc_port=free_port, + remote_path=remote_path, + local_mount_path=local_mount_path, + vfs_cache_path=vfs_cache_path, + ) + await tracked_mount.start_mount() + + self._started_mounts[mount_id] = tracked_mount + + async def wait_for_transfers_to_complete(self, local_mount_path: Path) -> None: + mount_id = self._get_mount_id(local_mount_path) + tracked_mount = self._started_mounts[mount_id] + + await tracked_mount.rc_interface.wait_for_all_transfers_to_complete() + + async def stop_mount(self, local_mount_path: Path) -> None: + mount_id = self._get_mount_id(local_mount_path) + tracked_mount = self._started_mounts[mount_id] + + await tracked_mount.stop_mount() + + async def setup(self) -> None: + pass + + async def teardown(self) -> None: + # await for all to terminate, limited gather + await asyncio.gather( + *[mount.teardown() for mount in self._started_mounts.values()] + ) + self._started_mounts.clear() + + +# TODO: oauth atuthorization pattern needs to be setup for non S3 providers diff --git a/packages/simcore-sdk/tests/unit/test_node_ports_common__r_clone_mount__core.py b/packages/simcore-sdk/tests/unit/test_node_ports_common__r_clone_mount__core.py new file mode 100644 index 000000000000..a89ba51623d3 --- /dev/null +++ b/packages/simcore-sdk/tests/unit/test_node_ports_common__r_clone_mount__core.py @@ -0,0 +1,503 @@ +# pylint: disable=protected-access +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument +import os +import re +import secrets +from collections.abc import AsyncIterable, AsyncIterator +from contextlib import asynccontextmanager +from pathlib import Path +from typing import Final, cast + +import aioboto3 +import aiodocker +import aiofiles +import pytest +from aiobotocore.session import ClientCreatorContext +from aiodocker.networks import DockerNetwork +from botocore.client import Config +from faker import Faker +from models_library.api_schemas_storage.storage_schemas import S3BucketName +from models_library.basic_types import PortInt +from pydantic import ByteSize, TypeAdapter +from pytest_mock import MockerFixture +from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict +from servicelib.container_utils import run_command_in_container +from servicelib.file_utils import create_sha256_checksum +from servicelib.logging_utils import _dampen_noisy_loggers +from servicelib.utils import limited_gather +from settings_library.r_clone import RCloneSettings +from simcore_sdk.node_ports_common._r_clone_mount import RCloneMountManager, _core +from simcore_sdk.node_ports_common._r_clone_mount._config_provider import ( + MountRemoteType, +) +from simcore_sdk.node_ports_common._r_clone_mount._core import ( + DaemonProcessManager, +) +from types_aiobotocore_s3 import S3Client + +_dampen_noisy_loggers(("botocore", "aiobotocore", "aioboto3", "moto.server")) + + +@pytest.fixture +def r_clone_version(package_dir: Path) -> str: + install_rclone_bash = ( + (package_dir / ".." / ".." / ".." / "..").resolve() + / "scripts" + / "install_rclone.bash" + ) + assert install_rclone_bash.exists() + + match = re.search(r'R_CLONE_VERSION="([\d.]+)"', install_rclone_bash.read_text()) + assert match + return match.group(1) + + +@pytest.fixture +def local_s3_content_path(tmpdir: Path) -> Path: + # path where s3 are created and then uploaded form + path = Path(tmpdir) / "copy_to_s3" + path.mkdir(parents=True, exist_ok=True) + return path + + +@pytest.fixture +def r_clone_local_mount_path(tmpdir: Path) -> Path: + # where rclone mount will make the files available + path = Path(tmpdir) / "r_clone_local_mount_path" + path.mkdir(parents=True, exist_ok=True) + return path + + +@pytest.fixture +def config_path(tmpdir: Path) -> Path: + # where the configuration path for rclone is found inside the container + path = Path(tmpdir) / "config_path" + path.mkdir(parents=True, exist_ok=True) + return path + + +@pytest.fixture +def mock_config_file(config_path: Path, faker: Faker, mocker: MockerFixture) -> None: + # ensure this returns a path where the config is living which has to be mounted in the container + # replace context manager with one that writes here + @asynccontextmanager + async def config_file(config: str) -> AsyncIterator[str]: + file_path = config_path / f"{faker.uuid4()}" + file_path.write_text(config) + yield f"{file_path}" + + file_path.unlink() + + mocker.patch.object(_core, "config_file", config_file) + + +_MONITORING_PORT: Final[PortInt] = 5572 + + +@pytest.fixture +async def docker_network() -> AsyncIterable[DockerNetwork]: + async with aiodocker.Docker() as client: + network_to_attach = await client.networks.create({"Name": "a_test_network"}) + try: + yield network_to_attach + finally: + await network_to_attach.delete() + + +@pytest.fixture +async def r_clone_container( + r_clone_version: str, + r_clone_local_mount_path: Path, + config_path: Path, + monkeypatch: pytest.MonkeyPatch, + docker_network: DockerNetwork, +) -> AsyncIterable[str]: + async with aiodocker.Docker() as client: + container = await client.containers.run( + config={ + "Image": f"rclone/rclone:{r_clone_version}", + "Entrypoint": ["/bin/sh", "-c", "apk add findutils && sleep 10000"], + "ExposedPorts": {f"{_MONITORING_PORT}/tcp": {}}, + "HostConfig": { + "PortBindings": { + f"{_MONITORING_PORT}/tcp": [{"HostPort": f"{_MONITORING_PORT}"}] + }, + "Binds": [ + f"{r_clone_local_mount_path}:{r_clone_local_mount_path}:rw", + f"{config_path}:{config_path}:rw", + ], + "Devices": [ + { + "PathOnHost": "/dev/fuse", + "PathInContainer": "/dev/fuse", + "CgroupPermissions": "rwm", + } + ], + "CapAdd": ["SYS_ADMIN"], + "SecurityOpt": ["apparmor:unconfined", "seccomp:unconfined"], + }, + } + ) + container_inspect = await container.show() + + container_name = container_inspect["Name"][1:] + monkeypatch.setenv("HOSTNAME", container_name) + + await docker_network.connect({"Container": container.id}) + + try: + yield container.id + finally: + await container.delete(force=True) + + +@pytest.fixture +async def moto_container(docker_network: DockerNetwork) -> AsyncIterable[None]: + async with aiodocker.Docker() as client: + container = await client.containers.run( + config={ + "Image": "motoserver/moto:latest", + "ExposedPorts": {"5000/tcp": {}}, + "HostConfig": { + "PortBindings": {"5000/tcp": [{"HostPort": "5000"}]}, + }, + "Env": ["MOTO_PORT=5000"], + }, + name="moto", + ) + await docker_network.connect({"Container": container.id}) + + try: + yield None + finally: + await container.delete(force=True) + + +async def test_daemon_container_process(r_clone_container: str): + container_process = DaemonProcessManager("sleep 10000") + await container_process.start() + assert container_process.pid + + ps_command = "ps -o pid,stat,comm" + result = await container_process._run_in_container(ps_command) # noqa: SLF001 + assert f"{container_process.pid} S" in result # check sleeping + + await container_process.stop() + await container_process._run_in_container(ps_command) # noqa: SLF001 + assert f"{container_process.pid} Z" not in result # check killed + + +@pytest.fixture +def mock_environment(monkeypatch: pytest.MonkeyPatch) -> EnvVarsDict: + return setenvs_from_dict( + monkeypatch, + { + "R_CLONE_PROVIDER": "AWS_MOTO", + "S3_ENDPOINT": "http://moto:5000", + "S3_ACCESS_KEY": "test", + "S3_BUCKET_NAME": "test", + "S3_SECRET_KEY": "test", + "S3_REGION": "us-east-1", + }, + ) + + +@pytest.fixture +def r_clone_settings(mock_environment: EnvVarsDict) -> RCloneSettings: + return RCloneSettings.create_from_envs() + + +@pytest.fixture +def remote_path() -> Path: + return Path("test") + + +@pytest.fixture +async def s3_client(r_clone_settings: RCloneSettings) -> AsyncIterable[S3Client]: + s3_settings = r_clone_settings.R_CLONE_S3 + session = aioboto3.Session() + session_client = session.client( + "s3", + endpoint_url=f"{s3_settings.S3_ENDPOINT}".replace("moto", "localhost"), + aws_access_key_id=s3_settings.S3_ACCESS_KEY, + aws_secret_access_key=s3_settings.S3_SECRET_KEY, + region_name=s3_settings.S3_REGION, + config=Config(signature_version="s3v4"), + ) + assert isinstance(session_client, ClientCreatorContext) # nosec + async with session_client as client: + client = cast(S3Client, client) + yield client + + +@pytest.fixture +def bucket_name(r_clone_settings: RCloneSettings) -> S3BucketName: + return TypeAdapter(S3BucketName).validate_python( + r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME + ) + + +def _secure_randint(a: int, b: int) -> int: + return a + secrets.randbelow(b - a + 1) + + +_DEFAULT_CHUCNK_SIZE: Final[ByteSize] = TypeAdapter(ByteSize).validate_python("1kb") + + +async def _get_random_file( + faker: Faker, + *, + store_to: Path, + file_size: ByteSize, + chunk_size: ByteSize = _DEFAULT_CHUCNK_SIZE, +) -> Path: + # creates a file in a path and returns it's hash + # generate a random file of size X and a random path inside the directory + + path_in_folder = Path( + faker.file_path(depth=_secure_randint(0, 5), extension="bin") + ).relative_to("/") + file_path = store_to / path_in_folder + + # ensure parent directory exists + file_path.parent.mkdir(parents=True, exist_ok=True) + assert file_path.parent.exists() + + async with aiofiles.open(file_path, "wb") as file: + written = 0 + while written < file_size: + to_write = min(chunk_size, file_size - written) + chunk = os.urandom(to_write) + await file.write(chunk) + written += to_write + + return path_in_folder + + +def _get_random_file_size() -> ByteSize: + return TypeAdapter(ByteSize).validate_python(f"{_secure_randint(1,1024)}Kb") + + +@pytest.fixture +async def create_files_in_s3( + r_clone_settings: RCloneSettings, + moto_container: None, + s3_client: S3Client, + bucket_name: S3BucketName, + faker: Faker, + remote_path: Path, + local_s3_content_path: Path, +) -> AsyncIterable[None]: + + await s3_client.create_bucket(Bucket=bucket_name) + + async def _create_file() -> None: + path_in_folder = await _get_random_file( + faker, + store_to=local_s3_content_path, + file_size=_get_random_file_size(), + ) + file_path = local_s3_content_path / path_in_folder + assert file_path.exists() + await s3_client.upload_file( + Filename=f"{file_path}", + Bucket=bucket_name, + Key=f"{remote_path/path_in_folder}", + ) + + files_to_create = _secure_randint(5, 20) + await limited_gather(*[_create_file() for _ in range(files_to_create)], limit=5) + + yield None + + files_in_bucket = await s3_client.list_objects_v2(Bucket=bucket_name) + + await limited_gather( + *[ + s3_client.delete_object(Bucket=bucket_name, Key=obj["Key"]) + for obj in files_in_bucket.get("Contents", []) + ], + limit=10, + ) + + # check all content form s3 was removed + files_in_bucket = await s3_client.list_objects_v2(Bucket=bucket_name) + assert files_in_bucket.get("Contents", []) == [] + + +@pytest.fixture +def mock_rc_port_with_default(mocker: MockerFixture) -> None: + mocker.patch( + "simcore_sdk.node_ports_common._r_clone_mount._core.unused_port", + return_value=_MONITORING_PORT, + ) + + +@pytest.fixture +def vfs_cache_path(tmpdir: Path) -> Path: + # path inside the docker container where the vfs cache will be stored + # for tests this can be just placed in the tmp directory ? + # TODO: for better tests it's better that is mounted as a volume + return Path("/tmp/rclone_cache") # noqa: S108 + + +@pytest.fixture +async def single_mount_r_clone_mount_manager( + mock_rc_port_with_default: None, + r_clone_container: str, + mock_config_file: None, + r_clone_settings: RCloneSettings, + vfs_cache_path: Path, +) -> AsyncIterable[RCloneMountManager]: + r_clone_mount_manager = RCloneMountManager(r_clone_settings, vfs_cache_path) + + yield r_clone_mount_manager + + await r_clone_mount_manager.teardown() + + +async def _get_file_checksums_from_local_path( + local_s3_content_path: Path, +) -> dict[Path, str]: + local_checksums = {} + for dirpath, _, filenames in os.walk(local_s3_content_path): + for filename in filenames: + file_path = Path(dirpath) / filename + relative_path = file_path.relative_to(local_s3_content_path) + + async with aiofiles.open(file_path, "rb") as file: + checksum = await create_sha256_checksum(file) + + local_checksums[relative_path] = checksum + return local_checksums + + +async def _get_file_checksums_from_container( + remote_path: Path, + r_clone_container: str, + bucket_name: S3BucketName, +) -> dict[Path, str]: + remote_checksum_and_files = await run_command_in_container( + r_clone_container, + command=f"find {remote_path} -type f -exec sha256sum {{}} \\;", + timeout=30, + ) + + def _parse_entry(entry: str) -> tuple[Path, str]: + checksum, file_path = entry.strip().split() + relative_path = ( + Path(file_path).relative_to(remote_path).relative_to(Path(bucket_name)) + ) + return relative_path, checksum + + return dict( + [_parse_entry(x) for x in remote_checksum_and_files.strip().split("\n")] + ) + + +async def _get_files_from_s3( + s3_client: S3Client, + bucket_name: S3BucketName, +) -> dict[Path, str]: + """Download files from S3 and return their SHA256 checksums.""" + files_in_bucket = await s3_client.list_objects_v2(Bucket=bucket_name) + + async def _get_file_checksum(key: str) -> tuple[Path, str]: + response = await s3_client.get_object(Bucket=bucket_name, Key=key) + checksum = await create_sha256_checksum(response["Body"]) + return Path(key).relative_to(Path(bucket_name)), checksum + + results = await limited_gather( + *[ + _get_file_checksum(obj["Key"]) + for obj in files_in_bucket.get("Contents", []) + ], + limit=10, + ) + + return dict(results) + + +async def _assert_local_content_in_s3( + s3_client: S3Client, + bucket_name: S3BucketName, + local_s3_content_path: Path, +) -> None: + files_local_folder = await _get_file_checksums_from_local_path( + local_s3_content_path + ) + files_from_s3 = await _get_files_from_s3(s3_client, bucket_name) + + assert files_local_folder == files_from_s3 + + +async def _assert_same_files_in_all_places( + s3_client: S3Client, + bucket_name: S3BucketName, + r_clone_container: str, + r_clone_local_mount_path: Path, +) -> None: + files_from_container = await _get_file_checksums_from_container( + r_clone_local_mount_path, r_clone_container, bucket_name + ) + files_from_s3 = await _get_files_from_s3(s3_client, bucket_name) + assert files_from_container == files_from_s3 + + +async def _change_file_in_container(remote_path: Path, r_clone_container: str) -> None: + await run_command_in_container( + r_clone_container, + command=f"dd if=/dev/urandom of={remote_path} bs={_get_random_file_size()} count=1", + timeout=30, + ) + + +async def test_tracked_mount_waits_for_files_before_finalizing( + create_files_in_s3: None, + single_mount_r_clone_mount_manager: RCloneMountManager, + r_clone_local_mount_path: Path, + # maybe drop + s3_client: S3Client, + bucket_name: S3BucketName, + r_clone_container: str, + local_s3_content_path: Path, + remote_path: Path, +): + await single_mount_r_clone_mount_manager.start_mount( + MountRemoteType.S3, remote_path, r_clone_local_mount_path + ) + + await _assert_local_content_in_s3(s3_client, bucket_name, local_s3_content_path) + + def _get_random_file_in_container() -> Path: + return ( + r_clone_local_mount_path + / bucket_name + / secrets.choice( + [x for x in local_s3_content_path.rglob("*") if x.is_file()] + ).relative_to(local_s3_content_path) + ) + + # change and check all is the same + files_to_change = {_get_random_file_in_container() for _ in range(15)} + await limited_gather( + *[_change_file_in_container(x, r_clone_container) for x in files_to_change], + limit=10, + ) + + await single_mount_r_clone_mount_manager.wait_for_transfers_to_complete( + r_clone_local_mount_path + ) + await _assert_same_files_in_all_places( + s3_client, + bucket_name, + r_clone_container, + r_clone_local_mount_path, + ) + + await single_mount_r_clone_mount_manager.stop_mount(r_clone_local_mount_path) + + +# TODO: we need a mode to check if rclone mount properly resumes the mounting in case of crash and restart +# we need a test for this one From d29003762102659c513d28b436a1a11cffca7923 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 10 Dec 2025 13:13:07 +0100 Subject: [PATCH 11/79] moved away --- .../modules/rclone/_config_provider.py | 24 - .../modules/rclone/_mount.py | 405 -------------- .../tests/unit/rclone/test__mount.py | 521 ------------------ 3 files changed, 950 deletions(-) delete mode 100644 services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_config_provider.py delete mode 100644 services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py delete mode 100644 services/dynamic-sidecar/tests/unit/rclone/test__mount.py diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_config_provider.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_config_provider.py deleted file mode 100644 index e7a372bdf7ef..000000000000 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_config_provider.py +++ /dev/null @@ -1,24 +0,0 @@ -from enum import Enum, auto -from typing import Final - -from settings_library.utils_r_clone import get_s3_r_clone_config -from simcore_service_dynamic_sidecar.core.settings import ApplicationSettings - -CONFIG_KEY: Final[str] = "MOUNT_REMOTE" - - -class MountRemoteType(Enum): - S3 = auto() - - -def get_config_content( - settings: ApplicationSettings, mount_remote_type: MountRemoteType -) -> str: - match mount_remote_type: - case MountRemoteType.S3: - return get_s3_r_clone_config( - settings.DY_SIDECAR_R_CLONE_SETTINGS, s3_config_key=CONFIG_KEY - ) - case _: - msg = f"Mount type {mount_remote_type} not implemented" - raise NotImplementedError(msg) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py deleted file mode 100644 index c7670bae52fc..000000000000 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/rclone/_mount.py +++ /dev/null @@ -1,405 +0,0 @@ -import asyncio -import logging -import os -from collections.abc import Awaitable, Callable -from contextlib import AsyncExitStack -from datetime import UTC, datetime, timedelta -from pathlib import Path -from typing import Any, Final -from uuid import uuid4 - -import httpx -from common_library.errors_classes import OsparcErrorMixin -from fastapi import FastAPI -from httpx import AsyncClient -from models_library.basic_types import PortInt -from models_library.progress_bar import ProgressReport -from pydantic import BaseModel, NonNegativeFloat -from servicelib.container_utils import run_command_in_container -from servicelib.logging_utils import log_catch -from servicelib.r_clone_utils import config_file -from simcore_service_dynamic_sidecar.core.settings import ( - ApplicationSettings, - RCloneMountSettings, -) -from tenacity import ( - before_sleep_log, - retry, - retry_if_exception_type, - stop_after_delay, - wait_fixed, -) - -from ._config_provider import CONFIG_KEY, MountRemoteType, get_config_content - -_logger = logging.getLogger(__name__) - - -_DEFAULT_REMOTE_CONTROL_HOST: Final[str] = "localhost" -_MAX_WAIT_RC_HTTP_INTERFACE_READY: Final[timedelta] = timedelta(seconds=10) -_DEFAULT_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=1) -_DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT: Final[timedelta] = timedelta(seconds=5) - -_DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=5) - - -class _BaseRcloneMountError(OsparcErrorMixin, RuntimeError): - pass - - -class _ProcessAlreadyStartedError(_BaseRcloneMountError): - msg_template: str = "Process already started with pid='{pid}' via '{command}'" - - -class _MountAlreadyStartedError(_BaseRcloneMountError): - msg_template: str = ( - "Mount process already stareted with pid='{pid}' via '{command}'" - ) - - -class _WaitingForTransfersToCompleteError(_BaseRcloneMountError): - msg_template: str = "Waiting for all transfers to complete" - - -class _WaitingForQueueToBeEmptyError(_BaseRcloneMountError): - msg_template: str = "Waiting for VFS queue to be empty: queue={queue}" - - -type MountID = str - - -def _get_command__pid_of_background_command(command: str) -> str: - return f"sh -c '{command} & echo $!'" - - -def _get_command__sigterm_process(pid: str) -> str: - return f"kill -SIGTERM {pid}" - - -class DaemonProcessManager: - """manage a command that is meant to run in a container forever""" - - def __init__(self, command: str, *, timeout: NonNegativeFloat = 5) -> None: - self.command = command - self.timeout = timeout - self.pid: str | None = None - - async def _run_in_container(self, command: str) -> str: - self_container = os.environ["HOSTNAME"] - return await run_command_in_container( - self_container, command=command, timeout=self.timeout - ) - - async def start(self): - if self.pid: - raise _ProcessAlreadyStartedError(pid=self.pid, command=self.command) - - command_result = await self._run_in_container( - command=_get_command__pid_of_background_command(self.command) - ) - # pid is printed as the first line of the output - self.pid = command_result.strip().split("\n")[0] - _logger.debug("Started rclone mount with pid=%s", self.pid) - - async def stop(self): - if self.pid is None: - return - - # since the process could have failed to start or failed shortly after - # starting the pid mind not be corresponding to a running process - # and will raise an error - with log_catch(_logger, reraise=False): - await self._run_in_container( - command=_get_command__sigterm_process(self.pid) - ) - - -def _get_rclone_mount_command( - config_file_path: str, - remote_path: Path, - local_mount_path: Path, - vfs_cache_path: Path, - rc_addr: str, - rc_user: str, - rc_password: str, -) -> str: - escaped_remote_path = f"{remote_path}".lstrip("/") - command: list[str] = [ - "rclone", - "--config", - config_file_path, - f"--log-file=/tmp/rclone-debug{uuid4()}.log", - "-vv", - "mount", - f"{CONFIG_KEY}:{escaped_remote_path}", - f"{local_mount_path}", - "--vfs-cache-mode full", - "--vfs-write-back", - "1s", # write-back delay - "--vfs-fast-fingerprint", # recommended for s3 backend - "--no-modtime", # don't read/write the modification time - "--cache-dir", - f"{vfs_cache_path}", - "--rc", - f"--rc-addr={rc_addr}", - "--rc-enable-metrics", - f"--rc-user='{rc_user}'", - f"--rc-pass='{rc_password}'", - "--allow-non-empty", - ] - return " ".join(command) - - -class MountActivity(BaseModel): - transferring: dict[str, ProgressReport] - queued: list[str] - - -class RCloneRCInterfaceClient: - def __init__( - self, - remote_control_port: PortInt, - r_clone_mount_settings: RCloneMountSettings, - *, - update_handler: Callable[[MountActivity], Awaitable[None]], - remote_control_host: str = _DEFAULT_REMOTE_CONTROL_HOST, - update_interval: timedelta = _DEFAULT_UPDATE_INTERVAL, - r_clone_client_timeout: timedelta = _DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT, - ) -> None: - self._r_clone_mount_settings = r_clone_mount_settings - self._update_interval_seconds = update_interval.total_seconds() - self._r_clone_client_timeout = r_clone_client_timeout - self._update_handler = update_handler - - self._rc_host = remote_control_host - self._rc_port = remote_control_port - self.rc_user = f"{uuid4()}" - self.rc_password = f"{uuid4()}" - - self._cleanup_stack = AsyncExitStack() - self._client: AsyncClient | None = None - - self._continue_running: bool = True - self._transfer_monitor: asyncio.Task | None = None - - async def setup(self) -> None: - self._client = await self._cleanup_stack.enter_async_context( - AsyncClient(timeout=self._r_clone_client_timeout.total_seconds()) - ) - self._transfer_monitor = asyncio.create_task(self._monitor()) - - async def teardown(self) -> None: - if self._transfer_monitor is not None: - self._continue_running = False - await self._transfer_monitor - self._transfer_monitor = None - - await self._cleanup_stack.aclose() - - @property - def _base_url(self) -> str: - return f"http://{self._rc_host}:{self._rc_port}" - - async def _request(self, method: str, path: str) -> Any: - assert self._client is not None # nosec - - response = await self._client.request( - method, f"{self._base_url}/{path}", auth=(self.rc_user, self.rc_password) - ) - response.raise_for_status() - result = response.json() - _logger.debug("'%s %s' replied with: %s", method, path, result) - return result - - async def _post_core_stats(self) -> dict: - return await self._request("POST", "core/stats") - - async def _post_vfs_queue(self) -> dict: - return await self._request("POST", "vfs/queue") - - async def _rc_noop(self) -> dict: - return await self._request("POST", "rc/noop") - - async def _monitor(self) -> None: - while self._continue_running: - await asyncio.sleep(self._update_interval_seconds) - - core_stats, vfs_queue = await asyncio.gather( - self._post_core_stats(), self._post_vfs_queue() - ) - - mount_activity = MountActivity( - transferring=( - { - x["name"]: ProgressReport(actual_value=x["percentage"] / 100) - for x in core_stats["transferring"] - } - if "transferring" in core_stats - else {} - ), - queued=[x["name"] for x in vfs_queue["queue"]], - ) - - await self._update_handler(mount_activity) - - @retry( - wait=wait_fixed(1), - stop=stop_after_delay(_MAX_WAIT_RC_HTTP_INTERFACE_READY.total_seconds()), - reraise=True, - retry=retry_if_exception_type(httpx.HTTPError), - before_sleep=before_sleep_log(_logger, logging.WARNING), - ) - async def wait_for_interface_to_be_ready(self) -> None: - await self._rc_noop() - - async def wait_for_all_transfers_to_complete(self) -> None: - @retry( - wait=wait_fixed(1), - stop=stop_after_delay( - self._r_clone_mount_settings.R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT.total_seconds() - ), - reraise=True, - retry=retry_if_exception_type( - (_WaitingForQueueToBeEmptyError, _WaitingForTransfersToCompleteError) - ), - before_sleep=before_sleep_log(_logger, logging.WARNING), - ) - async def _() -> None: - core_stats, vfs_queue = await asyncio.gather( - self._post_core_stats(), self._post_vfs_queue() - ) - - if ( - core_stats["transfers"] != core_stats["totalTransfers"] - or "transferring" in core_stats - ): - raise _WaitingForTransfersToCompleteError - - queue = vfs_queue["queue"] - if len(queue) != 0: - raise _WaitingForQueueToBeEmptyError(queue=queue) - - await _() - - -class TrackedMount: - def __init__( - self, - settings: ApplicationSettings, - remote_type: MountRemoteType, - *, - rc_port: PortInt, - remote_path: Path, - local_mount_path: Path, - vfs_cache_path: Path, - mount_activity_update_interval: timedelta = _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL, - ) -> None: - self.settings = settings - self.mount_type = remote_type - self.rc_port = rc_port - self.remote_path = remote_path - self.local_mount_path = local_mount_path - self.vfs_cache_path = vfs_cache_path - - self.rc_interface = RCloneRCInterfaceClient( - remote_control_port=rc_port, - r_clone_mount_settings=settings.R_CLONE_MOUNT_SETTINGS, - update_handler=self._progress_handler, - ) - self._last_mount_activity: MountActivity | None = None - self._last_mount_activity_update: datetime = datetime.fromtimestamp(0, UTC) - self._mount_activity_update_interval = mount_activity_update_interval - - # used internally to handle the mount command - self._daemon_manager: DaemonProcessManager | None = None - self._cleanup_stack = AsyncExitStack() - - async def _progress_handler(self, mount_activity: MountActivity) -> None: - now = datetime.now(UTC) - - enough_time_passed = ( - now - self._last_mount_activity_update - > self._mount_activity_update_interval - ) - - if enough_time_passed and self._last_mount_activity != mount_activity: - self._last_mount_activity = mount_activity - self._last_mount_activity_update = now - - # NOTE: this could also be useful if pushed to the UI - _logger.info( - "Activity for '%s': %s", - self.local_mount_path, - self._last_mount_activity, - ) - - async def setup(self) -> None: - pass - - async def teardown(self) -> None: - await self.stop_mount() - - async def start_mount(self) -> None: - if self._daemon_manager is not None: - raise _MountAlreadyStartedError( - pid=self._daemon_manager.pid, command=self._daemon_manager.command - ) - - config_file_path = await self._cleanup_stack.enter_async_context( - config_file(get_config_content(self.settings, self.mount_type)) - ) - - self._daemon_manager = DaemonProcessManager( - command=_get_rclone_mount_command( - config_file_path=config_file_path, - remote_path=self.remote_path, - local_mount_path=self.local_mount_path, - vfs_cache_path=self.vfs_cache_path, - rc_addr=f"0.0.0.0:{self.rc_port}", - rc_user=self.rc_interface.rc_user, - rc_password=self.rc_interface.rc_password, - ) - ) - await self._daemon_manager.start() - await self.rc_interface.setup() - await self.rc_interface.wait_for_interface_to_be_ready() - - async def stop_mount(self) -> None: - if self._daemon_manager is None: - return - - await self.rc_interface.wait_for_all_transfers_to_complete() - await self.rc_interface.teardown() - - await self._daemon_manager.stop() - self._daemon_manager = None - - await self._cleanup_stack.aclose() - - -class RCloneMountManager: - def __init__(self, app: FastAPI) -> None: - self.app = app - # keep track of all started mount commands via their pid and http endpoint, might need different ports for the http API - # add rc-user and rc-password the the config stored here so that nobody can access without credentials - - self._started_mounts: dict[MountID, TrackedMount] = {} - - async def start_mount(self, remote_type: MountRemoteType) -> MountID: - # create a mount via some configuration and keep track of it - pass - - async def stop_mount(self, mount_id: MountID) -> None: - pass - - async def setup(self) -> None: - pass - - async def teardown(self) -> None: - # await for all to terminate, limited gather - pass - - -# TODO: be able to mange multiple sources to be mounted - -# TODO: oauth atuthorization pattern needs to be setup for non S3 providers diff --git a/services/dynamic-sidecar/tests/unit/rclone/test__mount.py b/services/dynamic-sidecar/tests/unit/rclone/test__mount.py deleted file mode 100644 index 18e0ae6fab48..000000000000 --- a/services/dynamic-sidecar/tests/unit/rclone/test__mount.py +++ /dev/null @@ -1,521 +0,0 @@ -# pylint: disable=protected-access -# pylint: disable=redefined-outer-name -# pylint: disable=unused-argument -import os -import re -import secrets -from collections.abc import AsyncIterable, AsyncIterator -from contextlib import asynccontextmanager -from pathlib import Path -from typing import Final, cast - -import aioboto3 -import aiodocker -import aiofiles -import pytest -from aiobotocore.session import ClientCreatorContext -from aiodocker.networks import DockerNetwork -from botocore.client import Config -from faker import Faker -from models_library.api_schemas_storage.storage_schemas import S3BucketName -from models_library.basic_types import PortInt -from pydantic import ByteSize, TypeAdapter -from pytest_mock import MockerFixture -from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict -from servicelib.container_utils import run_command_in_container -from servicelib.file_utils import create_sha256_checksum -from servicelib.logging_utils import _dampen_noisy_loggers -from servicelib.utils import limited_gather -from simcore_service_dynamic_sidecar.core.settings import ApplicationSettings -from simcore_service_dynamic_sidecar.modules.rclone import _mount -from simcore_service_dynamic_sidecar.modules.rclone._config_provider import ( - MountRemoteType, -) -from simcore_service_dynamic_sidecar.modules.rclone._mount import ( - DaemonProcessManager, - TrackedMount, -) -from types_aiobotocore_s3 import S3Client - -_dampen_noisy_loggers(("botocore", "aiobotocore", "aioboto3", "moto.server")) - - -@pytest.fixture -def r_clone_version(package_dir: Path) -> str: - install_rclone_bash = ( - (package_dir / ".." / ".." / ".." / "..").resolve() - / "scripts" - / "install_rclone.bash" - ) - assert install_rclone_bash.exists() - - match = re.search(r'R_CLONE_VERSION="([\d.]+)"', install_rclone_bash.read_text()) - assert match - return match.group(1) - - -@pytest.fixture -def local_s3_content_path(tmpdir: Path) -> Path: - # path where s3 are created and then uploaded form - path = Path(tmpdir) / "copy_to_s3" - path.mkdir(parents=True, exist_ok=True) - return path - - -@pytest.fixture -def r_clone_local_mount_path(tmpdir: Path) -> Path: - # where rclone mount will make the files available - path = Path(tmpdir) / "r_clone_local_mount_path" - path.mkdir(parents=True, exist_ok=True) - return path - - -@pytest.fixture -def config_path(tmpdir: Path) -> Path: - # where the configuration path for rclone is found inside the container - path = Path(tmpdir) / "config_path" - path.mkdir(parents=True, exist_ok=True) - return path - - -@pytest.fixture -def mock_config_file(config_path: Path, faker: Faker, mocker: MockerFixture) -> None: - # ensure this returns a path where the config is living which has to be mounted in the container - # replace context manager with one that writes here - @asynccontextmanager - async def config_file(config: str) -> AsyncIterator[str]: - file_path = config_path / f"{faker.uuid4()}" - file_path.write_text(config) - yield f"{file_path}" - - file_path.unlink() - - mocker.patch.object(_mount, "config_file", config_file) - - -_MONITORING_PORT: Final[PortInt] = 5572 - - -@pytest.fixture -async def docker_network() -> AsyncIterable[DockerNetwork]: - async with aiodocker.Docker() as client: - network_to_attach = await client.networks.create({"Name": "a_test_network"}) - try: - yield network_to_attach - finally: - await network_to_attach.delete() - - -@pytest.fixture -async def r_clone_container( - r_clone_version: str, - r_clone_local_mount_path: Path, - config_path: Path, - monkeypatch: pytest.MonkeyPatch, - docker_network: DockerNetwork, -) -> AsyncIterable[str]: - async with aiodocker.Docker() as client: - container = await client.containers.run( - config={ - "Image": f"rclone/rclone:{r_clone_version}", - "Entrypoint": ["/bin/sh", "-c", "apk add findutils && sleep 10000"], - "ExposedPorts": {f"{_MONITORING_PORT}/tcp": {}}, - "HostConfig": { - "PortBindings": { - f"{_MONITORING_PORT}/tcp": [{"HostPort": f"{_MONITORING_PORT}"}] - }, - "Binds": [ - f"{r_clone_local_mount_path}:{r_clone_local_mount_path}:rw", - f"{config_path}:{config_path}:rw", - ], - "Devices": [ - { - "PathOnHost": "/dev/fuse", - "PathInContainer": "/dev/fuse", - "CgroupPermissions": "rwm", - } - ], - "CapAdd": ["SYS_ADMIN"], - "SecurityOpt": ["apparmor:unconfined", "seccomp:unconfined"], - }, - } - ) - container_inspect = await container.show() - - container_name = container_inspect["Name"][1:] - monkeypatch.setenv("HOSTNAME", container_name) - - await docker_network.connect({"Container": container.id}) - - try: - yield container.id - finally: - await container.delete(force=True) - - -@pytest.fixture -async def moto_container(docker_network: DockerNetwork) -> AsyncIterable[None]: - async with aiodocker.Docker() as client: - container = await client.containers.run( - config={ - "Image": "motoserver/moto:latest", - "ExposedPorts": {"5000/tcp": {}}, - "HostConfig": { - "PortBindings": {"5000/tcp": [{"HostPort": "5000"}]}, - }, - "Env": ["MOTO_PORT=5000"], - }, - name="moto", - ) - await docker_network.connect({"Container": container.id}) - - try: - yield None - finally: - await container.delete(force=True) - - -async def test_daemon_container_process(r_clone_container: str): - container_process = DaemonProcessManager("sleep 10000") - await container_process.start() - assert container_process.pid - - ps_command = "ps -o pid,stat,comm" - result = await container_process._run_in_container(ps_command) # noqa: SLF001 - assert f"{container_process.pid} S" in result # check sleeping - - await container_process.stop() - await container_process._run_in_container(ps_command) # noqa: SLF001 - assert f"{container_process.pid} Z" not in result # check killed - - -@pytest.fixture -def mock_environment( - monkeypatch: pytest.MonkeyPatch, mock_environment: EnvVarsDict -) -> EnvVarsDict: - setenvs_from_dict( - monkeypatch, - { - "R_CLONE_PROVIDER": "AWS_MOTO", - "S3_ENDPOINT": "http://moto:5000", - "S3_ACCESS_KEY": "test", - "S3_BUCKET_NAME": "test", - "S3_SECRET_KEY": "test", - "S3_REGION": "us-east-1", - }, - ) - return mock_environment - - -@pytest.fixture -def application_settings(mock_environment: EnvVarsDict) -> ApplicationSettings: - return ApplicationSettings.create_from_envs() - - -@pytest.fixture -def remote_path() -> Path: - return Path("test") - - -@pytest.fixture -async def s3_client( - application_settings: ApplicationSettings, -) -> AsyncIterable[S3Client]: - s3_settings = application_settings.DY_SIDECAR_R_CLONE_SETTINGS.R_CLONE_S3 - session = aioboto3.Session() - session_client = session.client( - "s3", - endpoint_url=f"{s3_settings.S3_ENDPOINT}".replace("moto", "localhost"), - aws_access_key_id=s3_settings.S3_ACCESS_KEY, - aws_secret_access_key=s3_settings.S3_SECRET_KEY, - region_name=s3_settings.S3_REGION, - config=Config(signature_version="s3v4"), - ) - assert isinstance(session_client, ClientCreatorContext) # nosec - async with session_client as client: - client = cast(S3Client, client) - yield client - - -@pytest.fixture -def bucket_name(application_settings: ApplicationSettings) -> S3BucketName: - return TypeAdapter(S3BucketName).validate_python( - application_settings.DY_SIDECAR_R_CLONE_SETTINGS.R_CLONE_S3.S3_BUCKET_NAME, - ) - - -def _secure_randint(a: int, b: int) -> int: - return a + secrets.randbelow(b - a + 1) - - -_DEFAULT_CHUCNK_SIZE: Final[ByteSize] = TypeAdapter(ByteSize).validate_python("1kb") - - -async def _get_random_file( - faker: Faker, - *, - store_to: Path, - file_size: ByteSize, - chunk_size: ByteSize = _DEFAULT_CHUCNK_SIZE, -) -> Path: - # creates a file in a path and returns it's hash - # generate a random file of size X and a random path inside the directory - - path_in_folder = Path( - faker.file_path(depth=_secure_randint(0, 5), extension="bin") - ).relative_to("/") - file_path = store_to / path_in_folder - - # ensure parent directory exists - file_path.parent.mkdir(parents=True, exist_ok=True) - assert file_path.parent.exists() - - async with aiofiles.open(file_path, "wb") as file: - written = 0 - while written < file_size: - to_write = min(chunk_size, file_size - written) - chunk = os.urandom(to_write) - await file.write(chunk) - written += to_write - - return path_in_folder - - -def _get_random_file_size() -> ByteSize: - return TypeAdapter(ByteSize).validate_python(f"{_secure_randint(1,1024)}Kb") - - -@pytest.fixture -async def create_files_in_s3( - application_settings: ApplicationSettings, - moto_container: None, - s3_client: S3Client, - bucket_name: S3BucketName, - faker: Faker, - remote_path: Path, - local_s3_content_path: Path, -) -> AsyncIterable[None]: - - await s3_client.create_bucket(Bucket=bucket_name) - - async def _create_file() -> None: - path_in_folder = await _get_random_file( - faker, - store_to=local_s3_content_path, - file_size=_get_random_file_size(), - ) - file_path = local_s3_content_path / path_in_folder - assert file_path.exists() - await s3_client.upload_file( - Filename=f"{file_path}", - Bucket=bucket_name, - Key=f"{remote_path/path_in_folder}", - ) - - files_to_create = _secure_randint(5, 20) - await limited_gather(*[_create_file() for _ in range(files_to_create)], limit=5) - - yield None - - files_in_bucket = await s3_client.list_objects_v2(Bucket=bucket_name) - - await limited_gather( - *[ - s3_client.delete_object(Bucket=bucket_name, Key=obj["Key"]) - for obj in files_in_bucket.get("Contents", []) - ], - limit=10, - ) - - # check all content form s3 was removed - files_in_bucket = await s3_client.list_objects_v2(Bucket=bucket_name) - assert files_in_bucket.get("Contents", []) == [] - - -@pytest.fixture -def mock_default_remote_control_host(mocker: MockerFixture) -> None: - mocker.patch( - "simcore_service_dynamic_sidecar.modules.rclone._mount._DEFAULT_REMOTE_CONTROL_HOST", - "0.0.0.0", # noqa: S104 - ) - - -@pytest.fixture -def vfs_cache_path(tmpdir: Path) -> Path: - # path inside the docker container where the vfs cache will be stored - # for tests this can be just placed in the tmp directory ? - # TODO: for better tests it's better that is mounted as a volume - return Path("/tmp/rclone_cache") # noqa: S108 - - -@pytest.fixture -async def tracked_mount( - mock_default_remote_control_host: None, - r_clone_container: str, - mock_config_file: None, - application_settings: ApplicationSettings, - remote_path: Path, - r_clone_local_mount_path: Path, - vfs_cache_path: Path, -) -> AsyncIterable[TrackedMount]: - tracked_mount = TrackedMount( - application_settings, - MountRemoteType.S3, - rc_port=_MONITORING_PORT, - remote_path=remote_path, - local_mount_path=r_clone_local_mount_path, - vfs_cache_path=vfs_cache_path, - ) - await tracked_mount.setup() - - yield tracked_mount - - await tracked_mount.teardown() - - -async def _get_file_checksums_from_local_path( - local_s3_content_path: Path, -) -> dict[Path, str]: - local_checksums = {} - for dirpath, _, filenames in os.walk(local_s3_content_path): - for filename in filenames: - file_path = Path(dirpath) / filename - relative_path = file_path.relative_to(local_s3_content_path) - - async with aiofiles.open(file_path, "rb") as file: - checksum = await create_sha256_checksum(file) - - local_checksums[relative_path] = checksum - return local_checksums - - -async def _get_file_checksums_from_container( - remote_path: Path, - r_clone_container: str, - bucket_name: S3BucketName, -) -> dict[Path, str]: - remote_checksum_and_files = await run_command_in_container( - r_clone_container, - command=f"find {remote_path} -type f -exec sha256sum {{}} \\;", - timeout=30, - ) - - def _parse_entry(entry: str) -> tuple[Path, str]: - checksum, file_path = entry.strip().split() - relative_path = ( - Path(file_path).relative_to(remote_path).relative_to(Path(bucket_name)) - ) - return relative_path, checksum - - return dict( - [_parse_entry(x) for x in remote_checksum_and_files.strip().split("\n")] - ) - - -async def _get_files_from_s3( - s3_client: S3Client, - bucket_name: S3BucketName, -) -> dict[Path, str]: - """Download files from S3 and return their SHA256 checksums.""" - files_in_bucket = await s3_client.list_objects_v2(Bucket=bucket_name) - - async def _get_file_checksum(key: str) -> tuple[Path, str]: - response = await s3_client.get_object(Bucket=bucket_name, Key=key) - checksum = await create_sha256_checksum(response["Body"]) - return Path(key).relative_to(Path(bucket_name)), checksum - - results = await limited_gather( - *[ - _get_file_checksum(obj["Key"]) - for obj in files_in_bucket.get("Contents", []) - ], - limit=10, - ) - - return dict(results) - - -async def _assert_local_content_in_s3( - s3_client: S3Client, - bucket_name: S3BucketName, - local_s3_content_path: Path, -) -> None: - files_local_folder = await _get_file_checksums_from_local_path( - local_s3_content_path - ) - files_from_s3 = await _get_files_from_s3(s3_client, bucket_name) - - assert files_local_folder == files_from_s3 - - -async def _assert_same_files_in_all_places( - s3_client: S3Client, - bucket_name: S3BucketName, - r_clone_container: str, - r_clone_local_mount_path: Path, -) -> None: - files_from_container = await _get_file_checksums_from_container( - r_clone_local_mount_path, r_clone_container, bucket_name - ) - files_from_s3 = await _get_files_from_s3(s3_client, bucket_name) - assert files_from_container == files_from_s3 - - -async def _change_file_in_container(remote_path: Path, r_clone_container: str) -> None: - await run_command_in_container( - r_clone_container, - command=f"dd if=/dev/urandom of={remote_path} bs={_get_random_file_size()} count=1", - timeout=30, - ) - - -async def test_r_clone_mount( - create_files_in_s3: None, - tracked_mount: TrackedMount, - r_clone_local_mount_path: Path, - # maybe drop - s3_client: S3Client, - bucket_name: S3BucketName, - r_clone_container: str, - local_s3_content_path: Path, -): - await tracked_mount.start_mount() - - await _assert_local_content_in_s3(s3_client, bucket_name, local_s3_content_path) - - def _get_random_file_in_container() -> Path: - return ( - r_clone_local_mount_path - / bucket_name - / secrets.choice( - [x for x in local_s3_content_path.rglob("*") if x.is_file()] - ).relative_to(local_s3_content_path) - ) - - # change and check all is the same - files_to_change = {_get_random_file_in_container() for _ in range(15)} - await limited_gather( - *[_change_file_in_container(x, r_clone_container) for x in files_to_change], - limit=10, - ) - - await tracked_mount.rc_interface.wait_for_all_transfers_to_complete() - await _assert_same_files_in_all_places( - s3_client, - bucket_name, - r_clone_container, - r_clone_local_mount_path, - ) - - await tracked_mount.stop_mount() - - -# TODO: -# better real world tests -# use this to mount a folder like the node directly in a separate path on the sidecar and expose this -# to the users somehow -# so that we can use it form a jupyter-math - - -# TODO: we need a mode to check if rclone mount properly resumes the mounting in case of crash and restart -# we need a test for this one From 0bca30e9a23bc50e3fafcf5caece9f43956e59af Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 10 Dec 2025 13:13:21 +0100 Subject: [PATCH 12/79] refactored --- .../core/settings.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py index bf2a0a63dcb1..78ebce5d2c30 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/settings.py @@ -47,19 +47,6 @@ class ResourceTrackingSettings(BaseApplicationSettings): ) -class RCloneMountSettings(BaseApplicationSettings): - R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT: timedelta = Field( - default=timedelta(minutes=60), - description="max amount of time to wait when closing the rclone mount", - ) - - _validate_r_clone_mount_transfers_completed_timeout = ( - validate_numeric_string_as_timedelta( - "R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT" - ) - ) - - class SystemMonitorSettings(BaseApplicationSettings): DY_SIDECAR_SYSTEM_MONITOR_TELEMETRY_ENABLE: bool = Field( default=False, description="enabled/disabled disk usage monitoring" @@ -215,10 +202,6 @@ class ApplicationSettings(BaseApplicationSettings, MixinLoggingSettings): description="settings for opentelemetry tracing", ) - R_CLONE_MOUNT_SETTINGS: RCloneMountSettings = Field( - json_schema_extra={"auto_default_from_env": True} - ) - @property def are_prometheus_metrics_enabled(self) -> bool: return ( # pylint: disable=no-member From 52e32b2a85029c964fb6656c337dd502693a592d Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 10 Dec 2025 13:20:08 +0100 Subject: [PATCH 13/79] refactor --- .../_r_clone_mount/__init__.py | 8 ++++-- .../node_ports_common/_r_clone_mount/_core.py | 26 +++++++++++++++---- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/__init__.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/__init__.py index cac25b70ce20..c7541ea95de3 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/__init__.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/__init__.py @@ -1,3 +1,7 @@ -from ._core import RCloneMountManager +from ._core import MountAlreadyStartedError, MountNotStartedError, RCloneMountManager -__all__: tuple[str, ...] = ("RCloneMountManager",) +__all__: tuple[str, ...] = ( + "MountAlreadyStartedError", + "MountNotStartedError", + "RCloneMountManager", +) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_core.py index d7d72e628c4f..962321b21c42 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_core.py @@ -48,7 +48,7 @@ class _ProcessAlreadyStartedError(_BaseRcloneMountError): msg_template: str = "Process already started with pid='{pid}' via '{command}'" -class _MountAlreadyStartedError(_BaseRcloneMountError): +class _TrackedMountAlreadyStartedError(_BaseRcloneMountError): msg_template: str = ( "Mount process already stareted with pid='{pid}' via '{command}'" ) @@ -62,6 +62,14 @@ class _WaitingForQueueToBeEmptyError(_BaseRcloneMountError): msg_template: str = "Waiting for VFS queue to be empty: queue={queue}" +class MountAlreadyStartedError(_BaseRcloneMountError): + msg_template: str = "Mount already started for local path='{local_mount_path}'" + + +class MountNotStartedError(_BaseRcloneMountError): + msg_template: str = "Mount not started for local path='{local_mount_path}'" + + def _get_command__pid_of_background_command(command: str) -> str: return f"sh -c '{command} & echo $!'" @@ -337,7 +345,7 @@ async def teardown(self) -> None: async def start_mount(self) -> None: if self._daemon_manager is not None: - raise _MountAlreadyStartedError( + raise _TrackedMountAlreadyStartedError( pid=self._daemon_manager.pid, command=self._daemon_manager.command ) @@ -394,6 +402,10 @@ async def start_mount( vfs_cache_path_overwrite: Path | None = None, ) -> None: mount_id = self._get_mount_id(local_mount_path) + if mount_id in self._started_mounts: + tracked_mount = self._started_mounts[mount_id] + raise MountAlreadyStartedError(local_mount_path=local_mount_path) + vfs_cache_path = ( vfs_cache_path_overwrite or self.common_vfs_cache_path ) / mount_id @@ -415,21 +427,25 @@ async def start_mount( async def wait_for_transfers_to_complete(self, local_mount_path: Path) -> None: mount_id = self._get_mount_id(local_mount_path) - tracked_mount = self._started_mounts[mount_id] + if mount_id not in self._started_mounts: + raise MountNotStartedError(local_mount_path=local_mount_path) + tracked_mount = self._started_mounts[mount_id] await tracked_mount.rc_interface.wait_for_all_transfers_to_complete() async def stop_mount(self, local_mount_path: Path) -> None: mount_id = self._get_mount_id(local_mount_path) - tracked_mount = self._started_mounts[mount_id] + if mount_id not in self._started_mounts: + raise MountNotStartedError(local_mount_path=local_mount_path) + tracked_mount = self._started_mounts[mount_id] await tracked_mount.stop_mount() async def setup(self) -> None: pass async def teardown(self) -> None: - # await for all to terminate, limited gather + # shutdown still ongoing mounts await asyncio.gather( *[mount.teardown() for mount in self._started_mounts.values()] ) From fcec8a1007f0edffd35d52446f62ff1fc182e838 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 10 Dec 2025 13:41:56 +0100 Subject: [PATCH 14/79] added migration --- ...a01f1c_added_use_r_clone_mounting_field.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 packages/postgres-database/src/simcore_postgres_database/migration/versions/611b7fa01f1c_added_use_r_clone_mounting_field.py diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/611b7fa01f1c_added_use_r_clone_mounting_field.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/611b7fa01f1c_added_use_r_clone_mounting_field.py new file mode 100644 index 000000000000..fa93517b25fc --- /dev/null +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/611b7fa01f1c_added_use_r_clone_mounting_field.py @@ -0,0 +1,36 @@ +"""added use_r_clone_mounting field + +Revision ID: 611b7fa01f1c +Revises: a85557c02d71 +Create Date: 2025-12-10 12:40:46.573251+00:00 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "611b7fa01f1c" +down_revision = "a85557c02d71" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.add_column( + "groups_extra_properties", + sa.Column( + "use_r_clone_mounting", + sa.Boolean(), + server_default=sa.text("false"), + nullable=False, + ), + ) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("groups_extra_properties", "use_r_clone_mounting") + # ### end Alembic commands ### From b844c137a5ff9babce413bd73fc15db8de881286 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 10 Dec 2025 13:42:22 +0100 Subject: [PATCH 15/79] added user_r_clone_mounting --- .../models/groups_extra_properties.py | 7 +++++++ .../utils_groups_extra_properties.py | 1 + 2 files changed, 8 insertions(+) diff --git a/packages/postgres-database/src/simcore_postgres_database/models/groups_extra_properties.py b/packages/postgres-database/src/simcore_postgres_database/models/groups_extra_properties.py index e25a1bd3b2bd..84e28a42c884 100644 --- a/packages/postgres-database/src/simcore_postgres_database/models/groups_extra_properties.py +++ b/packages/postgres-database/src/simcore_postgres_database/models/groups_extra_properties.py @@ -68,6 +68,13 @@ server_default=sa.sql.expression.false(), doc="If true, will mount efs distributed file system when dynamic services starts", ), + sa.Column( + "use_r_clone_mounting", + sa.Boolean(), + nullable=False, + server_default=sa.sql.expression.false(), + doc="If true, will mount efs distributed file system when dynamic services starts", + ), sa.UniqueConstraint( "group_id", "product_name", name="group_id_product_name_uniqueness" ), diff --git a/packages/postgres-database/src/simcore_postgres_database/utils_groups_extra_properties.py b/packages/postgres-database/src/simcore_postgres_database/utils_groups_extra_properties.py index b1cb32abf9f2..e645bcda535a 100644 --- a/packages/postgres-database/src/simcore_postgres_database/utils_groups_extra_properties.py +++ b/packages/postgres-database/src/simcore_postgres_database/utils_groups_extra_properties.py @@ -40,6 +40,7 @@ class GroupExtraProperties(FromRowMixin): created: datetime.datetime modified: datetime.datetime enable_efs: bool + use_r_clone_mounting: bool def _list_table_entries_ordered_by_group_type_stmt(user_id: int, product_name: str): From 3a986f052e3574c8a5833fc743dd5b4503060248 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 10 Dec 2025 14:12:58 +0100 Subject: [PATCH 16/79] added new interface --- .../core/application.py | 3 +++ .../modules/long_running_tasks.py | 8 ++++++ .../modules/r_clone_mount_manager.py | 27 +++++++++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/application.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/application.py index 75461aaeda0c..c9b8bf15718d 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/application.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/core/application.py @@ -29,6 +29,7 @@ from ..modules.notifications import setup_notifications from ..modules.outputs import setup_outputs from ..modules.prometheus_metrics import setup_prometheus_metrics +from ..modules.r_clone_mount_manager import setup_r_clone_mount_manager from ..modules.resource_tracking import setup_resource_tracking from ..modules.system_monitor import setup_system_monitor from ..modules.user_services_preferences import setup_user_services_preferences @@ -193,6 +194,8 @@ def create_app() -> FastAPI: setup_user_services_preferences(app) + setup_r_clone_mount_manager(app) + if application_settings.are_prometheus_metrics_enabled: setup_prometheus_metrics(app) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py index 9387a3867fae..69a12ea5c1fa 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py @@ -55,6 +55,7 @@ from ..modules.mounted_fs import MountedVolumes from ..modules.notifications._notifications_ports import PortNotifier from ..modules.outputs import OutputsManager, event_propagation_disabled +from ..modules.r_clone_mount_manager import get_r_clone_mount_manager from .long_running_tasks_utils import ( ensure_read_permissions_on_user_service_data, run_before_shutdown_actions, @@ -351,7 +352,9 @@ async def _restore_state_folder( progress_bar: ProgressBarData, state_path: Path, ) -> None: + assert settings.DY_SIDECAR_PRODUCT_NAME is not None # nosec await data_manager.pull( + product_name=settings.DY_SIDECAR_PRODUCT_NAME, user_id=settings.DY_SIDECAR_USER_ID, project_id=settings.DY_SIDECAR_PROJECT_ID, node_uuid=settings.DY_SIDECAR_NODE_ID, @@ -362,6 +365,8 @@ async def _restore_state_folder( r_clone_settings=settings.DY_SIDECAR_R_CLONE_SETTINGS, progress_bar=progress_bar, legacy_state=_get_legacy_state_with_dy_volumes_path(settings), + application_name=f"{APP_NAME}-{settings.DY_SIDECAR_NODE_ID}", + mount_manager=get_r_clone_mount_manager(app), ) @@ -424,7 +429,9 @@ async def _save_state_folder( state_path: Path, mounted_volumes: MountedVolumes, ) -> None: + assert settings.DY_SIDECAR_PRODUCT_NAME is not None # nosec await data_manager.push( + product_name=settings.DY_SIDECAR_PRODUCT_NAME, user_id=settings.DY_SIDECAR_USER_ID, project_id=settings.DY_SIDECAR_PROJECT_ID, node_uuid=settings.DY_SIDECAR_NODE_ID, @@ -437,6 +444,7 @@ async def _save_state_folder( progress_bar=progress_bar, legacy_state=_get_legacy_state_with_dy_volumes_path(settings), application_name=f"{APP_NAME}-{settings.DY_SIDECAR_NODE_ID}", + mount_manager=get_r_clone_mount_manager(app), ) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py new file mode 100644 index 000000000000..6507ae4a54f8 --- /dev/null +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py @@ -0,0 +1,27 @@ +from fastapi import FastAPI +from simcore_sdk.node_ports_common.r_clone_mount import RCloneMountManager + +from ..core.settings import ApplicationSettings + + +def setup_r_clone_mount_manager(app: FastAPI): + settings: ApplicationSettings = app.state.settings + + async def _on_startup() -> None: + + app.state.r_clone_mount_manager = r_clone_mount_manager = RCloneMountManager( + settings.DY_SIDECAR_R_CLONE_SETTINGS + ) + await r_clone_mount_manager.setup() + + async def _on_shutdown() -> None: + r_clone_mount_manager: RCloneMountManager = app.state.r_clone_mount_manager + await r_clone_mount_manager.teardown() + + app.add_event_handler("startup", _on_startup) + app.add_event_handler("shutdown", _on_shutdown) + + +def get_r_clone_mount_manager(app: FastAPI) -> RCloneMountManager: + assert isinstance(app.state.r_clone_mount_manager, RCloneMountManager) # nosec + return app.state.r_clone_mount_manager From e2e57101cc1dfd3b5a55a87d018f7a028b3252f3 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 10 Dec 2025 14:45:06 +0100 Subject: [PATCH 17/79] refactored --- .../src/settings_library/r_clone.py | 18 ++- .../src/simcore_sdk/node_data/data_manager.py | 103 ++++++++++++++-- .../node_ports_common/dbmanager.py | 18 +++ .../__init__.py | 2 + .../_config_provider.py | 0 .../_core.py | 23 ++-- ..._node_ports_common__r_clone_mount__core.py | 112 +++++++++--------- 7 files changed, 193 insertions(+), 83 deletions(-) rename packages/simcore-sdk/src/simcore_sdk/node_ports_common/{_r_clone_mount => r_clone_mount}/__init__.py (74%) rename packages/simcore-sdk/src/simcore_sdk/node_ports_common/{_r_clone_mount => r_clone_mount}/_config_provider.py (100%) rename packages/simcore-sdk/src/simcore_sdk/node_ports_common/{_r_clone_mount => r_clone_mount}/_core.py (95%) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index b0d7e20b77c3..974dfb0f5915 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -1,5 +1,6 @@ from datetime import timedelta from enum import StrEnum +from pathlib import Path from typing import Annotated from common_library.pydantic_validators import validate_numeric_string_as_timedelta @@ -19,9 +20,20 @@ class S3Provider(StrEnum): class RCloneMountSettings(BaseCustomSettings): """all settings related to mounting go here""" - R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT: timedelta = Field( - default=timedelta(minutes=60), - description="max amount of time to wait when closing the rclone mount", + R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT: Annotated[ + timedelta, + Field( + description="max amount of time to wait when closing the rclone mount", + ), + ] = timedelta(minutes=60) + + R_CLONE_MOUNT_VFS_CACHE_PATH: Annotated[ + Path, + Field( + description="common directory where all vfs-caches will be mounted to", + ), + ] = Path( + "/tmp/vfs-caching" # noqa: S108 ) _validate_r_clone_mount_transfers_completed_timeout = ( diff --git a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py index fbb9b1980110..7b39a44102c2 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py @@ -2,6 +2,7 @@ from pathlib import Path from tempfile import TemporaryDirectory +from models_library.products import ProductName from models_library.projects import ProjectID from models_library.projects_nodes_io import NodeID, StorageFileID from models_library.service_settings_labels import LegacyState @@ -16,6 +17,7 @@ from ..node_ports_common.constants import SIMCORE_LOCATION from ..node_ports_common.dbmanager import DBManager from ..node_ports_common.file_io_utils import LogRedirectCB +from ..node_ports_common.r_clone_mount import MountRemoteType, RCloneMountManager _logger = logging.getLogger(__name__) @@ -182,7 +184,25 @@ async def _delete_legacy_archive( ) -async def push( # pylint: disable=too-many-arguments +async def _use_r_clone_mounting( + application_name: str, user_id: UserID, product_name: ProductName +) -> bool: + return ( + await DBManager(application_name=application_name).get_group_extra_properties( + user_id=user_id, product_name=product_name + ) + ).use_r_clone_mounting + + +async def _stop_mount( + mount_manager: RCloneMountManager, destination_path: Path +) -> None: + await mount_manager.wait_for_transfers_to_complete(destination_path) + await mount_manager.stop_mount(destination_path) + + +async def push( # pylint: disable=too-many-arguments # noqa: PLR0913 + product_name: ProductName, user_id: UserID, project_id: ProjectID, node_uuid: NodeID, @@ -194,19 +214,23 @@ async def push( # pylint: disable=too-many-arguments progress_bar: ProgressBarData, legacy_state: LegacyState | None, application_name: str, + mount_manager: RCloneMountManager, ) -> None: """pushes and removes the legacy archive if present""" - await _push_directory( - user_id=user_id, - project_id=project_id, - node_uuid=node_uuid, - source_path=source_path, - r_clone_settings=r_clone_settings, - exclude_patterns=exclude_patterns, - io_log_redirect_cb=io_log_redirect_cb, - progress_bar=progress_bar, - ) + if _use_r_clone_mounting(application_name, user_id, product_name): + await _stop_mount(mount_manager, source_path) + else: + await _push_directory( + user_id=user_id, + project_id=project_id, + node_uuid=node_uuid, + source_path=source_path, + r_clone_settings=r_clone_settings, + exclude_patterns=exclude_patterns, + io_log_redirect_cb=io_log_redirect_cb, + progress_bar=progress_bar, + ) archive_exists = await _state_metadata_entry_exists( user_id=user_id, @@ -244,7 +268,24 @@ async def push( # pylint: disable=too-many-arguments ) -async def pull( +async def _start_mount_if_required( + mount_manager: RCloneMountManager, + application_name: str, + product_name: ProductName, + user_id: UserID, + project_id: ProjectID, + node_uuid: NodeID, + destination_path: Path, +) -> None: + if not _use_r_clone_mounting(application_name, user_id, product_name): + return + + s3_object = __create_s3_object_key(project_id, node_uuid, destination_path) + await mount_manager.start_mount(MountRemoteType.S3, s3_object, destination_path) + + +async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 + product_name: ProductName, user_id: UserID, project_id: ProjectID, node_uuid: NodeID, @@ -254,6 +295,8 @@ async def pull( r_clone_settings: RCloneSettings, progress_bar: ProgressBarData, legacy_state: LegacyState | None, + application_name: str, + mount_manager: RCloneMountManager, ) -> None: """restores the state folder""" @@ -286,6 +329,15 @@ async def pull( progress_bar=progress_bar, legacy_destination_path=legacy_state.old_state_path, ) + await _start_mount_if_required( + mount_manager, + application_name, + product_name, + user_id, + project_id, + node_uuid, + destination_path, + ) return state_archive_exists = await _state_metadata_entry_exists( @@ -305,6 +357,15 @@ async def pull( io_log_redirect_cb=io_log_redirect_cb, progress_bar=progress_bar, ) + await _start_mount_if_required( + mount_manager, + application_name, + product_name, + user_id, + project_id, + node_uuid, + destination_path, + ) return state_directory_exists = await _state_metadata_entry_exists( @@ -324,6 +385,24 @@ async def pull( r_clone_settings=r_clone_settings, progress_bar=progress_bar, ) + await _start_mount_if_required( + mount_manager, + application_name, + product_name, + user_id, + project_id, + node_uuid, + destination_path, + ) return + await _start_mount_if_required( + mount_manager, + application_name, + product_name, + user_id, + project_id, + node_uuid, + destination_path, + ) _logger.debug("No content previously saved for '%s'", destination_path) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/dbmanager.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/dbmanager.py index 21c0f0173b91..d74656d63eb2 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/dbmanager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/dbmanager.py @@ -2,6 +2,7 @@ import sqlalchemy as sa from common_library.json_serialization import json_dumps, json_loads +from models_library.products import ProductName from models_library.projects import ProjectID from models_library.users import UserID from pydantic import TypeAdapter @@ -13,6 +14,10 @@ update_for_run_id_and_node_id, ) from simcore_postgres_database.utils_comp_runs import get_latest_run_id_for_project +from simcore_postgres_database.utils_groups_extra_properties import ( + GroupExtraProperties, + GroupExtraPropertiesRepo, +) from sqlalchemy.ext.asyncio import AsyncConnection, AsyncEngine from .exceptions import NodeNotFound, ProjectNotFoundError @@ -192,3 +197,16 @@ async def get_project_owner_user_id(self, project_id: ProjectID) -> UserID: if prj_owner is None: raise ProjectNotFoundError(project_id) return TypeAdapter(UserID).validate_python(prj_owner) + + async def get_group_extra_properties( + self, user_id: UserID, product_name: ProductName + ) -> GroupExtraProperties: + async with ( + DBContextManager( + self._db_engine, application_name=self._application_name + ) as engine, + engine.connect() as connection, + ): + return await GroupExtraPropertiesRepo.get_aggregated_properties_for_user( + connection, user_id=user_id, product_name=product_name + ) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/__init__.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py similarity index 74% rename from packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/__init__.py rename to packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py index c7541ea95de3..106fadfb009f 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/__init__.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py @@ -1,7 +1,9 @@ +from ._config_provider import MountRemoteType from ._core import MountAlreadyStartedError, MountNotStartedError, RCloneMountManager __all__: tuple[str, ...] = ( "MountAlreadyStartedError", "MountNotStartedError", + "MountRemoteType", "RCloneMountManager", ) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_config_provider.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_config_provider.py similarity index 100% rename from packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_config_provider.py rename to packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_config_provider.py diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py similarity index 95% rename from packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_core.py rename to packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 962321b21c42..4fa24b89902d 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/_r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -13,6 +13,7 @@ from httpx import AsyncClient from models_library.basic_types import PortInt from models_library.progress_bar import ProgressReport +from models_library.projects_nodes_io import StorageFileID from pydantic import BaseModel, NonNegativeFloat from servicelib.container_utils import run_command_in_container from servicelib.logging_utils import log_catch @@ -118,7 +119,7 @@ async def stop(self): def _get_rclone_mount_command( config_file_path: str, - remote_path: Path, + remote_path: StorageFileID, local_mount_path: Path, vfs_cache_path: Path, rc_addr: str, @@ -138,8 +139,8 @@ def _get_rclone_mount_command( "--vfs-cache-mode full", "--vfs-write-back", "1s", # write-back delay TODO: could be part of the settings? - "--vfs-fast-fingerprint", # recommended for s3 backend - "--no-modtime", # don't read/write the modification time + "--vfs-fast-fingerprint", # recommended for s3 backend TODO: could be part of the settings? + "--no-modtime", # don't read/write the modification time TODO: could be part of the settings? "--cache-dir", f"{vfs_cache_path}", "--rc", @@ -296,7 +297,7 @@ def __init__( remote_type: MountRemoteType, *, rc_port: PortInt, - remote_path: Path, + remote_path: StorageFileID, local_mount_path: Path, vfs_cache_path: Path, mount_activity_update_interval: timedelta = _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL, @@ -356,7 +357,7 @@ async def start_mount(self) -> None: self._daemon_manager = DaemonProcessManager( command=_get_rclone_mount_command( config_file_path=config_file_path, - remote_path=self.remote_path, + remote_path=f"{self.r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME}/{self.remote_path}", local_mount_path=self.local_mount_path, vfs_cache_path=self.vfs_cache_path, rc_addr=f"0.0.0.0:{self.rc_port}", @@ -382,11 +383,11 @@ async def stop_mount(self) -> None: class RCloneMountManager: - def __init__( - self, r_clone_settings: RCloneSettings, common_vfs_cache_path: Path - ) -> None: + def __init__(self, r_clone_settings: RCloneSettings) -> None: self.r_clone_settings = r_clone_settings - self.common_vfs_cache_path = common_vfs_cache_path + self._common_vfs_cache_path = ( + self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_MOUNT_VFS_CACHE_PATH + ) self._started_mounts: dict[str, TrackedMount] = {} @@ -397,7 +398,7 @@ def _get_mount_id(local_mount_path: Path) -> str: async def start_mount( self, remote_type: MountRemoteType, - remote_path: Path, + remote_path: StorageFileID, local_mount_path: Path, vfs_cache_path_overwrite: Path | None = None, ) -> None: @@ -407,7 +408,7 @@ async def start_mount( raise MountAlreadyStartedError(local_mount_path=local_mount_path) vfs_cache_path = ( - vfs_cache_path_overwrite or self.common_vfs_cache_path + vfs_cache_path_overwrite or self._common_vfs_cache_path ) / mount_id vfs_cache_path.mkdir(parents=True, exist_ok=True) diff --git a/packages/simcore-sdk/tests/unit/test_node_ports_common__r_clone_mount__core.py b/packages/simcore-sdk/tests/unit/test_node_ports_common__r_clone_mount__core.py index a89ba51623d3..0ed2c0926f2a 100644 --- a/packages/simcore-sdk/tests/unit/test_node_ports_common__r_clone_mount__core.py +++ b/packages/simcore-sdk/tests/unit/test_node_ports_common__r_clone_mount__core.py @@ -19,6 +19,7 @@ from faker import Faker from models_library.api_schemas_storage.storage_schemas import S3BucketName from models_library.basic_types import PortInt +from models_library.projects_nodes_io import StorageFileID from pydantic import ByteSize, TypeAdapter from pytest_mock import MockerFixture from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict @@ -27,11 +28,11 @@ from servicelib.logging_utils import _dampen_noisy_loggers from servicelib.utils import limited_gather from settings_library.r_clone import RCloneSettings -from simcore_sdk.node_ports_common._r_clone_mount import RCloneMountManager, _core -from simcore_sdk.node_ports_common._r_clone_mount._config_provider import ( +from simcore_sdk.node_ports_common.r_clone_mount import RCloneMountManager, _core +from simcore_sdk.node_ports_common.r_clone_mount._config_provider import ( MountRemoteType, ) -from simcore_sdk.node_ports_common._r_clone_mount._core import ( +from simcore_sdk.node_ports_common.r_clone_mount._core import ( DaemonProcessManager, ) from types_aiobotocore_s3 import S3Client @@ -189,16 +190,39 @@ async def test_daemon_container_process(r_clone_container: str): @pytest.fixture -def mock_environment(monkeypatch: pytest.MonkeyPatch) -> EnvVarsDict: +def vfs_cache_path(tmpdir: Path) -> Path: + # path inside the docker container where the vfs cache will be stored + # for tests this can be just placed in the tmp directory ? + # TODO: for better tests it's better that is mounted as a volume + return Path("/tmp/rclone_cache") # noqa: S108 + + +@pytest.fixture +def remote_path(faker: Faker) -> StorageFileID: + return TypeAdapter(StorageFileID).validate_python( + f"{faker.uuid4()}/{faker.uuid4()}/mounted-path" + ) + + +@pytest.fixture +def bucket_name() -> S3BucketName: + return TypeAdapter(S3BucketName).validate_python("osparc-data") + + +@pytest.fixture +def mock_environment( + monkeypatch: pytest.MonkeyPatch, vfs_cache_path: Path, bucket_name: S3BucketName +) -> EnvVarsDict: return setenvs_from_dict( monkeypatch, { "R_CLONE_PROVIDER": "AWS_MOTO", "S3_ENDPOINT": "http://moto:5000", "S3_ACCESS_KEY": "test", - "S3_BUCKET_NAME": "test", + "S3_BUCKET_NAME": bucket_name, "S3_SECRET_KEY": "test", "S3_REGION": "us-east-1", + "R_CLONE_MOUNT_VFS_CACHE_PATH": f"{vfs_cache_path}", }, ) @@ -208,11 +232,6 @@ def r_clone_settings(mock_environment: EnvVarsDict) -> RCloneSettings: return RCloneSettings.create_from_envs() -@pytest.fixture -def remote_path() -> Path: - return Path("test") - - @pytest.fixture async def s3_client(r_clone_settings: RCloneSettings) -> AsyncIterable[S3Client]: s3_settings = r_clone_settings.R_CLONE_S3 @@ -231,13 +250,6 @@ async def s3_client(r_clone_settings: RCloneSettings) -> AsyncIterable[S3Client] yield client -@pytest.fixture -def bucket_name(r_clone_settings: RCloneSettings) -> S3BucketName: - return TypeAdapter(S3BucketName).validate_python( - r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME - ) - - def _secure_randint(a: int, b: int) -> int: return a + secrets.randbelow(b - a + 1) @@ -286,7 +298,7 @@ async def create_files_in_s3( s3_client: S3Client, bucket_name: S3BucketName, faker: Faker, - remote_path: Path, + remote_path: StorageFileID, local_s3_content_path: Path, ) -> AsyncIterable[None]: @@ -329,28 +341,19 @@ async def _create_file() -> None: @pytest.fixture def mock_rc_port_with_default(mocker: MockerFixture) -> None: mocker.patch( - "simcore_sdk.node_ports_common._r_clone_mount._core.unused_port", + "simcore_sdk.node_ports_common.r_clone_mount._core.unused_port", return_value=_MONITORING_PORT, ) -@pytest.fixture -def vfs_cache_path(tmpdir: Path) -> Path: - # path inside the docker container where the vfs cache will be stored - # for tests this can be just placed in the tmp directory ? - # TODO: for better tests it's better that is mounted as a volume - return Path("/tmp/rclone_cache") # noqa: S108 - - @pytest.fixture async def single_mount_r_clone_mount_manager( mock_rc_port_with_default: None, r_clone_container: str, mock_config_file: None, r_clone_settings: RCloneSettings, - vfs_cache_path: Path, ) -> AsyncIterable[RCloneMountManager]: - r_clone_mount_manager = RCloneMountManager(r_clone_settings, vfs_cache_path) + r_clone_mount_manager = RCloneMountManager(r_clone_settings) yield r_clone_mount_manager @@ -374,21 +377,17 @@ async def _get_file_checksums_from_local_path( async def _get_file_checksums_from_container( - remote_path: Path, - r_clone_container: str, - bucket_name: S3BucketName, + path_in_container: Path, r_clone_container: str ) -> dict[Path, str]: remote_checksum_and_files = await run_command_in_container( r_clone_container, - command=f"find {remote_path} -type f -exec sha256sum {{}} \\;", + command=f"find {path_in_container} -type f -exec sha256sum {{}} \\;", timeout=30, ) def _parse_entry(entry: str) -> tuple[Path, str]: checksum, file_path = entry.strip().split() - relative_path = ( - Path(file_path).relative_to(remote_path).relative_to(Path(bucket_name)) - ) + relative_path = Path(file_path).relative_to(path_in_container) return relative_path, checksum return dict( @@ -397,8 +396,7 @@ def _parse_entry(entry: str) -> tuple[Path, str]: async def _get_files_from_s3( - s3_client: S3Client, - bucket_name: S3BucketName, + s3_client: S3Client, bucket_name: S3BucketName, remote_path: StorageFileID ) -> dict[Path, str]: """Download files from S3 and return their SHA256 checksums.""" files_in_bucket = await s3_client.list_objects_v2(Bucket=bucket_name) @@ -406,7 +404,7 @@ async def _get_files_from_s3( async def _get_file_checksum(key: str) -> tuple[Path, str]: response = await s3_client.get_object(Bucket=bucket_name, Key=key) checksum = await create_sha256_checksum(response["Body"]) - return Path(key).relative_to(Path(bucket_name)), checksum + return Path(key).relative_to(Path(remote_path)), checksum results = await limited_gather( *[ @@ -423,11 +421,12 @@ async def _assert_local_content_in_s3( s3_client: S3Client, bucket_name: S3BucketName, local_s3_content_path: Path, + remote_path: StorageFileID, ) -> None: files_local_folder = await _get_file_checksums_from_local_path( local_s3_content_path ) - files_from_s3 = await _get_files_from_s3(s3_client, bucket_name) + files_from_s3 = await _get_files_from_s3(s3_client, bucket_name, remote_path) assert files_local_folder == files_from_s3 @@ -437,18 +436,21 @@ async def _assert_same_files_in_all_places( bucket_name: S3BucketName, r_clone_container: str, r_clone_local_mount_path: Path, + remote_path: StorageFileID, ) -> None: files_from_container = await _get_file_checksums_from_container( - r_clone_local_mount_path, r_clone_container, bucket_name + r_clone_local_mount_path, r_clone_container ) - files_from_s3 = await _get_files_from_s3(s3_client, bucket_name) + files_from_s3 = await _get_files_from_s3(s3_client, bucket_name, remote_path) assert files_from_container == files_from_s3 -async def _change_file_in_container(remote_path: Path, r_clone_container: str) -> None: +async def _change_file_in_container( + path_in_container: Path, r_clone_container: str +) -> None: await run_command_in_container( r_clone_container, - command=f"dd if=/dev/urandom of={remote_path} bs={_get_random_file_size()} count=1", + command=f"dd if=/dev/urandom of={path_in_container} bs={_get_random_file_size()} count=1", timeout=30, ) @@ -462,25 +464,24 @@ async def test_tracked_mount_waits_for_files_before_finalizing( bucket_name: S3BucketName, r_clone_container: str, local_s3_content_path: Path, - remote_path: Path, + remote_path: StorageFileID, ): await single_mount_r_clone_mount_manager.start_mount( MountRemoteType.S3, remote_path, r_clone_local_mount_path ) - await _assert_local_content_in_s3(s3_client, bucket_name, local_s3_content_path) + await _assert_local_content_in_s3( + s3_client, bucket_name, local_s3_content_path, remote_path + ) def _get_random_file_in_container() -> Path: - return ( - r_clone_local_mount_path - / bucket_name - / secrets.choice( - [x for x in local_s3_content_path.rglob("*") if x.is_file()] - ).relative_to(local_s3_content_path) - ) + return r_clone_local_mount_path / secrets.choice( + [x for x in local_s3_content_path.rglob("*") if x.is_file()] + ).relative_to(local_s3_content_path) # change and check all is the same files_to_change = {_get_random_file_in_container() for _ in range(15)} + await limited_gather( *[_change_file_in_container(x, r_clone_container) for x in files_to_change], limit=10, @@ -490,10 +491,7 @@ def _get_random_file_in_container() -> Path: r_clone_local_mount_path ) await _assert_same_files_in_all_places( - s3_client, - bucket_name, - r_clone_container, - r_clone_local_mount_path, + s3_client, bucket_name, r_clone_container, r_clone_local_mount_path, remote_path ) await single_mount_r_clone_mount_manager.stop_mount(r_clone_local_mount_path) From 106521a34151a7ecd8f468d8846b5f40c4f38343 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 10 Dec 2025 14:53:38 +0100 Subject: [PATCH 18/79] adde some logging --- .../node_ports_common/r_clone_mount/_core.py | 83 +++++++++++-------- 1 file changed, 50 insertions(+), 33 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 4fa24b89902d..dc3ec2ea93a3 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -16,7 +16,7 @@ from models_library.projects_nodes_io import StorageFileID from pydantic import BaseModel, NonNegativeFloat from servicelib.container_utils import run_command_in_container -from servicelib.logging_utils import log_catch +from servicelib.logging_utils import log_catch, log_context from servicelib.r_clone_utils import config_file from servicelib.utils import unused_port from settings_library.r_clone import RCloneMountSettings, RCloneSettings @@ -402,45 +402,62 @@ async def start_mount( local_mount_path: Path, vfs_cache_path_overwrite: Path | None = None, ) -> None: - mount_id = self._get_mount_id(local_mount_path) - if mount_id in self._started_mounts: - tracked_mount = self._started_mounts[mount_id] - raise MountAlreadyStartedError(local_mount_path=local_mount_path) - - vfs_cache_path = ( - vfs_cache_path_overwrite or self._common_vfs_cache_path - ) / mount_id - vfs_cache_path.mkdir(parents=True, exist_ok=True) - - free_port = await asyncio.get_running_loop().run_in_executor(None, unused_port) - - tracked_mount = TrackedMount( - self.r_clone_settings, - remote_type, - rc_port=free_port, - remote_path=remote_path, - local_mount_path=local_mount_path, - vfs_cache_path=vfs_cache_path, - ) - await tracked_mount.start_mount() + with log_context( + _logger, + logging.INFO, + f"mounting {local_mount_path=} from {remote_path=}", + log_duration=True, + ): + mount_id = self._get_mount_id(local_mount_path) + if mount_id in self._started_mounts: + tracked_mount = self._started_mounts[mount_id] + raise MountAlreadyStartedError(local_mount_path=local_mount_path) + + vfs_cache_path = ( + vfs_cache_path_overwrite or self._common_vfs_cache_path + ) / mount_id + vfs_cache_path.mkdir(parents=True, exist_ok=True) + + free_port = await asyncio.get_running_loop().run_in_executor( + None, unused_port + ) - self._started_mounts[mount_id] = tracked_mount + tracked_mount = TrackedMount( + self.r_clone_settings, + remote_type, + rc_port=free_port, + remote_path=remote_path, + local_mount_path=local_mount_path, + vfs_cache_path=vfs_cache_path, + ) + await tracked_mount.start_mount() + + self._started_mounts[mount_id] = tracked_mount async def wait_for_transfers_to_complete(self, local_mount_path: Path) -> None: - mount_id = self._get_mount_id(local_mount_path) - if mount_id not in self._started_mounts: - raise MountNotStartedError(local_mount_path=local_mount_path) + with log_context( + _logger, + logging.INFO, + f"wait for transfers to complete {local_mount_path=}", + log_duration=True, + ): + mount_id = self._get_mount_id(local_mount_path) + if mount_id not in self._started_mounts: + raise MountNotStartedError(local_mount_path=local_mount_path) - tracked_mount = self._started_mounts[mount_id] - await tracked_mount.rc_interface.wait_for_all_transfers_to_complete() + tracked_mount = self._started_mounts[mount_id] + await tracked_mount.rc_interface.wait_for_all_transfers_to_complete() async def stop_mount(self, local_mount_path: Path) -> None: - mount_id = self._get_mount_id(local_mount_path) - if mount_id not in self._started_mounts: - raise MountNotStartedError(local_mount_path=local_mount_path) + with log_context( + _logger, logging.INFO, f"unmounting {local_mount_path=}", log_duration=True + ): + mount_id = self._get_mount_id(local_mount_path) + if mount_id not in self._started_mounts: + raise MountNotStartedError(local_mount_path=local_mount_path) - tracked_mount = self._started_mounts[mount_id] - await tracked_mount.stop_mount() + tracked_mount = self._started_mounts[mount_id] + await tracked_mount.stop_mount() async def setup(self) -> None: pass From fa5e997ca1b1fc3e0a88bfd583716bd996c7c978 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 10 Dec 2025 17:18:37 +0100 Subject: [PATCH 19/79] refactor --- .../src/simcore_sdk/node_data/data_manager.py | 21 ++++++++----------- 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py index 7b39a44102c2..a449fb863771 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py @@ -184,16 +184,6 @@ async def _delete_legacy_archive( ) -async def _use_r_clone_mounting( - application_name: str, user_id: UserID, product_name: ProductName -) -> bool: - return ( - await DBManager(application_name=application_name).get_group_extra_properties( - user_id=user_id, product_name=product_name - ) - ).use_r_clone_mounting - - async def _stop_mount( mount_manager: RCloneMountManager, destination_path: Path ) -> None: @@ -218,7 +208,7 @@ async def push( # pylint: disable=too-many-arguments # noqa: PLR0913 ) -> None: """pushes and removes the legacy archive if present""" - if _use_r_clone_mounting(application_name, user_id, product_name): + if await mount_manager.was_mount_started(source_path): await _stop_mount(mount_manager, source_path) else: await _push_directory( @@ -277,7 +267,14 @@ async def _start_mount_if_required( node_uuid: NodeID, destination_path: Path, ) -> None: - if not _use_r_clone_mounting(application_name, user_id, product_name): + group_extra_properties = await DBManager( + application_name=application_name + ).get_group_extra_properties(user_id=user_id, product_name=product_name) + + _logger.debug("group_extra_properties=%s", group_extra_properties) + + if group_extra_properties.use_r_clone_mounting is False: + _logger.debug("RClone mounting not required") return s3_object = __create_s3_object_key(project_id, node_uuid, destination_path) From 091ac4f87c98f7cf108663c9ef26d936c1d1ad99 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 10:13:56 +0100 Subject: [PATCH 20/79] required --- services/dynamic-sidecar/docker/boot.sh | 4 +++ .../modules/long_running_tasks.py | 28 ++++++++++++++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/services/dynamic-sidecar/docker/boot.sh b/services/dynamic-sidecar/docker/boot.sh index 984ef554a3b8..39ed82bc802a 100755 --- a/services/dynamic-sidecar/docker/boot.sh +++ b/services/dynamic-sidecar/docker/boot.sh @@ -48,6 +48,10 @@ DYNAMIC_SIDECAR_REMOTE_DEBUGGING_PORT=${DYNAMIC_SIDECAR_REMOTE_DEBUGGING_PORT:-3 SERVER_LOG_LEVEL=$(echo "${APP_LOG_LEVEL}" | tr '[:upper:]' '[:lower:]') echo "$INFO" "Log-level app/server: $APP_LOG_LEVEL/$SERVER_LOG_LEVEL" +R_CLONE_VERSION=$(rclone version | head -n1 | awk '{print $2}' | sed 's/^v//') && \ + echo "R_CLONE_VERSION=${R_CLONE_VERSION}" && \ + export R_CLONE_VERSION + if [ "${SC_BOOT_MODE}" = "debug" ]; then reload_dir_packages=$(fdfind src /devel/packages --exec echo '--reload-dir {} ' | tr '\n' ' ') diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py index 69a12ea5c1fa..e790e3249f75 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py @@ -351,7 +351,28 @@ async def _restore_state_folder( settings: ApplicationSettings, progress_bar: ProgressBarData, state_path: Path, + mounted_volumes: MountedVolumes, ) -> None: + async def _resolve_volume_path(path: Path) -> dict: + not_dy_volume = path.relative_to(settings.DYNAMIC_SIDECAR_DY_VOLUMES_MOUNT_DIR) + matcher = f":/{not_dy_volume}" + + async for entry in mounted_volumes.iter_state_paths_to_docker_volumes( + settings.DY_SIDECAR_RUN_ID + ): + if entry.endswith(matcher): + mount_str = entry.replace(f"/{not_dy_volume}", f"{path}") + source, target = mount_str.split(":") + return { + "Type": "bind", + "Source": source, + "Target": target, + "BindOptions": {"Propagation": "rshared"}, + } + + msg = f"Could not resolve volume path for {path}" + raise RuntimeError(msg) + assert settings.DY_SIDECAR_PRODUCT_NAME is not None # nosec await data_manager.pull( product_name=settings.DY_SIDECAR_PRODUCT_NAME, @@ -367,6 +388,7 @@ async def _restore_state_folder( legacy_state=_get_legacy_state_with_dy_volumes_path(settings), application_name=f"{APP_NAME}-{settings.DY_SIDECAR_NODE_ID}", mount_manager=get_r_clone_mount_manager(app), + handler_get_bind_path=_resolve_volume_path, ) @@ -405,7 +427,11 @@ async def restore_user_services_state_paths( await logged_gather( *( _restore_state_folder( - app, settings=settings, progress_bar=root_progress, state_path=path + app, + settings=settings, + progress_bar=root_progress, + state_path=path, + mounted_volumes=mounted_volumes, ) for path in mounted_volumes.disk_state_paths_iter() ), From c6871b311e2647b1bce679059cbf41bd2306db43 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 10:14:25 +0100 Subject: [PATCH 21/79] added rlcone version in settings --- packages/settings-library/src/settings_library/r_clone.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index 974dfb0f5915..5ac2ac22fab0 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -36,6 +36,14 @@ class RCloneMountSettings(BaseCustomSettings): "/tmp/vfs-caching" # noqa: S108 ) + R_CLONE_VERSION: Annotated[ + str | None, + Field( + pattern=r"^\d+\.\d+\.\d+$", + description="version of rclone to use for the mounts", + ), + ] = None + _validate_r_clone_mount_transfers_completed_timeout = ( validate_numeric_string_as_timedelta( "R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT" From 3a70e9a286410d1c0a863d76d0bb1002d3049d22 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 10:21:23 +0100 Subject: [PATCH 22/79] working version --- .../src/simcore_sdk/node_data/data_manager.py | 28 +- .../r_clone_mount/__init__.py | 8 +- .../node_ports_common/r_clone_mount/_core.py | 397 +++++++++++++----- 3 files changed, 310 insertions(+), 123 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py index a449fb863771..290f7464e69a 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py @@ -17,17 +17,21 @@ from ..node_ports_common.constants import SIMCORE_LOCATION from ..node_ports_common.dbmanager import DBManager from ..node_ports_common.file_io_utils import LogRedirectCB -from ..node_ports_common.r_clone_mount import MountRemoteType, RCloneMountManager +from ..node_ports_common.r_clone_mount import ( + GetBindPathProtocol, + MountRemoteType, + RCloneMountManager, +) _logger = logging.getLogger(__name__) def __create_s3_object_key( - project_id: ProjectID, node_uuid: NodeID, file_path: Path | str + project_id: ProjectID, node_id: NodeID, file_path: Path | str ) -> StorageFileID: file_name = file_path.name if isinstance(file_path, Path) else file_path return TypeAdapter(StorageFileID).validate_python( - f"{project_id}/{node_uuid}/{file_name}" + f"{project_id}/{node_id}/{file_name}" ) @@ -264,8 +268,9 @@ async def _start_mount_if_required( product_name: ProductName, user_id: UserID, project_id: ProjectID, - node_uuid: NodeID, + node_id: NodeID, destination_path: Path, + handler_get_bind_path: GetBindPathProtocol, ) -> None: group_extra_properties = await DBManager( application_name=application_name @@ -277,8 +282,14 @@ async def _start_mount_if_required( _logger.debug("RClone mounting not required") return - s3_object = __create_s3_object_key(project_id, node_uuid, destination_path) - await mount_manager.start_mount(MountRemoteType.S3, s3_object, destination_path) + s3_object = __create_s3_object_key(project_id, node_id, destination_path) + await mount_manager.start_mount( + node_id, + MountRemoteType.S3, + s3_object, + destination_path, + handler_get_bind_path=handler_get_bind_path, + ) async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 @@ -294,6 +305,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 legacy_state: LegacyState | None, application_name: str, mount_manager: RCloneMountManager, + handler_get_bind_path: GetBindPathProtocol, ) -> None: """restores the state folder""" @@ -334,6 +346,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 project_id, node_uuid, destination_path, + handler_get_bind_path, ) return @@ -362,6 +375,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 project_id, node_uuid, destination_path, + handler_get_bind_path, ) return @@ -390,6 +404,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 project_id, node_uuid, destination_path, + handler_get_bind_path, ) return @@ -401,5 +416,6 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 project_id, node_uuid, destination_path, + handler_get_bind_path, ) _logger.debug("No content previously saved for '%s'", destination_path) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py index 106fadfb009f..187621da2d39 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py @@ -1,7 +1,13 @@ from ._config_provider import MountRemoteType -from ._core import MountAlreadyStartedError, MountNotStartedError, RCloneMountManager +from ._core import ( + GetBindPathProtocol, + MountAlreadyStartedError, + MountNotStartedError, + RCloneMountManager, +) __all__: tuple[str, ...] = ( + "GetBindPathProtocol", "MountAlreadyStartedError", "MountNotStartedError", "MountRemoteType", diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index dc3ec2ea93a3..56dbe5776867 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -4,20 +4,23 @@ from collections.abc import Awaitable, Callable from contextlib import AsyncExitStack from datetime import UTC, datetime, timedelta +from functools import cached_property from pathlib import Path -from typing import Any, Final +from textwrap import dedent +from typing import Any, Final, Protocol from uuid import uuid4 +import aiodocker import httpx +from aiodocker.containers import DockerContainer +from aiodocker.networks import DockerNetwork from common_library.errors_classes import OsparcErrorMixin from httpx import AsyncClient from models_library.basic_types import PortInt from models_library.progress_bar import ProgressReport -from models_library.projects_nodes_io import StorageFileID -from pydantic import BaseModel, NonNegativeFloat -from servicelib.container_utils import run_command_in_container -from servicelib.logging_utils import log_catch, log_context -from servicelib.r_clone_utils import config_file +from models_library.projects_nodes_io import NodeID, StorageFileID +from pydantic import BaseModel +from servicelib.logging_utils import log_context from servicelib.utils import unused_port from settings_library.r_clone import RCloneMountSettings, RCloneSettings from tenacity import ( @@ -33,25 +36,26 @@ _logger = logging.getLogger(__name__) -_DEFAULT_REMOTE_CONTROL_HOST: Final[str] = "localhost" _MAX_WAIT_RC_HTTP_INTERFACE_READY: Final[timedelta] = timedelta(seconds=10) _DEFAULT_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=1) _DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT: Final[timedelta] = timedelta(seconds=5) _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=5) +_DOCKER_PREFIX_MOUNT: Final[str] = "rclone-mount" -class _BaseRcloneMountError(OsparcErrorMixin, RuntimeError): - pass +_NOT_FOUND: Final[int] = 404 +type MountId = str -class _ProcessAlreadyStartedError(_BaseRcloneMountError): - msg_template: str = "Process already started with pid='{pid}' via '{command}'" + +class _BaseRcloneMountError(OsparcErrorMixin, RuntimeError): + pass -class _TrackedMountAlreadyStartedError(_BaseRcloneMountError): +class _ContainerAlreadyStartedError(_BaseRcloneMountError): msg_template: str = ( - "Mount process already stareted with pid='{pid}' via '{command}'" + "Mount process already stareted via container='{container}' with command='{command}'" ) @@ -71,54 +75,165 @@ class MountNotStartedError(_BaseRcloneMountError): msg_template: str = "Mount not started for local path='{local_mount_path}'" -def _get_command__pid_of_background_command(command: str) -> str: - return f"sh -c '{command} & echo $!'" +def _get_self_container_id() -> str: + # in docker the hostname is the container id + return os.environ["HOSTNAME"] -def _get_command__sigterm_process(pid: str) -> str: - return f"kill -SIGTERM {pid}" +class GetBindPathProtocol(Protocol): + async def __call__(self, path: Path) -> dict: ... -class DaemonProcessManager: - """manage a command that is meant to run in a container forever""" +class ContainerManager: + def __init__( + self, + node_id: NodeID, + r_clone_version: str, + remote_control_port: PortInt, + local_mount_path: Path, + r_clone_config_content: str, + remote_path: str, + vfs_cache_path: Path, + rc_user: str, + rc_password: str, + *, + handler_get_bind_path: GetBindPathProtocol, + ) -> None: + self.node_id = node_id + self.r_clone_version = r_clone_version + self.remote_control_port = remote_control_port + self.local_mount_path = local_mount_path + self.r_clone_config_content = r_clone_config_content + self.handler_get_bind_path = handler_get_bind_path + + self.command = _get_rclone_mount_command( + r_clone_config_content=r_clone_config_content, + remote_path=remote_path, + local_mount_path=self.local_mount_path, + vfs_cache_path=vfs_cache_path, + rc_addr=f"0.0.0.0:{remote_control_port}", + rc_user=rc_user, + rc_password=rc_password, + ) - def __init__(self, command: str, *, timeout: NonNegativeFloat = 5) -> None: - self.command = command - self.timeout = timeout - self.pid: str | None = None + self._cleanup_stack = AsyncExitStack() + self._client: aiodocker.Docker | None = None - async def _run_in_container(self, command: str) -> str: - self_container = os.environ["HOSTNAME"] - return await run_command_in_container( - self_container, command=command, timeout=self.timeout - ) + self._r_clone_container: DockerContainer | None = None + self._r_clone_network: DockerNetwork | None = None + + @cached_property + def r_clone_container_name(self) -> str: + return f"{_DOCKER_PREFIX_MOUNT}-c{self.node_id}{_get_mount_id(self.local_mount_path)}"[ + :63 + ] + + @cached_property + def _r_clone_network_name(self) -> str: + return f"{_DOCKER_PREFIX_MOUNT}-n{self.node_id}{_get_mount_id(self.local_mount_path)}"[ + :63 + ] + + @property + def _aiodocker_client(self) -> aiodocker.Docker: + assert self._client is not None # nosec + return self._client async def start(self): - if self.pid: - raise _ProcessAlreadyStartedError(pid=self.pid, command=self.command) + self._client = await self._cleanup_stack.enter_async_context(aiodocker.Docker()) + # TODO: toss away docker session when done with it do not maintain object in memory to avoid issues + # better more robust way of doing it - command_result = await self._run_in_container( - command=_get_command__pid_of_background_command(self.command) + try: + existing_container = await self._aiodocker_client.containers.get( + self.r_clone_container_name + ) + await existing_container.delete(force=True) + except aiodocker.exceptions.DockerError as e: + if e.status != _NOT_FOUND: + raise + + try: + existing_network = DockerNetwork( + self._aiodocker_client, self._r_clone_network_name + ) + await existing_network.show() + await existing_network.delete() + except aiodocker.exceptions.DockerError as e: + if e.status != _NOT_FOUND: + raise + + self._r_clone_network = await self._aiodocker_client.networks.create( + {"Name": self._r_clone_network_name, "Attachable": True} + ) + await self._r_clone_network.connect({"Container": _get_self_container_id()}) + + # create rclone container attached to the network + self._r_clone_container = await self._aiodocker_client.containers.run( + config={ + "Image": f"rclone/rclone:{self.r_clone_version}", + "Entrypoint": [ + "/bin/sh", + "-c", + f"{self.command} && sleep 100000 || sleep 100000000 ", + ], + "ExposedPorts": {f"{self.remote_control_port}/tcp": {}}, + "HostConfig": { + "NetworkMode": self._r_clone_network_name, + "Binds": [], + "Mounts": [await self.handler_get_bind_path(self.local_mount_path)], + "Devices": [ + { + "PathOnHost": "/dev/fuse", + "PathInContainer": "/dev/fuse", + "CgroupPermissions": "rwm", + } + ], + "CapAdd": ["SYS_ADMIN"], + "SecurityOpt": ["apparmor:unconfined", "seccomp:unconfined"], + }, + }, + name=self.r_clone_container_name, + ) + container_inspect = await self._r_clone_container.show() + _logger.debug( + "Started rclone mount container '%s' with command='%s' (inspect=%s)", + self.r_clone_container_name, + self.command, + container_inspect, ) - # pid is printed as the first line of the output - self.pid = command_result.strip().split("\n")[0] - _logger.debug("Started rclone mount with pid=%s", self.pid) async def stop(self): - if self.pid is None: - return + assert self._r_clone_container is not None # nosec + assert self._r_clone_network is not None # nosec + + await self._r_clone_container.stop() + + await self._r_clone_network.disconnect({"Container": _get_self_container_id()}) + await self._r_clone_network.delete() + + await self._r_clone_container.delete() + + await self._cleanup_stack.aclose() - # since the process could have failed to start or failed shortly after - # starting the pid mind not be corresponding to a running process - # and will raise an error - with log_catch(_logger, reraise=False): - await self._run_in_container( - command=_get_command__sigterm_process(self.pid) - ) + +def _get_mount_id(local_mount_path: Path) -> MountId: + return f"{local_mount_path}".replace("/", "_") + + +_COMMAND_TEMPLATE: Final[str] = dedent( + """ +cat < /tmp/rclone.conf +{r_clone_config_content} +EOF + +{r_clone_command} +""" +) def _get_rclone_mount_command( - config_file_path: str, + r_clone_config_content: str, remote_path: StorageFileID, local_mount_path: Path, vfs_cache_path: Path, @@ -126,12 +241,14 @@ def _get_rclone_mount_command( rc_user: str, rc_password: str, ) -> str: + # jupyter gid and uid form the user inside + uid = 1000 + gid = 100 escaped_remote_path = f"{remote_path}".lstrip("/") - command: list[str] = [ + command_array: list[str] = [ "rclone", "--config", - config_file_path, - f"--log-file=/tmp/rclone-debug{uuid4()}.log", # TODO: maybe it is possible to make a reproducible path insteaa of random for simpler access to logs? + "/tmp/rclone.conf", # noqa: S108 "-vv", "mount", f"{CONFIG_KEY}:{escaped_remote_path}", @@ -149,8 +266,18 @@ def _get_rclone_mount_command( f"--rc-user='{rc_user}'", f"--rc-pass='{rc_password}'", "--allow-non-empty", + "--allow-other", + "--uid", + f"{uid}", + "--gid", + f"{gid}", ] - return " ".join(command) + r_clone_command = " ".join(command_array) + + return _COMMAND_TEMPLATE.format( + r_clone_config_content=r_clone_config_content, + r_clone_command=r_clone_command, + ) class MountActivity(BaseModel): @@ -163,21 +290,23 @@ def __init__( self, remote_control_port: PortInt, r_clone_mount_settings: RCloneMountSettings, + remote_control_host: str, + rc_user: str, + rc_password: str, *, update_handler: Callable[[MountActivity], Awaitable[None]], - remote_control_host: str = _DEFAULT_REMOTE_CONTROL_HOST, update_interval: timedelta = _DEFAULT_UPDATE_INTERVAL, r_clone_client_timeout: timedelta = _DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT, ) -> None: self._r_clone_mount_settings = r_clone_mount_settings self._update_interval_seconds = update_interval.total_seconds() self._r_clone_client_timeout = r_clone_client_timeout + self._rc_user = rc_user + self._rc_password = rc_password self._update_handler = update_handler self._rc_host = remote_control_host self._rc_port = remote_control_port - self.rc_user = f"{uuid4()}" - self.rc_password = f"{uuid4()}" self._cleanup_stack = AsyncExitStack() self._client: AsyncClient | None = None @@ -206,8 +335,10 @@ def _base_url(self) -> str: async def _request(self, method: str, path: str) -> Any: assert self._client is not None # nosec + request_url = f"{self._base_url}/{path}" + _logger.debug("Sending '%s %s' request", method, request_url) response = await self._client.request( - method, f"{self._base_url}/{path}", auth=(self.rc_user, self.rc_password) + method, request_url, auth=(self._rc_user, self._rc_password) ) response.raise_for_status() result = response.json() @@ -293,6 +424,7 @@ async def _() -> None: class TrackedMount: def __init__( self, + node_id: NodeID, r_clone_settings: RCloneSettings, remote_type: MountRemoteType, *, @@ -300,28 +432,34 @@ def __init__( remote_path: StorageFileID, local_mount_path: Path, vfs_cache_path: Path, + handler_get_bind_path: GetBindPathProtocol, mount_activity_update_interval: timedelta = _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL, ) -> None: + self.node_id = node_id self.r_clone_settings = r_clone_settings self.mount_type = remote_type self.rc_port = rc_port self.remote_path = remote_path self.local_mount_path = local_mount_path self.vfs_cache_path = vfs_cache_path + self.rc_user = f"{uuid4()}" + self.rc_password = f"{uuid4()}" + self.handler_get_bind_path = handler_get_bind_path - self.rc_interface = RCloneRCInterfaceClient( - remote_control_port=rc_port, - r_clone_mount_settings=r_clone_settings.R_CLONE_MOUNT_SETTINGS, - update_handler=self._progress_handler, - ) self._last_mount_activity: MountActivity | None = None self._last_mount_activity_update: datetime = datetime.fromtimestamp(0, UTC) self._mount_activity_update_interval = mount_activity_update_interval # used internally to handle the mount command - self._daemon_manager: DaemonProcessManager | None = None + self._container_manager: ContainerManager | None = None + self._rc_interface: RCloneRCInterfaceClient | None = None self._cleanup_stack = AsyncExitStack() + @property + def rc_interface(self) -> RCloneRCInterfaceClient: + assert self._rc_interface is not None # nosec + return self._rc_interface + async def _progress_handler(self, mount_activity: MountActivity) -> None: now = datetime.now(UTC) @@ -345,39 +483,56 @@ async def teardown(self) -> None: await self.stop_mount() async def start_mount(self) -> None: - if self._daemon_manager is not None: - raise _TrackedMountAlreadyStartedError( - pid=self._daemon_manager.pid, command=self._daemon_manager.command + if self._container_manager is not None: + raise _ContainerAlreadyStartedError( + container=self._container_manager.r_clone_container_name, + command=self._container_manager.command, ) - config_file_path = await self._cleanup_stack.enter_async_context( - config_file(get_config_content(self.r_clone_settings, self.mount_type)) + r_clone_config_content = get_config_content( + self.r_clone_settings, self.mount_type ) - self._daemon_manager = DaemonProcessManager( - command=_get_rclone_mount_command( - config_file_path=config_file_path, - remote_path=f"{self.r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME}/{self.remote_path}", - local_mount_path=self.local_mount_path, - vfs_cache_path=self.vfs_cache_path, - rc_addr=f"0.0.0.0:{self.rc_port}", - rc_user=self.rc_interface.rc_user, - rc_password=self.rc_interface.rc_password, - ) + if self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION is None: + msg = "R_CLONE_VERSION setting is not set" + raise RuntimeError(msg) + + self._container_manager = ContainerManager( + node_id=self.node_id, + r_clone_version=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION, + remote_control_port=self.rc_port, + local_mount_path=self.local_mount_path, + r_clone_config_content=r_clone_config_content, + remote_path=f"{self.r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME}/{self.remote_path}", + vfs_cache_path=self.vfs_cache_path, + rc_user=self.rc_user, + rc_password=self.rc_password, + handler_get_bind_path=self.handler_get_bind_path, ) - await self._daemon_manager.start() + + self._rc_interface: RCloneRCInterfaceClient | None = RCloneRCInterfaceClient( + remote_control_port=self.rc_port, + r_clone_mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, + remote_control_host=self._container_manager.r_clone_container_name, + rc_user=self.rc_user, + rc_password=self.rc_password, + update_handler=self._progress_handler, + ) + + await self._container_manager.start() await self.rc_interface.setup() await self.rc_interface.wait_for_interface_to_be_ready() async def stop_mount(self) -> None: - if self._daemon_manager is None: + if self._container_manager is None: return await self.rc_interface.wait_for_all_transfers_to_complete() await self.rc_interface.teardown() + self._rc_interface = None - await self._daemon_manager.stop() - self._daemon_manager = None + await self._container_manager.stop() + self._container_manager = None await self._cleanup_stack.aclose() @@ -389,50 +544,56 @@ def __init__(self, r_clone_settings: RCloneSettings) -> None: self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_MOUNT_VFS_CACHE_PATH ) - self._started_mounts: dict[str, TrackedMount] = {} - - @staticmethod - def _get_mount_id(local_mount_path: Path) -> str: - return f"{local_mount_path}".replace("/", "_") + self._started_mounts: dict[MountId, TrackedMount] = {} async def start_mount( self, + node_id: NodeID, remote_type: MountRemoteType, remote_path: StorageFileID, local_mount_path: Path, + handler_get_bind_path: GetBindPathProtocol, vfs_cache_path_overwrite: Path | None = None, ) -> None: - with log_context( - _logger, - logging.INFO, - f"mounting {local_mount_path=} from {remote_path=}", - log_duration=True, - ): - mount_id = self._get_mount_id(local_mount_path) - if mount_id in self._started_mounts: - tracked_mount = self._started_mounts[mount_id] - raise MountAlreadyStartedError(local_mount_path=local_mount_path) - - vfs_cache_path = ( - vfs_cache_path_overwrite or self._common_vfs_cache_path - ) / mount_id - vfs_cache_path.mkdir(parents=True, exist_ok=True) - - free_port = await asyncio.get_running_loop().run_in_executor( - None, unused_port - ) - - tracked_mount = TrackedMount( - self.r_clone_settings, - remote_type, - rc_port=free_port, - remote_path=remote_path, - local_mount_path=local_mount_path, - vfs_cache_path=vfs_cache_path, - ) - await tracked_mount.start_mount() - - self._started_mounts[mount_id] = tracked_mount + try: + with log_context( + _logger, + logging.INFO, + f"mounting {local_mount_path=} from {remote_path=}", + log_duration=True, + ): + mount_id = _get_mount_id(local_mount_path) + if mount_id in self._started_mounts: + tracked_mount = self._started_mounts[mount_id] + raise MountAlreadyStartedError(local_mount_path=local_mount_path) + + vfs_cache_path = ( + vfs_cache_path_overwrite or self._common_vfs_cache_path + ) / mount_id + vfs_cache_path.mkdir(parents=True, exist_ok=True) + + free_port = await asyncio.get_running_loop().run_in_executor( + None, unused_port + ) + + tracked_mount = TrackedMount( + node_id, + self.r_clone_settings, + remote_type, + rc_port=free_port, + remote_path=remote_path, + local_mount_path=local_mount_path, + vfs_cache_path=vfs_cache_path, + handler_get_bind_path=handler_get_bind_path, + ) + await tracked_mount.start_mount() + + self._started_mounts[mount_id] = tracked_mount + except Exception: + _logger.exception("SOMETHING WENT WRONG WAITING HERE FOR DEBUGGING") + await asyncio.sleep(100000) # let rclone write logs + + raise async def wait_for_transfers_to_complete(self, local_mount_path: Path) -> None: with log_context( @@ -441,18 +602,22 @@ async def wait_for_transfers_to_complete(self, local_mount_path: Path) -> None: f"wait for transfers to complete {local_mount_path=}", log_duration=True, ): - mount_id = self._get_mount_id(local_mount_path) + mount_id = _get_mount_id(local_mount_path) if mount_id not in self._started_mounts: raise MountNotStartedError(local_mount_path=local_mount_path) tracked_mount = self._started_mounts[mount_id] await tracked_mount.rc_interface.wait_for_all_transfers_to_complete() + async def was_mount_started(self, local_mount_path: Path) -> bool: + mount_id = _get_mount_id(local_mount_path) + return mount_id in self._started_mounts + async def stop_mount(self, local_mount_path: Path) -> None: with log_context( _logger, logging.INFO, f"unmounting {local_mount_path=}", log_duration=True ): - mount_id = self._get_mount_id(local_mount_path) + mount_id = _get_mount_id(local_mount_path) if mount_id not in self._started_mounts: raise MountNotStartedError(local_mount_path=local_mount_path) From a64f72ac198a321f3107f988a1819b1058d9f385 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 10:46:14 +0100 Subject: [PATCH 23/79] added s3 entry to view contents --- .../src/simcore_sdk/node_data/data_manager.py | 6 ++++- .../node_ports_common/filemanager.py | 26 +++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py index 290f7464e69a..6eda097a07fd 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py @@ -196,7 +196,6 @@ async def _stop_mount( async def push( # pylint: disable=too-many-arguments # noqa: PLR0913 - product_name: ProductName, user_id: UserID, project_id: ProjectID, node_uuid: NodeID, @@ -283,6 +282,11 @@ async def _start_mount_if_required( return s3_object = __create_s3_object_key(project_id, node_id, destination_path) + + await filemanager.create_r_clone_mounted_directory_entry( + user_id=user_id, s3_object=s3_object, store_id=SIMCORE_LOCATION + ) + await mount_manager.start_mount( node_id, MountRemoteType.S3, diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/filemanager.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/filemanager.py index 60f44f7a7e65..d04ae37888d4 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/filemanager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/filemanager.py @@ -88,6 +88,32 @@ async def get_download_link_from_s3( return URL(f"{file_link}") +async def create_r_clone_mounted_directory_entry( + *, + user_id: UserID, + s3_object: StorageFileID, + store_id: LocationID | None, +) -> None: + _, upload_links = await get_upload_links_from_s3( + user_id=user_id, + store_name=None, + store_id=store_id, + s3_object=s3_object, + client_session=None, + link_type=LinkType.S3, + file_size=ByteSize(0), + is_directory=True, + sha256_checksum=None, + ) + async with ClientSessionContextManager(None) as session: + await _filemanager_utils.complete_upload( + session, + upload_links.links.complete_upload, + [], + is_directory=True, + ) + + async def get_upload_links_from_s3( *, user_id: UserID, From d63b13b8c0f0e43c7bd4eefd4f96a493c710d3e5 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 10:46:39 +0100 Subject: [PATCH 24/79] removed unused --- .../modules/long_running_tasks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py index e790e3249f75..115bbd9a43cb 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py @@ -457,7 +457,6 @@ async def _save_state_folder( ) -> None: assert settings.DY_SIDECAR_PRODUCT_NAME is not None # nosec await data_manager.push( - product_name=settings.DY_SIDECAR_PRODUCT_NAME, user_id=settings.DY_SIDECAR_USER_ID, project_id=settings.DY_SIDECAR_PROJECT_ID, node_uuid=settings.DY_SIDECAR_NODE_ID, From 5211629f7f1be25e23726790e1ef2a4dfe803c63 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 11:55:19 +0100 Subject: [PATCH 25/79] added predicatable rclone mount names --- .../src/simcore_sdk/node_data/data_manager.py | 21 +++++--- .../node_ports_common/r_clone_mount/_core.py | 51 ++++++++++++------- .../modules/long_running_tasks.py | 12 +++-- 3 files changed, 57 insertions(+), 27 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py index 6eda097a07fd..3d2eaf3fc1c6 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py @@ -7,7 +7,7 @@ from models_library.projects_nodes_io import NodeID, StorageFileID from models_library.service_settings_labels import LegacyState from models_library.users import UserID -from pydantic import TypeAdapter +from pydantic import NonNegativeInt, TypeAdapter from servicelib.archiving_utils import unarchive_dir from servicelib.logging_utils import log_context from servicelib.progress_bar import ProgressBarData @@ -189,10 +189,10 @@ async def _delete_legacy_archive( async def _stop_mount( - mount_manager: RCloneMountManager, destination_path: Path + mount_manager: RCloneMountManager, destination_path: Path, index: NonNegativeInt ) -> None: - await mount_manager.wait_for_transfers_to_complete(destination_path) - await mount_manager.stop_mount(destination_path) + await mount_manager.wait_for_transfers_to_complete(destination_path, index) + await mount_manager.stop_mount(destination_path, index) async def push( # pylint: disable=too-many-arguments # noqa: PLR0913 @@ -200,6 +200,7 @@ async def push( # pylint: disable=too-many-arguments # noqa: PLR0913 project_id: ProjectID, node_uuid: NodeID, source_path: Path, + index: NonNegativeInt, *, io_log_redirect_cb: LogRedirectCB, r_clone_settings: RCloneSettings, @@ -211,8 +212,8 @@ async def push( # pylint: disable=too-many-arguments # noqa: PLR0913 ) -> None: """pushes and removes the legacy archive if present""" - if await mount_manager.was_mount_started(source_path): - await _stop_mount(mount_manager, source_path) + if await mount_manager.was_mount_started(source_path, index): + await _stop_mount(mount_manager, source_path, index) else: await _push_directory( user_id=user_id, @@ -269,6 +270,7 @@ async def _start_mount_if_required( project_id: ProjectID, node_id: NodeID, destination_path: Path, + index: NonNegativeInt, handler_get_bind_path: GetBindPathProtocol, ) -> None: group_extra_properties = await DBManager( @@ -292,6 +294,7 @@ async def _start_mount_if_required( MountRemoteType.S3, s3_object, destination_path, + index, handler_get_bind_path=handler_get_bind_path, ) @@ -302,6 +305,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 project_id: ProjectID, node_uuid: NodeID, destination_path: Path, + index: NonNegativeInt, *, io_log_redirect_cb: LogRedirectCB, r_clone_settings: RCloneSettings, @@ -350,6 +354,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 project_id, node_uuid, destination_path, + index, handler_get_bind_path, ) return @@ -379,6 +384,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 project_id, node_uuid, destination_path, + index, handler_get_bind_path, ) return @@ -391,6 +397,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 is_archive=False, ) if state_directory_exists: + # TODO: no more pullig here just mounting! await _pull_directory( user_id=user_id, project_id=project_id, @@ -408,6 +415,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 project_id, node_uuid, destination_path, + index, handler_get_bind_path, ) return @@ -420,6 +428,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 project_id, node_uuid, destination_path, + index, handler_get_bind_path, ) _logger.debug("No content previously saved for '%s'", destination_path) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 56dbe5776867..4d671d7e1793 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -19,7 +19,7 @@ from models_library.basic_types import PortInt from models_library.progress_bar import ProgressReport from models_library.projects_nodes_io import NodeID, StorageFileID -from pydantic import BaseModel +from pydantic import BaseModel, NonNegativeInt from servicelib.logging_utils import log_context from servicelib.utils import unused_port from settings_library.r_clone import RCloneMountSettings, RCloneSettings @@ -42,7 +42,7 @@ _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=5) -_DOCKER_PREFIX_MOUNT: Final[str] = "rclone-mount" +_DOCKER_PREFIX_MOUNT: Final[str] = "rcm" _NOT_FOUND: Final[int] = 404 @@ -91,6 +91,7 @@ def __init__( r_clone_version: str, remote_control_port: PortInt, local_mount_path: Path, + index: NonNegativeInt, r_clone_config_content: str, remote_path: str, vfs_cache_path: Path, @@ -103,6 +104,7 @@ def __init__( self.r_clone_version = r_clone_version self.remote_control_port = remote_control_port self.local_mount_path = local_mount_path + self.index = index self.r_clone_config_content = r_clone_config_content self.handler_get_bind_path = handler_get_bind_path @@ -124,15 +126,13 @@ def __init__( @cached_property def r_clone_container_name(self) -> str: - return f"{_DOCKER_PREFIX_MOUNT}-c{self.node_id}{_get_mount_id(self.local_mount_path)}"[ - :63 - ] + mount_id = _get_mount_id(self.local_mount_path, self.index) + return f"{_DOCKER_PREFIX_MOUNT}-c-{self.node_id}{mount_id}"[:63] @cached_property def _r_clone_network_name(self) -> str: - return f"{_DOCKER_PREFIX_MOUNT}-n{self.node_id}{_get_mount_id(self.local_mount_path)}"[ - :63 - ] + mount_id = _get_mount_id(self.local_mount_path, self.index) + return f"{_DOCKER_PREFIX_MOUNT}-c-{self.node_id}{mount_id}"[:63] @property def _aiodocker_client(self) -> aiodocker.Docker: @@ -181,6 +181,7 @@ async def start(self): "HostConfig": { "NetworkMode": self._r_clone_network_name, "Binds": [], + # TODO: mount the VFS cache directory somewhere to have better performance "Mounts": [await self.handler_get_bind_path(self.local_mount_path)], "Devices": [ { @@ -217,8 +218,9 @@ async def stop(self): await self._cleanup_stack.aclose() -def _get_mount_id(local_mount_path: Path) -> MountId: - return f"{local_mount_path}".replace("/", "_") +def _get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> MountId: + # reversing string to avoid collisions + return f"{index}{local_mount_path}".replace("/", "_")[::-1] _COMMAND_TEMPLATE: Final[str] = dedent( @@ -365,7 +367,11 @@ async def _monitor(self) -> None: mount_activity = MountActivity( transferring=( { - x["name"]: ProgressReport(actual_value=x["percentage"] / 100) + x["name"]: ProgressReport( + actual_value=( + x["percentage"] / 100 if "percentage" in x else 0.0 + ) + ) for x in core_stats["transferring"] } if "transferring" in core_stats @@ -431,6 +437,7 @@ def __init__( rc_port: PortInt, remote_path: StorageFileID, local_mount_path: Path, + index: NonNegativeInt, vfs_cache_path: Path, handler_get_bind_path: GetBindPathProtocol, mount_activity_update_interval: timedelta = _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL, @@ -441,6 +448,7 @@ def __init__( self.rc_port = rc_port self.remote_path = remote_path self.local_mount_path = local_mount_path + self.index = index self.vfs_cache_path = vfs_cache_path self.rc_user = f"{uuid4()}" self.rc_password = f"{uuid4()}" @@ -502,6 +510,7 @@ async def start_mount(self) -> None: r_clone_version=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION, remote_control_port=self.rc_port, local_mount_path=self.local_mount_path, + index=self.index, r_clone_config_content=r_clone_config_content, remote_path=f"{self.r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME}/{self.remote_path}", vfs_cache_path=self.vfs_cache_path, @@ -552,6 +561,7 @@ async def start_mount( remote_type: MountRemoteType, remote_path: StorageFileID, local_mount_path: Path, + index: NonNegativeInt, handler_get_bind_path: GetBindPathProtocol, vfs_cache_path_overwrite: Path | None = None, ) -> None: @@ -562,7 +572,7 @@ async def start_mount( f"mounting {local_mount_path=} from {remote_path=}", log_duration=True, ): - mount_id = _get_mount_id(local_mount_path) + mount_id = _get_mount_id(local_mount_path, index) if mount_id in self._started_mounts: tracked_mount = self._started_mounts[mount_id] raise MountAlreadyStartedError(local_mount_path=local_mount_path) @@ -583,6 +593,7 @@ async def start_mount( rc_port=free_port, remote_path=remote_path, local_mount_path=local_mount_path, + index=index, vfs_cache_path=vfs_cache_path, handler_get_bind_path=handler_get_bind_path, ) @@ -595,29 +606,33 @@ async def start_mount( raise - async def wait_for_transfers_to_complete(self, local_mount_path: Path) -> None: + async def wait_for_transfers_to_complete( + self, local_mount_path: Path, index: NonNegativeInt + ) -> None: with log_context( _logger, logging.INFO, f"wait for transfers to complete {local_mount_path=}", log_duration=True, ): - mount_id = _get_mount_id(local_mount_path) + mount_id = _get_mount_id(local_mount_path, index) if mount_id not in self._started_mounts: raise MountNotStartedError(local_mount_path=local_mount_path) tracked_mount = self._started_mounts[mount_id] await tracked_mount.rc_interface.wait_for_all_transfers_to_complete() - async def was_mount_started(self, local_mount_path: Path) -> bool: - mount_id = _get_mount_id(local_mount_path) + async def was_mount_started( + self, local_mount_path: Path, index: NonNegativeInt + ) -> bool: + mount_id = _get_mount_id(local_mount_path, index) return mount_id in self._started_mounts - async def stop_mount(self, local_mount_path: Path) -> None: + async def stop_mount(self, local_mount_path: Path, index: NonNegativeInt) -> None: with log_context( _logger, logging.INFO, f"unmounting {local_mount_path=}", log_duration=True ): - mount_id = _get_mount_id(local_mount_path) + mount_id = _get_mount_id(local_mount_path, index) if mount_id not in self._started_mounts: raise MountNotStartedError(local_mount_path=local_mount_path) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py index 115bbd9a43cb..04ea6cdaa576 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py @@ -11,7 +11,7 @@ from models_library.generated_models.docker_rest_api import ContainerState from models_library.rabbitmq_messages import ProgressType, SimcorePlatformStatus from models_library.service_settings_labels import LegacyState -from pydantic import PositiveInt +from pydantic import NonNegativeInt, PositiveInt from servicelib.fastapi import long_running_tasks from servicelib.file_utils import log_directory_changes from servicelib.logging_utils import log_context @@ -351,6 +351,7 @@ async def _restore_state_folder( settings: ApplicationSettings, progress_bar: ProgressBarData, state_path: Path, + index: NonNegativeInt, mounted_volumes: MountedVolumes, ) -> None: async def _resolve_volume_path(path: Path) -> dict: @@ -380,6 +381,7 @@ async def _resolve_volume_path(path: Path) -> dict: project_id=settings.DY_SIDECAR_PROJECT_ID, node_uuid=settings.DY_SIDECAR_NODE_ID, destination_path=Path(state_path), + index=index, io_log_redirect_cb=functools.partial( post_sidecar_log_message, app, log_level=logging.INFO ), @@ -431,9 +433,10 @@ async def restore_user_services_state_paths( settings=settings, progress_bar=root_progress, state_path=path, + index=k, mounted_volumes=mounted_volumes, ) - for path in mounted_volumes.disk_state_paths_iter() + for k, path in enumerate(mounted_volumes.disk_state_paths_iter()) ), max_concurrency=CONCURRENCY_STATE_SAVE_RESTORE, reraise=True, # this should raise if there is an issue @@ -453,6 +456,7 @@ async def _save_state_folder( settings: ApplicationSettings, progress_bar: ProgressBarData, state_path: Path, + index: NonNegativeInt, mounted_volumes: MountedVolumes, ) -> None: assert settings.DY_SIDECAR_PRODUCT_NAME is not None # nosec @@ -461,6 +465,7 @@ async def _save_state_folder( project_id=settings.DY_SIDECAR_PROJECT_ID, node_uuid=settings.DY_SIDECAR_NODE_ID, source_path=state_path, + index=index, r_clone_settings=settings.DY_SIDECAR_R_CLONE_SETTINGS, exclude_patterns=mounted_volumes.state_exclude, io_log_redirect_cb=functools.partial( @@ -502,9 +507,10 @@ async def save_user_services_state_paths( settings=settings, progress_bar=root_progress, state_path=state_path, + index=k, mounted_volumes=mounted_volumes, ) - for state_path in state_paths + for k, state_path in enumerate(state_paths) ], max_concurrency=CONCURRENCY_STATE_SAVE_RESTORE, ) From 9f3007745dd81535f196dfeb546932cdbd7dbec1 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 12:21:12 +0100 Subject: [PATCH 26/79] refacotred logic --- .../src/simcore_sdk/node_data/data_manager.py | 78 ++++++++++--------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py index 3d2eaf3fc1c6..5723368b361d 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py @@ -262,25 +262,27 @@ async def push( # pylint: disable=too-many-arguments # noqa: PLR0913 ) +async def _requires_r_clone_mounting( + application_name: str, user_id: UserID, product_name: ProductName +) -> bool: + group_extra_properties = await DBManager( + application_name=application_name + ).get_group_extra_properties(user_id=user_id, product_name=product_name) + return group_extra_properties.use_r_clone_mounting is True + + async def _start_mount_if_required( mount_manager: RCloneMountManager, - application_name: str, - product_name: ProductName, user_id: UserID, project_id: ProjectID, node_id: NodeID, destination_path: Path, index: NonNegativeInt, handler_get_bind_path: GetBindPathProtocol, + *, + use_r_clone_mount: bool, ) -> None: - group_extra_properties = await DBManager( - application_name=application_name - ).get_group_extra_properties(user_id=user_id, product_name=product_name) - - _logger.debug("group_extra_properties=%s", group_extra_properties) - - if group_extra_properties.use_r_clone_mounting is False: - _logger.debug("RClone mounting not required") + if not use_r_clone_mount: return s3_object = __create_s3_object_key(project_id, node_id, destination_path) @@ -317,6 +319,10 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 ) -> None: """restores the state folder""" + use_r_clone_mount = await _requires_r_clone_mounting( + application_name, user_id, product_name + ) + if legacy_state and legacy_state.new_state_path == destination_path: _logger.info( "trying to restore from legacy_state=%s, destination_path=%s", @@ -348,14 +354,13 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 ) await _start_mount_if_required( mount_manager, - application_name, - product_name, user_id, project_id, node_uuid, destination_path, index, handler_get_bind_path, + use_r_clone_mount=use_r_clone_mount, ) return @@ -378,14 +383,13 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 ) await _start_mount_if_required( mount_manager, - application_name, - product_name, user_id, project_id, node_uuid, destination_path, index, handler_get_bind_path, + use_r_clone_mount=use_r_clone_mount, ) return @@ -397,38 +401,38 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 is_archive=False, ) if state_directory_exists: - # TODO: no more pullig here just mounting! - await _pull_directory( - user_id=user_id, - project_id=project_id, - node_uuid=node_uuid, - destination_path=destination_path, - io_log_redirect_cb=io_log_redirect_cb, - r_clone_settings=r_clone_settings, - progress_bar=progress_bar, - ) - await _start_mount_if_required( - mount_manager, - application_name, - product_name, - user_id, - project_id, - node_uuid, - destination_path, - index, - handler_get_bind_path, - ) + if use_r_clone_mount: + await _start_mount_if_required( + mount_manager, + user_id, + project_id, + node_uuid, + destination_path, + index, + handler_get_bind_path, + use_r_clone_mount=use_r_clone_mount, + ) + else: + await _pull_directory( + user_id=user_id, + project_id=project_id, + node_uuid=node_uuid, + destination_path=destination_path, + io_log_redirect_cb=io_log_redirect_cb, + r_clone_settings=r_clone_settings, + progress_bar=progress_bar, + ) + return await _start_mount_if_required( mount_manager, - application_name, - product_name, user_id, project_id, node_uuid, destination_path, index, handler_get_bind_path, + use_r_clone_mount=use_r_clone_mount, ) _logger.debug("No content previously saved for '%s'", destination_path) From 1341290117de30bde9aebb36bc61a0a439b60c8d Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 13:19:39 +0100 Subject: [PATCH 27/79] does not require a special user --- .../simcore_sdk/node_ports_common/r_clone_mount/_core.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 4d671d7e1793..8b556edcfba0 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -243,9 +243,6 @@ def _get_rclone_mount_command( rc_user: str, rc_password: str, ) -> str: - # jupyter gid and uid form the user inside - uid = 1000 - gid = 100 escaped_remote_path = f"{remote_path}".lstrip("/") command_array: list[str] = [ "rclone", @@ -269,10 +266,6 @@ def _get_rclone_mount_command( f"--rc-pass='{rc_password}'", "--allow-non-empty", "--allow-other", - "--uid", - f"{uid}", - "--gid", - f"{gid}", ] r_clone_command = " ".join(command_array) From 75a4c310fe1be3446dcd8cf503cc2882a3972804 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 13:27:04 +0100 Subject: [PATCH 28/79] cleanup --- .../node_ports_common/r_clone_mount/_core.py | 137 +++++++++--------- 1 file changed, 66 insertions(+), 71 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 8b556edcfba0..0e5df484dea2 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -46,7 +46,7 @@ _NOT_FOUND: Final[int] = 404 -type MountId = str +type _MountId = str class _BaseRcloneMountError(OsparcErrorMixin, RuntimeError): @@ -218,12 +218,12 @@ async def stop(self): await self._cleanup_stack.aclose() -def _get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> MountId: - # reversing string to avoid collisions +def _get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> _MountId: + # unique reproducible id for this mount return f"{index}{local_mount_path}".replace("/", "_")[::-1] -_COMMAND_TEMPLATE: Final[str] = dedent( +_R_CLONE_MOUNT_TEMPLATE: Final[str] = dedent( """ cat < /tmp/rclone.conf {r_clone_config_content} @@ -244,32 +244,32 @@ def _get_rclone_mount_command( rc_password: str, ) -> str: escaped_remote_path = f"{remote_path}".lstrip("/") - command_array: list[str] = [ - "rclone", - "--config", - "/tmp/rclone.conf", # noqa: S108 - "-vv", - "mount", - f"{CONFIG_KEY}:{escaped_remote_path}", - f"{local_mount_path}", - "--vfs-cache-mode full", - "--vfs-write-back", - "1s", # write-back delay TODO: could be part of the settings? - "--vfs-fast-fingerprint", # recommended for s3 backend TODO: could be part of the settings? - "--no-modtime", # don't read/write the modification time TODO: could be part of the settings? - "--cache-dir", - f"{vfs_cache_path}", - "--rc", - f"--rc-addr={rc_addr}", - "--rc-enable-metrics", - f"--rc-user='{rc_user}'", - f"--rc-pass='{rc_password}'", - "--allow-non-empty", - "--allow-other", - ] - r_clone_command = " ".join(command_array) - - return _COMMAND_TEMPLATE.format( + r_clone_command = " ".join( + [ + "rclone", + "--config", + "/tmp/rclone.conf", # noqa: S108 + "-vv", + "mount", + f"{CONFIG_KEY}:{escaped_remote_path}", + f"{local_mount_path}", + "--vfs-cache-mode full", + "--vfs-write-back", + "5s", # write-back delay TODO: could be part of the settings? + "--vfs-fast-fingerprint", # recommended for s3 backend TODO: could be part of the settings? + "--no-modtime", # don't read/write the modification time TODO: could be part of the settings? + "--cache-dir", + f"{vfs_cache_path}", + "--rc", + f"--rc-addr={rc_addr}", + "--rc-enable-metrics", + f"--rc-user='{rc_user}'", + f"--rc-pass='{rc_password}'", + "--allow-non-empty", + "--allow-other", + ] + ) + return _R_CLONE_MOUNT_TEMPLATE.format( r_clone_config_content=r_clone_config_content, r_clone_command=r_clone_command, ) @@ -546,7 +546,7 @@ def __init__(self, r_clone_settings: RCloneSettings) -> None: self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_MOUNT_VFS_CACHE_PATH ) - self._started_mounts: dict[MountId, TrackedMount] = {} + self._started_mounts: dict[_MountId, TrackedMount] = {} async def start_mount( self, @@ -558,46 +558,40 @@ async def start_mount( handler_get_bind_path: GetBindPathProtocol, vfs_cache_path_overwrite: Path | None = None, ) -> None: - try: - with log_context( - _logger, - logging.INFO, - f"mounting {local_mount_path=} from {remote_path=}", - log_duration=True, - ): - mount_id = _get_mount_id(local_mount_path, index) - if mount_id in self._started_mounts: - tracked_mount = self._started_mounts[mount_id] - raise MountAlreadyStartedError(local_mount_path=local_mount_path) - - vfs_cache_path = ( - vfs_cache_path_overwrite or self._common_vfs_cache_path - ) / mount_id - vfs_cache_path.mkdir(parents=True, exist_ok=True) - - free_port = await asyncio.get_running_loop().run_in_executor( - None, unused_port - ) - - tracked_mount = TrackedMount( - node_id, - self.r_clone_settings, - remote_type, - rc_port=free_port, - remote_path=remote_path, - local_mount_path=local_mount_path, - index=index, - vfs_cache_path=vfs_cache_path, - handler_get_bind_path=handler_get_bind_path, - ) - await tracked_mount.start_mount() - - self._started_mounts[mount_id] = tracked_mount - except Exception: - _logger.exception("SOMETHING WENT WRONG WAITING HERE FOR DEBUGGING") - await asyncio.sleep(100000) # let rclone write logs - - raise + with log_context( + _logger, + logging.INFO, + f"mounting {local_mount_path=} from {remote_path=}", + log_duration=True, + ): + mount_id = _get_mount_id(local_mount_path, index) + if mount_id in self._started_mounts: + tracked_mount = self._started_mounts[mount_id] + raise MountAlreadyStartedError(local_mount_path=local_mount_path) + + vfs_cache_path = ( + vfs_cache_path_overwrite or self._common_vfs_cache_path + ) / mount_id + vfs_cache_path.mkdir(parents=True, exist_ok=True) + + free_port = await asyncio.get_running_loop().run_in_executor( + None, unused_port + ) + + tracked_mount = TrackedMount( + node_id, + self.r_clone_settings, + remote_type, + rc_port=free_port, + remote_path=remote_path, + local_mount_path=local_mount_path, + index=index, + vfs_cache_path=vfs_cache_path, + handler_get_bind_path=handler_get_bind_path, + ) + await tracked_mount.start_mount() + + self._started_mounts[mount_id] = tracked_mount async def wait_for_transfers_to_complete( self, local_mount_path: Path, index: NonNegativeInt @@ -627,6 +621,7 @@ async def stop_mount(self, local_mount_path: Path, index: NonNegativeInt) -> Non ): mount_id = _get_mount_id(local_mount_path, index) if mount_id not in self._started_mounts: + # TODO: check if this is running on docker, then shutdown -> otherwise sidecar will break raise MountNotStartedError(local_mount_path=local_mount_path) tracked_mount = self._started_mounts[mount_id] From f3b118377ca52189953dd7441f83bad5cb1ee62c Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 13:30:26 +0100 Subject: [PATCH 29/79] refactor interface --- .../simcore-sdk/src/simcore_sdk/node_data/data_manager.py | 6 +++--- .../simcore_sdk/node_ports_common/r_clone_mount/_core.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py index 5723368b361d..e02c4e90a798 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py @@ -292,11 +292,11 @@ async def _start_mount_if_required( ) await mount_manager.start_mount( - node_id, - MountRemoteType.S3, - s3_object, destination_path, index, + node_id=node_id, + remote_type=MountRemoteType.S3, + remote_path=s3_object, handler_get_bind_path=handler_get_bind_path, ) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 0e5df484dea2..e4990ee1a048 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -550,11 +550,12 @@ def __init__(self, r_clone_settings: RCloneSettings) -> None: async def start_mount( self, + local_mount_path: Path, + index: NonNegativeInt, + *, node_id: NodeID, remote_type: MountRemoteType, remote_path: StorageFileID, - local_mount_path: Path, - index: NonNegativeInt, handler_get_bind_path: GetBindPathProtocol, vfs_cache_path_overwrite: Path | None = None, ) -> None: From 1d10e021ff5decb9da706d75cced2021e774f687 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 14:41:30 +0100 Subject: [PATCH 30/79] extended settings --- .../src/settings_library/r_clone.py | 70 ++++++++++++++----- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index 5ac2ac22fab0..15897b5f82a2 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -1,7 +1,7 @@ from datetime import timedelta from enum import StrEnum from pathlib import Path -from typing import Annotated +from typing import Annotated, Final from common_library.pydantic_validators import validate_numeric_string_as_timedelta from pydantic import Field, NonNegativeInt @@ -9,6 +9,9 @@ from .base import BaseCustomSettings from .s3 import S3Settings +DEFAULT_VFS_CACHE_PATH: Final[Path] = Path("/vfs-caching") +DEFAULT_VFS_CACHE_MAX_SIZE: Final[str] = "500G" + class S3Provider(StrEnum): AWS = "AWS" @@ -18,8 +21,6 @@ class S3Provider(StrEnum): class RCloneMountSettings(BaseCustomSettings): - """all settings related to mounting go here""" - R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT: Annotated[ timedelta, Field( @@ -27,28 +28,65 @@ class RCloneMountSettings(BaseCustomSettings): ), ] = timedelta(minutes=60) - R_CLONE_MOUNT_VFS_CACHE_PATH: Annotated[ - Path, - Field( - description="common directory where all vfs-caches will be mounted to", - ), - ] = Path( - "/tmp/vfs-caching" # noqa: S108 + _validate_r_clone_mount_transfers_completed_timeout = ( + validate_numeric_string_as_timedelta( + "R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT" + ) ) + # CONTAINER + R_CLONE_VERSION: Annotated[ str | None, Field( pattern=r"^\d+\.\d+\.\d+$", - description="version of rclone to use for the mounts", + description="version of rclone for the container image", ), ] = None - _validate_r_clone_mount_transfers_completed_timeout = ( - validate_numeric_string_as_timedelta( - "R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT" - ) - ) + R_CLONE_CONFIG_FILE_PATH: Annotated[ + Path, + Field( + description="path inside the container where the rclone config file is located", + ), + ] = Path("/tmo/rclone.conf") + + # CLI command `rclone mount` + + R_CLONE_MOUNT_VFS_CACHE_PATH: Annotated[ + Path, + Field( + description="`--cache-dir X`: sets the path to use for vfs cache", + ), + ] = DEFAULT_VFS_CACHE_PATH + + R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE: Annotated[ + str, + Field( + description="`--vfs-cache-max-size X`: sets the maximum size of the vfs cache", + ), + ] = DEFAULT_VFS_CACHE_MAX_SIZE + + R_CLONE_MOUNT_VFS_WRITE_BACK: Annotated[ + str, + Field( + description="`--vfs-write-back X`: sets the time to wait before writing back data to the remote", + ), + ] = "5s" + + R_CLONE_MOUNT_VFS_FAST_FINGERPRINT: Annotated[ + bool, + Field( + description="whether to use `--vfs-fast-fingerprint` option", + ), + ] = True + + R_CLONE_MOUNT_NO_MODTIME: Annotated[ + bool, + Field( + description="whether to use `--no-modtime` option", + ), + ] = True class RCloneSettings(BaseCustomSettings): From 22d9f15cf700540c436045a452f759a966f12288 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 14:43:23 +0100 Subject: [PATCH 31/79] mount vfs cache volume --- .../docker_service_specs/sidecar.py | 12 ++++++ .../modules/dynamic_sidecar/volumes.py | 43 ++++++++++++++++--- 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py index 35b95f2db84f..4221974ce66e 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py @@ -322,6 +322,18 @@ async def _get_mounts( ) ) + if scheduler_data.paths_mapping.state_paths: + mounts.append( + DynamicSidecarVolumesPathsResolver.mount_vfs_cache( + swarm_stack_name=dynamic_services_scheduler_settings.SWARM_STACK_NAME, + node_uuid=scheduler_data.node_uuid, + service_run_id=scheduler_data.run_id, + project_id=scheduler_data.project_id, + user_id=scheduler_data.user_id, + has_quota_support=has_quota_support, + ) + ) + if dynamic_sidecar_path := dynamic_sidecar_settings.DYNAMIC_SIDECAR_MOUNT_PATH_DEV: # Settings validators guarantees that this never happens in production mode assert ( diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py index 71630b814cb4..fcfd5ebcbef6 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/volumes.py @@ -20,8 +20,18 @@ WRITE_SIZE, AwsEfsSettings, ) +from settings_library.r_clone import DEFAULT_VFS_CACHE_MAX_SIZE, DEFAULT_VFS_CACHE_PATH -DY_SIDECAR_SHARED_STORE_PATH: Final[Path] = Path("/shared-store") +_BASE_PATH: Path = Path("/dy-volumes") +# below are subfolders in `_BASE_PATH` +_DY_SIDECAR_SUBFOLDER_SHARED_STORE: Final[Path] = Path("/shared-store") +_DY_SIDECAR_SUBFOLDER_VFS_CACHE: Final[Path] = DEFAULT_VFS_CACHE_PATH + + +# DEFAULT LIMITS +_LIMIT_SHARED_STORE: Final[str] = "1M" +_LIMIT_VFS_CACHE: Final[str] = DEFAULT_VFS_CACHE_MAX_SIZE +_LIMIT_USER_PREFERENCES: Final[str] = "10M" def _get_efs_volume_driver_config( @@ -42,12 +52,10 @@ def _get_efs_volume_driver_config( class DynamicSidecarVolumesPathsResolver: - BASE_PATH: Path = Path("/dy-volumes") - @classmethod def target(cls, path: Path) -> str: """Returns a folder path within `/dy-volumes` folder""" - target_path = cls.BASE_PATH / path.relative_to("/") + target_path = _BASE_PATH / path.relative_to("/") return f"{target_path}" @classmethod @@ -128,12 +136,33 @@ def mount_shared_store( ) -> dict[str, Any]: return cls.mount_entry( swarm_stack_name=swarm_stack_name, - path=DY_SIDECAR_SHARED_STORE_PATH, + path=_DY_SIDECAR_SUBFOLDER_SHARED_STORE, + node_uuid=node_uuid, + service_run_id=service_run_id, + project_id=project_id, + user_id=user_id, + volume_size_limit=_LIMIT_SHARED_STORE if has_quota_support else None, + ) + + @classmethod + def mount_vfs_cache( + cls, + service_run_id: ServiceRunID, + node_uuid: NodeID, + project_id: ProjectID, + user_id: UserID, + swarm_stack_name: str, + *, + has_quota_support: bool, + ) -> dict[str, Any]: + return cls.mount_entry( + swarm_stack_name=swarm_stack_name, + path=_DY_SIDECAR_SUBFOLDER_VFS_CACHE, node_uuid=node_uuid, service_run_id=service_run_id, project_id=project_id, user_id=user_id, - volume_size_limit="1M" if has_quota_support else None, + volume_size_limit=_LIMIT_VFS_CACHE if has_quota_support else None, ) @classmethod @@ -158,7 +187,7 @@ def mount_user_preferences( # NOTE: the contents of this volume will be zipped and much # be at most `_MAX_PREFERENCES_TOTAL_SIZE`, this 10M accounts # for files and data that can be compressed a lot - volume_size_limit="10M" if has_quota_support else None, + volume_size_limit=_LIMIT_USER_PREFERENCES if has_quota_support else None, ) @classmethod From c6b7db2ac764e1b13458c2965212b41a83050d0d Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 15:48:02 +0100 Subject: [PATCH 32/79] renamed --- packages/settings-library/src/settings_library/r_clone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index 15897b5f82a2..1587a437b21c 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -9,7 +9,7 @@ from .base import BaseCustomSettings from .s3 import S3Settings -DEFAULT_VFS_CACHE_PATH: Final[Path] = Path("/vfs-caching") +DEFAULT_VFS_CACHE_PATH: Final[Path] = Path("/vfs-cache") DEFAULT_VFS_CACHE_MAX_SIZE: Final[str] = "500G" From 71722748980cde9cb88a414acbe6a7fea14f75f6 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 15:48:43 +0100 Subject: [PATCH 33/79] fixed volume backup --- .../services/volumes_manager.py | 35 +++++++++-- .../scheduler/_core/_events_utils.py | 63 +++++++------------ 2 files changed, 51 insertions(+), 47 deletions(-) diff --git a/services/agent/src/simcore_service_agent/services/volumes_manager.py b/services/agent/src/simcore_service_agent/services/volumes_manager.py index 1ef6ef1d0cbd..6dcfac4c349f 100644 --- a/services/agent/src/simcore_service_agent/services/volumes_manager.py +++ b/services/agent/src/simcore_service_agent/services/volumes_manager.py @@ -16,15 +16,24 @@ from servicelib.rabbitmq.rpc_interfaces.agent.errors import ( NoServiceVolumesFoundRPCError, ) +from settings_library.r_clone import DEFAULT_VFS_CACHE_PATH from tenacity import AsyncRetrying, before_sleep_log, stop_after_delay, wait_fixed from ..core.settings import ApplicationSettings -from .docker_utils import get_unused_dynamc_sidecar_volumes, remove_volume +from .docker_utils import get_unused_dynamic_sidecar_volumes, remove_volume _logger = logging.getLogger(__name__) _WAIT_FOR_UNUSED_SERVICE_VOLUMES: Final[timedelta] = timedelta(minutes=1) +_VOLUMES_TO_NEVER_BACKUP: Final[set[str]] = { + "stupni", # inputs -> usually all services use this name + "stuptuo", # outputs -> can be regenerated, usually all services use this name + "erots-derahs", # shared-store -> defined by the dynamic-sidecar + f"{DEFAULT_VFS_CACHE_PATH}".strip("/")[::-1], # vfs-cache + "secnereferP", # Preferences -> usually defined by the user this is the one we use in the only service that supports if for now +} + @dataclass class VolumesManager( # pylint:disable=too-many-instance-attributes @@ -68,7 +77,7 @@ async def shutdown(self) -> None: async def _bookkeeping_task(self) -> None: with log_context(_logger, logging.DEBUG, "volume bookkeeping"): - current_unused_volumes = await get_unused_dynamc_sidecar_volumes( + current_unused_volumes = await get_unused_dynamic_sidecar_volumes( self.docker ) old_unused_volumes = set(self._unused_volumes.keys()) @@ -86,6 +95,12 @@ async def _bookkeeping_task(self) -> None: async def _remove_volume_safe( self, *, volume_name: str, requires_backup: bool ) -> None: + # overwrite backup policy if volume does not require backup + for x in _VOLUMES_TO_NEVER_BACKUP: + if f"_{x}_" in volume_name: + requires_backup = False + break + # NOTE: to avoid race conditions only one volume can be removed # also avoids issues with accessing the docker API in parallel async with self.removal_lock: @@ -100,10 +115,10 @@ async def _periodic_volume_cleanup_task(self) -> None: with log_context(_logger, logging.DEBUG, "volume cleanup"): volumes_to_remove: set[str] = set() for volume_name, inactive_since in self._unused_volumes.items(): - volume_inactive_sicne = ( + volume_inactive_since = ( arrow.utcnow().datetime - inactive_since ).total_seconds() - if volume_inactive_sicne > self.remove_volumes_inactive_for: + if volume_inactive_since > self.remove_volumes_inactive_for: volumes_to_remove.add(volume_name) for volume in volumes_to_remove: @@ -123,7 +138,7 @@ async def _wait_for_service_volumes_to_become_unused( before_sleep=before_sleep_log(_logger, logging.DEBUG), ): with attempt: - current_unused_volumes = await get_unused_dynamc_sidecar_volumes( + current_unused_volumes = await get_unused_dynamic_sidecar_volumes( self.docker ) @@ -142,6 +157,10 @@ async def _wait_for_service_volumes_to_become_unused( return service_volumes async def remove_service_volumes(self, node_id: NodeID) -> None: + """ + Cleanup after each sidecar was shut down removing all volumes it created with + a backup since it already did that + """ # bookkept volumes might not be up to date service_volumes = await self._wait_for_service_volumes_to_become_unused(node_id) _logger.debug( @@ -157,8 +176,12 @@ async def remove_service_volumes(self, node_id: NodeID) -> None: ) async def remove_all_volumes(self) -> None: + """ + Should be called by autoscaling to ensure no data is lost + If a volume is found it's data has to be backed up + """ # bookkept volumes might not be up to date - current_unused_volumes = await get_unused_dynamc_sidecar_volumes(self.docker) + current_unused_volumes = await get_unused_dynamic_sidecar_volumes(self.docker) with log_context(_logger, logging.INFO, "remove all volumes"): for volume in current_unused_volumes: diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py index 3f83d2e5f5c5..5427952ff607 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/scheduler/_core/_events_utils.py @@ -6,10 +6,9 @@ from common_library.json_serialization import json_loads from fastapi import FastAPI -from models_library.api_schemas_long_running_tasks.base import ProgressPercent from models_library.products import ProductName from models_library.projects_networks import ProjectsNetworks -from models_library.projects_nodes_io import NodeID, NodeIDStr +from models_library.projects_nodes_io import NodeID from models_library.rabbitmq_messages import InstrumentationRabbitMessage from models_library.rpc.webserver.auth.api_keys import generate_unique_api_key from models_library.service_settings_labels import SimcoreServiceLabels @@ -211,17 +210,11 @@ async def service_remove_sidecar_proxy_docker_networks_and_volumes( app: FastAPI, node_uuid: NodeID, swarm_stack_name: str, - set_were_state_and_outputs_saved: bool | None = None, ) -> None: scheduler_data: SchedulerData = _get_scheduler_data(app, node_uuid) rabbit_rpc_client: RabbitMQRPCClient = app.state.rabbitmq_rpc_client - if set_were_state_and_outputs_saved is not None: - scheduler_data.dynamic_sidecar.were_state_and_outputs_saved = True - - await task_progress.update( - message="removing dynamic sidecar stack", percent=ProgressPercent(0.1) - ) + await task_progress.update(message="removing dynamic sidecar stack", percent=0.1) await remove_dynamic_sidecar_stack( node_uuid=scheduler_data.node_uuid, @@ -235,42 +228,32 @@ async def service_remove_sidecar_proxy_docker_networks_and_volumes( node_id=scheduler_data.node_uuid, ) - await task_progress.update(message="removing network", percent=ProgressPercent(0.2)) + await task_progress.update(message="removing network", percent=0.2) await remove_dynamic_sidecar_network(scheduler_data.dynamic_sidecar_network_name) - if scheduler_data.dynamic_sidecar.were_state_and_outputs_saved: - if scheduler_data.dynamic_sidecar.docker_node_id is None: - _logger.warning( - "Skipped volume removal for %s, since a docker_node_id was not found.", - scheduler_data.node_uuid, - ) - else: - # Remove all dy-sidecar associated volumes from node - await task_progress.update( - message="removing volumes", percent=ProgressPercent(0.3) - ) - with log_context(_logger, logging.DEBUG, f"removing volumes '{node_uuid}'"): - try: - await remove_volumes_without_backup_for_service( - rabbit_rpc_client, - docker_node_id=scheduler_data.dynamic_sidecar.docker_node_id, - swarm_stack_name=swarm_stack_name, - node_id=scheduler_data.node_uuid, - ) - except ( - NoServiceVolumesFoundRPCError, - RemoteMethodNotRegisteredError, # happens when autoscaling node was removed - ) as e: - _logger.info("Could not remove volumes, because: '%s'", e) + if scheduler_data.dynamic_sidecar.docker_node_id: + # Remove all dy-sidecar associated volumes from node + await task_progress.update(message="removing volumes", percent=0.3) + with log_context(_logger, logging.DEBUG, f"removing volumes '{node_uuid}'"): + try: + await remove_volumes_without_backup_for_service( + rabbit_rpc_client, + docker_node_id=scheduler_data.dynamic_sidecar.docker_node_id, + swarm_stack_name=swarm_stack_name, + node_id=scheduler_data.node_uuid, + ) + except ( + NoServiceVolumesFoundRPCError, + RemoteMethodNotRegisteredError, # happens when autoscaling node was removed + ) as e: + _logger.info("Could not remove volumes, because: '%s'", e) _logger.debug( "Removed dynamic-sidecar services and crated container for '%s'", scheduler_data.service_name, ) - await task_progress.update( - message="removing project networks", percent=ProgressPercent(0.8) - ) + await task_progress.update(message="removing project networks", percent=0.8) used_projects_networks = await get_projects_networks_containers( project_id=scheduler_data.project_id ) @@ -290,9 +273,7 @@ async def service_remove_sidecar_proxy_docker_networks_and_volumes( await _cleanup_long_running_tasks(app, scheduler_data.run_id) - await task_progress.update( - message="finished removing resources", percent=ProgressPercent(1) - ) + await task_progress.update(message="finished removing resources", percent=1) async def _cleanup_long_running_tasks( @@ -452,7 +433,7 @@ async def attach_project_networks(app: FastAPI, scheduler_data: SchedulerData) - network_name, container_aliases, ) in projects_networks.networks_with_aliases.items(): - network_alias = container_aliases.get(NodeIDStr(scheduler_data.node_uuid)) + network_alias = container_aliases.get(f"{scheduler_data.node_uuid}") if network_alias is not None: await sidecars_client.attach_service_containers_to_project_network( dynamic_sidecar_endpoint=dynamic_sidecar_endpoint, From 97a9c80a03c2024daff1f20c9945d566cfa54eb8 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 15:49:05 +0100 Subject: [PATCH 34/79] rename logic --- .../simcore-sdk/src/simcore_sdk/node_data/data_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py index e02c4e90a798..3d0e30e2796b 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py @@ -192,7 +192,7 @@ async def _stop_mount( mount_manager: RCloneMountManager, destination_path: Path, index: NonNegativeInt ) -> None: await mount_manager.wait_for_transfers_to_complete(destination_path, index) - await mount_manager.stop_mount(destination_path, index) + await mount_manager.ensure_unmounted(destination_path, index) async def push( # pylint: disable=too-many-arguments # noqa: PLR0913 @@ -291,7 +291,7 @@ async def _start_mount_if_required( user_id=user_id, s3_object=s3_object, store_id=SIMCORE_LOCATION ) - await mount_manager.start_mount( + await mount_manager.ensure_mounted( destination_path, index, node_id=node_id, From 824051409a579acd957f1f6a875a092e21e583d7 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 16:10:56 +0100 Subject: [PATCH 35/79] refactored handlers --- .../src/simcore_sdk/node_data/data_manager.py | 8 +++ .../modules/long_running_tasks.py | 53 ++++++++++++------- 2 files changed, 41 insertions(+), 20 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py index 3d0e30e2796b..0f293da90225 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py @@ -19,6 +19,7 @@ from ..node_ports_common.file_io_utils import LogRedirectCB from ..node_ports_common.r_clone_mount import ( GetBindPathProtocol, + MountActivityProtocol, MountRemoteType, RCloneMountManager, ) @@ -279,6 +280,7 @@ async def _start_mount_if_required( destination_path: Path, index: NonNegativeInt, handler_get_bind_path: GetBindPathProtocol, + handler_mount_activity: MountActivityProtocol, *, use_r_clone_mount: bool, ) -> None: @@ -298,6 +300,7 @@ async def _start_mount_if_required( remote_type=MountRemoteType.S3, remote_path=s3_object, handler_get_bind_path=handler_get_bind_path, + handler_mount_activity=handler_mount_activity, ) @@ -316,6 +319,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 application_name: str, mount_manager: RCloneMountManager, handler_get_bind_path: GetBindPathProtocol, + handler_mount_activity: MountActivityProtocol, ) -> None: """restores the state folder""" @@ -360,6 +364,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 destination_path, index, handler_get_bind_path, + handler_mount_activity, use_r_clone_mount=use_r_clone_mount, ) return @@ -389,6 +394,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 destination_path, index, handler_get_bind_path, + handler_mount_activity, use_r_clone_mount=use_r_clone_mount, ) return @@ -410,6 +416,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 destination_path, index, handler_get_bind_path, + handler_mount_activity, use_r_clone_mount=use_r_clone_mount, ) else: @@ -433,6 +440,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 destination_path, index, handler_get_bind_path, + handler_mount_activity, use_r_clone_mount=use_r_clone_mount, ) _logger.debug("No content previously saved for '%s'", destination_path) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py index 04ea6cdaa576..7963c7020152 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py @@ -19,6 +19,7 @@ from servicelib.progress_bar import ProgressBarData from servicelib.utils import logged_gather from simcore_sdk.node_data import data_manager +from simcore_sdk.node_ports_common.r_clone_mount import MountActivity from tenacity import retry from tenacity.before_sleep import before_sleep_log from tenacity.retry import retry_if_result @@ -345,6 +346,34 @@ def _get_legacy_state_with_dy_volumes_path( ) +async def _handler_get_bind_path( + settings: ApplicationSettings, mounted_volumes: MountedVolumes, path: Path +) -> dict: + not_dy_volume = path.relative_to(settings.DYNAMIC_SIDECAR_DY_VOLUMES_MOUNT_DIR) + matcher = f":/{not_dy_volume}" + + async for entry in mounted_volumes.iter_state_paths_to_docker_volumes( + settings.DY_SIDECAR_RUN_ID + ): + if entry.endswith(matcher): + mount_str = entry.replace(f"/{not_dy_volume}", f"{path}") + source, target = mount_str.split(":") + return { + "Type": "bind", + "Source": source, + "Target": target, + "BindOptions": {"Propagation": "rshared"}, + } + + msg = f"Could not resolve volume path for {path}" + raise RuntimeError(msg) + + +async def _handler_mount_activity(state_path: Path, activity: MountActivity) -> None: + # in the future this should go to the fornted + _logger.info("Mount activity for '%s': %s", state_path, activity) + + async def _restore_state_folder( app: FastAPI, *, @@ -354,25 +383,6 @@ async def _restore_state_folder( index: NonNegativeInt, mounted_volumes: MountedVolumes, ) -> None: - async def _resolve_volume_path(path: Path) -> dict: - not_dy_volume = path.relative_to(settings.DYNAMIC_SIDECAR_DY_VOLUMES_MOUNT_DIR) - matcher = f":/{not_dy_volume}" - - async for entry in mounted_volumes.iter_state_paths_to_docker_volumes( - settings.DY_SIDECAR_RUN_ID - ): - if entry.endswith(matcher): - mount_str = entry.replace(f"/{not_dy_volume}", f"{path}") - source, target = mount_str.split(":") - return { - "Type": "bind", - "Source": source, - "Target": target, - "BindOptions": {"Propagation": "rshared"}, - } - - msg = f"Could not resolve volume path for {path}" - raise RuntimeError(msg) assert settings.DY_SIDECAR_PRODUCT_NAME is not None # nosec await data_manager.pull( @@ -390,7 +400,10 @@ async def _resolve_volume_path(path: Path) -> dict: legacy_state=_get_legacy_state_with_dy_volumes_path(settings), application_name=f"{APP_NAME}-{settings.DY_SIDECAR_NODE_ID}", mount_manager=get_r_clone_mount_manager(app), - handler_get_bind_path=_resolve_volume_path, + handler_get_bind_path=functools.partial( + _handler_get_bind_path, settings, mounted_volumes + ), + handler_mount_activity=_handler_mount_activity, ) From 8dcaa69b9728f93c9d98c5f91052a736058f2b76 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 16:11:15 +0100 Subject: [PATCH 36/79] refactor --- .../simcore_sdk/node_ports_common/r_clone_mount/__init__.py | 4 ++++ .../node_ports_common/r_clone_mount/_config_provider.py | 1 + 2 files changed, 5 insertions(+) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py index 187621da2d39..49265d39269f 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py @@ -1,6 +1,8 @@ from ._config_provider import MountRemoteType from ._core import ( GetBindPathProtocol, + MountActivity, + MountActivityProtocol, MountAlreadyStartedError, MountNotStartedError, RCloneMountManager, @@ -8,6 +10,8 @@ __all__: tuple[str, ...] = ( "GetBindPathProtocol", + "MountActivity", + "MountActivityProtocol", "MountAlreadyStartedError", "MountNotStartedError", "MountRemoteType", diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_config_provider.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_config_provider.py index 7610234ed5a0..cc893b0e787d 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_config_provider.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_config_provider.py @@ -9,6 +9,7 @@ class MountRemoteType(Enum): S3 = auto() + # NOTE: oauth atuthorization pattern needs to be setup for non S3 providers def get_config_content( From 4f2d0d051525459e7060ffa56a565440fbf8dc08 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 16:31:02 +0100 Subject: [PATCH 37/79] vfs cache volume added --- .../modules/long_running_tasks.py | 56 ++++++++++++++----- .../modules/mounted_fs.py | 5 ++ 2 files changed, 47 insertions(+), 14 deletions(-) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py index 7963c7020152..ee3c9ab75ed7 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py @@ -18,6 +18,7 @@ from servicelib.long_running_tasks.task import TaskProtocol, TaskRegistry from servicelib.progress_bar import ProgressBarData from servicelib.utils import logged_gather +from settings_library.r_clone import DEFAULT_VFS_CACHE_PATH from simcore_sdk.node_data import data_manager from simcore_sdk.node_ports_common.r_clone_mount import MountActivity from tenacity import retry @@ -346,27 +347,54 @@ def _get_legacy_state_with_dy_volumes_path( ) +_EXPECTED_BIND_PATHS_COUNT: Final[NonNegativeInt] = 2 + + async def _handler_get_bind_path( - settings: ApplicationSettings, mounted_volumes: MountedVolumes, path: Path -) -> dict: - not_dy_volume = path.relative_to(settings.DYNAMIC_SIDECAR_DY_VOLUMES_MOUNT_DIR) - matcher = f":/{not_dy_volume}" + settings: ApplicationSettings, mounted_volumes: MountedVolumes, state_path: Path +) -> list: + vfs_cache_path = f"{mounted_volumes.vfs_cache_path}" + vfs_source, vfs_target = vfs_cache_path.replace( + f"/{DEFAULT_VFS_CACHE_PATH}", + f"{settings.DYNAMIC_SIDECAR_DY_VOLUMES_MOUNT_DIR}{DEFAULT_VFS_CACHE_PATH}", + ).split(":") + + bind_paths: list[dict] = [ + # TODO: verify this is correct, path might be slightly off + { + "Type": "bind", + "Source": vfs_source, + "Target": vfs_target, + "BindOptions": {"Propagation": "rshared"}, + } + ] + + state_path_no_dy_volume = state_path.relative_to( + settings.DYNAMIC_SIDECAR_DY_VOLUMES_MOUNT_DIR + ) + matcher = f":/{state_path_no_dy_volume}" async for entry in mounted_volumes.iter_state_paths_to_docker_volumes( settings.DY_SIDECAR_RUN_ID ): if entry.endswith(matcher): - mount_str = entry.replace(f"/{not_dy_volume}", f"{path}") + mount_str = entry.replace(f"/{state_path_no_dy_volume}", f"{state_path}") source, target = mount_str.split(":") - return { - "Type": "bind", - "Source": source, - "Target": target, - "BindOptions": {"Propagation": "rshared"}, - } + bind_paths.append( + { + "Type": "bind", + "Source": source, + "Target": target, + "BindOptions": {"Propagation": "rshared"}, + } + ) + break + + if len(bind_paths) != _EXPECTED_BIND_PATHS_COUNT: + msg = f"Could not resolve volume path for {state_path}" + raise RuntimeError(msg) - msg = f"Could not resolve volume path for {path}" - raise RuntimeError(msg) + return bind_paths async def _handler_mount_activity(state_path: Path, activity: MountActivity) -> None: @@ -400,7 +428,7 @@ async def _restore_state_folder( legacy_state=_get_legacy_state_with_dy_volumes_path(settings), application_name=f"{APP_NAME}-{settings.DY_SIDECAR_NODE_ID}", mount_manager=get_r_clone_mount_manager(app), - handler_get_bind_path=functools.partial( + handler_get_bind_paths=functools.partial( _handler_get_bind_path, settings, mounted_volumes ), handler_mount_activity=_handler_mount_activity, diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/mounted_fs.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/mounted_fs.py index eeedd4d16173..3654ebaa2e49 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/mounted_fs.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/mounted_fs.py @@ -7,6 +7,7 @@ from models_library.projects_nodes_io import NodeID from models_library.services import ServiceRunID from servicelib.docker_constants import PREFIX_DYNAMIC_SIDECAR_VOLUMES +from settings_library.r_clone import DEFAULT_VFS_CACHE_PATH from ..core.docker_utils import get_volume_by_label from ..core.settings import ApplicationSettings @@ -97,6 +98,10 @@ def disk_inputs_path(self) -> Path: def disk_outputs_path(self) -> Path: return _ensure_path(self._dy_volumes / self.outputs_path.relative_to("/")) + @cached_property + def vfs_cache_path(self) -> Path: + return _ensure_path(self._dy_volumes / DEFAULT_VFS_CACHE_PATH.relative_to("/")) + def disk_state_paths_iter(self) -> Iterator[Path]: for state_path in self.state_paths: yield _ensure_path(self._dy_volumes / state_path.relative_to("/")) From 08c7478c5d501c5785022cb9500d85b619f91003 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 16:33:15 +0100 Subject: [PATCH 38/79] palce --- .../src/simcore_sdk/node_data/data_manager.py | 16 +-- .../r_clone_mount/__init__.py | 4 +- .../node_ports_common/r_clone_mount/_core.py | 119 +++++++++--------- 3 files changed, 72 insertions(+), 67 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py index 0f293da90225..0a829ce609b8 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py @@ -18,7 +18,7 @@ from ..node_ports_common.dbmanager import DBManager from ..node_ports_common.file_io_utils import LogRedirectCB from ..node_ports_common.r_clone_mount import ( - GetBindPathProtocol, + GetBindPathsProtocol, MountActivityProtocol, MountRemoteType, RCloneMountManager, @@ -279,7 +279,7 @@ async def _start_mount_if_required( node_id: NodeID, destination_path: Path, index: NonNegativeInt, - handler_get_bind_path: GetBindPathProtocol, + handler_get_bind_paths: GetBindPathsProtocol, handler_mount_activity: MountActivityProtocol, *, use_r_clone_mount: bool, @@ -299,7 +299,7 @@ async def _start_mount_if_required( node_id=node_id, remote_type=MountRemoteType.S3, remote_path=s3_object, - handler_get_bind_path=handler_get_bind_path, + handler_get_bind_paths=handler_get_bind_paths, handler_mount_activity=handler_mount_activity, ) @@ -318,7 +318,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 legacy_state: LegacyState | None, application_name: str, mount_manager: RCloneMountManager, - handler_get_bind_path: GetBindPathProtocol, + handler_get_bind_paths: GetBindPathsProtocol, handler_mount_activity: MountActivityProtocol, ) -> None: """restores the state folder""" @@ -363,7 +363,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 node_uuid, destination_path, index, - handler_get_bind_path, + handler_get_bind_paths, handler_mount_activity, use_r_clone_mount=use_r_clone_mount, ) @@ -393,7 +393,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 node_uuid, destination_path, index, - handler_get_bind_path, + handler_get_bind_paths, handler_mount_activity, use_r_clone_mount=use_r_clone_mount, ) @@ -415,7 +415,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 node_uuid, destination_path, index, - handler_get_bind_path, + handler_get_bind_paths, handler_mount_activity, use_r_clone_mount=use_r_clone_mount, ) @@ -439,7 +439,7 @@ async def pull( # pylint: disable=too-many-arguments # noqa: PLR0913 node_uuid, destination_path, index, - handler_get_bind_path, + handler_get_bind_paths, handler_mount_activity, use_r_clone_mount=use_r_clone_mount, ) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py index 49265d39269f..7b9238522c46 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py @@ -1,6 +1,6 @@ from ._config_provider import MountRemoteType from ._core import ( - GetBindPathProtocol, + GetBindPathsProtocol, MountActivity, MountActivityProtocol, MountAlreadyStartedError, @@ -9,7 +9,7 @@ ) __all__: tuple[str, ...] = ( - "GetBindPathProtocol", + "GetBindPathsProtocol", "MountActivity", "MountActivityProtocol", "MountAlreadyStartedError", diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index e4990ee1a048..22bb7c178551 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -38,7 +38,7 @@ _MAX_WAIT_RC_HTTP_INTERFACE_READY: Final[timedelta] = timedelta(seconds=10) _DEFAULT_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=1) -_DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT: Final[timedelta] = timedelta(seconds=5) +_DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT: Final[timedelta] = timedelta(seconds=20) _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=5) @@ -80,39 +80,47 @@ def _get_self_container_id() -> str: return os.environ["HOSTNAME"] -class GetBindPathProtocol(Protocol): - async def __call__(self, path: Path) -> dict: ... +class MountActivity(BaseModel): + transferring: dict[str, ProgressReport] + queued: list[str] + + +class GetBindPathsProtocol(Protocol): + async def __call__(self, state_path: Path) -> list: ... + + +class MountActivityProtocol(Protocol): + async def __call__(self, state_path: Path, activity: MountActivity) -> None: ... class ContainerManager: def __init__( self, + mount_settings: RCloneMountSettings, node_id: NodeID, - r_clone_version: str, remote_control_port: PortInt, local_mount_path: Path, index: NonNegativeInt, r_clone_config_content: str, remote_path: str, - vfs_cache_path: Path, rc_user: str, rc_password: str, *, - handler_get_bind_path: GetBindPathProtocol, + handler_get_bind_paths: GetBindPathsProtocol, ) -> None: + self.mount_settings = mount_settings self.node_id = node_id - self.r_clone_version = r_clone_version self.remote_control_port = remote_control_port self.local_mount_path = local_mount_path self.index = index self.r_clone_config_content = r_clone_config_content - self.handler_get_bind_path = handler_get_bind_path + self.handler_get_bind_paths = handler_get_bind_paths self.command = _get_rclone_mount_command( + mount_settings=mount_settings, r_clone_config_content=r_clone_config_content, remote_path=remote_path, local_mount_path=self.local_mount_path, - vfs_cache_path=vfs_cache_path, rc_addr=f"0.0.0.0:{remote_control_port}", rc_user=rc_user, rc_password=rc_password, @@ -171,18 +179,13 @@ async def start(self): # create rclone container attached to the network self._r_clone_container = await self._aiodocker_client.containers.run( config={ - "Image": f"rclone/rclone:{self.r_clone_version}", - "Entrypoint": [ - "/bin/sh", - "-c", - f"{self.command} && sleep 100000 || sleep 100000000 ", - ], + "Image": f"rclone/rclone:{self.mount_settings.R_CLONE_VERSION}", + "Entrypoint": ["/bin/sh", "-c", f"{self.command}"], "ExposedPorts": {f"{self.remote_control_port}/tcp": {}}, "HostConfig": { "NetworkMode": self._r_clone_network_name, "Binds": [], - # TODO: mount the VFS cache directory somewhere to have better performance - "Mounts": [await self.handler_get_bind_path(self.local_mount_path)], + "Mounts": await self.handler_get_bind_paths(self.local_mount_path), "Devices": [ { "PathOnHost": "/dev/fuse", @@ -225,7 +228,7 @@ def _get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> _MountId: _R_CLONE_MOUNT_TEMPLATE: Final[str] = dedent( """ -cat < /tmp/rclone.conf +cat < {r_clone_config_path} {r_clone_config_content} EOF @@ -235,10 +238,10 @@ def _get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> _MountId: def _get_rclone_mount_command( + mount_settings: RCloneMountSettings, r_clone_config_content: str, remote_path: StorageFileID, local_mount_path: Path, - vfs_cache_path: Path, rc_addr: str, rc_user: str, rc_password: str, @@ -248,18 +251,24 @@ def _get_rclone_mount_command( [ "rclone", "--config", - "/tmp/rclone.conf", # noqa: S108 + f"{mount_settings.R_CLONE_CONFIG_FILE_PATH}", "-vv", "mount", f"{CONFIG_KEY}:{escaped_remote_path}", f"{local_mount_path}", "--vfs-cache-mode full", "--vfs-write-back", - "5s", # write-back delay TODO: could be part of the settings? - "--vfs-fast-fingerprint", # recommended for s3 backend TODO: could be part of the settings? - "--no-modtime", # don't read/write the modification time TODO: could be part of the settings? + mount_settings.R_CLONE_MOUNT_VFS_WRITE_BACK, + "--vfs-cache-max-size", + mount_settings.R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE, + ( + "--vfs-fast-fingerprint" + if mount_settings.R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE + else "" + ), + ("--no-modtime" if mount_settings.R_CLONE_MOUNT_NO_MODTIME else ""), "--cache-dir", - f"{vfs_cache_path}", + f"{mount_settings.R_CLONE_MOUNT_VFS_CACHE_PATH}", "--rc", f"--rc-addr={rc_addr}", "--rc-enable-metrics", @@ -270,16 +279,12 @@ def _get_rclone_mount_command( ] ) return _R_CLONE_MOUNT_TEMPLATE.format( + r_clone_config_path=mount_settings.R_CLONE_CONFIG_FILE_PATH, r_clone_config_content=r_clone_config_content, r_clone_command=r_clone_command, ) -class MountActivity(BaseModel): - transferring: dict[str, ProgressReport] - queued: list[str] - - class RCloneRCInterfaceClient: def __init__( self, @@ -431,8 +436,8 @@ def __init__( remote_path: StorageFileID, local_mount_path: Path, index: NonNegativeInt, - vfs_cache_path: Path, - handler_get_bind_path: GetBindPathProtocol, + handler_get_bind_paths: GetBindPathsProtocol, + handler_mount_activity: MountActivityProtocol, mount_activity_update_interval: timedelta = _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL, ) -> None: self.node_id = node_id @@ -442,10 +447,10 @@ def __init__( self.remote_path = remote_path self.local_mount_path = local_mount_path self.index = index - self.vfs_cache_path = vfs_cache_path self.rc_user = f"{uuid4()}" self.rc_password = f"{uuid4()}" - self.handler_get_bind_path = handler_get_bind_path + self.handler_get_bind_paths = handler_get_bind_paths + self.handler_mount_activity = handler_mount_activity self._last_mount_activity: MountActivity | None = None self._last_mount_activity_update: datetime = datetime.fromtimestamp(0, UTC) @@ -473,12 +478,7 @@ async def _progress_handler(self, mount_activity: MountActivity) -> None: self._last_mount_activity = mount_activity self._last_mount_activity_update = now - # NOTE: this could also be useful if pushed to the UI - _logger.info( - "Activity for '%s': %s", - self.local_mount_path, - self._last_mount_activity, - ) + await self.handler_mount_activity(self.local_mount_path, mount_activity) async def teardown(self) -> None: await self.stop_mount() @@ -499,17 +499,16 @@ async def start_mount(self) -> None: raise RuntimeError(msg) self._container_manager = ContainerManager( + mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, node_id=self.node_id, - r_clone_version=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION, remote_control_port=self.rc_port, local_mount_path=self.local_mount_path, index=self.index, r_clone_config_content=r_clone_config_content, remote_path=f"{self.r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME}/{self.remote_path}", - vfs_cache_path=self.vfs_cache_path, rc_user=self.rc_user, rc_password=self.rc_password, - handler_get_bind_path=self.handler_get_bind_path, + handler_get_bind_paths=self.handler_get_bind_paths, ) self._rc_interface: RCloneRCInterfaceClient | None = RCloneRCInterfaceClient( @@ -542,13 +541,11 @@ async def stop_mount(self) -> None: class RCloneMountManager: def __init__(self, r_clone_settings: RCloneSettings) -> None: self.r_clone_settings = r_clone_settings - self._common_vfs_cache_path = ( - self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_MOUNT_VFS_CACHE_PATH - ) + # TODO: make this stateless and go via aiodocker to avoid issues when restartign the container self._started_mounts: dict[_MountId, TrackedMount] = {} - async def start_mount( + async def ensure_mounted( self, local_mount_path: Path, index: NonNegativeInt, @@ -556,9 +553,10 @@ async def start_mount( node_id: NodeID, remote_type: MountRemoteType, remote_path: StorageFileID, - handler_get_bind_path: GetBindPathProtocol, - vfs_cache_path_overwrite: Path | None = None, + handler_get_bind_paths: GetBindPathsProtocol, + handler_mount_activity: MountActivityProtocol, ) -> None: + # TODO: rename to ENSURE MOUNT EXISTS with log_context( _logger, logging.INFO, @@ -570,11 +568,6 @@ async def start_mount( tracked_mount = self._started_mounts[mount_id] raise MountAlreadyStartedError(local_mount_path=local_mount_path) - vfs_cache_path = ( - vfs_cache_path_overwrite or self._common_vfs_cache_path - ) / mount_id - vfs_cache_path.mkdir(parents=True, exist_ok=True) - free_port = await asyncio.get_running_loop().run_in_executor( None, unused_port ) @@ -587,8 +580,8 @@ async def start_mount( remote_path=remote_path, local_mount_path=local_mount_path, index=index, - vfs_cache_path=vfs_cache_path, - handler_get_bind_path=handler_get_bind_path, + handler_get_bind_paths=handler_get_bind_paths, + handler_mount_activity=handler_mount_activity, ) await tracked_mount.start_mount() @@ -597,6 +590,8 @@ async def start_mount( async def wait_for_transfers_to_complete( self, local_mount_path: Path, index: NonNegativeInt ) -> None: + # if mount is not present it just returns immediately + with log_context( _logger, logging.INFO, @@ -613,10 +608,14 @@ async def wait_for_transfers_to_complete( async def was_mount_started( self, local_mount_path: Path, index: NonNegativeInt ) -> bool: + # checks if mount is present or not mount_id = _get_mount_id(local_mount_path, index) return mount_id in self._started_mounts - async def stop_mount(self, local_mount_path: Path, index: NonNegativeInt) -> None: + async def ensure_unmounted( + self, local_mount_path: Path, index: NonNegativeInt + ) -> None: + # TODO: rename to ENSURE mount does not exist with log_context( _logger, logging.INFO, f"unmounting {local_mount_path=}", log_duration=True ): @@ -629,6 +628,7 @@ async def stop_mount(self, local_mount_path: Path, index: NonNegativeInt) -> Non await tracked_mount.stop_mount() async def setup(self) -> None: + # TODO: add a process which ensures that the mounts keep running -> register some local data to restart the mount process if it dies (even on accident manually) pass async def teardown(self) -> None: @@ -639,4 +639,9 @@ async def teardown(self) -> None: self._started_mounts.clear() -# TODO: oauth atuthorization pattern needs to be setup for non S3 providers +# NOTES: +# There are multiple layers in place here +# - docker api to create/remove containers and networks +# - rclone container management +# - rclone process status management via its rc http interface +# - mounts management From 9ab9a4a9b8da0c8f0c0375128fedc9fe9f1e51bd Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 12 Dec 2025 16:37:58 +0100 Subject: [PATCH 39/79] reordered --- .../node_ports_common/r_clone_mount/_core.py | 139 +++++++++--------- 1 file changed, 69 insertions(+), 70 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 22bb7c178551..55bc061ce72c 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -48,6 +48,74 @@ type _MountId = str +_R_CLONE_MOUNT_TEMPLATE: Final[str] = dedent( + """ +cat < {r_clone_config_path} +{r_clone_config_content} +EOF + +{r_clone_command} +""" +) + + +def _get_rclone_mount_command( + mount_settings: RCloneMountSettings, + r_clone_config_content: str, + remote_path: StorageFileID, + local_mount_path: Path, + remote_control_port: PortInt, + rc_user: str, + rc_password: str, +) -> str: + escaped_remote_path = f"{remote_path}".lstrip("/") + r_clone_command = " ".join( + [ + "rclone", + "--config", + f"{mount_settings.R_CLONE_CONFIG_FILE_PATH}", + "-vv", + "mount", + f"{CONFIG_KEY}:{escaped_remote_path}", + f"{local_mount_path}", + "--vfs-cache-mode full", + "--vfs-write-back", + mount_settings.R_CLONE_MOUNT_VFS_WRITE_BACK, + "--vfs-cache-max-size", + mount_settings.R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE, + ( + "--vfs-fast-fingerprint" + if mount_settings.R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE + else "" + ), + ("--no-modtime" if mount_settings.R_CLONE_MOUNT_NO_MODTIME else ""), + "--cache-dir", + f"{mount_settings.R_CLONE_MOUNT_VFS_CACHE_PATH}", + "--rc", + f"--rc-addr=0.0.0.0:{remote_control_port}", + "--rc-enable-metrics", + f"--rc-user='{rc_user}'", + f"--rc-pass='{rc_password}'", + "--allow-non-empty", + "--allow-other", + ] + ) + return _R_CLONE_MOUNT_TEMPLATE.format( + r_clone_config_path=mount_settings.R_CLONE_CONFIG_FILE_PATH, + r_clone_config_content=r_clone_config_content, + r_clone_command=r_clone_command, + ) + + +def _get_self_container_id() -> str: + # in docker the hostname is the container id + return os.environ["HOSTNAME"] + + +def _get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> _MountId: + # unique reproducible id for this mount + return f"{index}{local_mount_path}".replace("/", "_")[::-1] + class _BaseRcloneMountError(OsparcErrorMixin, RuntimeError): pass @@ -75,11 +143,6 @@ class MountNotStartedError(_BaseRcloneMountError): msg_template: str = "Mount not started for local path='{local_mount_path}'" -def _get_self_container_id() -> str: - # in docker the hostname is the container id - return os.environ["HOSTNAME"] - - class MountActivity(BaseModel): transferring: dict[str, ProgressReport] queued: list[str] @@ -121,7 +184,7 @@ def __init__( r_clone_config_content=r_clone_config_content, remote_path=remote_path, local_mount_path=self.local_mount_path, - rc_addr=f"0.0.0.0:{remote_control_port}", + remote_control_port=remote_control_port, rc_user=rc_user, rc_password=rc_password, ) @@ -221,70 +284,6 @@ async def stop(self): await self._cleanup_stack.aclose() -def _get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> _MountId: - # unique reproducible id for this mount - return f"{index}{local_mount_path}".replace("/", "_")[::-1] - - -_R_CLONE_MOUNT_TEMPLATE: Final[str] = dedent( - """ -cat < {r_clone_config_path} -{r_clone_config_content} -EOF - -{r_clone_command} -""" -) - - -def _get_rclone_mount_command( - mount_settings: RCloneMountSettings, - r_clone_config_content: str, - remote_path: StorageFileID, - local_mount_path: Path, - rc_addr: str, - rc_user: str, - rc_password: str, -) -> str: - escaped_remote_path = f"{remote_path}".lstrip("/") - r_clone_command = " ".join( - [ - "rclone", - "--config", - f"{mount_settings.R_CLONE_CONFIG_FILE_PATH}", - "-vv", - "mount", - f"{CONFIG_KEY}:{escaped_remote_path}", - f"{local_mount_path}", - "--vfs-cache-mode full", - "--vfs-write-back", - mount_settings.R_CLONE_MOUNT_VFS_WRITE_BACK, - "--vfs-cache-max-size", - mount_settings.R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE, - ( - "--vfs-fast-fingerprint" - if mount_settings.R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE - else "" - ), - ("--no-modtime" if mount_settings.R_CLONE_MOUNT_NO_MODTIME else ""), - "--cache-dir", - f"{mount_settings.R_CLONE_MOUNT_VFS_CACHE_PATH}", - "--rc", - f"--rc-addr={rc_addr}", - "--rc-enable-metrics", - f"--rc-user='{rc_user}'", - f"--rc-pass='{rc_password}'", - "--allow-non-empty", - "--allow-other", - ] - ) - return _R_CLONE_MOUNT_TEMPLATE.format( - r_clone_config_path=mount_settings.R_CLONE_CONFIG_FILE_PATH, - r_clone_config_content=r_clone_config_content, - r_clone_command=r_clone_command, - ) - - class RCloneRCInterfaceClient: def __init__( self, From ee1797ff37f114c8755646c37cc9fd05e95c18fb Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 15 Dec 2025 09:03:42 +0100 Subject: [PATCH 40/79] renamed properly --- .../services/docker_utils.py | 4 +-- .../tests/unit/test_services_docker_utils.py | 8 +++--- .../unit/test_services_volumes_manager.py | 28 +++++++++---------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/services/agent/src/simcore_service_agent/services/docker_utils.py b/services/agent/src/simcore_service_agent/services/docker_utils.py index 1390a5b12df2..ef41a263172b 100644 --- a/services/agent/src/simcore_service_agent/services/docker_utils.py +++ b/services/agent/src/simcore_service_agent/services/docker_utils.py @@ -34,12 +34,12 @@ def _reverse_string(to_reverse: str) -> str: def _does_volume_require_backup(volume_name: str) -> bool: # from `dyv_1726228407_891aa1a7-eb31-459f-8aed-8c902f5f5fb0_dd84f39e-7154-4a13-ba1d-50068d723104_stupni_www_` - # retruns `stupni_www_` + # returns `stupni_www_` inverse_name_part = volume_name[CHARS_IN_VOLUME_NAME_BEFORE_DIR_NAME:] return not inverse_name_part.startswith(_VOLUMES_NOT_TO_BACKUP) -async def get_unused_dynamc_sidecar_volumes(docker: Docker) -> set[str]: +async def get_unused_dynamic_sidecar_volumes(docker: Docker) -> set[str]: """Returns all volumes unused by sidecars""" volumes = await docker.volumes.list() all_volumes: set[str] = {volume["Name"] for volume in volumes["Volumes"]} diff --git a/services/agent/tests/unit/test_services_docker_utils.py b/services/agent/tests/unit/test_services_docker_utils.py index f4a19c9b9aa0..c6c1e4f59f33 100644 --- a/services/agent/tests/unit/test_services_docker_utils.py +++ b/services/agent/tests/unit/test_services_docker_utils.py @@ -17,7 +17,7 @@ _VOLUMES_NOT_TO_BACKUP, _does_volume_require_backup, _reverse_string, - get_unused_dynamc_sidecar_volumes, + get_unused_dynamic_sidecar_volumes, get_volume_details, remove_volume, ) @@ -78,7 +78,7 @@ async def test_doclker_utils_workflow( ) created_volumes.update(created_volume) - volumes = await get_unused_dynamc_sidecar_volumes(volumes_manager_docker_client) + volumes = await get_unused_dynamic_sidecar_volumes(volumes_manager_docker_client) assert volumes == created_volumes, ( "Most likely you have a dirty working state, please check " "that there are no previous docker volumes named `dyv_...` " @@ -114,12 +114,12 @@ async def test_doclker_utils_workflow( count_vloumes_to_backup if requires_backup else 0 ) - volumes = await get_unused_dynamc_sidecar_volumes(volumes_manager_docker_client) + volumes = await get_unused_dynamic_sidecar_volumes(volumes_manager_docker_client) assert len(volumes) == 0 @pytest.mark.parametrize("requires_backup", [True, False]) -async def test_remove_misisng_volume_does_not_raise_error( +async def test_remove_missing_volume_does_not_raise_error( requires_backup: bool, initialized_app: FastAPI, volumes_manager_docker_client: Docker, diff --git a/services/agent/tests/unit/test_services_volumes_manager.py b/services/agent/tests/unit/test_services_volumes_manager.py index 5fae32710dfe..17d190961bb3 100644 --- a/services/agent/tests/unit/test_services_volumes_manager.py +++ b/services/agent/tests/unit/test_services_volumes_manager.py @@ -43,7 +43,7 @@ def add_unused_volumes_for_service(self, node_id: NodeID) -> None: def remove_volume(self, volume_name: str) -> None: self.volumes.remove(volume_name) - def get_unused_dynamc_sidecar_volumes(self) -> set[str]: + def get_unused_dynamic_sidecar_volumes(self) -> set[str]: return deepcopy(self.volumes) @@ -58,8 +58,8 @@ async def _remove_volume( ) -> None: proxy.remove_volume(volume_name) - async def _get_unused_dynamc_sidecar_volumes(app: FastAPI) -> set[str]: - return proxy.get_unused_dynamc_sidecar_volumes() + async def _get_unused_dynamic_sidecar_volumes(app: FastAPI) -> set[str]: + return proxy.get_unused_dynamic_sidecar_volumes() mocker.patch( "simcore_service_agent.services.volumes_manager.remove_volume", @@ -67,8 +67,8 @@ async def _get_unused_dynamc_sidecar_volumes(app: FastAPI) -> set[str]: ) mocker.patch( - "simcore_service_agent.services.volumes_manager.get_unused_dynamc_sidecar_volumes", - side_effect=_get_unused_dynamc_sidecar_volumes, + "simcore_service_agent.services.volumes_manager.get_unused_dynamic_sidecar_volumes", + side_effect=_get_unused_dynamic_sidecar_volumes, ) return proxy @@ -105,13 +105,13 @@ async def test_volumes_manager_remove_all_volumes( mock_docker_utils.add_unused_volumes_for_service(uuid4()) assert spy_remove_volume.call_count == 0 assert ( - len(mock_docker_utils.get_unused_dynamc_sidecar_volumes()) + len(mock_docker_utils.get_unused_dynamic_sidecar_volumes()) == len(VOLUMES_TO_CREATE) * service_count ) await volumes_manager.remove_all_volumes() assert spy_remove_volume.call_count == len(VOLUMES_TO_CREATE) * service_count - assert len(mock_docker_utils.get_unused_dynamc_sidecar_volumes()) == 0 + assert len(mock_docker_utils.get_unused_dynamic_sidecar_volumes()) == 0 async def test_volumes_manager_remove_service_volumes( @@ -121,22 +121,22 @@ async def test_volumes_manager_remove_service_volumes( ): assert spy_remove_volume.call_count == 0 mock_docker_utils.add_unused_volumes_for_service(uuid4()) - node_id_to_remvoe = uuid4() - mock_docker_utils.add_unused_volumes_for_service(node_id_to_remvoe) + node_id_to_remove = uuid4() + mock_docker_utils.add_unused_volumes_for_service(node_id_to_remove) assert spy_remove_volume.call_count == 0 assert ( - len(mock_docker_utils.get_unused_dynamc_sidecar_volumes()) + len(mock_docker_utils.get_unused_dynamic_sidecar_volumes()) == len(VOLUMES_TO_CREATE) * 2 ) - await volumes_manager.remove_service_volumes(node_id_to_remvoe) + await volumes_manager.remove_service_volumes(node_id_to_remove) assert spy_remove_volume.call_count == len(VOLUMES_TO_CREATE) - unused_volumes = mock_docker_utils.get_unused_dynamc_sidecar_volumes() + unused_volumes = mock_docker_utils.get_unused_dynamic_sidecar_volumes() assert len(unused_volumes) == len(VOLUMES_TO_CREATE) for volume_name in unused_volumes: - assert f"{node_id_to_remvoe}" not in volume_name + assert f"{node_id_to_remove}" not in volume_name @pytest.fixture @@ -184,4 +184,4 @@ async def _run_volumes_clennup() -> None: with attempt: await _run_volumes_clennup() assert spy_remove_volume.call_count == len(VOLUMES_TO_CREATE) - assert len(mock_docker_utils.get_unused_dynamc_sidecar_volumes()) == 0 + assert len(mock_docker_utils.get_unused_dynamic_sidecar_volumes()) == 0 From 80c4681a99cc72e70d2a0df6217dab23283c1580 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 15 Dec 2025 10:50:15 +0100 Subject: [PATCH 41/79] fixed wrong path --- packages/settings-library/src/settings_library/r_clone.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index 1587a437b21c..c5f9babfc2a1 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -49,7 +49,9 @@ class RCloneMountSettings(BaseCustomSettings): Field( description="path inside the container where the rclone config file is located", ), - ] = Path("/tmo/rclone.conf") + ] = Path( + "/tmp/rclone.conf" # noqa: S108 + ) # CLI command `rclone mount` From 9fd8e06c5eb4bb1e0246a865b689a28c5ef522db Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 15 Dec 2025 10:51:04 +0100 Subject: [PATCH 42/79] added required paths --- .../modules/mounted_fs.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/mounted_fs.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/mounted_fs.py index 3654ebaa2e49..a01fe80067ad 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/mounted_fs.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/mounted_fs.py @@ -74,6 +74,13 @@ def volume_name_outputs(self) -> str: f"_{_name_from_full_path(self.outputs_path)[::-1]}" ) + @cached_property + def volume_name_vfs_cache(self) -> str: + return ( + f"{PREFIX_DYNAMIC_SIDECAR_VOLUMES}_{self.service_run_id}_{self.node_id}" + f"_{_name_from_full_path(DEFAULT_VFS_CACHE_PATH)[::-1]}" + ) + @cached_property def volume_user_preferences(self) -> str | None: if self.user_preferences_path is None: @@ -141,6 +148,12 @@ async def get_outputs_docker_volume(self, service_run_id: ServiceRunID) -> str: ) return f"{bind_path}:{self.outputs_path}" + async def get_vfs_cache_docker_volume(self, service_run_id: ServiceRunID) -> str: + bind_path: Path = await self._get_bind_path_from_label( + self.volume_name_vfs_cache, service_run_id + ) + return f"{bind_path}:{self.vfs_cache_path}" + async def get_user_preferences_path_volume( self, service_run_id: ServiceRunID ) -> str | None: From 3d19f28fe61a09b545e148da4d597d9d342d64db Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 15 Dec 2025 10:51:24 +0100 Subject: [PATCH 43/79] fixed broken mount point --- .../modules/long_running_tasks.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py index ee3c9ab75ed7..b0682bfb241d 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py @@ -18,7 +18,6 @@ from servicelib.long_running_tasks.task import TaskProtocol, TaskRegistry from servicelib.progress_bar import ProgressBarData from servicelib.utils import logged_gather -from settings_library.r_clone import DEFAULT_VFS_CACHE_PATH from simcore_sdk.node_data import data_manager from simcore_sdk.node_ports_common.r_clone_mount import MountActivity from tenacity import retry @@ -353,14 +352,17 @@ def _get_legacy_state_with_dy_volumes_path( async def _handler_get_bind_path( settings: ApplicationSettings, mounted_volumes: MountedVolumes, state_path: Path ) -> list: - vfs_cache_path = f"{mounted_volumes.vfs_cache_path}" - vfs_source, vfs_target = vfs_cache_path.replace( - f"/{DEFAULT_VFS_CACHE_PATH}", - f"{settings.DYNAMIC_SIDECAR_DY_VOLUMES_MOUNT_DIR}{DEFAULT_VFS_CACHE_PATH}", + vfs_cache_path = await mounted_volumes.get_vfs_cache_docker_volume( + settings.DY_SIDECAR_RUN_ID + ) + + vfs_source, vfs_target = ( + f"{vfs_cache_path}".replace( + f"{settings.DYNAMIC_SIDECAR_DY_VOLUMES_MOUNT_DIR}", "" + ) ).split(":") bind_paths: list[dict] = [ - # TODO: verify this is correct, path might be slightly off { "Type": "bind", "Source": vfs_source, From d47e315f050be01aa26e14e0e8628271a0181fe5 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 15 Dec 2025 15:24:57 +0100 Subject: [PATCH 44/79] extracted docker utils --- .../r_clone_mount/__init__.py | 2 +- .../node_ports_common/r_clone_mount/_core.py | 130 ++++++------------ .../r_clone_mount/_docker_utils.py | 123 +++++++++++++++++ .../r_clone_mount/_models.py | 6 + 4 files changed, 170 insertions(+), 91 deletions(-) create mode 100644 packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py create mode 100644 packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py index 7b9238522c46..fd0b78d63b40 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py @@ -1,12 +1,12 @@ from ._config_provider import MountRemoteType from ._core import ( - GetBindPathsProtocol, MountActivity, MountActivityProtocol, MountAlreadyStartedError, MountNotStartedError, RCloneMountManager, ) +from ._models import GetBindPathsProtocol __all__: tuple[str, ...] = ( "GetBindPathsProtocol", diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 55bc061ce72c..ff93d2fdc4f4 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -10,10 +10,7 @@ from typing import Any, Final, Protocol from uuid import uuid4 -import aiodocker import httpx -from aiodocker.containers import DockerContainer -from aiodocker.networks import DockerNetwork from common_library.errors_classes import OsparcErrorMixin from httpx import AsyncClient from models_library.basic_types import PortInt @@ -31,7 +28,9 @@ wait_fixed, ) +from . import _docker_utils from ._config_provider import CONFIG_KEY, MountRemoteType, get_config_content +from ._models import GetBindPathsProtocol _logger = logging.getLogger(__name__) @@ -148,10 +147,6 @@ class MountActivity(BaseModel): queued: list[str] -class GetBindPathsProtocol(Protocol): - async def __call__(self, state_path: Path) -> list: ... - - class MountActivityProtocol(Protocol): async def __call__(self, state_path: Path, activity: MountActivity) -> None: ... @@ -189,12 +184,6 @@ def __init__( rc_password=rc_password, ) - self._cleanup_stack = AsyncExitStack() - self._client: aiodocker.Docker | None = None - - self._r_clone_container: DockerContainer | None = None - self._r_clone_network: DockerNetwork | None = None - @cached_property def r_clone_container_name(self) -> str: mount_id = _get_mount_id(self.local_mount_path, self.index) @@ -205,83 +194,38 @@ def _r_clone_network_name(self) -> str: mount_id = _get_mount_id(self.local_mount_path, self.index) return f"{_DOCKER_PREFIX_MOUNT}-c-{self.node_id}{mount_id}"[:63] - @property - def _aiodocker_client(self) -> aiodocker.Docker: - assert self._client is not None # nosec - return self._client - - async def start(self): - self._client = await self._cleanup_stack.enter_async_context(aiodocker.Docker()) - # TODO: toss away docker session when done with it do not maintain object in memory to avoid issues - # better more robust way of doing it - - try: - existing_container = await self._aiodocker_client.containers.get( - self.r_clone_container_name + async def create(self): + async with _docker_utils.get_or_crate_docker_session(None) as client: + await _docker_utils.remove_container_if_exists( + client, self.r_clone_container_name ) - await existing_container.delete(force=True) - except aiodocker.exceptions.DockerError as e: - if e.status != _NOT_FOUND: - raise - - try: - existing_network = DockerNetwork( - self._aiodocker_client, self._r_clone_network_name + await _docker_utils.remove_network_if_exists( + client, self.r_clone_container_name + ) + await _docker_utils.create_network_and_connect_sidecar_container( + client, self._r_clone_network_name ) - await existing_network.show() - await existing_network.delete() - except aiodocker.exceptions.DockerError as e: - if e.status != _NOT_FOUND: - raise - - self._r_clone_network = await self._aiodocker_client.networks.create( - {"Name": self._r_clone_network_name, "Attachable": True} - ) - await self._r_clone_network.connect({"Container": _get_self_container_id()}) - - # create rclone container attached to the network - self._r_clone_container = await self._aiodocker_client.containers.run( - config={ - "Image": f"rclone/rclone:{self.mount_settings.R_CLONE_VERSION}", - "Entrypoint": ["/bin/sh", "-c", f"{self.command}"], - "ExposedPorts": {f"{self.remote_control_port}/tcp": {}}, - "HostConfig": { - "NetworkMode": self._r_clone_network_name, - "Binds": [], - "Mounts": await self.handler_get_bind_paths(self.local_mount_path), - "Devices": [ - { - "PathOnHost": "/dev/fuse", - "PathInContainer": "/dev/fuse", - "CgroupPermissions": "rwm", - } - ], - "CapAdd": ["SYS_ADMIN"], - "SecurityOpt": ["apparmor:unconfined", "seccomp:unconfined"], - }, - }, - name=self.r_clone_container_name, - ) - container_inspect = await self._r_clone_container.show() - _logger.debug( - "Started rclone mount container '%s' with command='%s' (inspect=%s)", - self.r_clone_container_name, - self.command, - container_inspect, - ) - - async def stop(self): - assert self._r_clone_container is not None # nosec - assert self._r_clone_network is not None # nosec - - await self._r_clone_container.stop() - - await self._r_clone_network.disconnect({"Container": _get_self_container_id()}) - await self._r_clone_network.delete() - await self._r_clone_container.delete() + assert self.mount_settings.R_CLONE_VERSION is not None # nosec + await _docker_utils.create_r_clone_container( + client, + self.r_clone_container_name, + self.command, + r_clone_version=self.mount_settings.R_CLONE_VERSION, + remote_control_port=self.remote_control_port, + r_clone_network_name=self._r_clone_network_name, + local_mount_path=self.local_mount_path, + handler_get_bind_paths=self.handler_get_bind_paths, + ) - await self._cleanup_stack.aclose() + async def remove(self): + async with _docker_utils.get_or_crate_docker_session(None) as client: + await _docker_utils.remove_container_if_exists( + client, self.r_clone_container_name + ) + await _docker_utils.remove_network_if_exists( + client, self.r_clone_container_name + ) class RCloneRCInterfaceClient: @@ -519,7 +463,7 @@ async def start_mount(self) -> None: update_handler=self._progress_handler, ) - await self._container_manager.start() + await self._container_manager.create() await self.rc_interface.setup() await self.rc_interface.wait_for_interface_to_be_ready() @@ -531,7 +475,7 @@ async def stop_mount(self) -> None: await self.rc_interface.teardown() self._rc_interface = None - await self._container_manager.stop() + await self._container_manager.remove() self._container_manager = None await self._cleanup_stack.aclose() @@ -540,6 +484,9 @@ async def stop_mount(self) -> None: class RCloneMountManager: def __init__(self, r_clone_settings: RCloneSettings) -> None: self.r_clone_settings = r_clone_settings + if self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION is None: + msg = "R_CLONE_VERSION setting is not set" + raise RuntimeError(msg) # TODO: make this stateless and go via aiodocker to avoid issues when restartign the container self._started_mounts: dict[_MountId, TrackedMount] = {} @@ -555,7 +502,9 @@ async def ensure_mounted( handler_get_bind_paths: GetBindPathsProtocol, handler_mount_activity: MountActivityProtocol, ) -> None: - # TODO: rename to ENSURE MOUNT EXISTS + # check if rlcone mount exists + # + with log_context( _logger, logging.INFO, @@ -614,7 +563,7 @@ async def was_mount_started( async def ensure_unmounted( self, local_mount_path: Path, index: NonNegativeInt ) -> None: - # TODO: rename to ENSURE mount does not exist + # make sure this is done using stateless docker api calls with log_context( _logger, logging.INFO, f"unmounting {local_mount_path=}", log_duration=True ): @@ -628,6 +577,7 @@ async def ensure_unmounted( async def setup(self) -> None: # TODO: add a process which ensures that the mounts keep running -> register some local data to restart the mount process if it dies (even on accident manually) + pass async def teardown(self) -> None: diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py new file mode 100644 index 000000000000..a9063779ad8c --- /dev/null +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py @@ -0,0 +1,123 @@ +import logging +import os +from collections.abc import AsyncIterator +from contextlib import asynccontextmanager +from pathlib import Path +from typing import Final + +from aiodocker import Docker +from aiodocker.exceptions import DockerError +from aiodocker.networks import DockerNetwork +from models_library.basic_types import PortInt + +from ._models import GetBindPathsProtocol + +_logger = logging.getLogger(__name__) + +_NOT_FOUND: Final[int] = 404 +_INTERNAL_SERVER_ERROR: Final[int] = 500 + + +def _get_self_container_id() -> str: + # in docker the hostname is the container id + return os.environ["HOSTNAME"] + + +@asynccontextmanager +async def get_or_crate_docker_session(docker: Docker | None) -> AsyncIterator[Docker]: + if docker is not None: + yield docker + return + + async with Docker() as client: + yield client + + +async def create_r_clone_container( + docker: Docker | None, + container_name: str, + command: str, + *, + r_clone_version: str, + remote_control_port: PortInt, + r_clone_network_name: str, + local_mount_path: Path, + handler_get_bind_paths: GetBindPathsProtocol, +) -> None: + async with get_or_crate_docker_session(docker) as client: + # create rclone container attached to the network + r_clone_container = await client.containers.run( + config={ + "Image": f"rclone/rclone:{r_clone_version}", + "Entrypoint": ["/bin/sh", "-c", f"{command}"], + "ExposedPorts": {f"{remote_control_port}/tcp": {}}, + "HostConfig": { + "NetworkMode": r_clone_network_name, + "Binds": [], + "Mounts": await handler_get_bind_paths(local_mount_path), + "Devices": [ + { + "PathOnHost": "/dev/fuse", + "PathInContainer": "/dev/fuse", + "CgroupPermissions": "rwm", + } + ], + "CapAdd": ["SYS_ADMIN"], + "SecurityOpt": ["apparmor:unconfined", "seccomp:unconfined"], + }, + }, + name=container_name, + ) + container_inspect = await r_clone_container.show() + _logger.debug( + "Started rclone mount container '%s' with command='%s' (inspect=%s)", + container_name, + command, + container_inspect, + ) + + +async def create_network_and_connect_sidecar_container( + docker: Docker | None, network_name: str +) -> None: + async with get_or_crate_docker_session(docker) as client: + r_clone_network = await client.networks.create( + {"Name": network_name, "Attachable": True} + ) + await r_clone_network.connect({"Container": _get_self_container_id()}) + + +async def remove_container_if_exists( + docker: Docker | None, container_name: str +) -> None: + async with get_or_crate_docker_session(docker) as client: + try: + existing_container = await client.containers.get(container_name) + await existing_container.delete(force=True) + except DockerError as e: + if e.status != _NOT_FOUND: + raise + + +async def remove_network_if_exists(docker: Docker | None, network_name: str) -> None: + async with get_or_crate_docker_session(docker) as client: + existing_network = DockerNetwork(client, network_name) + + try: + await existing_network.disconnect({"Container": _get_self_container_id()}) + except DockerError as e: + if ( + not ( + e.status == _INTERNAL_SERVER_ERROR + and "is not connected to network" in e.message + ) + and e.status != _NOT_FOUND + ): + raise + + try: + await existing_network.show() + await existing_network.delete() + except DockerError as e: + if e.status != _NOT_FOUND: + raise diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py new file mode 100644 index 000000000000..711449992786 --- /dev/null +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py @@ -0,0 +1,6 @@ +from pathlib import Path +from typing import Protocol + + +class GetBindPathsProtocol(Protocol): + async def __call__(self, state_path: Path) -> list: ... From 438bfb998fffa2e470111c4a10471bfe183e173f Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 15 Dec 2025 15:33:06 +0100 Subject: [PATCH 45/79] refactor --- .../node_ports_common/r_clone_mount/_core.py | 71 ++++++++----------- .../r_clone_mount/_docker_utils.py | 2 +- 2 files changed, 29 insertions(+), 44 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index ff93d2fdc4f4..0284d3321ae4 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -1,6 +1,5 @@ import asyncio import logging -import os from collections.abc import Awaitable, Callable from contextlib import AsyncExitStack from datetime import UTC, datetime, timedelta @@ -106,11 +105,6 @@ def _get_rclone_mount_command( ) -def _get_self_container_id() -> str: - # in docker the hostname is the container id - return os.environ["HOSTNAME"] - - def _get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> _MountId: # unique reproducible id for this mount return f"{index}{local_mount_path}".replace("/", "_")[::-1] @@ -172,17 +166,11 @@ def __init__( self.local_mount_path = local_mount_path self.index = index self.r_clone_config_content = r_clone_config_content - self.handler_get_bind_paths = handler_get_bind_paths + self.remote_path = remote_path + self.rc_user = rc_user + self.rc_password = rc_password - self.command = _get_rclone_mount_command( - mount_settings=mount_settings, - r_clone_config_content=r_clone_config_content, - remote_path=remote_path, - local_mount_path=self.local_mount_path, - remote_control_port=remote_control_port, - rc_user=rc_user, - rc_password=rc_password, - ) + self.handler_get_bind_paths = handler_get_bind_paths @cached_property def r_clone_container_name(self) -> str: @@ -210,7 +198,15 @@ async def create(self): await _docker_utils.create_r_clone_container( client, self.r_clone_container_name, - self.command, + command=_get_rclone_mount_command( + mount_settings=self.mount_settings, + r_clone_config_content=self.r_clone_config_content, + remote_path=self.remote_path, + local_mount_path=self.local_mount_path, + remote_control_port=self.remote_control_port, + rc_user=self.rc_user, + rc_password=self.rc_password, + ), r_clone_version=self.mount_settings.R_CLONE_VERSION, remote_control_port=self.remote_control_port, r_clone_network_name=self._r_clone_network_name, @@ -400,7 +396,21 @@ def __init__( self._mount_activity_update_interval = mount_activity_update_interval # used internally to handle the mount command - self._container_manager: ContainerManager | None = None + self._container_manager = ContainerManager( + mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, + node_id=self.node_id, + remote_control_port=self.rc_port, + local_mount_path=self.local_mount_path, + index=self.index, + r_clone_config_content=get_config_content( + self.r_clone_settings, self.mount_type + ), + remote_path=f"{self.r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME}/{self.remote_path}", + rc_user=self.rc_user, + rc_password=self.rc_password, + handler_get_bind_paths=self.handler_get_bind_paths, + ) + self._rc_interface: RCloneRCInterfaceClient | None = None self._cleanup_stack = AsyncExitStack() @@ -427,33 +437,11 @@ async def teardown(self) -> None: await self.stop_mount() async def start_mount(self) -> None: - if self._container_manager is not None: - raise _ContainerAlreadyStartedError( - container=self._container_manager.r_clone_container_name, - command=self._container_manager.command, - ) - - r_clone_config_content = get_config_content( - self.r_clone_settings, self.mount_type - ) if self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION is None: msg = "R_CLONE_VERSION setting is not set" raise RuntimeError(msg) - self._container_manager = ContainerManager( - mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, - node_id=self.node_id, - remote_control_port=self.rc_port, - local_mount_path=self.local_mount_path, - index=self.index, - r_clone_config_content=r_clone_config_content, - remote_path=f"{self.r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME}/{self.remote_path}", - rc_user=self.rc_user, - rc_password=self.rc_password, - handler_get_bind_paths=self.handler_get_bind_paths, - ) - self._rc_interface: RCloneRCInterfaceClient | None = RCloneRCInterfaceClient( remote_control_port=self.rc_port, r_clone_mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, @@ -468,15 +456,12 @@ async def start_mount(self) -> None: await self.rc_interface.wait_for_interface_to_be_ready() async def stop_mount(self) -> None: - if self._container_manager is None: - return await self.rc_interface.wait_for_all_transfers_to_complete() await self.rc_interface.teardown() self._rc_interface = None await self._container_manager.remove() - self._container_manager = None await self._cleanup_stack.aclose() diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py index a9063779ad8c..24b3e819370a 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py @@ -36,8 +36,8 @@ async def get_or_crate_docker_session(docker: Docker | None) -> AsyncIterator[Do async def create_r_clone_container( docker: Docker | None, container_name: str, - command: str, *, + command: str, r_clone_version: str, remote_control_port: PortInt, r_clone_network_name: str, From cace2f080856c66276bddc32d7095491d0bf7ea7 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 15 Dec 2025 15:51:52 +0100 Subject: [PATCH 46/79] rename --- .../node_ports_common/r_clone_mount/_core.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 0284d3321ae4..105b84406796 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -184,12 +184,15 @@ def _r_clone_network_name(self) -> str: async def create(self): async with _docker_utils.get_or_crate_docker_session(None) as client: + # ensure nothing was left from previous runs await _docker_utils.remove_container_if_exists( client, self.r_clone_container_name ) await _docker_utils.remove_network_if_exists( client, self.r_clone_container_name ) + + # create network + container and connect to sidecar await _docker_utils.create_network_and_connect_sidecar_container( client, self._r_clone_network_name ) @@ -251,19 +254,19 @@ def __init__( self._client: AsyncClient | None = None self._continue_running: bool = True - self._transfer_monitor: asyncio.Task | None = None + self._mount_activity_task: asyncio.Task | None = None async def setup(self) -> None: self._client = await self._cleanup_stack.enter_async_context( AsyncClient(timeout=self._r_clone_client_timeout.total_seconds()) ) - self._transfer_monitor = asyncio.create_task(self._monitor()) + self._mount_activity_task = asyncio.create_task(self._mount_activity_worker()) async def teardown(self) -> None: - if self._transfer_monitor is not None: + if self._mount_activity_task is not None: self._continue_running = False - await self._transfer_monitor - self._transfer_monitor = None + await self._mount_activity_task + self._mount_activity_task = None await self._cleanup_stack.aclose() @@ -293,7 +296,7 @@ async def _post_vfs_queue(self) -> dict: async def _rc_noop(self) -> dict: return await self._request("POST", "rc/noop") - async def _monitor(self) -> None: + async def _mount_activity_worker(self) -> None: while self._continue_running: await asyncio.sleep(self._update_interval_seconds) From 917286e1da528751f435be8bf0a9c8f3c5920da3 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 15 Dec 2025 16:02:25 +0100 Subject: [PATCH 47/79] add log level --- services/dynamic-sidecar/docker/boot.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/dynamic-sidecar/docker/boot.sh b/services/dynamic-sidecar/docker/boot.sh index 39ed82bc802a..d6bc4dc4d570 100755 --- a/services/dynamic-sidecar/docker/boot.sh +++ b/services/dynamic-sidecar/docker/boot.sh @@ -49,7 +49,7 @@ SERVER_LOG_LEVEL=$(echo "${APP_LOG_LEVEL}" | tr '[:upper:]' '[:lower:]') echo "$INFO" "Log-level app/server: $APP_LOG_LEVEL/$SERVER_LOG_LEVEL" R_CLONE_VERSION=$(rclone version | head -n1 | awk '{print $2}' | sed 's/^v//') && \ - echo "R_CLONE_VERSION=${R_CLONE_VERSION}" && \ + echo "$INFO" "R_CLONE_VERSION=${R_CLONE_VERSION}" && \ export R_CLONE_VERSION if [ "${SC_BOOT_MODE}" = "debug" ]; then From 7a1db83b17d12c2d13ab109c7b1f34a737bd6f29 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 15 Dec 2025 16:02:58 +0100 Subject: [PATCH 48/79] refactor --- .../node_ports_common/r_clone_mount/_core.py | 50 +++++++------------ 1 file changed, 19 insertions(+), 31 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 105b84406796..bd7173fc18a8 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -145,7 +145,7 @@ class MountActivityProtocol(Protocol): async def __call__(self, state_path: Path, activity: MountActivity) -> None: ... -class ContainerManager: +class ContainerManager: # stateless def __init__( self, mount_settings: RCloneMountSettings, @@ -227,7 +227,7 @@ async def remove(self): ) -class RCloneRCInterfaceClient: +class RCloneRCHttpClient: # HAS STATE def __init__( self, remote_control_port: PortInt, @@ -297,6 +297,7 @@ async def _rc_noop(self) -> dict: return await self._request("POST", "rc/noop") async def _mount_activity_worker(self) -> None: + # TODO: extract logic from interface while self._continue_running: await asyncio.sleep(self._update_interval_seconds) @@ -367,7 +368,7 @@ async def _() -> None: await _() -class TrackedMount: +class TrackedMount: # HAS STATE -> links RCClone it's OK def __init__( self, node_id: NodeID, @@ -414,13 +415,14 @@ def __init__( handler_get_bind_paths=self.handler_get_bind_paths, ) - self._rc_interface: RCloneRCInterfaceClient | None = None - self._cleanup_stack = AsyncExitStack() - - @property - def rc_interface(self) -> RCloneRCInterfaceClient: - assert self._rc_interface is not None # nosec - return self._rc_interface + self.rc_http_client = RCloneRCHttpClient( + remote_control_port=self.rc_port, + r_clone_mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, + remote_control_host=self._container_manager.r_clone_container_name, + rc_user=self.rc_user, + rc_password=self.rc_password, + update_handler=self._progress_handler, + ) async def _progress_handler(self, mount_activity: MountActivity) -> None: now = datetime.now(UTC) @@ -436,38 +438,24 @@ async def _progress_handler(self, mount_activity: MountActivity) -> None: await self.handler_mount_activity(self.local_mount_path, mount_activity) - async def teardown(self) -> None: - await self.stop_mount() - async def start_mount(self) -> None: if self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION is None: msg = "R_CLONE_VERSION setting is not set" raise RuntimeError(msg) - self._rc_interface: RCloneRCInterfaceClient | None = RCloneRCInterfaceClient( - remote_control_port=self.rc_port, - r_clone_mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, - remote_control_host=self._container_manager.r_clone_container_name, - rc_user=self.rc_user, - rc_password=self.rc_password, - update_handler=self._progress_handler, - ) - await self._container_manager.create() - await self.rc_interface.setup() - await self.rc_interface.wait_for_interface_to_be_ready() + + await self.rc_http_client.setup() + await self.rc_http_client.wait_for_interface_to_be_ready() async def stop_mount(self) -> None: - await self.rc_interface.wait_for_all_transfers_to_complete() - await self.rc_interface.teardown() - self._rc_interface = None + await self.rc_http_client.wait_for_all_transfers_to_complete() + await self.rc_http_client.teardown() await self._container_manager.remove() - await self._cleanup_stack.aclose() - class RCloneMountManager: def __init__(self, r_clone_settings: RCloneSettings) -> None: @@ -539,7 +527,7 @@ async def wait_for_transfers_to_complete( raise MountNotStartedError(local_mount_path=local_mount_path) tracked_mount = self._started_mounts[mount_id] - await tracked_mount.rc_interface.wait_for_all_transfers_to_complete() + await tracked_mount.rc_http_client.wait_for_all_transfers_to_complete() async def was_mount_started( self, local_mount_path: Path, index: NonNegativeInt @@ -571,7 +559,7 @@ async def setup(self) -> None: async def teardown(self) -> None: # shutdown still ongoing mounts await asyncio.gather( - *[mount.teardown() for mount in self._started_mounts.values()] + *[mount.stop_mount() for mount in self._started_mounts.values()] ) self._started_mounts.clear() From 53eda18b6c6fe1ef7d6984b76e18605f82aa1778 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 15 Dec 2025 16:11:09 +0100 Subject: [PATCH 49/79] refactor --- .../node_ports_common/r_clone_mount/_core.py | 56 +++++++++---------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index bd7173fc18a8..7b0407440f74 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -1,7 +1,6 @@ import asyncio import logging from collections.abc import Awaitable, Callable -from contextlib import AsyncExitStack from datetime import UTC, datetime, timedelta from functools import cached_property from pathlib import Path @@ -250,16 +249,14 @@ def __init__( self._rc_host = remote_control_host self._rc_port = remote_control_port - self._cleanup_stack = AsyncExitStack() - self._client: AsyncClient | None = None - self._continue_running: bool = True self._mount_activity_task: asyncio.Task | None = None + @property + def _client(self) -> AsyncClient: + return AsyncClient(timeout=self._r_clone_client_timeout.total_seconds()) + async def setup(self) -> None: - self._client = await self._cleanup_stack.enter_async_context( - AsyncClient(timeout=self._r_clone_client_timeout.total_seconds()) - ) self._mount_activity_task = asyncio.create_task(self._mount_activity_worker()) async def teardown(self) -> None: @@ -268,8 +265,6 @@ async def teardown(self) -> None: await self._mount_activity_task self._mount_activity_task = None - await self._cleanup_stack.aclose() - @property def _base_url(self) -> str: return f"http://{self._rc_host}:{self._rc_port}" @@ -296,31 +291,32 @@ async def _post_vfs_queue(self) -> dict: async def _rc_noop(self) -> dict: return await self._request("POST", "rc/noop") + async def get_mount_activity(self) -> MountActivity: + core_stats, vfs_queue = await asyncio.gather( + self._post_core_stats(), self._post_vfs_queue() + ) + + return MountActivity( + transferring=( + { + x["name"]: ProgressReport( + actual_value=( + x["percentage"] / 100 if "percentage" in x else 0.0 + ) + ) + for x in core_stats["transferring"] + } + if "transferring" in core_stats + else {} + ), + queued=[x["name"] for x in vfs_queue["queue"]], + ) + async def _mount_activity_worker(self) -> None: # TODO: extract logic from interface while self._continue_running: await asyncio.sleep(self._update_interval_seconds) - - core_stats, vfs_queue = await asyncio.gather( - self._post_core_stats(), self._post_vfs_queue() - ) - - mount_activity = MountActivity( - transferring=( - { - x["name"]: ProgressReport( - actual_value=( - x["percentage"] / 100 if "percentage" in x else 0.0 - ) - ) - for x in core_stats["transferring"] - } - if "transferring" in core_stats - else {} - ), - queued=[x["name"] for x in vfs_queue["queue"]], - ) - + mount_activity = await self.get_mount_activity() await self._update_handler(mount_activity) @retry( From 44e10585e589a9b7238cc49f6a3fbed7f521f923 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Mon, 15 Dec 2025 18:05:14 +0100 Subject: [PATCH 50/79] restructured --- .../src/simcore_sdk/node_data/data_manager.py | 3 +- .../node_ports_common/r_clone_mount/_core.py | 170 ++++++++---------- 2 files changed, 80 insertions(+), 93 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py index 0a829ce609b8..9511a9984437 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_data/data_manager.py @@ -192,7 +192,6 @@ async def _delete_legacy_archive( async def _stop_mount( mount_manager: RCloneMountManager, destination_path: Path, index: NonNegativeInt ) -> None: - await mount_manager.wait_for_transfers_to_complete(destination_path, index) await mount_manager.ensure_unmounted(destination_path, index) @@ -213,7 +212,7 @@ async def push( # pylint: disable=too-many-arguments # noqa: PLR0913 ) -> None: """pushes and removes the legacy archive if present""" - if await mount_manager.was_mount_started(source_path, index): + if mount_manager.is_mount_tracked(source_path, index): await _stop_mount(mount_manager, source_path, index) else: await _push_directory( diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 7b0407440f74..67d1b6683a34 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -9,12 +9,14 @@ from uuid import uuid4 import httpx +from common_library.async_tools import cancel_wait_task from common_library.errors_classes import OsparcErrorMixin from httpx import AsyncClient from models_library.basic_types import PortInt from models_library.progress_bar import ProgressReport from models_library.projects_nodes_io import NodeID, StorageFileID from pydantic import BaseModel, NonNegativeInt +from servicelib.background_task import create_periodic_task from servicelib.logging_utils import log_context from servicelib.utils import unused_port from settings_library.r_clone import RCloneMountSettings, RCloneSettings @@ -144,7 +146,7 @@ class MountActivityProtocol(Protocol): async def __call__(self, state_path: Path, activity: MountActivity) -> None: ... -class ContainerManager: # stateless +class StatelessContainerManager: # stateless def __init__( self, mount_settings: RCloneMountSettings, @@ -226,7 +228,7 @@ async def remove(self): ) -class RCloneRCHttpClient: # HAS STATE +class StatelessRCloneRCHttpClient: # HAS STATE def __init__( self, remote_control_port: PortInt, @@ -249,36 +251,23 @@ def __init__( self._rc_host = remote_control_host self._rc_port = remote_control_port - self._continue_running: bool = True - self._mount_activity_task: asyncio.Task | None = None - - @property - def _client(self) -> AsyncClient: - return AsyncClient(timeout=self._r_clone_client_timeout.total_seconds()) - - async def setup(self) -> None: - self._mount_activity_task = asyncio.create_task(self._mount_activity_worker()) - - async def teardown(self) -> None: - if self._mount_activity_task is not None: - self._continue_running = False - await self._mount_activity_task - self._mount_activity_task = None - @property def _base_url(self) -> str: return f"http://{self._rc_host}:{self._rc_port}" async def _request(self, method: str, path: str) -> Any: - assert self._client is not None # nosec - request_url = f"{self._base_url}/{path}" _logger.debug("Sending '%s %s' request", method, request_url) - response = await self._client.request( - method, request_url, auth=(self._rc_user, self._rc_password) - ) - response.raise_for_status() - result = response.json() + + async with AsyncClient( + timeout=self._r_clone_client_timeout.total_seconds() + ) as client: + response = await client.request( + method, request_url, auth=(self._rc_user, self._rc_password) + ) + response.raise_for_status() + result = response.json() + _logger.debug("'%s %s' replied with: %s", method, path, result) return result @@ -312,13 +301,6 @@ async def get_mount_activity(self) -> MountActivity: queued=[x["name"] for x in vfs_queue["queue"]], ) - async def _mount_activity_worker(self) -> None: - # TODO: extract logic from interface - while self._continue_running: - await asyncio.sleep(self._update_interval_seconds) - mount_activity = await self.get_mount_activity() - await self._update_handler(mount_activity) - @retry( wait=wait_fixed(1), stop=stop_after_delay(_MAX_WAIT_RC_HTTP_INTERFACE_READY.total_seconds()), @@ -329,6 +311,13 @@ async def _mount_activity_worker(self) -> None: async def wait_for_interface_to_be_ready(self) -> None: await self._rc_noop() + async def is_responsive(self) -> bool: + try: + await self._rc_noop() + return True + except httpx.HTTPError: + return False + async def wait_for_all_transfers_to_complete(self) -> None: """ Should be waited before closing the mount @@ -364,7 +353,7 @@ async def _() -> None: await _() -class TrackedMount: # HAS STATE -> links RCClone it's OK +class TrackedMount: def __init__( self, node_id: NodeID, @@ -394,9 +383,10 @@ def __init__( self._last_mount_activity: MountActivity | None = None self._last_mount_activity_update: datetime = datetime.fromtimestamp(0, UTC) self._mount_activity_update_interval = mount_activity_update_interval + self._task_mount_activity: asyncio.Task[None] | None = None # used internally to handle the mount command - self._container_manager = ContainerManager( + self._container_manager = StatelessContainerManager( mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, node_id=self.node_id, remote_control_port=self.rc_port, @@ -411,16 +401,16 @@ def __init__( handler_get_bind_paths=self.handler_get_bind_paths, ) - self.rc_http_client = RCloneRCHttpClient( + self._rc_http_client = StatelessRCloneRCHttpClient( remote_control_port=self.rc_port, r_clone_mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, remote_control_host=self._container_manager.r_clone_container_name, rc_user=self.rc_user, rc_password=self.rc_password, - update_handler=self._progress_handler, + update_handler=self._handler_mount_activity, ) - async def _progress_handler(self, mount_activity: MountActivity) -> None: + async def _handler_mount_activity(self, mount_activity: MountActivity) -> None: now = datetime.now(UTC) enough_time_passed = ( @@ -434,23 +424,33 @@ async def _progress_handler(self, mount_activity: MountActivity) -> None: await self.handler_mount_activity(self.local_mount_path, mount_activity) - async def start_mount(self) -> None: - - if self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION is None: - msg = "R_CLONE_VERSION setting is not set" - raise RuntimeError(msg) + async def _worker_mount_activity(self) -> None: + mount_activity = await self._rc_http_client.get_mount_activity() + await self._handler_mount_activity(mount_activity) + async def start_mount(self) -> None: await self._container_manager.create() - await self.rc_http_client.setup() - await self.rc_http_client.wait_for_interface_to_be_ready() + await self._rc_http_client.wait_for_interface_to_be_ready() - async def stop_mount(self) -> None: + self._task_mount_activity = create_periodic_task( + self._worker_mount_activity, + interval=self._mount_activity_update_interval, + task_name=f"rclone-mount-activity-{_get_mount_id(self.local_mount_path, self.index)}", + ) - await self.rc_http_client.wait_for_all_transfers_to_complete() - await self.rc_http_client.teardown() + async def stop_mount(self) -> None: + await self._rc_http_client.wait_for_all_transfers_to_complete() await self._container_manager.remove() + if self._task_mount_activity is not None: + await cancel_wait_task(self._task_mount_activity) + + async def wait_for_all_transfers_to_complete(self) -> None: + await self._rc_http_client.wait_for_all_transfers_to_complete() + + async def is_responsive(self) -> bool: + return await self._rc_http_client.is_responsive() class RCloneMountManager: @@ -460,8 +460,8 @@ def __init__(self, r_clone_settings: RCloneSettings) -> None: msg = "R_CLONE_VERSION setting is not set" raise RuntimeError(msg) - # TODO: make this stateless and go via aiodocker to avoid issues when restartign the container - self._started_mounts: dict[_MountId, TrackedMount] = {} + self._tracked_mounts: dict[_MountId, TrackedMount] = {} + self._task_ensure_mounts_working: asyncio.Task[None] | None = None async def ensure_mounted( self, @@ -484,8 +484,8 @@ async def ensure_mounted( log_duration=True, ): mount_id = _get_mount_id(local_mount_path, index) - if mount_id in self._started_mounts: - tracked_mount = self._started_mounts[mount_id] + if mount_id in self._tracked_mounts: + tracked_mount = self._tracked_mounts[mount_id] raise MountAlreadyStartedError(local_mount_path=local_mount_path) free_port = await asyncio.get_running_loop().run_in_executor( @@ -505,32 +505,12 @@ async def ensure_mounted( ) await tracked_mount.start_mount() - self._started_mounts[mount_id] = tracked_mount + self._tracked_mounts[mount_id] = tracked_mount - async def wait_for_transfers_to_complete( - self, local_mount_path: Path, index: NonNegativeInt - ) -> None: - # if mount is not present it just returns immediately - - with log_context( - _logger, - logging.INFO, - f"wait for transfers to complete {local_mount_path=}", - log_duration=True, - ): - mount_id = _get_mount_id(local_mount_path, index) - if mount_id not in self._started_mounts: - raise MountNotStartedError(local_mount_path=local_mount_path) - - tracked_mount = self._started_mounts[mount_id] - await tracked_mount.rc_http_client.wait_for_all_transfers_to_complete() - - async def was_mount_started( - self, local_mount_path: Path, index: NonNegativeInt - ) -> bool: - # checks if mount is present or not + def is_mount_tracked(self, local_mount_path: Path, index: NonNegativeInt) -> bool: + """True if if a mount is being tracked""" mount_id = _get_mount_id(local_mount_path, index) - return mount_id in self._started_mounts + return mount_id in self._tracked_mounts async def ensure_unmounted( self, local_mount_path: Path, index: NonNegativeInt @@ -540,29 +520,37 @@ async def ensure_unmounted( _logger, logging.INFO, f"unmounting {local_mount_path=}", log_duration=True ): mount_id = _get_mount_id(local_mount_path, index) - if mount_id not in self._started_mounts: - # TODO: check if this is running on docker, then shutdown -> otherwise sidecar will break - raise MountNotStartedError(local_mount_path=local_mount_path) + tracked_mount = self._tracked_mounts[mount_id] + + await tracked_mount.wait_for_all_transfers_to_complete() - tracked_mount = self._started_mounts[mount_id] await tracked_mount.stop_mount() - async def setup(self) -> None: - # TODO: add a process which ensures that the mounts keep running -> register some local data to restart the mount process if it dies (even on accident manually) + async def _worker_ensure_mounts_working(self) -> None: + with log_context(_logger, logging.DEBUG, "Ensuring rclone mounts are working"): + for mount in self._tracked_mounts.values(): + if not await mount.is_responsive(): + with log_context( + _logger, + logging.WARNING, + f"RClone mount for local path='{mount.local_mount_path}' is not responsive, restarting it", + ): + await mount.stop_mount() + await mount.start_mount() - pass + async def setup(self) -> None: + self._task_ensure_mounts_working = create_periodic_task( + self._worker_ensure_mounts_working, + interval=timedelta(seconds=10), + task_name="rclone-mount-ensure-mounts-working", + ) async def teardown(self) -> None: # shutdown still ongoing mounts await asyncio.gather( - *[mount.stop_mount() for mount in self._started_mounts.values()] + *[mount.stop_mount() for mount in self._tracked_mounts.values()] ) - self._started_mounts.clear() - + self._tracked_mounts.clear() -# NOTES: -# There are multiple layers in place here -# - docker api to create/remove containers and networks -# - rclone container management -# - rclone process status management via its rc http interface -# - mounts management + if self._task_ensure_mounts_working is not None: + await cancel_wait_task(self._task_ensure_mounts_working) From 30c5135d183e702702aedaa407385905e64da62f Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Tue, 16 Dec 2025 11:43:07 +0100 Subject: [PATCH 51/79] added shutdown upon error with rclone --- .../node_ports_common/r_clone_mount/_core.py | 55 ++++++++++++++++--- .../modules/r_clone_mount_manager.py | 43 ++++++++++++++- 2 files changed, 89 insertions(+), 9 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 67d1b6683a34..7d1f5179208e 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -43,17 +43,31 @@ _DOCKER_PREFIX_MOUNT: Final[str] = "rcm" -_NOT_FOUND: Final[int] = 404 - type _MountId = str _R_CLONE_MOUNT_TEMPLATE: Final[str] = dedent( """ +set -e + +MOUNT_POINT='{local_mount_path}' + +cleanup() {{ + echo 'STARTED CLEANUP...' + umount -f "$MOUNT_POINT" || true + echo 'FINISHED CLEANUP' +}} +trap cleanup SIGTERM SIGINT + cat < {r_clone_config_path} {r_clone_config_content} EOF -{r_clone_command} +{r_clone_command} 2>&1 & + +RCLONE_PID=$! +wait "$RCLONE_PID" +echo "rclone exited, running cleanup (if not already triggered)..." +cleanup """ ) @@ -103,6 +117,7 @@ def _get_rclone_mount_command( r_clone_config_path=mount_settings.R_CLONE_CONFIG_FILE_PATH, r_clone_config_content=r_clone_config_content, r_clone_command=r_clone_command, + local_mount_path=local_mount_path, ) @@ -146,6 +161,10 @@ class MountActivityProtocol(Protocol): async def __call__(self, state_path: Path, activity: MountActivity) -> None: ... +class ShutdownHandlerProtocol(Protocol): + async def __call__(self) -> None: ... + + class StatelessContainerManager: # stateless def __init__( self, @@ -439,8 +458,9 @@ async def start_mount(self) -> None: task_name=f"rclone-mount-activity-{_get_mount_id(self.local_mount_path, self.index)}", ) - async def stop_mount(self) -> None: - await self._rc_http_client.wait_for_all_transfers_to_complete() + async def stop_mount(self, *, skip_transfer_wait: bool = False) -> None: + if not skip_transfer_wait: + await self._rc_http_client.wait_for_all_transfers_to_complete() await self._container_manager.remove() if self._task_mount_activity is not None: @@ -454,8 +474,14 @@ async def is_responsive(self) -> bool: class RCloneMountManager: - def __init__(self, r_clone_settings: RCloneSettings) -> None: + def __init__( + self, + r_clone_settings: RCloneSettings, + *, + request_shutdown_handler: ShutdownHandlerProtocol, + ) -> None: self.r_clone_settings = r_clone_settings + self.request_shutdown_handler = request_shutdown_handler if self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION is None: msg = "R_CLONE_VERSION setting is not set" raise RuntimeError(msg) @@ -527,16 +553,29 @@ async def ensure_unmounted( await tracked_mount.stop_mount() async def _worker_ensure_mounts_working(self) -> None: + mount_restored = False with log_context(_logger, logging.DEBUG, "Ensuring rclone mounts are working"): for mount in self._tracked_mounts.values(): if not await mount.is_responsive(): with log_context( _logger, logging.WARNING, - f"RClone mount for local path='{mount.local_mount_path}' is not responsive, restarting it", + f"Restoring mount for path='{mount.local_mount_path}'", ): - await mount.stop_mount() + await mount.stop_mount(skip_transfer_wait=True) await mount.start_mount() + mount_restored = True + + if mount_restored: + with log_context( + _logger, + logging.WARNING, + "Requesting service shutdown due to mount restoration", + ): + # NOTE: since the mount is bind mounted, we ensure that it restarts properly + # then we shutdown the service since the user service will have an out of date + # FUSE mount. + await self.request_shutdown_handler() async def setup(self) -> None: self._task_ensure_mounts_working = create_periodic_task( diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py index 6507ae4a54f8..5c77b111af98 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py @@ -1,8 +1,48 @@ +import logging +from functools import partial + from fastapi import FastAPI +from models_library.api_schemas_dynamic_scheduler.dynamic_services import ( + DynamicServiceStop, +) +from servicelib.logging_utils import log_context +from servicelib.rabbitmq.rpc_interfaces.dynamic_scheduler.services import ( + stop_dynamic_service, +) from simcore_sdk.node_ports_common.r_clone_mount import RCloneMountManager +from ..core.rabbitmq import get_rabbitmq_rpc_client, post_sidecar_log_message from ..core.settings import ApplicationSettings +_logger = logging.getLogger(__file__) + + +async def _request_shutdown(app: FastAPI) -> None: + settings: ApplicationSettings = app.state.settings + client = get_rabbitmq_rpc_client(app) + + with log_context( + _logger, logging.INFO, "requesting service shutdown from dynamic-scheduler" + ): + await stop_dynamic_service( + client, + dynamic_service_stop=DynamicServiceStop( + user_id=settings.DY_SIDECAR_USER_ID, + project_id=settings.DY_SIDECAR_PROJECT_ID, + node_id=settings.DY_SIDECAR_NODE_ID, + simcore_user_agent="", + save_state=True, + ), + ) + await post_sidecar_log_message( + app, + ( + "Your service was closed due to an issue that would create unexpected behavior. " + "No data was lost. Thank you for your understanding." + ), + log_level=logging.WARNING, + ) + def setup_r_clone_mount_manager(app: FastAPI): settings: ApplicationSettings = app.state.settings @@ -10,7 +50,8 @@ def setup_r_clone_mount_manager(app: FastAPI): async def _on_startup() -> None: app.state.r_clone_mount_manager = r_clone_mount_manager = RCloneMountManager( - settings.DY_SIDECAR_R_CLONE_SETTINGS + settings.DY_SIDECAR_R_CLONE_SETTINGS, + request_shutdown_handler=partial(_request_shutdown, app), ) await r_clone_mount_manager.setup() From 1db6cd682bab752b73d9323c9d4fba973678a348 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Tue, 16 Dec 2025 11:48:09 +0100 Subject: [PATCH 52/79] moved internls --- .../r_clone_mount/__init__.py | 10 +++++-- .../node_ports_common/r_clone_mount/_core.py | 30 +++++-------------- .../r_clone_mount/_models.py | 16 ++++++++++ 3 files changed, 31 insertions(+), 25 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py index fd0b78d63b40..f7952f6448fe 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py @@ -1,12 +1,15 @@ from ._config_provider import MountRemoteType from ._core import ( - MountActivity, - MountActivityProtocol, MountAlreadyStartedError, MountNotStartedError, RCloneMountManager, ) -from ._models import GetBindPathsProtocol +from ._models import ( + GetBindPathsProtocol, + MountActivity, + MountActivityProtocol, + ShutdownHandlerProtocol, +) __all__: tuple[str, ...] = ( "GetBindPathsProtocol", @@ -16,4 +19,5 @@ "MountNotStartedError", "MountRemoteType", "RCloneMountManager", + "ShutdownHandlerProtocol", ) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 7d1f5179208e..dad432786294 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -5,7 +5,7 @@ from functools import cached_property from pathlib import Path from textwrap import dedent -from typing import Any, Final, Protocol +from typing import Any, Final from uuid import uuid4 import httpx @@ -15,7 +15,7 @@ from models_library.basic_types import PortInt from models_library.progress_bar import ProgressReport from models_library.projects_nodes_io import NodeID, StorageFileID -from pydantic import BaseModel, NonNegativeInt +from pydantic import NonNegativeInt from servicelib.background_task import create_periodic_task from servicelib.logging_utils import log_context from servicelib.utils import unused_port @@ -30,7 +30,12 @@ from . import _docker_utils from ._config_provider import CONFIG_KEY, MountRemoteType, get_config_content -from ._models import GetBindPathsProtocol +from ._models import ( + GetBindPathsProtocol, + MountActivity, + MountActivityProtocol, + ShutdownHandlerProtocol, +) _logger = logging.getLogger(__name__) @@ -130,12 +135,6 @@ class _BaseRcloneMountError(OsparcErrorMixin, RuntimeError): pass -class _ContainerAlreadyStartedError(_BaseRcloneMountError): - msg_template: str = ( - "Mount process already stareted via container='{container}' with command='{command}'" - ) - - class _WaitingForTransfersToCompleteError(_BaseRcloneMountError): msg_template: str = "Waiting for all transfers to complete" @@ -152,19 +151,6 @@ class MountNotStartedError(_BaseRcloneMountError): msg_template: str = "Mount not started for local path='{local_mount_path}'" -class MountActivity(BaseModel): - transferring: dict[str, ProgressReport] - queued: list[str] - - -class MountActivityProtocol(Protocol): - async def __call__(self, state_path: Path, activity: MountActivity) -> None: ... - - -class ShutdownHandlerProtocol(Protocol): - async def __call__(self) -> None: ... - - class StatelessContainerManager: # stateless def __init__( self, diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py index 711449992786..e47a72a1d1c8 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py @@ -1,6 +1,22 @@ from pathlib import Path from typing import Protocol +from models_library.progress_bar import ProgressReport +from pydantic import BaseModel + + +class MountActivity(BaseModel): + transferring: dict[str, ProgressReport] + queued: list[str] + class GetBindPathsProtocol(Protocol): async def __call__(self, state_path: Path) -> list: ... + + +class MountActivityProtocol(Protocol): + async def __call__(self, state_path: Path, activity: MountActivity) -> None: ... + + +class ShutdownHandlerProtocol(Protocol): + async def __call__(self) -> None: ... From 3d43274dec57ffbd61093754b88a72e94db33701 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Tue, 16 Dec 2025 11:52:39 +0100 Subject: [PATCH 53/79] refactor --- .../node_ports_common/r_clone_mount/_core.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index dad432786294..f7493e716bc7 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -151,8 +151,8 @@ class MountNotStartedError(_BaseRcloneMountError): msg_template: str = "Mount not started for local path='{local_mount_path}'" -class StatelessContainerManager: # stateless - def __init__( +class StatelessContainerManager: # pylint:disable=too-many-instance-attributes + def __init__( # pylint:disable=too-many-arguments self, mount_settings: RCloneMountSettings, node_id: NodeID, @@ -233,7 +233,7 @@ async def remove(self): ) -class StatelessRCloneRCHttpClient: # HAS STATE +class StatelessRCloneRCHttpClient: def __init__( self, remote_control_port: PortInt, @@ -358,8 +358,8 @@ async def _() -> None: await _() -class TrackedMount: - def __init__( +class TrackedMount: # pylint:disable=too-many-instance-attributes + def __init__( # pylint:disable=too-many-arguments self, node_id: NodeID, r_clone_settings: RCloneSettings, @@ -486,9 +486,6 @@ async def ensure_mounted( handler_get_bind_paths: GetBindPathsProtocol, handler_mount_activity: MountActivityProtocol, ) -> None: - # check if rlcone mount exists - # - with log_context( _logger, logging.INFO, @@ -527,7 +524,6 @@ def is_mount_tracked(self, local_mount_path: Path, index: NonNegativeInt) -> boo async def ensure_unmounted( self, local_mount_path: Path, index: NonNegativeInt ) -> None: - # make sure this is done using stateless docker api calls with log_context( _logger, logging.INFO, f"unmounting {local_mount_path=}", log_duration=True ): From 573a0f88d1da0eb912b5dce0da4210b98047b1c8 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Tue, 16 Dec 2025 12:44:01 +0100 Subject: [PATCH 54/79] fixed migration --- ...> e4db35fe8054_added_use_r_clone_mounting_field.py} | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) rename packages/postgres-database/src/simcore_postgres_database/migration/versions/{611b7fa01f1c_added_use_r_clone_mounting_field.py => e4db35fe8054_added_use_r_clone_mounting_field.py} (82%) diff --git a/packages/postgres-database/src/simcore_postgres_database/migration/versions/611b7fa01f1c_added_use_r_clone_mounting_field.py b/packages/postgres-database/src/simcore_postgres_database/migration/versions/e4db35fe8054_added_use_r_clone_mounting_field.py similarity index 82% rename from packages/postgres-database/src/simcore_postgres_database/migration/versions/611b7fa01f1c_added_use_r_clone_mounting_field.py rename to packages/postgres-database/src/simcore_postgres_database/migration/versions/e4db35fe8054_added_use_r_clone_mounting_field.py index fa93517b25fc..c15a2cd96fbb 100644 --- a/packages/postgres-database/src/simcore_postgres_database/migration/versions/611b7fa01f1c_added_use_r_clone_mounting_field.py +++ b/packages/postgres-database/src/simcore_postgres_database/migration/versions/e4db35fe8054_added_use_r_clone_mounting_field.py @@ -1,8 +1,8 @@ """added use_r_clone_mounting field -Revision ID: 611b7fa01f1c -Revises: a85557c02d71 -Create Date: 2025-12-10 12:40:46.573251+00:00 +Revision ID: e4db35fe8054 +Revises: ce69cc44246a +Create Date: 2025-12-16 11:43:36.941571+00:00 """ @@ -10,8 +10,8 @@ from alembic import op # revision identifiers, used by Alembic. -revision = "611b7fa01f1c" -down_revision = "a85557c02d71" +revision = "e4db35fe8054" +down_revision = "ce69cc44246a" branch_labels = None depends_on = None From 2fa41ae1c621201ea32d10a66aca0184ab35912a Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Tue, 16 Dec 2025 12:48:32 +0100 Subject: [PATCH 55/79] refactor --- .../r_clone_mount/__init__.py | 12 ++-- .../node_ports_common/r_clone_mount/_core.py | 57 +++++++------------ .../r_clone_mount/_errors.py | 17 ++++++ .../r_clone_mount/_models.py | 2 +- 4 files changed, 42 insertions(+), 46 deletions(-) create mode 100644 packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_errors.py diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py index f7952f6448fe..505e51bb31d4 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py @@ -1,14 +1,11 @@ from ._config_provider import MountRemoteType -from ._core import ( - MountAlreadyStartedError, - MountNotStartedError, - RCloneMountManager, -) +from ._core import RCloneMountManager +from ._errors import MountAlreadyStartedError from ._models import ( GetBindPathsProtocol, MountActivity, MountActivityProtocol, - ShutdownHandlerProtocol, + RequestShutdownProtocol, ) __all__: tuple[str, ...] = ( @@ -16,8 +13,7 @@ "MountActivity", "MountActivityProtocol", "MountAlreadyStartedError", - "MountNotStartedError", "MountRemoteType", "RCloneMountManager", - "ShutdownHandlerProtocol", + "RequestShutdownProtocol", ) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index f7493e716bc7..f8768aa27843 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -8,10 +8,8 @@ from typing import Any, Final from uuid import uuid4 -import httpx from common_library.async_tools import cancel_wait_task -from common_library.errors_classes import OsparcErrorMixin -from httpx import AsyncClient +from httpx import AsyncClient, HTTPError from models_library.basic_types import PortInt from models_library.progress_bar import ProgressReport from models_library.projects_nodes_io import NodeID, StorageFileID @@ -30,11 +28,16 @@ from . import _docker_utils from ._config_provider import CONFIG_KEY, MountRemoteType, get_config_content +from ._errors import ( + MountAlreadyStartedError, + WaitingForQueueToBeEmptyError, + WaitingForTransfersToCompleteError, +) from ._models import ( GetBindPathsProtocol, MountActivity, MountActivityProtocol, - ShutdownHandlerProtocol, + RequestShutdownProtocol, ) _logger = logging.getLogger(__name__) @@ -127,31 +130,11 @@ def _get_rclone_mount_command( def _get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> _MountId: - # unique reproducible id for this mount + # unique reproducible id for the mount return f"{index}{local_mount_path}".replace("/", "_")[::-1] -class _BaseRcloneMountError(OsparcErrorMixin, RuntimeError): - pass - - -class _WaitingForTransfersToCompleteError(_BaseRcloneMountError): - msg_template: str = "Waiting for all transfers to complete" - - -class _WaitingForQueueToBeEmptyError(_BaseRcloneMountError): - msg_template: str = "Waiting for VFS queue to be empty: queue={queue}" - - -class MountAlreadyStartedError(_BaseRcloneMountError): - msg_template: str = "Mount already started for local path='{local_mount_path}'" - - -class MountNotStartedError(_BaseRcloneMountError): - msg_template: str = "Mount not started for local path='{local_mount_path}'" - - -class StatelessContainerManager: # pylint:disable=too-many-instance-attributes +class ContainerManager: # pylint:disable=too-many-instance-attributes def __init__( # pylint:disable=too-many-arguments self, mount_settings: RCloneMountSettings, @@ -233,7 +216,7 @@ async def remove(self): ) -class StatelessRCloneRCHttpClient: +class RemoteControlHttpClient: def __init__( self, remote_control_port: PortInt, @@ -310,7 +293,7 @@ async def get_mount_activity(self) -> MountActivity: wait=wait_fixed(1), stop=stop_after_delay(_MAX_WAIT_RC_HTTP_INTERFACE_READY.total_seconds()), reraise=True, - retry=retry_if_exception_type(httpx.HTTPError), + retry=retry_if_exception_type(HTTPError), before_sleep=before_sleep_log(_logger, logging.WARNING), ) async def wait_for_interface_to_be_ready(self) -> None: @@ -320,7 +303,7 @@ async def is_responsive(self) -> bool: try: await self._rc_noop() return True - except httpx.HTTPError: + except HTTPError: return False async def wait_for_all_transfers_to_complete(self) -> None: @@ -336,7 +319,7 @@ async def wait_for_all_transfers_to_complete(self) -> None: ), reraise=True, retry=retry_if_exception_type( - (_WaitingForQueueToBeEmptyError, _WaitingForTransfersToCompleteError) + (WaitingForQueueToBeEmptyError, WaitingForTransfersToCompleteError) ), before_sleep=before_sleep_log(_logger, logging.WARNING), ) @@ -349,11 +332,11 @@ async def _() -> None: core_stats["transfers"] != core_stats["totalTransfers"] or "transferring" in core_stats ): - raise _WaitingForTransfersToCompleteError + raise WaitingForTransfersToCompleteError queue = vfs_queue["queue"] if len(queue) != 0: - raise _WaitingForQueueToBeEmptyError(queue=queue) + raise WaitingForQueueToBeEmptyError(queue=queue) await _() @@ -391,7 +374,7 @@ def __init__( # pylint:disable=too-many-arguments self._task_mount_activity: asyncio.Task[None] | None = None # used internally to handle the mount command - self._container_manager = StatelessContainerManager( + self._container_manager = ContainerManager( mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, node_id=self.node_id, remote_control_port=self.rc_port, @@ -406,7 +389,7 @@ def __init__( # pylint:disable=too-many-arguments handler_get_bind_paths=self.handler_get_bind_paths, ) - self._rc_http_client = StatelessRCloneRCHttpClient( + self._rc_http_client = RemoteControlHttpClient( remote_control_port=self.rc_port, r_clone_mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, remote_control_host=self._container_manager.r_clone_container_name, @@ -464,10 +447,10 @@ def __init__( self, r_clone_settings: RCloneSettings, *, - request_shutdown_handler: ShutdownHandlerProtocol, + handler_request_shutdown: RequestShutdownProtocol, ) -> None: self.r_clone_settings = r_clone_settings - self.request_shutdown_handler = request_shutdown_handler + self.handler_request_shutdown = handler_request_shutdown if self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION is None: msg = "R_CLONE_VERSION setting is not set" raise RuntimeError(msg) @@ -557,7 +540,7 @@ async def _worker_ensure_mounts_working(self) -> None: # NOTE: since the mount is bind mounted, we ensure that it restarts properly # then we shutdown the service since the user service will have an out of date # FUSE mount. - await self.request_shutdown_handler() + await self.handler_request_shutdown() async def setup(self) -> None: self._task_ensure_mounts_working = create_periodic_task( diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_errors.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_errors.py new file mode 100644 index 000000000000..7d4aafc7c0cd --- /dev/null +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_errors.py @@ -0,0 +1,17 @@ +from common_library.errors_classes import OsparcErrorMixin + + +class _BaseRcloneMountError(OsparcErrorMixin, RuntimeError): + pass + + +class WaitingForTransfersToCompleteError(_BaseRcloneMountError): + msg_template: str = "Waiting for all transfers to complete" + + +class WaitingForQueueToBeEmptyError(_BaseRcloneMountError): + msg_template: str = "Waiting for VFS queue to be empty: queue={queue}" + + +class MountAlreadyStartedError(_BaseRcloneMountError): + msg_template: str = "Mount already started for local path='{local_mount_path}'" diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py index e47a72a1d1c8..87f113beceaa 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py @@ -18,5 +18,5 @@ class MountActivityProtocol(Protocol): async def __call__(self, state_path: Path, activity: MountActivity) -> None: ... -class ShutdownHandlerProtocol(Protocol): +class RequestShutdownProtocol(Protocol): async def __call__(self) -> None: ... From 84a177a33c3e9b2a052a1e40a06b0f0223743a58 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Tue, 16 Dec 2025 12:48:40 +0100 Subject: [PATCH 56/79] renamed --- .../modules/r_clone_mount_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py index 5c77b111af98..7d021cf9b30a 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py @@ -51,7 +51,7 @@ async def _on_startup() -> None: app.state.r_clone_mount_manager = r_clone_mount_manager = RCloneMountManager( settings.DY_SIDECAR_R_CLONE_SETTINGS, - request_shutdown_handler=partial(_request_shutdown, app), + handler_request_shutdown=partial(_request_shutdown, app), ) await r_clone_mount_manager.setup() From 6f495be334add728b8735f3caedade0a6e1cb8c2 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Tue, 16 Dec 2025 12:50:39 +0100 Subject: [PATCH 57/79] refactor --- .../modules/r_clone_mount_manager.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py index 7d021cf9b30a..1e36894ec5ad 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/r_clone_mount_manager.py @@ -17,12 +17,12 @@ _logger = logging.getLogger(__file__) -async def _request_shutdown(app: FastAPI) -> None: +async def _handle_shutdown_request(app: FastAPI) -> None: settings: ApplicationSettings = app.state.settings client = get_rabbitmq_rpc_client(app) with log_context( - _logger, logging.INFO, "requesting service shutdown from dynamic-scheduler" + _logger, logging.INFO, "requesting service shutdown via dynamic-scheduler" ): await stop_dynamic_service( client, @@ -51,7 +51,7 @@ async def _on_startup() -> None: app.state.r_clone_mount_manager = r_clone_mount_manager = RCloneMountManager( settings.DY_SIDECAR_R_CLONE_SETTINGS, - handler_request_shutdown=partial(_request_shutdown, app), + handler_request_shutdown=partial(_handle_shutdown_request, app), ) await r_clone_mount_manager.setup() From 84c981de07bbf20cc73180361254445947726034 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 17 Dec 2025 11:07:08 +0100 Subject: [PATCH 58/79] maybe acceptable settings --- .../src/settings_library/r_clone.py | 98 ++++++++++++++++++- .../simcore_sdk/node_ports_common/r_clone.py | 21 +--- .../node_ports_common/r_clone_mount/_core.py | 61 +++++++++--- 3 files changed, 140 insertions(+), 40 deletions(-) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index c5f9babfc2a1..54f2c2593f5a 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -10,7 +10,11 @@ from .s3 import S3Settings DEFAULT_VFS_CACHE_PATH: Final[Path] = Path("/vfs-cache") -DEFAULT_VFS_CACHE_MAX_SIZE: Final[str] = "500G" +DEFAULT_VFS_CACHE_MAX_SIZE: Final[str] = "10G" + + +_TRANSFER_COUNT: Final[NonNegativeInt] = 30 +_TPS_PER_TRANSFER: Final[NonNegativeInt] = 7 class S3Provider(StrEnum): @@ -53,6 +57,13 @@ class RCloneMountSettings(BaseCustomSettings): "/tmp/rclone.conf" # noqa: S108 ) + R_CLONE_MOUNT_SHOW_DEBUG_LOGS: Annotated[ + bool, + Field( + description="whether to enable debug logs for the rclone mount command", + ), + ] = False + # CLI command `rclone mount` R_CLONE_MOUNT_VFS_CACHE_PATH: Annotated[ @@ -62,6 +73,13 @@ class RCloneMountSettings(BaseCustomSettings): ), ] = DEFAULT_VFS_CACHE_PATH + R_CLONE_VFS_READ_AHEAD: Annotated[ + str, + Field( + description="`--vfs-read-ahead X`: sets the read ahead buffer size", + ), + ] = "16M" + R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE: Annotated[ str, Field( @@ -69,6 +87,20 @@ class RCloneMountSettings(BaseCustomSettings): ), ] = DEFAULT_VFS_CACHE_MAX_SIZE + R_CLONE_MOUNT_VFS_CACHE_MIN_FREE_SPACE: Annotated[ + str, + Field( + description="`--vfs-cache-min-free-space X`: sets the minimum free space to keep on disk", + ), + ] = "5G" + + R_CLONE_CACHE_POLL_INTERVAL: Annotated[ + str, + Field( + description="`--vfs-cache-poll-interval X`: sets the interval to poll the vfs cache", + ), + ] = "5m" + R_CLONE_MOUNT_VFS_WRITE_BACK: Annotated[ str, Field( @@ -90,6 +122,37 @@ class RCloneMountSettings(BaseCustomSettings): ), ] = True + R_CLONE_DIR_CACHE_TIME: Annotated[ + str, + Field( + description="`--dir-cache-time X`: sets the time to cache directory and file information", + ), + ] = "10m" + + R_CLONE_ATTR_TIMEOUT: Annotated[ + str, + Field( + description="`--attr-timeout X`: sets the time to cache file attributes", + ), + ] = "1m" + + R_CLONE_TPSLIMIT: Annotated[ + NonNegativeInt, + Field( + description="`--tpslimit X`: sets the transactions per second limit", + ), + ] = ( + _TRANSFER_COUNT * _TPS_PER_TRANSFER + ) + R_CLONE_TPSLIMIT_BURST: Annotated[ + NonNegativeInt, + Field( + description="`--tpslimit-burst X`: sets the burst limit for transactions per second", + ), + ] = ( + _TRANSFER_COUNT * _TPS_PER_TRANSFER * 2 + ) + class RCloneSettings(BaseCustomSettings): R_CLONE_S3: Annotated[ @@ -101,7 +164,8 @@ class RCloneSettings(BaseCustomSettings): # SEE https://rclone.org/docs/#transfers-n NonNegativeInt, Field(description="`--transfers X`: sets the amount of parallel transfers"), - ] = 5 + ] = _TRANSFER_COUNT + R_CLONE_OPTION_RETRIES: Annotated[ # SEE https://rclone.org/docs/#retries-int NonNegativeInt, @@ -120,19 +184,19 @@ class RCloneSettings(BaseCustomSettings): Field( description="`--checkers X`: sets the number checkers", ), - ] = 32 + ] = 8 R_CLONE_S3_UPLOAD_CONCURRENCY: Annotated[ NonNegativeInt, Field( description="`--s3-upload-concurrency X`: sets the number of concurrent uploads to S3", ), - ] = 8 + ] = _TRANSFER_COUNT R_CLONE_CHUNK_SIZE: Annotated[ str, Field(description="`--s3-chunk-size X`: sets the chunk size for S3"), - ] = "128M" + ] = "64M" R_CLONE_ORDER_BY: Annotated[ str, @@ -144,3 +208,27 @@ class RCloneSettings(BaseCustomSettings): R_CLONE_MOUNT_SETTINGS: RCloneMountSettings = Field( json_schema_extra={"auto_default_from_env": True} ) + + +def get_rclone_common_optimizations(r_clone_settings: RCloneSettings) -> list[str]: + # TODO: move to settings is better + return [ + "--retries", + f"{r_clone_settings.R_CLONE_OPTION_RETRIES}", + "--transfers", + f"{r_clone_settings.R_CLONE_OPTION_TRANSFERS}", + # below two options reduce to a minimum the memory footprint + # https://forum.rclone.org/t/how-to-set-a-memory-limit/10230/4 + "--buffer-size", # docs https://rclone.org/docs/#buffer-size-size + r_clone_settings.R_CLONE_OPTION_BUFFER_SIZE, + "--checkers", + f"{r_clone_settings.R_CLONE_OPTION_CHECKERS}", + "--s3-upload-concurrency", + f"{r_clone_settings.R_CLONE_S3_UPLOAD_CONCURRENCY}", + "--s3-chunk-size", + r_clone_settings.R_CLONE_CHUNK_SIZE, + # handles the order of file upload + "--order-by", + r_clone_settings.R_CLONE_ORDER_BY, + "--fast-list", + ] diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py index ea49122d1d86..d1023cc37824 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py @@ -12,7 +12,7 @@ from servicelib.progress_bar import ProgressBarData from servicelib.r_clone_utils import config_file from servicelib.utils import logged_gather -from settings_library.r_clone import RCloneSettings +from settings_library.r_clone import RCloneSettings, get_rclone_common_optimizations from settings_library.utils_r_clone import get_s3_r_clone_config from ._utils import BaseLogParser @@ -186,24 +186,7 @@ async def _sync_sources( "rclone", "--config", config_file_name, - "--retries", - f"{r_clone_settings.R_CLONE_OPTION_RETRIES}", - "--transfers", - f"{r_clone_settings.R_CLONE_OPTION_TRANSFERS}", - # below two options reduce to a minimum the memory footprint - # https://forum.rclone.org/t/how-to-set-a-memory-limit/10230/4 - "--buffer-size", # docs https://rclone.org/docs/#buffer-size-size - r_clone_settings.R_CLONE_OPTION_BUFFER_SIZE, - "--checkers", - f"{r_clone_settings.R_CLONE_OPTION_CHECKERS}", - "--s3-upload-concurrency", - f"{r_clone_settings.R_CLONE_S3_UPLOAD_CONCURRENCY}", - "--s3-chunk-size", - r_clone_settings.R_CLONE_CHUNK_SIZE, - # handles the order of file upload - "--order-by", - r_clone_settings.R_CLONE_ORDER_BY, - "--fast-list", + *get_rclone_common_optimizations(r_clone_settings), "--use-json-log", # frequent polling for faster progress updates "--stats", diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index f8768aa27843..f30c50df0772 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -15,9 +15,13 @@ from models_library.projects_nodes_io import NodeID, StorageFileID from pydantic import NonNegativeInt from servicelib.background_task import create_periodic_task -from servicelib.logging_utils import log_context +from servicelib.logging_utils import log_catch, log_context from servicelib.utils import unused_port -from settings_library.r_clone import RCloneMountSettings, RCloneSettings +from settings_library.r_clone import ( + RCloneMountSettings, + RCloneSettings, + get_rclone_common_optimizations, +) from tenacity import ( before_sleep_log, retry, @@ -70,6 +74,8 @@ {r_clone_config_content} EOF +echo "Start command: {r_clone_command}" + {r_clone_command} 2>&1 & RCLONE_PID=$! @@ -81,7 +87,7 @@ def _get_rclone_mount_command( - mount_settings: RCloneMountSettings, + r_clone_settings: RCloneSettings, r_clone_config_content: str, remote_path: StorageFileID, local_mount_path: Path, @@ -89,29 +95,48 @@ def _get_rclone_mount_command( rc_user: str, rc_password: str, ) -> str: + mount_settings = r_clone_settings.R_CLONE_MOUNT_SETTINGS escaped_remote_path = f"{remote_path}".lstrip("/") + r_clone_command = " ".join( [ "rclone", "--config", f"{mount_settings.R_CLONE_CONFIG_FILE_PATH}", - "-vv", + ("-vv" if mount_settings.R_CLONE_MOUNT_SHOW_DEBUG_LOGS else ""), "mount", f"{CONFIG_KEY}:{escaped_remote_path}", f"{local_mount_path}", - "--vfs-cache-mode full", - "--vfs-write-back", - mount_settings.R_CLONE_MOUNT_VFS_WRITE_BACK, + # VFS + "--vfs-cache-mode", + "full", + "--vfs-read-ahead", + mount_settings.R_CLONE_VFS_READ_AHEAD, "--vfs-cache-max-size", mount_settings.R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE, + "--vfs-cache-min-free-space", + mount_settings.R_CLONE_MOUNT_VFS_CACHE_MIN_FREE_SPACE, + "--vfs-cache-poll-interval", + mount_settings.R_CLONE_CACHE_POLL_INTERVAL, + "--vfs-write-back", + mount_settings.R_CLONE_MOUNT_VFS_WRITE_BACK, ( "--vfs-fast-fingerprint" - if mount_settings.R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE + if mount_settings.R_CLONE_MOUNT_VFS_FAST_FINGERPRINT else "" ), - ("--no-modtime" if mount_settings.R_CLONE_MOUNT_NO_MODTIME else ""), "--cache-dir", f"{mount_settings.R_CLONE_MOUNT_VFS_CACHE_PATH}", + "--dir-cache-time", + mount_settings.R_CLONE_DIR_CACHE_TIME, + "--attr-timeout", + mount_settings.R_CLONE_ATTR_TIMEOUT, + "--tpslimit", + f"{mount_settings.R_CLONE_TPSLIMIT}", + "--tpslimit-burst", + f"{mount_settings.R_CLONE_TPSLIMIT_BURST}", + ("--no-modtime" if mount_settings.R_CLONE_MOUNT_NO_MODTIME else ""), + # REMOTE CONTROL "--rc", f"--rc-addr=0.0.0.0:{remote_control_port}", "--rc-enable-metrics", @@ -119,6 +144,7 @@ def _get_rclone_mount_command( f"--rc-pass='{rc_password}'", "--allow-non-empty", "--allow-other", + *get_rclone_common_optimizations(r_clone_settings), ] ) return _R_CLONE_MOUNT_TEMPLATE.format( @@ -137,7 +163,7 @@ def _get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> _MountId: class ContainerManager: # pylint:disable=too-many-instance-attributes def __init__( # pylint:disable=too-many-arguments self, - mount_settings: RCloneMountSettings, + r_clone_settings: RCloneSettings, node_id: NodeID, remote_control_port: PortInt, local_mount_path: Path, @@ -149,7 +175,7 @@ def __init__( # pylint:disable=too-many-arguments *, handler_get_bind_paths: GetBindPathsProtocol, ) -> None: - self.mount_settings = mount_settings + self.r_clone_settings = r_clone_settings self.node_id = node_id self.remote_control_port = remote_control_port self.local_mount_path = local_mount_path @@ -186,12 +212,14 @@ async def create(self): client, self._r_clone_network_name ) - assert self.mount_settings.R_CLONE_VERSION is not None # nosec + assert ( + self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION is not None + ) # nosec await _docker_utils.create_r_clone_container( client, self.r_clone_container_name, command=_get_rclone_mount_command( - mount_settings=self.mount_settings, + r_clone_settings=self.r_clone_settings, r_clone_config_content=self.r_clone_config_content, remote_path=self.remote_path, local_mount_path=self.local_mount_path, @@ -199,7 +227,7 @@ async def create(self): rc_user=self.rc_user, rc_password=self.rc_password, ), - r_clone_version=self.mount_settings.R_CLONE_VERSION, + r_clone_version=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION, remote_control_port=self.remote_control_port, r_clone_network_name=self._r_clone_network_name, local_mount_path=self.local_mount_path, @@ -375,7 +403,7 @@ def __init__( # pylint:disable=too-many-arguments # used internally to handle the mount command self._container_manager = ContainerManager( - mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, + r_clone_settings=self.r_clone_settings, node_id=self.node_id, remote_control_port=self.rc_port, local_mount_path=self.local_mount_path, @@ -414,7 +442,8 @@ async def _handler_mount_activity(self, mount_activity: MountActivity) -> None: async def _worker_mount_activity(self) -> None: mount_activity = await self._rc_http_client.get_mount_activity() - await self._handler_mount_activity(mount_activity) + with log_catch(logger=_logger, reraise=False): + await self._handler_mount_activity(mount_activity) async def start_mount(self) -> None: await self._container_manager.create() From c48fa55da6cb42f3c424ff9990861f609c24fe16 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 17 Dec 2025 11:24:00 +0100 Subject: [PATCH 59/79] bumped rclone version --- scripts/install_rclone.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/install_rclone.bash b/scripts/install_rclone.bash index 31ec36302a9b..a4dc7fde5cdd 100755 --- a/scripts/install_rclone.bash +++ b/scripts/install_rclone.bash @@ -9,7 +9,7 @@ set -o nounset # abort on unbound variable set -o pipefail # don't hide errors within pipes IFS=$'\n\t' -R_CLONE_VERSION="1.70.3" +R_CLONE_VERSION="1.72.1" TARGETARCH="${TARGETARCH:-amd64}" echo "platform ${TARGETARCH}" From 4cfcb451aebe5957a4bf054120634b0bfda7f4ce Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 17 Dec 2025 11:30:43 +0100 Subject: [PATCH 60/79] refactor --- .../settings-library/src/settings_library/r_clone.py | 12 +++++++++++- .../node_ports_common/r_clone_mount/_core.py | 9 +++++---- .../node_ports_common/r_clone_mount/_docker_utils.py | 6 ++++++ 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index 54f2c2593f5a..e5198b370754 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -4,7 +4,7 @@ from typing import Annotated, Final from common_library.pydantic_validators import validate_numeric_string_as_timedelta -from pydantic import Field, NonNegativeInt +from pydantic import ByteSize, Field, NonNegativeInt, TypeAdapter from .base import BaseCustomSettings from .s3 import S3Settings @@ -16,6 +16,8 @@ _TRANSFER_COUNT: Final[NonNegativeInt] = 30 _TPS_PER_TRANSFER: Final[NonNegativeInt] = 7 +_ONE_NANO_CPU: Final[NonNegativeInt] = int(1e9) + class S3Provider(StrEnum): AWS = "AWS" @@ -64,6 +66,14 @@ class RCloneMountSettings(BaseCustomSettings): ), ] = False + R_CLONE_MEMORY_LIMIT: Annotated[ + ByteSize, Field(description="memory limit for the rclone mount container") + ] = TypeAdapter(ByteSize).validate_python("1GiB") + + R_CLONE_NANO_CPUS: Annotated[ + NonNegativeInt, Field(description="CPU limit for the rclone mount container") + ] = (1 * _ONE_NANO_CPU) + # CLI command `rclone mount` R_CLONE_MOUNT_VFS_CACHE_PATH: Annotated[ diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index f30c50df0772..38317026acec 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -212,9 +212,8 @@ async def create(self): client, self._r_clone_network_name ) - assert ( - self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION is not None - ) # nosec + mount_settings = self.r_clone_settings.R_CLONE_MOUNT_SETTINGS + assert mount_settings.R_CLONE_VERSION is not None # nosec await _docker_utils.create_r_clone_container( client, self.r_clone_container_name, @@ -227,10 +226,12 @@ async def create(self): rc_user=self.rc_user, rc_password=self.rc_password, ), - r_clone_version=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION, + r_clone_version=mount_settings.R_CLONE_VERSION, remote_control_port=self.remote_control_port, r_clone_network_name=self._r_clone_network_name, local_mount_path=self.local_mount_path, + memory_limit=mount_settings.R_CLONE_MEMORY_LIMIT, + nano_cpus=mount_settings.R_CLONE_NANO_CPUS, handler_get_bind_paths=self.handler_get_bind_paths, ) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py index 24b3e819370a..500304744c27 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py @@ -9,6 +9,7 @@ from aiodocker.exceptions import DockerError from aiodocker.networks import DockerNetwork from models_library.basic_types import PortInt +from pydantic import ByteSize, NonNegativeInt from ._models import GetBindPathsProtocol @@ -42,6 +43,8 @@ async def create_r_clone_container( remote_control_port: PortInt, r_clone_network_name: str, local_mount_path: Path, + memory_limit: ByteSize, + nano_cpus: NonNegativeInt, handler_get_bind_paths: GetBindPathsProtocol, ) -> None: async with get_or_crate_docker_session(docker) as client: @@ -64,6 +67,9 @@ async def create_r_clone_container( ], "CapAdd": ["SYS_ADMIN"], "SecurityOpt": ["apparmor:unconfined", "seccomp:unconfined"], + "Memory": memory_limit, + "MemorySwap": memory_limit, + "NanoCpus": nano_cpus, }, }, name=container_name, From 331a0917626c1897641bcb013c8b61c1a8219ab8 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 17 Dec 2025 11:33:59 +0100 Subject: [PATCH 61/79] rename --- .../src/settings_library/r_clone.py | 12 ++++++------ .../node_ports_common/r_clone_mount/_core.py | 19 +++++++++++-------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index e5198b370754..5a8b3f4654cc 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -30,7 +30,7 @@ class RCloneMountSettings(BaseCustomSettings): R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT: Annotated[ timedelta, Field( - description="max amount of time to wait when closing the rclone mount", + description="max amount of time to wait for rclone mount command to finish", ), ] = timedelta(minutes=60) @@ -42,7 +42,7 @@ class RCloneMountSettings(BaseCustomSettings): # CONTAINER - R_CLONE_VERSION: Annotated[ + R_CLONE_CONTAINER_VERSION: Annotated[ str | None, Field( pattern=r"^\d+\.\d+\.\d+$", @@ -50,7 +50,7 @@ class RCloneMountSettings(BaseCustomSettings): ), ] = None - R_CLONE_CONFIG_FILE_PATH: Annotated[ + R_CLONE_CONTAINER_CONFIG_FILE_PATH: Annotated[ Path, Field( description="path inside the container where the rclone config file is located", @@ -59,18 +59,18 @@ class RCloneMountSettings(BaseCustomSettings): "/tmp/rclone.conf" # noqa: S108 ) - R_CLONE_MOUNT_SHOW_DEBUG_LOGS: Annotated[ + R_CLONE_CONTAINER_MOUNT_SHOW_DEBUG_LOGS: Annotated[ bool, Field( description="whether to enable debug logs for the rclone mount command", ), ] = False - R_CLONE_MEMORY_LIMIT: Annotated[ + R_CLONE_CONTAINER_MEMORY_LIMIT: Annotated[ ByteSize, Field(description="memory limit for the rclone mount container") ] = TypeAdapter(ByteSize).validate_python("1GiB") - R_CLONE_NANO_CPUS: Annotated[ + R_CLONE_CONTAINER_NANO_CPUS: Annotated[ NonNegativeInt, Field(description="CPU limit for the rclone mount container") ] = (1 * _ONE_NANO_CPU) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index 38317026acec..f9d488057fb1 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -102,8 +102,8 @@ def _get_rclone_mount_command( [ "rclone", "--config", - f"{mount_settings.R_CLONE_CONFIG_FILE_PATH}", - ("-vv" if mount_settings.R_CLONE_MOUNT_SHOW_DEBUG_LOGS else ""), + f"{mount_settings.R_CLONE_CONTAINER_CONFIG_FILE_PATH}", + ("-vv" if mount_settings.R_CLONE_CONTAINER_MOUNT_SHOW_DEBUG_LOGS else ""), "mount", f"{CONFIG_KEY}:{escaped_remote_path}", f"{local_mount_path}", @@ -148,7 +148,7 @@ def _get_rclone_mount_command( ] ) return _R_CLONE_MOUNT_TEMPLATE.format( - r_clone_config_path=mount_settings.R_CLONE_CONFIG_FILE_PATH, + r_clone_config_path=mount_settings.R_CLONE_CONTAINER_CONFIG_FILE_PATH, r_clone_config_content=r_clone_config_content, r_clone_command=r_clone_command, local_mount_path=local_mount_path, @@ -213,7 +213,7 @@ async def create(self): ) mount_settings = self.r_clone_settings.R_CLONE_MOUNT_SETTINGS - assert mount_settings.R_CLONE_VERSION is not None # nosec + assert mount_settings.R_CLONE_CONTAINER_VERSION is not None # nosec await _docker_utils.create_r_clone_container( client, self.r_clone_container_name, @@ -226,12 +226,12 @@ async def create(self): rc_user=self.rc_user, rc_password=self.rc_password, ), - r_clone_version=mount_settings.R_CLONE_VERSION, + r_clone_version=mount_settings.R_CLONE_CONTAINER_VERSION, remote_control_port=self.remote_control_port, r_clone_network_name=self._r_clone_network_name, local_mount_path=self.local_mount_path, - memory_limit=mount_settings.R_CLONE_MEMORY_LIMIT, - nano_cpus=mount_settings.R_CLONE_NANO_CPUS, + memory_limit=mount_settings.R_CLONE_CONTAINER_MEMORY_LIMIT, + nano_cpus=mount_settings.R_CLONE_CONTAINER_NANO_CPUS, handler_get_bind_paths=self.handler_get_bind_paths, ) @@ -481,7 +481,10 @@ def __init__( ) -> None: self.r_clone_settings = r_clone_settings self.handler_request_shutdown = handler_request_shutdown - if self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_VERSION is None: + if ( + self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_CONTAINER_VERSION + is None + ): msg = "R_CLONE_VERSION setting is not set" raise RuntimeError(msg) From 08b328d85335755b214d5d637f3340641897ed98 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Wed, 17 Dec 2025 12:18:32 +0100 Subject: [PATCH 62/79] fixed version --- scripts/install_rclone.bash | 2 +- services/dynamic-sidecar/docker/boot.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/install_rclone.bash b/scripts/install_rclone.bash index a4dc7fde5cdd..1bd530ba862c 100755 --- a/scripts/install_rclone.bash +++ b/scripts/install_rclone.bash @@ -9,7 +9,7 @@ set -o nounset # abort on unbound variable set -o pipefail # don't hide errors within pipes IFS=$'\n\t' -R_CLONE_VERSION="1.72.1" +R_CLONE_VERSION="1.72.0" TARGETARCH="${TARGETARCH:-amd64}" echo "platform ${TARGETARCH}" diff --git a/services/dynamic-sidecar/docker/boot.sh b/services/dynamic-sidecar/docker/boot.sh index d6bc4dc4d570..cf552e3f1c34 100755 --- a/services/dynamic-sidecar/docker/boot.sh +++ b/services/dynamic-sidecar/docker/boot.sh @@ -48,9 +48,9 @@ DYNAMIC_SIDECAR_REMOTE_DEBUGGING_PORT=${DYNAMIC_SIDECAR_REMOTE_DEBUGGING_PORT:-3 SERVER_LOG_LEVEL=$(echo "${APP_LOG_LEVEL}" | tr '[:upper:]' '[:lower:]') echo "$INFO" "Log-level app/server: $APP_LOG_LEVEL/$SERVER_LOG_LEVEL" -R_CLONE_VERSION=$(rclone version | head -n1 | awk '{print $2}' | sed 's/^v//') && \ - echo "$INFO" "R_CLONE_VERSION=${R_CLONE_VERSION}" && \ - export R_CLONE_VERSION +R_CLONE_CONTAINER_VERSION=$(rclone version | head -n1 | awk '{print $2}' | sed 's/^v//') && \ + echo "$INFO" "R_CLONE_CONTAINER_VERSION=${R_CLONE_CONTAINER_VERSION}" && \ + export R_CLONE_CONTAINER_VERSION if [ "${SC_BOOT_MODE}" = "debug" ]; then reload_dir_packages=$(fdfind src /devel/packages --exec echo '--reload-dir {} ' | tr '\n' ' ') From 135cd7c649dc8bc8b22d84c6de86a251475ffc39 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Thu, 18 Dec 2025 12:04:20 +0100 Subject: [PATCH 63/79] working settings --- .../src/settings_library/r_clone.py | 54 ++++++++++--------- .../node_ports_common/r_clone_mount/_core.py | 15 ++---- .../docker_service_specs/sidecar.py | 2 +- 3 files changed, 34 insertions(+), 37 deletions(-) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index 5a8b3f4654cc..8dd191b29667 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -10,10 +10,10 @@ from .s3 import S3Settings DEFAULT_VFS_CACHE_PATH: Final[Path] = Path("/vfs-cache") -DEFAULT_VFS_CACHE_MAX_SIZE: Final[str] = "10G" +DEFAULT_VFS_CACHE_MAX_SIZE: Final[str] = "500G" -_TRANSFER_COUNT: Final[NonNegativeInt] = 30 +_TRANSFER_COUNT: Final[NonNegativeInt] = 15 _TPS_PER_TRANSFER: Final[NonNegativeInt] = 7 _ONE_NANO_CPU: Final[NonNegativeInt] = int(1e9) @@ -68,7 +68,7 @@ class RCloneMountSettings(BaseCustomSettings): R_CLONE_CONTAINER_MEMORY_LIMIT: Annotated[ ByteSize, Field(description="memory limit for the rclone mount container") - ] = TypeAdapter(ByteSize).validate_python("1GiB") + ] = TypeAdapter(ByteSize).validate_python("2GiB") R_CLONE_CONTAINER_NANO_CPUS: Annotated[ NonNegativeInt, Field(description="CPU limit for the rclone mount container") @@ -109,33 +109,19 @@ class RCloneMountSettings(BaseCustomSettings): Field( description="`--vfs-cache-poll-interval X`: sets the interval to poll the vfs cache", ), - ] = "5m" + ] = "1m" R_CLONE_MOUNT_VFS_WRITE_BACK: Annotated[ str, Field( description="`--vfs-write-back X`: sets the time to wait before writing back data to the remote", ), - ] = "5s" - - R_CLONE_MOUNT_VFS_FAST_FINGERPRINT: Annotated[ - bool, - Field( - description="whether to use `--vfs-fast-fingerprint` option", - ), - ] = True - - R_CLONE_MOUNT_NO_MODTIME: Annotated[ - bool, - Field( - description="whether to use `--no-modtime` option", - ), - ] = True + ] = "10s" R_CLONE_DIR_CACHE_TIME: Annotated[ str, Field( - description="`--dir-cache-time X`: sets the time to cache directory and file information", + description="`--dir-cache-time X`: time before directory is uploaded from remote if changed", ), ] = "10m" @@ -163,6 +149,13 @@ class RCloneMountSettings(BaseCustomSettings): _TRANSFER_COUNT * _TPS_PER_TRANSFER * 2 ) + R_CLONE_MAX_BUFFER_MEMORY: Annotated[ + str, + Field( + description="`--max-buffer-memory X`: sets the maximum buffer memory for rclone", + ), + ] = "16M" + class RCloneSettings(BaseCustomSettings): R_CLONE_S3: Annotated[ @@ -181,13 +174,21 @@ class RCloneSettings(BaseCustomSettings): NonNegativeInt, Field(description="`--retries X`: times to retry each individual transfer"), ] = 3 - R_CLONE_OPTION_BUFFER_SIZE: Annotated[ + + R_CLONE_OPTION_RETRIES_SLEEP: Annotated[ + str, + Field( + description="`--retries-sleep X`: max time to sleep between retries (caps exponential backoff)" + ), + ] = "30s" + + R_CLONE_BUFFER_SIZE: Annotated[ # SEE https://rclone.org/docs/#buffer-size-size str, Field( description="`--buffer-size X`: sets the amount of RAM to use for each individual transfer", ), - ] = "16M" + ] = "8M" R_CLONE_OPTION_CHECKERS: Annotated[ NonNegativeInt, @@ -201,12 +202,12 @@ class RCloneSettings(BaseCustomSettings): Field( description="`--s3-upload-concurrency X`: sets the number of concurrent uploads to S3", ), - ] = _TRANSFER_COUNT + ] = 5 R_CLONE_CHUNK_SIZE: Annotated[ str, Field(description="`--s3-chunk-size X`: sets the chunk size for S3"), - ] = "64M" + ] = "16M" R_CLONE_ORDER_BY: Annotated[ str, @@ -221,16 +222,17 @@ class RCloneSettings(BaseCustomSettings): def get_rclone_common_optimizations(r_clone_settings: RCloneSettings) -> list[str]: - # TODO: move to settings is better return [ "--retries", f"{r_clone_settings.R_CLONE_OPTION_RETRIES}", + "--retries-sleep", + r_clone_settings.R_CLONE_OPTION_RETRIES_SLEEP, "--transfers", f"{r_clone_settings.R_CLONE_OPTION_TRANSFERS}", # below two options reduce to a minimum the memory footprint # https://forum.rclone.org/t/how-to-set-a-memory-limit/10230/4 "--buffer-size", # docs https://rclone.org/docs/#buffer-size-size - r_clone_settings.R_CLONE_OPTION_BUFFER_SIZE, + r_clone_settings.R_CLONE_BUFFER_SIZE, "--checkers", f"{r_clone_settings.R_CLONE_OPTION_CHECKERS}", "--s3-upload-concurrency", diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index f9d488057fb1..bb3a972a51cc 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -118,13 +118,9 @@ def _get_rclone_mount_command( mount_settings.R_CLONE_MOUNT_VFS_CACHE_MIN_FREE_SPACE, "--vfs-cache-poll-interval", mount_settings.R_CLONE_CACHE_POLL_INTERVAL, + "--write-back-cache", "--vfs-write-back", mount_settings.R_CLONE_MOUNT_VFS_WRITE_BACK, - ( - "--vfs-fast-fingerprint" - if mount_settings.R_CLONE_MOUNT_VFS_FAST_FINGERPRINT - else "" - ), "--cache-dir", f"{mount_settings.R_CLONE_MOUNT_VFS_CACHE_PATH}", "--dir-cache-time", @@ -135,7 +131,9 @@ def _get_rclone_mount_command( f"{mount_settings.R_CLONE_TPSLIMIT}", "--tpslimit-burst", f"{mount_settings.R_CLONE_TPSLIMIT_BURST}", - ("--no-modtime" if mount_settings.R_CLONE_MOUNT_NO_MODTIME else ""), + "--no-modtime", + "--max-buffer-memory", + mount_settings.R_CLONE_MAX_BUFFER_MEMORY, # REMOTE CONTROL "--rc", f"--rc-addr=0.0.0.0:{remote_control_port}", @@ -283,10 +281,7 @@ async def _request(self, method: str, path: str) -> Any: method, request_url, auth=(self._rc_user, self._rc_password) ) response.raise_for_status() - result = response.json() - - _logger.debug("'%s %s' replied with: %s", method, path, result) - return result + return response.json() async def _post_core_stats(self) -> dict: return await self._request("POST", "core/stats") diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py index 2f9548f513a2..4f6c92935b8e 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py @@ -147,7 +147,7 @@ def _get_environment_variables( "R_CLONE_PROVIDER": r_clone_settings.R_CLONE_PROVIDER, "R_CLONE_OPTION_TRANSFERS": f"{r_clone_settings.R_CLONE_OPTION_TRANSFERS}", "R_CLONE_OPTION_RETRIES": f"{r_clone_settings.R_CLONE_OPTION_RETRIES}", - "R_CLONE_OPTION_BUFFER_SIZE": r_clone_settings.R_CLONE_OPTION_BUFFER_SIZE, + "R_CLONE_OPTION_BUFFER_SIZE": r_clone_settings.R_CLONE_BUFFER_SIZE, "RABBIT_HOST": f"{rabbit_settings.RABBIT_HOST}", "RABBIT_PASSWORD": f"{rabbit_settings.RABBIT_PASSWORD.get_secret_value()}", "RABBIT_PORT": f"{rabbit_settings.RABBIT_PORT}", From 0985d7a8320dfe425074e32fb993ea44e55d70ed Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 07:11:47 +0100 Subject: [PATCH 64/79] sensible settings and reverted settings layout --- .../src/settings_library/r_clone.py | 140 ++++++++++-------- .../simcore_sdk/node_ports_common/r_clone.py | 23 ++- .../node_ports_common/r_clone_mount/_core.py | 76 +++++----- 3 files changed, 144 insertions(+), 95 deletions(-) diff --git a/packages/settings-library/src/settings_library/r_clone.py b/packages/settings-library/src/settings_library/r_clone.py index 8dd191b29667..545a9a7724be 100644 --- a/packages/settings-library/src/settings_library/r_clone.py +++ b/packages/settings-library/src/settings_library/r_clone.py @@ -14,7 +14,7 @@ _TRANSFER_COUNT: Final[NonNegativeInt] = 15 -_TPS_PER_TRANSFER: Final[NonNegativeInt] = 7 +_TPSLIMIT: Final[NonNegativeInt] = 2000 _ONE_NANO_CPU: Final[NonNegativeInt] = int(1e9) @@ -41,16 +41,7 @@ class RCloneMountSettings(BaseCustomSettings): ) # CONTAINER - - R_CLONE_CONTAINER_VERSION: Annotated[ - str | None, - Field( - pattern=r"^\d+\.\d+\.\d+$", - description="version of rclone for the container image", - ), - ] = None - - R_CLONE_CONTAINER_CONFIG_FILE_PATH: Annotated[ + R_CLONE_MOUNT_CONTAINER_CONFIG_FILE_PATH: Annotated[ Path, Field( description="path inside the container where the rclone config file is located", @@ -59,18 +50,18 @@ class RCloneMountSettings(BaseCustomSettings): "/tmp/rclone.conf" # noqa: S108 ) - R_CLONE_CONTAINER_MOUNT_SHOW_DEBUG_LOGS: Annotated[ + R_CLONE_MOUNT_CONTAINER_SHOW_DEBUG_LOGS: Annotated[ bool, Field( description="whether to enable debug logs for the rclone mount command", ), ] = False - R_CLONE_CONTAINER_MEMORY_LIMIT: Annotated[ + R_CLONE_MOUNT_CONTAINER_MEMORY_LIMIT: Annotated[ ByteSize, Field(description="memory limit for the rclone mount container") ] = TypeAdapter(ByteSize).validate_python("2GiB") - R_CLONE_CONTAINER_NANO_CPUS: Annotated[ + R_CLONE_MOUNT_CONTAINER_NANO_CPUS: Annotated[ NonNegativeInt, Field(description="CPU limit for the rclone mount container") ] = (1 * _ONE_NANO_CPU) @@ -83,7 +74,7 @@ class RCloneMountSettings(BaseCustomSettings): ), ] = DEFAULT_VFS_CACHE_PATH - R_CLONE_VFS_READ_AHEAD: Annotated[ + R_CLONE_MOUNT_VFS_READ_AHEAD: Annotated[ str, Field( description="`--vfs-read-ahead X`: sets the read ahead buffer size", @@ -104,7 +95,7 @@ class RCloneMountSettings(BaseCustomSettings): ), ] = "5G" - R_CLONE_CACHE_POLL_INTERVAL: Annotated[ + R_CLONE_MOUNT_CACHE_POLL_INTERVAL: Annotated[ str, Field( description="`--vfs-cache-poll-interval X`: sets the interval to poll the vfs cache", @@ -118,44 +109,92 @@ class RCloneMountSettings(BaseCustomSettings): ), ] = "10s" - R_CLONE_DIR_CACHE_TIME: Annotated[ + R_CLONE_MOUNT_DIR_CACHE_TIME: Annotated[ str, Field( description="`--dir-cache-time X`: time before directory is uploaded from remote if changed", ), ] = "10m" - R_CLONE_ATTR_TIMEOUT: Annotated[ + R_CLONE_MOUNT_ATTR_TIMEOUT: Annotated[ str, Field( description="`--attr-timeout X`: sets the time to cache file attributes", ), ] = "1m" - R_CLONE_TPSLIMIT: Annotated[ + R_CLONE_MOUNT_TPSLIMIT: Annotated[ NonNegativeInt, Field( description="`--tpslimit X`: sets the transactions per second limit", ), - ] = ( - _TRANSFER_COUNT * _TPS_PER_TRANSFER - ) - R_CLONE_TPSLIMIT_BURST: Annotated[ + ] = _TPSLIMIT + R_CLONE_MOUNT_TPSLIMIT_BURST: Annotated[ NonNegativeInt, Field( description="`--tpslimit-burst X`: sets the burst limit for transactions per second", ), ] = ( - _TRANSFER_COUNT * _TPS_PER_TRANSFER * 2 + _TPSLIMIT * 2 ) - R_CLONE_MAX_BUFFER_MEMORY: Annotated[ + R_CLONE_MOUNT_MAX_BUFFER_MEMORY: Annotated[ str, Field( description="`--max-buffer-memory X`: sets the maximum buffer memory for rclone", ), ] = "16M" + R_CLONE_MOUNT_RETRIES: Annotated[ + NonNegativeInt, + Field( + description="`--retries X`: sets the number of retries for rclone mount command", + ), + ] = 3 + + R_CLONE_MOUNT_RETRIES_SLEEP: Annotated[ + str, + Field( + description="`--retries-sleep X`: sets the maximum sleep time between retries", + ), + ] = "30s" + R_CLONE_MOUNT_TRANSFERS: Annotated[ + NonNegativeInt, + Field( + description="`--transfers X`: sets the number of parallel transfers for rclone mount command", + ), + ] = 15 + R_CLONE_MOUNT_BUFFER_SIZE: Annotated[ + str, + Field( + description="`--buffer-size X`: sets the buffer size for rclone mount command", + ), + ] = "16M" + R_CLONE_MOUNT_CHECKERS: Annotated[ + NonNegativeInt, + Field( + description="`--checkers X`: sets the number of checkers for rclone mount command", + ), + ] = 8 + R_CLONE_MOUNT_S3_UPLOAD_CONCURRENCY: Annotated[ + NonNegativeInt, + Field( + description="`--s3-upload-concurrency X`: sets the number of concurrent uploads to S3", + ), + ] = 5 + R_CLONE_MOUNT_S3_CHUNK_SIZE: Annotated[ + str, + Field( + description="`--s3-chunk-size X`: sets the chunk size for S3", + ), + ] = "16M" + R_CLONE_MOUNT_ORDER_BY: Annotated[ + str, + Field( + description="`--order-by X`: sets the order of file upload, e.g., 'size,mixed'", + ), + ] = "size,mixed" + class RCloneSettings(BaseCustomSettings): R_CLONE_S3: Annotated[ @@ -163,11 +202,23 @@ class RCloneSettings(BaseCustomSettings): ] R_CLONE_PROVIDER: S3Provider + R_CLONE_VERSION: Annotated[ + str | None, + Field( + pattern=r"^\d+\.\d+\.\d+$", + description="version of rclone for the container image", + ), + ] = None + + R_CLONE_MOUNT_SETTINGS: RCloneMountSettings = Field( + json_schema_extra={"auto_default_from_env": True} + ) + R_CLONE_OPTION_TRANSFERS: Annotated[ # SEE https://rclone.org/docs/#transfers-n NonNegativeInt, Field(description="`--transfers X`: sets the amount of parallel transfers"), - ] = _TRANSFER_COUNT + ] = 5 R_CLONE_OPTION_RETRIES: Annotated[ # SEE https://rclone.org/docs/#retries-int @@ -175,22 +226,22 @@ class RCloneSettings(BaseCustomSettings): Field(description="`--retries X`: times to retry each individual transfer"), ] = 3 - R_CLONE_OPTION_RETRIES_SLEEP: Annotated[ + R_CLONE_RETRIES_SLEEP: Annotated[ str, Field( description="`--retries-sleep X`: max time to sleep between retries (caps exponential backoff)" ), ] = "30s" - R_CLONE_BUFFER_SIZE: Annotated[ + R_CLONE_OPTION_BUFFER_SIZE: Annotated[ # SEE https://rclone.org/docs/#buffer-size-size str, Field( description="`--buffer-size X`: sets the amount of RAM to use for each individual transfer", ), - ] = "8M" + ] = "16M" - R_CLONE_OPTION_CHECKERS: Annotated[ + R_CLONE_CHECKERS: Annotated[ NonNegativeInt, Field( description="`--checkers X`: sets the number checkers", @@ -215,32 +266,3 @@ class RCloneSettings(BaseCustomSettings): description="`--order-by X`: sets the order of file upload, e.g., 'size,mixed'", ), ] = "size,mixed" - - R_CLONE_MOUNT_SETTINGS: RCloneMountSettings = Field( - json_schema_extra={"auto_default_from_env": True} - ) - - -def get_rclone_common_optimizations(r_clone_settings: RCloneSettings) -> list[str]: - return [ - "--retries", - f"{r_clone_settings.R_CLONE_OPTION_RETRIES}", - "--retries-sleep", - r_clone_settings.R_CLONE_OPTION_RETRIES_SLEEP, - "--transfers", - f"{r_clone_settings.R_CLONE_OPTION_TRANSFERS}", - # below two options reduce to a minimum the memory footprint - # https://forum.rclone.org/t/how-to-set-a-memory-limit/10230/4 - "--buffer-size", # docs https://rclone.org/docs/#buffer-size-size - r_clone_settings.R_CLONE_BUFFER_SIZE, - "--checkers", - f"{r_clone_settings.R_CLONE_OPTION_CHECKERS}", - "--s3-upload-concurrency", - f"{r_clone_settings.R_CLONE_S3_UPLOAD_CONCURRENCY}", - "--s3-chunk-size", - r_clone_settings.R_CLONE_CHUNK_SIZE, - # handles the order of file upload - "--order-by", - r_clone_settings.R_CLONE_ORDER_BY, - "--fast-list", - ] diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py index d1023cc37824..eb4f8e38a744 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone.py @@ -12,7 +12,7 @@ from servicelib.progress_bar import ProgressBarData from servicelib.r_clone_utils import config_file from servicelib.utils import logged_gather -from settings_library.r_clone import RCloneSettings, get_rclone_common_optimizations +from settings_library.r_clone import RCloneSettings from settings_library.utils_r_clone import get_s3_r_clone_config from ._utils import BaseLogParser @@ -186,7 +186,26 @@ async def _sync_sources( "rclone", "--config", config_file_name, - *get_rclone_common_optimizations(r_clone_settings), + "--retries", + f"{r_clone_settings.R_CLONE_OPTION_RETRIES}", + "--retries-sleep", + r_clone_settings.R_CLONE_RETRIES_SLEEP, + "--transfers", + f"{r_clone_settings.R_CLONE_OPTION_TRANSFERS}", + # below two options reduce to a minimum the memory footprint + # https://forum.rclone.org/t/how-to-set-a-memory-limit/10230/4 + "--buffer-size", # docs https://rclone.org/docs/#buffer-size-size + r_clone_settings.R_CLONE_OPTION_BUFFER_SIZE, + "--checkers", + f"{r_clone_settings.R_CLONE_CHECKERS}", + "--s3-upload-concurrency", + f"{r_clone_settings.R_CLONE_S3_UPLOAD_CONCURRENCY}", + "--s3-chunk-size", + r_clone_settings.R_CLONE_CHUNK_SIZE, + # handles the order of file upload + "--order-by", + r_clone_settings.R_CLONE_ORDER_BY, + "--fast-list", "--use-json-log", # frequent polling for faster progress updates "--stats", diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index bb3a972a51cc..c51f3e3f0ed3 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -17,11 +17,7 @@ from servicelib.background_task import create_periodic_task from servicelib.logging_utils import log_catch, log_context from servicelib.utils import unused_port -from settings_library.r_clone import ( - RCloneMountSettings, - RCloneSettings, - get_rclone_common_optimizations, -) +from settings_library.r_clone import RCloneMountSettings, RCloneSettings from tenacity import ( before_sleep_log, retry, @@ -87,7 +83,7 @@ def _get_rclone_mount_command( - r_clone_settings: RCloneSettings, + mount_settings: RCloneMountSettings, r_clone_config_content: str, remote_path: StorageFileID, local_mount_path: Path, @@ -95,15 +91,14 @@ def _get_rclone_mount_command( rc_user: str, rc_password: str, ) -> str: - mount_settings = r_clone_settings.R_CLONE_MOUNT_SETTINGS escaped_remote_path = f"{remote_path}".lstrip("/") r_clone_command = " ".join( [ "rclone", "--config", - f"{mount_settings.R_CLONE_CONTAINER_CONFIG_FILE_PATH}", - ("-vv" if mount_settings.R_CLONE_CONTAINER_MOUNT_SHOW_DEBUG_LOGS else ""), + f"{mount_settings.R_CLONE_MOUNT_CONTAINER_CONFIG_FILE_PATH}", + ("-vv" if mount_settings.R_CLONE_MOUNT_CONTAINER_SHOW_DEBUG_LOGS else ""), "mount", f"{CONFIG_KEY}:{escaped_remote_path}", f"{local_mount_path}", @@ -111,29 +106,46 @@ def _get_rclone_mount_command( "--vfs-cache-mode", "full", "--vfs-read-ahead", - mount_settings.R_CLONE_VFS_READ_AHEAD, + mount_settings.R_CLONE_MOUNT_VFS_READ_AHEAD, "--vfs-cache-max-size", mount_settings.R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE, "--vfs-cache-min-free-space", mount_settings.R_CLONE_MOUNT_VFS_CACHE_MIN_FREE_SPACE, "--vfs-cache-poll-interval", - mount_settings.R_CLONE_CACHE_POLL_INTERVAL, + mount_settings.R_CLONE_MOUNT_CACHE_POLL_INTERVAL, "--write-back-cache", "--vfs-write-back", mount_settings.R_CLONE_MOUNT_VFS_WRITE_BACK, "--cache-dir", f"{mount_settings.R_CLONE_MOUNT_VFS_CACHE_PATH}", "--dir-cache-time", - mount_settings.R_CLONE_DIR_CACHE_TIME, + mount_settings.R_CLONE_MOUNT_DIR_CACHE_TIME, "--attr-timeout", - mount_settings.R_CLONE_ATTR_TIMEOUT, + mount_settings.R_CLONE_MOUNT_ATTR_TIMEOUT, "--tpslimit", - f"{mount_settings.R_CLONE_TPSLIMIT}", + f"{mount_settings.R_CLONE_MOUNT_TPSLIMIT}", "--tpslimit-burst", - f"{mount_settings.R_CLONE_TPSLIMIT_BURST}", + f"{mount_settings.R_CLONE_MOUNT_TPSLIMIT_BURST}", "--no-modtime", "--max-buffer-memory", - mount_settings.R_CLONE_MAX_BUFFER_MEMORY, + mount_settings.R_CLONE_MOUNT_MAX_BUFFER_MEMORY, + # TRANSFERS + "--retries", + f"{mount_settings.R_CLONE_MOUNT_RETRIES}", + "--retries-sleep", + mount_settings.R_CLONE_MOUNT_RETRIES_SLEEP, + "--transfers", + f"{mount_settings.R_CLONE_MOUNT_TRANSFERS}", + "--buffer-size", + mount_settings.R_CLONE_MOUNT_BUFFER_SIZE, + "--checkers", + f"{mount_settings.R_CLONE_MOUNT_CHECKERS}", + "--s3-upload-concurrency", + f"{mount_settings.R_CLONE_MOUNT_S3_UPLOAD_CONCURRENCY}", + "--s3-chunk-size", + mount_settings.R_CLONE_MOUNT_S3_CHUNK_SIZE, + "--order-by", + mount_settings.R_CLONE_MOUNT_ORDER_BY, # REMOTE CONTROL "--rc", f"--rc-addr=0.0.0.0:{remote_control_port}", @@ -142,11 +154,10 @@ def _get_rclone_mount_command( f"--rc-pass='{rc_password}'", "--allow-non-empty", "--allow-other", - *get_rclone_common_optimizations(r_clone_settings), ] ) return _R_CLONE_MOUNT_TEMPLATE.format( - r_clone_config_path=mount_settings.R_CLONE_CONTAINER_CONFIG_FILE_PATH, + r_clone_config_path=mount_settings.R_CLONE_MOUNT_CONTAINER_CONFIG_FILE_PATH, r_clone_config_content=r_clone_config_content, r_clone_command=r_clone_command, local_mount_path=local_mount_path, @@ -210,13 +221,13 @@ async def create(self): client, self._r_clone_network_name ) + assert self.r_clone_settings.R_CLONE_VERSION is not None # nosec mount_settings = self.r_clone_settings.R_CLONE_MOUNT_SETTINGS - assert mount_settings.R_CLONE_CONTAINER_VERSION is not None # nosec await _docker_utils.create_r_clone_container( client, self.r_clone_container_name, command=_get_rclone_mount_command( - r_clone_settings=self.r_clone_settings, + mount_settings=mount_settings, r_clone_config_content=self.r_clone_config_content, remote_path=self.remote_path, local_mount_path=self.local_mount_path, @@ -224,12 +235,12 @@ async def create(self): rc_user=self.rc_user, rc_password=self.rc_password, ), - r_clone_version=mount_settings.R_CLONE_CONTAINER_VERSION, + r_clone_version=self.r_clone_settings.R_CLONE_VERSION, remote_control_port=self.remote_control_port, r_clone_network_name=self._r_clone_network_name, local_mount_path=self.local_mount_path, - memory_limit=mount_settings.R_CLONE_CONTAINER_MEMORY_LIMIT, - nano_cpus=mount_settings.R_CLONE_CONTAINER_NANO_CPUS, + memory_limit=mount_settings.R_CLONE_MOUNT_CONTAINER_MEMORY_LIMIT, + nano_cpus=mount_settings.R_CLONE_MOUNT_CONTAINER_NANO_CPUS, handler_get_bind_paths=self.handler_get_bind_paths, ) @@ -247,7 +258,7 @@ class RemoteControlHttpClient: def __init__( self, remote_control_port: PortInt, - r_clone_mount_settings: RCloneMountSettings, + mount_settings: RCloneMountSettings, remote_control_host: str, rc_user: str, rc_password: str, @@ -256,7 +267,7 @@ def __init__( update_interval: timedelta = _DEFAULT_UPDATE_INTERVAL, r_clone_client_timeout: timedelta = _DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT, ) -> None: - self._r_clone_mount_settings = r_clone_mount_settings + self.mount_settings = mount_settings self._update_interval_seconds = update_interval.total_seconds() self._r_clone_client_timeout = r_clone_client_timeout self._rc_user = rc_user @@ -339,7 +350,7 @@ async def wait_for_all_transfers_to_complete(self) -> None: @retry( wait=wait_fixed(1), stop=stop_after_delay( - self._r_clone_mount_settings.R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT.total_seconds() + self.mount_settings.R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT.total_seconds() ), reraise=True, retry=retry_if_exception_type( @@ -415,7 +426,7 @@ def __init__( # pylint:disable=too-many-arguments self._rc_http_client = RemoteControlHttpClient( remote_control_port=self.rc_port, - r_clone_mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, + mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, remote_control_host=self._container_manager.r_clone_container_name, rc_user=self.rc_user, rc_password=self.rc_password, @@ -476,10 +487,7 @@ def __init__( ) -> None: self.r_clone_settings = r_clone_settings self.handler_request_shutdown = handler_request_shutdown - if ( - self.r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_CONTAINER_VERSION - is None - ): + if r_clone_settings.R_CLONE_VERSION is None: msg = "R_CLONE_VERSION setting is not set" raise RuntimeError(msg) @@ -578,11 +586,11 @@ async def setup(self) -> None: ) async def teardown(self) -> None: + if self._task_ensure_mounts_working is not None: + await cancel_wait_task(self._task_ensure_mounts_working) + # shutdown still ongoing mounts await asyncio.gather( *[mount.stop_mount() for mount in self._tracked_mounts.values()] ) self._tracked_mounts.clear() - - if self._task_ensure_mounts_working is not None: - await cancel_wait_task(self._task_ensure_mounts_working) From fd363da1a526535fa8250c61eae44dadd233c6c4 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 07:12:12 +0100 Subject: [PATCH 65/79] removed unused --- .../core/dynamic_services_settings/sidecar.py | 40 ++----------------- 1 file changed, 3 insertions(+), 37 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/core/dynamic_services_settings/sidecar.py b/services/director-v2/src/simcore_service_director_v2/core/dynamic_services_settings/sidecar.py index 3edfb6cf8b03..91c8d224e5c1 100644 --- a/services/director-v2/src/simcore_service_director_v2/core/dynamic_services_settings/sidecar.py +++ b/services/director-v2/src/simcore_service_director_v2/core/dynamic_services_settings/sidecar.py @@ -1,6 +1,5 @@ import logging import warnings -from enum import Enum from pathlib import Path from typing import Annotated @@ -10,10 +9,10 @@ ensure_unique_dict_values_validator, ensure_unique_list_values_validator, ) -from pydantic import AliasChoices, Field, PositiveInt, ValidationInfo, field_validator +from pydantic import AliasChoices, Field, ValidationInfo, field_validator from settings_library.base import BaseCustomSettings from settings_library.efs import AwsEfsSettings -from settings_library.r_clone import RCloneSettings as SettingsLibraryRCloneSettings +from settings_library.r_clone import RCloneSettings from settings_library.utils_logging import MixinLoggingSettings from settings_library.utils_service import DEFAULT_FASTAPI_PORT @@ -22,39 +21,6 @@ _logger = logging.getLogger(__name__) -class VFSCacheMode(str, Enum): - __slots__ = () - - OFF = "off" - MINIMAL = "minimal" - WRITES = "writes" - FULL = "full" - - -class RCloneSettings(SettingsLibraryRCloneSettings): - R_CLONE_DIR_CACHE_TIME_SECONDS: PositiveInt = Field( - 10, - description="time to cache directory entries for", - ) - R_CLONE_POLL_INTERVAL_SECONDS: PositiveInt = Field( - 9, - description="time to wait between polling for changes", - ) - R_CLONE_VFS_CACHE_MODE: VFSCacheMode = Field( - VFSCacheMode.MINIMAL, # SEE https://rclone.org/commands/rclone_mount/#vfs-file-caching - description="VFS operation mode, defines how and when the disk cache is synced", - ) - - @field_validator("R_CLONE_POLL_INTERVAL_SECONDS") - @classmethod - def enforce_r_clone_requirement(cls, v: int, info: ValidationInfo) -> PositiveInt: - dir_cache_time = info.data["R_CLONE_DIR_CACHE_TIME_SECONDS"] - if v >= dir_cache_time: - msg = f"R_CLONE_POLL_INTERVAL_SECONDS={v} must be lower than R_CLONE_DIR_CACHE_TIME_SECONDS={dir_cache_time}" - raise ValueError(msg) - return v - - class PlacementSettings(BaseCustomSettings): # This is just a service placement constraint, see # https://docs.docker.com/engine/swarm/services/#control-service-placement. @@ -173,7 +139,7 @@ class DynamicSidecarSettings(BaseCustomSettings, MixinLoggingSettings): DYNAMIC_SIDECAR_EXPOSE_PORT: bool = Field( default=False, - description="Publishes the service on localhost for debuging and testing [DEVELOPMENT ONLY]" + description="Publishes the service on localhost for debugging and testing [DEVELOPMENT ONLY]" "Can be used to access swagger doc from the host as http://127.0.0.1:30023/dev/doc " "where 30023 is the host published port", validate_default=True, From ef3e0683485a3754fe05ffd4a27e03153629b21c Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 07:12:44 +0100 Subject: [PATCH 66/79] mount settings transferred via lables --- .env-devel | 1 + services/director-v2/.env-devel | 1 + .../modules/dynamic_sidecar/docker_service_specs/sidecar.py | 3 ++- services/docker-compose.yml | 1 + services/dynamic-sidecar/docker/boot.sh | 6 +++--- 5 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.env-devel b/.env-devel index 74a73cab85b9..bd7c08edb55a 100644 --- a/.env-devel +++ b/.env-devel @@ -242,6 +242,7 @@ RESOURCE_USAGE_TRACKER_TRACING={} R_CLONE_OPTION_BUFFER_SIZE=16M R_CLONE_OPTION_RETRIES=3 R_CLONE_OPTION_TRANSFERS=5 +R_CLONE_MOUNT_SETTINGS={} R_CLONE_PROVIDER=MINIO # simcore-user used in docker images diff --git a/services/director-v2/.env-devel b/services/director-v2/.env-devel index 83b9a460ac08..6d806d9d46b9 100644 --- a/services/director-v2/.env-devel +++ b/services/director-v2/.env-devel @@ -61,6 +61,7 @@ R_CLONE_PROVIDER=MINIO R_CLONE_OPTION_TRANSFERS=5 R_CLONE_OPTION_RETRIES=3 R_CLONE_OPTION_BUFFER_SIZE=16M +R_CLONE_MOUNT_SETTINGS={} TRACING_OBSERVABILITY_BACKEND_ENDPOINT=http://jaeger:9411 TRAEFIK_SIMCORE_ZONE=internal_simcore_stack diff --git a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py index 4f6c92935b8e..9f7e3cc880c9 100644 --- a/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py +++ b/services/director-v2/src/simcore_service_director_v2/modules/dynamic_sidecar/docker_service_specs/sidecar.py @@ -147,7 +147,8 @@ def _get_environment_variables( "R_CLONE_PROVIDER": r_clone_settings.R_CLONE_PROVIDER, "R_CLONE_OPTION_TRANSFERS": f"{r_clone_settings.R_CLONE_OPTION_TRANSFERS}", "R_CLONE_OPTION_RETRIES": f"{r_clone_settings.R_CLONE_OPTION_RETRIES}", - "R_CLONE_OPTION_BUFFER_SIZE": r_clone_settings.R_CLONE_BUFFER_SIZE, + "R_CLONE_OPTION_BUFFER_SIZE": r_clone_settings.R_CLONE_OPTION_BUFFER_SIZE, + "R_CLONE_MOUNT_SETTINGS": r_clone_settings.R_CLONE_MOUNT_SETTINGS.model_dump_json(), "RABBIT_HOST": f"{rabbit_settings.RABBIT_HOST}", "RABBIT_PASSWORD": f"{rabbit_settings.RABBIT_PASSWORD.get_secret_value()}", "RABBIT_PORT": f"{rabbit_settings.RABBIT_PORT}", diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 3fe911982ca9..32f556a79e44 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -417,6 +417,7 @@ services: R_CLONE_OPTION_RETRIES: ${R_CLONE_OPTION_RETRIES} R_CLONE_OPTION_TRANSFERS: ${R_CLONE_OPTION_TRANSFERS} R_CLONE_PROVIDER: ${R_CLONE_PROVIDER} + R_CLONE_MOUNT_SETTINGS: ${R_CLONE_MOUNT_SETTINGS} EFS_DNS_NAME: ${EFS_DNS_NAME} EFS_MOUNTED_PATH: ${EFS_MOUNTED_PATH} diff --git a/services/dynamic-sidecar/docker/boot.sh b/services/dynamic-sidecar/docker/boot.sh index cf552e3f1c34..d6bc4dc4d570 100755 --- a/services/dynamic-sidecar/docker/boot.sh +++ b/services/dynamic-sidecar/docker/boot.sh @@ -48,9 +48,9 @@ DYNAMIC_SIDECAR_REMOTE_DEBUGGING_PORT=${DYNAMIC_SIDECAR_REMOTE_DEBUGGING_PORT:-3 SERVER_LOG_LEVEL=$(echo "${APP_LOG_LEVEL}" | tr '[:upper:]' '[:lower:]') echo "$INFO" "Log-level app/server: $APP_LOG_LEVEL/$SERVER_LOG_LEVEL" -R_CLONE_CONTAINER_VERSION=$(rclone version | head -n1 | awk '{print $2}' | sed 's/^v//') && \ - echo "$INFO" "R_CLONE_CONTAINER_VERSION=${R_CLONE_CONTAINER_VERSION}" && \ - export R_CLONE_CONTAINER_VERSION +R_CLONE_VERSION=$(rclone version | head -n1 | awk '{print $2}' | sed 's/^v//') && \ + echo "$INFO" "R_CLONE_VERSION=${R_CLONE_VERSION}" && \ + export R_CLONE_VERSION if [ "${SC_BOOT_MODE}" = "debug" ]; then reload_dir_packages=$(fdfind src /devel/packages --exec echo '--reload-dir {} ' | tr '\n' ' ') From 52b95682afd4cdfd40ab853f4ffa42afdd111261 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 07:13:05 +0100 Subject: [PATCH 67/79] printable report --- .../modules/long_running_tasks.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py index b0682bfb241d..91cc630cc776 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py @@ -401,7 +401,12 @@ async def _handler_get_bind_path( async def _handler_mount_activity(state_path: Path, activity: MountActivity) -> None: # in the future this should go to the fornted - _logger.info("Mount activity for '%s': %s", state_path, activity) + _logger.info( + "Mount activity for '%s': [queued=%s] %s", + state_path, + len(activity.queued), + activity.transferring, + ) async def _restore_state_folder( From eca069b9fa6770f0cdaaa934aeb4ca9647559041 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 07:25:34 +0100 Subject: [PATCH 68/79] refactor --- .../modules/long_running_tasks.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py index 91cc630cc776..9546af32d247 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py @@ -400,13 +400,15 @@ async def _handler_get_bind_path( async def _handler_mount_activity(state_path: Path, activity: MountActivity) -> None: - # in the future this should go to the fornted - _logger.info( - "Mount activity for '%s': [queued=%s] %s", - state_path, - len(activity.queued), - activity.transferring, - ) + waiting_in_queue = len(activity.transferring) - len(activity.queued) + + # TODO: this object should be pushed to the FE in the future + activity_summary = { + "path": state_path, + "waiting_in_queue": waiting_in_queue, + "transferring": activity.transferring, + } + _logger.info("activity_summary=%s", activity_summary) async def _restore_state_folder( From 49d82a0f86ee18b7ed22ce9544a412796ffe96f4 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 07:28:33 +0100 Subject: [PATCH 69/79] added dependency --- packages/simcore-sdk/requirements/_base.in | 1 + packages/simcore-sdk/requirements/_base.txt | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/packages/simcore-sdk/requirements/_base.in b/packages/simcore-sdk/requirements/_base.in index 4ce6caec6571..ba257f7663d0 100644 --- a/packages/simcore-sdk/requirements/_base.in +++ b/packages/simcore-sdk/requirements/_base.in @@ -11,6 +11,7 @@ aiocache +aiodocker aiofiles aiohttp httpx diff --git a/packages/simcore-sdk/requirements/_base.txt b/packages/simcore-sdk/requirements/_base.txt index c161d302f34b..7b2c72c20900 100644 --- a/packages/simcore-sdk/requirements/_base.txt +++ b/packages/simcore-sdk/requirements/_base.txt @@ -7,7 +7,9 @@ aiocache==0.12.3 aiodebug==2.3.0 # via -r requirements/../../../packages/service-library/requirements/_base.in aiodocker==0.24.0 - # via -r requirements/../../../packages/service-library/requirements/_base.in + # via + # -r requirements/../../../packages/service-library/requirements/_base.in + # -r requirements/_base.in aiofiles==24.1.0 # via # -r requirements/../../../packages/service-library/requirements/_base.in From 392685c65e854017c359fc941f829c058542f155 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 07:35:15 +0100 Subject: [PATCH 70/79] updated tests --- services/director-v2/tests/unit/test_core_settings.py | 2 +- ...test_modules_dynamic_sidecar_docker_service_specs_sidecar.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/services/director-v2/tests/unit/test_core_settings.py b/services/director-v2/tests/unit/test_core_settings.py index 9b35f54957ac..1c1e557d6bff 100644 --- a/services/director-v2/tests/unit/test_core_settings.py +++ b/services/director-v2/tests/unit/test_core_settings.py @@ -26,7 +26,7 @@ def _get_backend_type_options() -> set[str]: def test_supported_backends_did_not_change() -> None: - _EXPECTED = {"AWS", "CEPH", "MINIO"} + _EXPECTED = {"AWS", "CEPH", "MINIO", "AWS_MOTO"} assert _get_backend_type_options() == _EXPECTED, ( "Backend configuration change, please code support for " "it in volumes_resolver -> _get_s3_volume_driver_config. " diff --git a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_docker_service_specs_sidecar.py b/services/director-v2/tests/unit/test_modules_dynamic_sidecar_docker_service_specs_sidecar.py index da645555f4ce..eb4c04dca65a 100644 --- a/services/director-v2/tests/unit/test_modules_dynamic_sidecar_docker_service_specs_sidecar.py +++ b/services/director-v2/tests/unit/test_modules_dynamic_sidecar_docker_service_specs_sidecar.py @@ -44,6 +44,7 @@ "POSTGRES_PASSWORD", "POSTGRES_PORT", "POSTGRES_USER", + "R_CLONE_MOUNT_SETTINGS", "R_CLONE_OPTION_BUFFER_SIZE", "R_CLONE_OPTION_RETRIES", "R_CLONE_OPTION_TRANSFERS", From c3078945539a53600417ec4706f419f647c90016 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 07:41:39 +0100 Subject: [PATCH 71/79] refactor --- .../r_clone_mount/_container.py | 362 +++++++++++++++++ .../node_ports_common/r_clone_mount/_core.py | 364 +----------------- .../r_clone_mount/_models.py | 2 + .../node_ports_common/r_clone_mount/_utils.py | 10 + 4 files changed, 385 insertions(+), 353 deletions(-) create mode 100644 packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py create mode 100644 packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_utils.py diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py new file mode 100644 index 000000000000..4ce1775a4e47 --- /dev/null +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py @@ -0,0 +1,362 @@ +import asyncio +import logging +from collections.abc import Awaitable, Callable +from datetime import timedelta +from functools import cached_property +from pathlib import Path +from textwrap import dedent +from typing import Any, Final + +from httpx import AsyncClient, HTTPError +from models_library.basic_types import PortInt +from models_library.progress_bar import ProgressReport +from models_library.projects_nodes_io import NodeID, StorageFileID +from pydantic import NonNegativeInt +from settings_library.r_clone import RCloneMountSettings, RCloneSettings +from tenacity import ( + before_sleep_log, + retry, + retry_if_exception_type, + stop_after_delay, + wait_fixed, +) + +from . import _docker_utils +from ._config_provider import CONFIG_KEY +from ._errors import ( + WaitingForQueueToBeEmptyError, + WaitingForTransfersToCompleteError, +) +from ._models import ( + GetBindPathsProtocol, + MountActivity, +) +from ._utils import get_mount_id + +_logger = logging.getLogger(__name__) + + +_MAX_WAIT_RC_HTTP_INTERFACE_READY: Final[timedelta] = timedelta(seconds=10) +_DEFAULT_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=1) +_DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT: Final[timedelta] = timedelta(seconds=20) + + +_DOCKER_PREFIX_MOUNT: Final[str] = "rcm" + + +_R_CLONE_MOUNT_TEMPLATE: Final[str] = dedent( + """ +set -e + +MOUNT_POINT='{local_mount_path}' + +cleanup() {{ + echo 'STARTED CLEANUP...' + umount -f "$MOUNT_POINT" || true + echo 'FINISHED CLEANUP' +}} +trap cleanup SIGTERM SIGINT + +cat < {r_clone_config_path} +{r_clone_config_content} +EOF + +echo "Start command: {r_clone_command}" + +{r_clone_command} 2>&1 & + +RCLONE_PID=$! +wait "$RCLONE_PID" +echo "rclone exited, running cleanup (if not already triggered)..." +cleanup +""" +) + + +def _get_rclone_mount_command( + mount_settings: RCloneMountSettings, + r_clone_config_content: str, + remote_path: StorageFileID, + local_mount_path: Path, + remote_control_port: PortInt, + rc_user: str, + rc_password: str, +) -> str: + escaped_remote_path = f"{remote_path}".lstrip("/") + + r_clone_command = " ".join( + [ + "rclone", + "--config", + f"{mount_settings.R_CLONE_MOUNT_CONTAINER_CONFIG_FILE_PATH}", + ("-vv" if mount_settings.R_CLONE_MOUNT_CONTAINER_SHOW_DEBUG_LOGS else ""), + "mount", + f"{CONFIG_KEY}:{escaped_remote_path}", + f"{local_mount_path}", + # VFS + "--vfs-cache-mode", + "full", + "--vfs-read-ahead", + mount_settings.R_CLONE_MOUNT_VFS_READ_AHEAD, + "--vfs-cache-max-size", + mount_settings.R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE, + "--vfs-cache-min-free-space", + mount_settings.R_CLONE_MOUNT_VFS_CACHE_MIN_FREE_SPACE, + "--vfs-cache-poll-interval", + mount_settings.R_CLONE_MOUNT_CACHE_POLL_INTERVAL, + "--write-back-cache", + "--vfs-write-back", + mount_settings.R_CLONE_MOUNT_VFS_WRITE_BACK, + "--cache-dir", + f"{mount_settings.R_CLONE_MOUNT_VFS_CACHE_PATH}", + "--dir-cache-time", + mount_settings.R_CLONE_MOUNT_DIR_CACHE_TIME, + "--attr-timeout", + mount_settings.R_CLONE_MOUNT_ATTR_TIMEOUT, + "--tpslimit", + f"{mount_settings.R_CLONE_MOUNT_TPSLIMIT}", + "--tpslimit-burst", + f"{mount_settings.R_CLONE_MOUNT_TPSLIMIT_BURST}", + "--no-modtime", + "--max-buffer-memory", + mount_settings.R_CLONE_MOUNT_MAX_BUFFER_MEMORY, + # TRANSFERS + "--retries", + f"{mount_settings.R_CLONE_MOUNT_RETRIES}", + "--retries-sleep", + mount_settings.R_CLONE_MOUNT_RETRIES_SLEEP, + "--transfers", + f"{mount_settings.R_CLONE_MOUNT_TRANSFERS}", + "--buffer-size", + mount_settings.R_CLONE_MOUNT_BUFFER_SIZE, + "--checkers", + f"{mount_settings.R_CLONE_MOUNT_CHECKERS}", + "--s3-upload-concurrency", + f"{mount_settings.R_CLONE_MOUNT_S3_UPLOAD_CONCURRENCY}", + "--s3-chunk-size", + mount_settings.R_CLONE_MOUNT_S3_CHUNK_SIZE, + "--order-by", + mount_settings.R_CLONE_MOUNT_ORDER_BY, + # REMOTE CONTROL + "--rc", + f"--rc-addr=0.0.0.0:{remote_control_port}", + "--rc-enable-metrics", + f"--rc-user='{rc_user}'", + f"--rc-pass='{rc_password}'", + "--allow-non-empty", + "--allow-other", + ] + ) + return _R_CLONE_MOUNT_TEMPLATE.format( + r_clone_config_path=mount_settings.R_CLONE_MOUNT_CONTAINER_CONFIG_FILE_PATH, + r_clone_config_content=r_clone_config_content, + r_clone_command=r_clone_command, + local_mount_path=local_mount_path, + ) + + +class ContainerManager: # pylint:disable=too-many-instance-attributes + def __init__( # pylint:disable=too-many-arguments + self, + r_clone_settings: RCloneSettings, + node_id: NodeID, + remote_control_port: PortInt, + local_mount_path: Path, + index: NonNegativeInt, + r_clone_config_content: str, + remote_path: str, + rc_user: str, + rc_password: str, + *, + handler_get_bind_paths: GetBindPathsProtocol, + ) -> None: + self.r_clone_settings = r_clone_settings + self.node_id = node_id + self.remote_control_port = remote_control_port + self.local_mount_path = local_mount_path + self.index = index + self.r_clone_config_content = r_clone_config_content + self.remote_path = remote_path + self.rc_user = rc_user + self.rc_password = rc_password + + self.handler_get_bind_paths = handler_get_bind_paths + + @cached_property + def r_clone_container_name(self) -> str: + mount_id = get_mount_id(self.local_mount_path, self.index) + return f"{_DOCKER_PREFIX_MOUNT}-c-{self.node_id}{mount_id}"[:63] + + @cached_property + def _r_clone_network_name(self) -> str: + mount_id = get_mount_id(self.local_mount_path, self.index) + return f"{_DOCKER_PREFIX_MOUNT}-c-{self.node_id}{mount_id}"[:63] + + async def create(self): + async with _docker_utils.get_or_crate_docker_session(None) as client: + # ensure nothing was left from previous runs + await _docker_utils.remove_container_if_exists( + client, self.r_clone_container_name + ) + await _docker_utils.remove_network_if_exists( + client, self.r_clone_container_name + ) + + # create network + container and connect to sidecar + await _docker_utils.create_network_and_connect_sidecar_container( + client, self._r_clone_network_name + ) + + assert self.r_clone_settings.R_CLONE_VERSION is not None # nosec + mount_settings = self.r_clone_settings.R_CLONE_MOUNT_SETTINGS + await _docker_utils.create_r_clone_container( + client, + self.r_clone_container_name, + command=_get_rclone_mount_command( + mount_settings=mount_settings, + r_clone_config_content=self.r_clone_config_content, + remote_path=self.remote_path, + local_mount_path=self.local_mount_path, + remote_control_port=self.remote_control_port, + rc_user=self.rc_user, + rc_password=self.rc_password, + ), + r_clone_version=self.r_clone_settings.R_CLONE_VERSION, + remote_control_port=self.remote_control_port, + r_clone_network_name=self._r_clone_network_name, + local_mount_path=self.local_mount_path, + memory_limit=mount_settings.R_CLONE_MOUNT_CONTAINER_MEMORY_LIMIT, + nano_cpus=mount_settings.R_CLONE_MOUNT_CONTAINER_NANO_CPUS, + handler_get_bind_paths=self.handler_get_bind_paths, + ) + + async def remove(self): + async with _docker_utils.get_or_crate_docker_session(None) as client: + await _docker_utils.remove_container_if_exists( + client, self.r_clone_container_name + ) + await _docker_utils.remove_network_if_exists( + client, self.r_clone_container_name + ) + + +class RemoteControlHttpClient: + def __init__( + self, + remote_control_port: PortInt, + mount_settings: RCloneMountSettings, + remote_control_host: str, + rc_user: str, + rc_password: str, + *, + update_handler: Callable[[MountActivity], Awaitable[None]], + update_interval: timedelta = _DEFAULT_UPDATE_INTERVAL, + r_clone_client_timeout: timedelta = _DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT, + ) -> None: + self.mount_settings = mount_settings + self._update_interval_seconds = update_interval.total_seconds() + self._r_clone_client_timeout = r_clone_client_timeout + self._rc_user = rc_user + self._rc_password = rc_password + self._update_handler = update_handler + + self._rc_host = remote_control_host + self._rc_port = remote_control_port + + @property + def _base_url(self) -> str: + return f"http://{self._rc_host}:{self._rc_port}" + + async def _request(self, method: str, path: str) -> Any: + request_url = f"{self._base_url}/{path}" + _logger.debug("Sending '%s %s' request", method, request_url) + + async with AsyncClient( + timeout=self._r_clone_client_timeout.total_seconds() + ) as client: + response = await client.request( + method, request_url, auth=(self._rc_user, self._rc_password) + ) + response.raise_for_status() + return response.json() + + async def _post_core_stats(self) -> dict: + return await self._request("POST", "core/stats") + + async def _post_vfs_queue(self) -> dict: + return await self._request("POST", "vfs/queue") + + async def _rc_noop(self) -> dict: + return await self._request("POST", "rc/noop") + + async def get_mount_activity(self) -> MountActivity: + core_stats, vfs_queue = await asyncio.gather( + self._post_core_stats(), self._post_vfs_queue() + ) + + return MountActivity( + transferring=( + { + x["name"]: ProgressReport( + actual_value=( + x["percentage"] / 100 if "percentage" in x else 0.0 + ) + ) + for x in core_stats["transferring"] + } + if "transferring" in core_stats + else {} + ), + queued=[x["name"] for x in vfs_queue["queue"]], + ) + + @retry( + wait=wait_fixed(1), + stop=stop_after_delay(_MAX_WAIT_RC_HTTP_INTERFACE_READY.total_seconds()), + reraise=True, + retry=retry_if_exception_type(HTTPError), + before_sleep=before_sleep_log(_logger, logging.WARNING), + ) + async def wait_for_interface_to_be_ready(self) -> None: + await self._rc_noop() + + async def is_responsive(self) -> bool: + try: + await self._rc_noop() + return True + except HTTPError: + return False + + async def wait_for_all_transfers_to_complete(self) -> None: + """ + Should be waited before closing the mount + to ensure all data is transferred to remote. + """ + + @retry( + wait=wait_fixed(1), + stop=stop_after_delay( + self.mount_settings.R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT.total_seconds() + ), + reraise=True, + retry=retry_if_exception_type( + (WaitingForQueueToBeEmptyError, WaitingForTransfersToCompleteError) + ), + before_sleep=before_sleep_log(_logger, logging.WARNING), + ) + async def _() -> None: + core_stats, vfs_queue = await asyncio.gather( + self._post_core_stats(), self._post_vfs_queue() + ) + + if ( + core_stats["transfers"] != core_stats["totalTransfers"] + or "transferring" in core_stats + ): + raise WaitingForTransfersToCompleteError + + queue = vfs_queue["queue"] + if len(queue) != 0: + raise WaitingForQueueToBeEmptyError(queue=queue) + + await _() diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py index c51f3e3f0ed3..660916e25e02 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py @@ -1,380 +1,38 @@ import asyncio import logging -from collections.abc import Awaitable, Callable from datetime import UTC, datetime, timedelta -from functools import cached_property from pathlib import Path -from textwrap import dedent -from typing import Any, Final +from typing import Final from uuid import uuid4 from common_library.async_tools import cancel_wait_task -from httpx import AsyncClient, HTTPError from models_library.basic_types import PortInt -from models_library.progress_bar import ProgressReport from models_library.projects_nodes_io import NodeID, StorageFileID from pydantic import NonNegativeInt from servicelib.background_task import create_periodic_task from servicelib.logging_utils import log_catch, log_context from servicelib.utils import unused_port -from settings_library.r_clone import RCloneMountSettings, RCloneSettings -from tenacity import ( - before_sleep_log, - retry, - retry_if_exception_type, - stop_after_delay, - wait_fixed, -) +from settings_library.r_clone import RCloneSettings -from . import _docker_utils -from ._config_provider import CONFIG_KEY, MountRemoteType, get_config_content +from ._config_provider import MountRemoteType, get_config_content +from ._container import ContainerManager, RemoteControlHttpClient from ._errors import ( MountAlreadyStartedError, - WaitingForQueueToBeEmptyError, - WaitingForTransfersToCompleteError, ) from ._models import ( GetBindPathsProtocol, MountActivity, MountActivityProtocol, + MountId, RequestShutdownProtocol, ) +from ._utils import get_mount_id _logger = logging.getLogger(__name__) -_MAX_WAIT_RC_HTTP_INTERFACE_READY: Final[timedelta] = timedelta(seconds=10) -_DEFAULT_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=1) -_DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT: Final[timedelta] = timedelta(seconds=20) - _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=5) -_DOCKER_PREFIX_MOUNT: Final[str] = "rcm" - -type _MountId = str - -_R_CLONE_MOUNT_TEMPLATE: Final[str] = dedent( - """ -set -e - -MOUNT_POINT='{local_mount_path}' - -cleanup() {{ - echo 'STARTED CLEANUP...' - umount -f "$MOUNT_POINT" || true - echo 'FINISHED CLEANUP' -}} -trap cleanup SIGTERM SIGINT - -cat < {r_clone_config_path} -{r_clone_config_content} -EOF - -echo "Start command: {r_clone_command}" - -{r_clone_command} 2>&1 & - -RCLONE_PID=$! -wait "$RCLONE_PID" -echo "rclone exited, running cleanup (if not already triggered)..." -cleanup -""" -) - - -def _get_rclone_mount_command( - mount_settings: RCloneMountSettings, - r_clone_config_content: str, - remote_path: StorageFileID, - local_mount_path: Path, - remote_control_port: PortInt, - rc_user: str, - rc_password: str, -) -> str: - escaped_remote_path = f"{remote_path}".lstrip("/") - - r_clone_command = " ".join( - [ - "rclone", - "--config", - f"{mount_settings.R_CLONE_MOUNT_CONTAINER_CONFIG_FILE_PATH}", - ("-vv" if mount_settings.R_CLONE_MOUNT_CONTAINER_SHOW_DEBUG_LOGS else ""), - "mount", - f"{CONFIG_KEY}:{escaped_remote_path}", - f"{local_mount_path}", - # VFS - "--vfs-cache-mode", - "full", - "--vfs-read-ahead", - mount_settings.R_CLONE_MOUNT_VFS_READ_AHEAD, - "--vfs-cache-max-size", - mount_settings.R_CLONE_MOUNT_VFS_CACHE_MAX_SIZE, - "--vfs-cache-min-free-space", - mount_settings.R_CLONE_MOUNT_VFS_CACHE_MIN_FREE_SPACE, - "--vfs-cache-poll-interval", - mount_settings.R_CLONE_MOUNT_CACHE_POLL_INTERVAL, - "--write-back-cache", - "--vfs-write-back", - mount_settings.R_CLONE_MOUNT_VFS_WRITE_BACK, - "--cache-dir", - f"{mount_settings.R_CLONE_MOUNT_VFS_CACHE_PATH}", - "--dir-cache-time", - mount_settings.R_CLONE_MOUNT_DIR_CACHE_TIME, - "--attr-timeout", - mount_settings.R_CLONE_MOUNT_ATTR_TIMEOUT, - "--tpslimit", - f"{mount_settings.R_CLONE_MOUNT_TPSLIMIT}", - "--tpslimit-burst", - f"{mount_settings.R_CLONE_MOUNT_TPSLIMIT_BURST}", - "--no-modtime", - "--max-buffer-memory", - mount_settings.R_CLONE_MOUNT_MAX_BUFFER_MEMORY, - # TRANSFERS - "--retries", - f"{mount_settings.R_CLONE_MOUNT_RETRIES}", - "--retries-sleep", - mount_settings.R_CLONE_MOUNT_RETRIES_SLEEP, - "--transfers", - f"{mount_settings.R_CLONE_MOUNT_TRANSFERS}", - "--buffer-size", - mount_settings.R_CLONE_MOUNT_BUFFER_SIZE, - "--checkers", - f"{mount_settings.R_CLONE_MOUNT_CHECKERS}", - "--s3-upload-concurrency", - f"{mount_settings.R_CLONE_MOUNT_S3_UPLOAD_CONCURRENCY}", - "--s3-chunk-size", - mount_settings.R_CLONE_MOUNT_S3_CHUNK_SIZE, - "--order-by", - mount_settings.R_CLONE_MOUNT_ORDER_BY, - # REMOTE CONTROL - "--rc", - f"--rc-addr=0.0.0.0:{remote_control_port}", - "--rc-enable-metrics", - f"--rc-user='{rc_user}'", - f"--rc-pass='{rc_password}'", - "--allow-non-empty", - "--allow-other", - ] - ) - return _R_CLONE_MOUNT_TEMPLATE.format( - r_clone_config_path=mount_settings.R_CLONE_MOUNT_CONTAINER_CONFIG_FILE_PATH, - r_clone_config_content=r_clone_config_content, - r_clone_command=r_clone_command, - local_mount_path=local_mount_path, - ) - - -def _get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> _MountId: - # unique reproducible id for the mount - return f"{index}{local_mount_path}".replace("/", "_")[::-1] - - -class ContainerManager: # pylint:disable=too-many-instance-attributes - def __init__( # pylint:disable=too-many-arguments - self, - r_clone_settings: RCloneSettings, - node_id: NodeID, - remote_control_port: PortInt, - local_mount_path: Path, - index: NonNegativeInt, - r_clone_config_content: str, - remote_path: str, - rc_user: str, - rc_password: str, - *, - handler_get_bind_paths: GetBindPathsProtocol, - ) -> None: - self.r_clone_settings = r_clone_settings - self.node_id = node_id - self.remote_control_port = remote_control_port - self.local_mount_path = local_mount_path - self.index = index - self.r_clone_config_content = r_clone_config_content - self.remote_path = remote_path - self.rc_user = rc_user - self.rc_password = rc_password - - self.handler_get_bind_paths = handler_get_bind_paths - - @cached_property - def r_clone_container_name(self) -> str: - mount_id = _get_mount_id(self.local_mount_path, self.index) - return f"{_DOCKER_PREFIX_MOUNT}-c-{self.node_id}{mount_id}"[:63] - - @cached_property - def _r_clone_network_name(self) -> str: - mount_id = _get_mount_id(self.local_mount_path, self.index) - return f"{_DOCKER_PREFIX_MOUNT}-c-{self.node_id}{mount_id}"[:63] - - async def create(self): - async with _docker_utils.get_or_crate_docker_session(None) as client: - # ensure nothing was left from previous runs - await _docker_utils.remove_container_if_exists( - client, self.r_clone_container_name - ) - await _docker_utils.remove_network_if_exists( - client, self.r_clone_container_name - ) - - # create network + container and connect to sidecar - await _docker_utils.create_network_and_connect_sidecar_container( - client, self._r_clone_network_name - ) - - assert self.r_clone_settings.R_CLONE_VERSION is not None # nosec - mount_settings = self.r_clone_settings.R_CLONE_MOUNT_SETTINGS - await _docker_utils.create_r_clone_container( - client, - self.r_clone_container_name, - command=_get_rclone_mount_command( - mount_settings=mount_settings, - r_clone_config_content=self.r_clone_config_content, - remote_path=self.remote_path, - local_mount_path=self.local_mount_path, - remote_control_port=self.remote_control_port, - rc_user=self.rc_user, - rc_password=self.rc_password, - ), - r_clone_version=self.r_clone_settings.R_CLONE_VERSION, - remote_control_port=self.remote_control_port, - r_clone_network_name=self._r_clone_network_name, - local_mount_path=self.local_mount_path, - memory_limit=mount_settings.R_CLONE_MOUNT_CONTAINER_MEMORY_LIMIT, - nano_cpus=mount_settings.R_CLONE_MOUNT_CONTAINER_NANO_CPUS, - handler_get_bind_paths=self.handler_get_bind_paths, - ) - - async def remove(self): - async with _docker_utils.get_or_crate_docker_session(None) as client: - await _docker_utils.remove_container_if_exists( - client, self.r_clone_container_name - ) - await _docker_utils.remove_network_if_exists( - client, self.r_clone_container_name - ) - - -class RemoteControlHttpClient: - def __init__( - self, - remote_control_port: PortInt, - mount_settings: RCloneMountSettings, - remote_control_host: str, - rc_user: str, - rc_password: str, - *, - update_handler: Callable[[MountActivity], Awaitable[None]], - update_interval: timedelta = _DEFAULT_UPDATE_INTERVAL, - r_clone_client_timeout: timedelta = _DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT, - ) -> None: - self.mount_settings = mount_settings - self._update_interval_seconds = update_interval.total_seconds() - self._r_clone_client_timeout = r_clone_client_timeout - self._rc_user = rc_user - self._rc_password = rc_password - self._update_handler = update_handler - - self._rc_host = remote_control_host - self._rc_port = remote_control_port - - @property - def _base_url(self) -> str: - return f"http://{self._rc_host}:{self._rc_port}" - - async def _request(self, method: str, path: str) -> Any: - request_url = f"{self._base_url}/{path}" - _logger.debug("Sending '%s %s' request", method, request_url) - - async with AsyncClient( - timeout=self._r_clone_client_timeout.total_seconds() - ) as client: - response = await client.request( - method, request_url, auth=(self._rc_user, self._rc_password) - ) - response.raise_for_status() - return response.json() - - async def _post_core_stats(self) -> dict: - return await self._request("POST", "core/stats") - - async def _post_vfs_queue(self) -> dict: - return await self._request("POST", "vfs/queue") - - async def _rc_noop(self) -> dict: - return await self._request("POST", "rc/noop") - - async def get_mount_activity(self) -> MountActivity: - core_stats, vfs_queue = await asyncio.gather( - self._post_core_stats(), self._post_vfs_queue() - ) - - return MountActivity( - transferring=( - { - x["name"]: ProgressReport( - actual_value=( - x["percentage"] / 100 if "percentage" in x else 0.0 - ) - ) - for x in core_stats["transferring"] - } - if "transferring" in core_stats - else {} - ), - queued=[x["name"] for x in vfs_queue["queue"]], - ) - - @retry( - wait=wait_fixed(1), - stop=stop_after_delay(_MAX_WAIT_RC_HTTP_INTERFACE_READY.total_seconds()), - reraise=True, - retry=retry_if_exception_type(HTTPError), - before_sleep=before_sleep_log(_logger, logging.WARNING), - ) - async def wait_for_interface_to_be_ready(self) -> None: - await self._rc_noop() - - async def is_responsive(self) -> bool: - try: - await self._rc_noop() - return True - except HTTPError: - return False - - async def wait_for_all_transfers_to_complete(self) -> None: - """ - Should be waited before closing the mount - to ensure all data is transferred to remote. - """ - - @retry( - wait=wait_fixed(1), - stop=stop_after_delay( - self.mount_settings.R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT.total_seconds() - ), - reraise=True, - retry=retry_if_exception_type( - (WaitingForQueueToBeEmptyError, WaitingForTransfersToCompleteError) - ), - before_sleep=before_sleep_log(_logger, logging.WARNING), - ) - async def _() -> None: - core_stats, vfs_queue = await asyncio.gather( - self._post_core_stats(), self._post_vfs_queue() - ) - - if ( - core_stats["transfers"] != core_stats["totalTransfers"] - or "transferring" in core_stats - ): - raise WaitingForTransfersToCompleteError - - queue = vfs_queue["queue"] - if len(queue) != 0: - raise WaitingForQueueToBeEmptyError(queue=queue) - - await _() - class TrackedMount: # pylint:disable=too-many-instance-attributes def __init__( # pylint:disable=too-many-arguments @@ -460,7 +118,7 @@ async def start_mount(self) -> None: self._task_mount_activity = create_periodic_task( self._worker_mount_activity, interval=self._mount_activity_update_interval, - task_name=f"rclone-mount-activity-{_get_mount_id(self.local_mount_path, self.index)}", + task_name=f"rclone-mount-activity-{get_mount_id(self.local_mount_path, self.index)}", ) async def stop_mount(self, *, skip_transfer_wait: bool = False) -> None: @@ -491,7 +149,7 @@ def __init__( msg = "R_CLONE_VERSION setting is not set" raise RuntimeError(msg) - self._tracked_mounts: dict[_MountId, TrackedMount] = {} + self._tracked_mounts: dict[MountId, TrackedMount] = {} self._task_ensure_mounts_working: asyncio.Task[None] | None = None async def ensure_mounted( @@ -511,7 +169,7 @@ async def ensure_mounted( f"mounting {local_mount_path=} from {remote_path=}", log_duration=True, ): - mount_id = _get_mount_id(local_mount_path, index) + mount_id = get_mount_id(local_mount_path, index) if mount_id in self._tracked_mounts: tracked_mount = self._tracked_mounts[mount_id] raise MountAlreadyStartedError(local_mount_path=local_mount_path) @@ -537,7 +195,7 @@ async def ensure_mounted( def is_mount_tracked(self, local_mount_path: Path, index: NonNegativeInt) -> bool: """True if if a mount is being tracked""" - mount_id = _get_mount_id(local_mount_path, index) + mount_id = get_mount_id(local_mount_path, index) return mount_id in self._tracked_mounts async def ensure_unmounted( @@ -546,7 +204,7 @@ async def ensure_unmounted( with log_context( _logger, logging.INFO, f"unmounting {local_mount_path=}", log_duration=True ): - mount_id = _get_mount_id(local_mount_path, index) + mount_id = get_mount_id(local_mount_path, index) tracked_mount = self._tracked_mounts[mount_id] await tracked_mount.wait_for_all_transfers_to_complete() diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py index 87f113beceaa..de3f4fa56026 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py @@ -4,6 +4,8 @@ from models_library.progress_bar import ProgressReport from pydantic import BaseModel +type MountId = str + class MountActivity(BaseModel): transferring: dict[str, ProgressReport] diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_utils.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_utils.py new file mode 100644 index 000000000000..0c864322fe26 --- /dev/null +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_utils.py @@ -0,0 +1,10 @@ +from pathlib import Path + +from pydantic import NonNegativeInt + +from ._models import MountId + + +def get_mount_id(local_mount_path: Path, index: NonNegativeInt) -> MountId: + # unique reproducible id for the mount + return f"{index}{local_mount_path}".replace("/", "_")[::-1] From e1465d24cc34cc55e60d22260d1b00f93a870906 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 07:43:15 +0100 Subject: [PATCH 72/79] rename --- .../src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py | 2 +- .../node_ports_common/r_clone_mount/{_core.py => _manager.py} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/{_core.py => _manager.py} (100%) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py index 505e51bb31d4..cb49732258b3 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py @@ -1,6 +1,6 @@ from ._config_provider import MountRemoteType -from ._core import RCloneMountManager from ._errors import MountAlreadyStartedError +from ._manager import RCloneMountManager from ._models import ( GetBindPathsProtocol, MountActivity, diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py similarity index 100% rename from packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_core.py rename to packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py From b330de6d77558ab0b9867442280481476ed8ab0f Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 07:46:36 +0100 Subject: [PATCH 73/79] no negative numbers --- .../modules/long_running_tasks.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py index 9546af32d247..a1789cdf5edd 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py @@ -400,12 +400,10 @@ async def _handler_get_bind_path( async def _handler_mount_activity(state_path: Path, activity: MountActivity) -> None: - waiting_in_queue = len(activity.transferring) - len(activity.queued) - # TODO: this object should be pushed to the FE in the future activity_summary = { "path": state_path, - "waiting_in_queue": waiting_in_queue, + "queued": len(activity.queued), "transferring": activity.transferring, } _logger.info("activity_summary=%s", activity_summary) From fe7744dfd75bd456a704022f1b26c2297c447dfd Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 07:50:02 +0100 Subject: [PATCH 74/79] refactor --- .../r_clone_mount/__init__.py | 2 ++ .../r_clone_mount/_models.py | 4 +++- .../modules/long_running_tasks.py | 20 ++++++++++++------- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py index cb49732258b3..18d360c189a7 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/__init__.py @@ -6,6 +6,7 @@ MountActivity, MountActivityProtocol, RequestShutdownProtocol, + Transferring, ) __all__: tuple[str, ...] = ( @@ -16,4 +17,5 @@ "MountRemoteType", "RCloneMountManager", "RequestShutdownProtocol", + "Transferring", ) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py index de3f4fa56026..a4809587e9a1 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_models.py @@ -6,9 +6,11 @@ type MountId = str +type Transferring = dict[str, ProgressReport] + class MountActivity(BaseModel): - transferring: dict[str, ProgressReport] + transferring: Transferring queued: list[str] diff --git a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py index a1789cdf5edd..776d2d12563c 100644 --- a/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py +++ b/services/dynamic-sidecar/src/simcore_service_dynamic_sidecar/modules/long_running_tasks.py @@ -2,6 +2,7 @@ import logging from collections.abc import AsyncGenerator from contextlib import asynccontextmanager +from dataclasses import dataclass from pathlib import Path from typing import Any, Final @@ -19,7 +20,7 @@ from servicelib.progress_bar import ProgressBarData from servicelib.utils import logged_gather from simcore_sdk.node_data import data_manager -from simcore_sdk.node_ports_common.r_clone_mount import MountActivity +from simcore_sdk.node_ports_common.r_clone_mount import MountActivity, Transferring from tenacity import retry from tenacity.before_sleep import before_sleep_log from tenacity.retry import retry_if_result @@ -399,14 +400,19 @@ async def _handler_get_bind_path( return bind_paths +@dataclass +class MountActivitySummary: + path: Path + queued: int + transferring: Transferring + + async def _handler_mount_activity(state_path: Path, activity: MountActivity) -> None: # TODO: this object should be pushed to the FE in the future - activity_summary = { - "path": state_path, - "queued": len(activity.queued), - "transferring": activity.transferring, - } - _logger.info("activity_summary=%s", activity_summary) + summary = MountActivitySummary( + path=state_path, queued=len(activity.queued), transferring=activity.transferring + ) + _logger.info("Mount activity %s", summary) async def _restore_state_folder( From 90ff87ee959d8f864365f63b65417a233d3f9ddb Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 08:50:51 +0100 Subject: [PATCH 75/79] removed unused --- .../r_clone_mount/_container.py | 3 - .../r_clone_mount/_manager.py | 59 +++++++++---------- 2 files changed, 28 insertions(+), 34 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py index 4ce1775a4e47..252177dfd94d 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py @@ -1,6 +1,5 @@ import asyncio import logging -from collections.abc import Awaitable, Callable from datetime import timedelta from functools import cached_property from pathlib import Path @@ -249,7 +248,6 @@ def __init__( rc_user: str, rc_password: str, *, - update_handler: Callable[[MountActivity], Awaitable[None]], update_interval: timedelta = _DEFAULT_UPDATE_INTERVAL, r_clone_client_timeout: timedelta = _DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT, ) -> None: @@ -258,7 +256,6 @@ def __init__( self._r_clone_client_timeout = r_clone_client_timeout self._rc_user = rc_user self._rc_password = rc_password - self._update_handler = update_handler self._rc_host = remote_control_host self._rc_port = remote_control_port diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py index 660916e25e02..c76c2d693b9d 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py @@ -34,12 +34,12 @@ _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=5) -class TrackedMount: # pylint:disable=too-many-instance-attributes - def __init__( # pylint:disable=too-many-arguments +class _TrackedMount: + def __init__( self, node_id: NodeID, r_clone_settings: RCloneSettings, - remote_type: MountRemoteType, + mount_remote_type: MountRemoteType, *, rc_port: PortInt, remote_path: StorageFileID, @@ -49,49 +49,47 @@ def __init__( # pylint:disable=too-many-arguments handler_mount_activity: MountActivityProtocol, mount_activity_update_interval: timedelta = _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL, ) -> None: - self.node_id = node_id - self.r_clone_settings = r_clone_settings - self.mount_type = remote_type - self.rc_port = rc_port self.remote_path = remote_path self.local_mount_path = local_mount_path self.index = index - self.rc_user = f"{uuid4()}" - self.rc_password = f"{uuid4()}" - self.handler_get_bind_paths = handler_get_bind_paths - self.handler_mount_activity = handler_mount_activity + + self._handler_mount_activity = handler_mount_activity + self._mount_activity_update_interval = mount_activity_update_interval self._last_mount_activity: MountActivity | None = None self._last_mount_activity_update: datetime = datetime.fromtimestamp(0, UTC) - self._mount_activity_update_interval = mount_activity_update_interval self._task_mount_activity: asyncio.Task[None] | None = None + rc_user = f"{uuid4()}" + rc_password = f"{uuid4()}" + # used internally to handle the mount command self._container_manager = ContainerManager( - r_clone_settings=self.r_clone_settings, - node_id=self.node_id, - remote_control_port=self.rc_port, + r_clone_settings=r_clone_settings, + node_id=node_id, + remote_control_port=rc_port, local_mount_path=self.local_mount_path, index=self.index, r_clone_config_content=get_config_content( - self.r_clone_settings, self.mount_type + r_clone_settings, mount_remote_type ), - remote_path=f"{self.r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME}/{self.remote_path}", - rc_user=self.rc_user, - rc_password=self.rc_password, - handler_get_bind_paths=self.handler_get_bind_paths, + remote_path=f"{r_clone_settings.R_CLONE_S3.S3_BUCKET_NAME}/{self.remote_path}", + rc_user=rc_user, + rc_password=rc_password, + handler_get_bind_paths=handler_get_bind_paths, ) self._rc_http_client = RemoteControlHttpClient( - remote_control_port=self.rc_port, - mount_settings=self.r_clone_settings.R_CLONE_MOUNT_SETTINGS, + remote_control_port=rc_port, + mount_settings=r_clone_settings.R_CLONE_MOUNT_SETTINGS, remote_control_host=self._container_manager.r_clone_container_name, - rc_user=self.rc_user, - rc_password=self.rc_password, - update_handler=self._handler_mount_activity, + rc_user=rc_user, + rc_password=rc_password, ) - async def _handler_mount_activity(self, mount_activity: MountActivity) -> None: + async def _update_and_notify_mount_activity( + self, mount_activity: MountActivity + ) -> None: now = datetime.now(UTC) enough_time_passed = ( @@ -103,12 +101,12 @@ async def _handler_mount_activity(self, mount_activity: MountActivity) -> None: self._last_mount_activity = mount_activity self._last_mount_activity_update = now - await self.handler_mount_activity(self.local_mount_path, mount_activity) + await self._handler_mount_activity(self.local_mount_path, mount_activity) async def _worker_mount_activity(self) -> None: mount_activity = await self._rc_http_client.get_mount_activity() with log_catch(logger=_logger, reraise=False): - await self._handler_mount_activity(mount_activity) + await self._update_and_notify_mount_activity(mount_activity) async def start_mount(self) -> None: await self._container_manager.create() @@ -149,7 +147,7 @@ def __init__( msg = "R_CLONE_VERSION setting is not set" raise RuntimeError(msg) - self._tracked_mounts: dict[MountId, TrackedMount] = {} + self._tracked_mounts: dict[MountId, _TrackedMount] = {} self._task_ensure_mounts_working: asyncio.Task[None] | None = None async def ensure_mounted( @@ -178,7 +176,7 @@ async def ensure_mounted( None, unused_port ) - tracked_mount = TrackedMount( + tracked_mount = _TrackedMount( node_id, self.r_clone_settings, remote_type, @@ -194,7 +192,6 @@ async def ensure_mounted( self._tracked_mounts[mount_id] = tracked_mount def is_mount_tracked(self, local_mount_path: Path, index: NonNegativeInt) -> bool: - """True if if a mount is being tracked""" mount_id = get_mount_id(local_mount_path, index) return mount_id in self._tracked_mounts From 64244e4bb25451e84af668a8a4c19d4bff3c3f28 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 09:05:34 +0100 Subject: [PATCH 76/79] refactor --- .../r_clone_mount/_container.py | 38 +++++++------------ .../r_clone_mount/_docker_utils.py | 4 +- .../r_clone_mount/_manager.py | 12 +++--- 3 files changed, 22 insertions(+), 32 deletions(-) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py index 252177dfd94d..fe911ec3d1c9 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py @@ -77,7 +77,7 @@ def _get_rclone_mount_command( r_clone_config_content: str, remote_path: StorageFileID, local_mount_path: Path, - remote_control_port: PortInt, + rc_port: PortInt, rc_user: str, rc_password: str, ) -> str: @@ -138,7 +138,7 @@ def _get_rclone_mount_command( mount_settings.R_CLONE_MOUNT_ORDER_BY, # REMOTE CONTROL "--rc", - f"--rc-addr=0.0.0.0:{remote_control_port}", + f"--rc-addr=0.0.0.0:{rc_port}", "--rc-enable-metrics", f"--rc-user='{rc_user}'", f"--rc-pass='{rc_password}'", @@ -159,7 +159,7 @@ def __init__( # pylint:disable=too-many-arguments self, r_clone_settings: RCloneSettings, node_id: NodeID, - remote_control_port: PortInt, + rc_port: PortInt, local_mount_path: Path, index: NonNegativeInt, r_clone_config_content: str, @@ -171,7 +171,7 @@ def __init__( # pylint:disable=too-many-arguments ) -> None: self.r_clone_settings = r_clone_settings self.node_id = node_id - self.remote_control_port = remote_control_port + self.rc_port = rc_port self.local_mount_path = local_mount_path self.index = index self.r_clone_config_content = r_clone_config_content @@ -216,12 +216,12 @@ async def create(self): r_clone_config_content=self.r_clone_config_content, remote_path=self.remote_path, local_mount_path=self.local_mount_path, - remote_control_port=self.remote_control_port, + rc_port=self.rc_port, rc_user=self.rc_user, rc_password=self.rc_password, ), r_clone_version=self.r_clone_settings.R_CLONE_VERSION, - remote_control_port=self.remote_control_port, + rc_port=self.rc_port, r_clone_network_name=self._r_clone_network_name, local_mount_path=self.local_mount_path, memory_limit=mount_settings.R_CLONE_MOUNT_CONTAINER_MEMORY_LIMIT, @@ -242,27 +242,21 @@ async def remove(self): class RemoteControlHttpClient: def __init__( self, - remote_control_port: PortInt, - mount_settings: RCloneMountSettings, - remote_control_host: str, + rc_host: str, + rc_port: PortInt, rc_user: str, rc_password: str, *, + transfers_completed_timeout: timedelta, update_interval: timedelta = _DEFAULT_UPDATE_INTERVAL, r_clone_client_timeout: timedelta = _DEFAULT_R_CLONE_CLIENT_REQUEST_TIMEOUT, ) -> None: - self.mount_settings = mount_settings + self.transfers_completed_timeout = transfers_completed_timeout self._update_interval_seconds = update_interval.total_seconds() self._r_clone_client_timeout = r_clone_client_timeout - self._rc_user = rc_user - self._rc_password = rc_password - self._rc_host = remote_control_host - self._rc_port = remote_control_port - - @property - def _base_url(self) -> str: - return f"http://{self._rc_host}:{self._rc_port}" + self._base_url = f"http://{rc_host}:{rc_port}" + self._auth = (rc_user, rc_password) async def _request(self, method: str, path: str) -> Any: request_url = f"{self._base_url}/{path}" @@ -271,9 +265,7 @@ async def _request(self, method: str, path: str) -> Any: async with AsyncClient( timeout=self._r_clone_client_timeout.total_seconds() ) as client: - response = await client.request( - method, request_url, auth=(self._rc_user, self._rc_password) - ) + response = await client.request(method, request_url, auth=self._auth) response.raise_for_status() return response.json() @@ -332,9 +324,7 @@ async def wait_for_all_transfers_to_complete(self) -> None: @retry( wait=wait_fixed(1), - stop=stop_after_delay( - self.mount_settings.R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT.total_seconds() - ), + stop=stop_after_delay(self.transfers_completed_timeout.total_seconds()), reraise=True, retry=retry_if_exception_type( (WaitingForQueueToBeEmptyError, WaitingForTransfersToCompleteError) diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py index 500304744c27..22dcb14c00a5 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py @@ -40,7 +40,7 @@ async def create_r_clone_container( *, command: str, r_clone_version: str, - remote_control_port: PortInt, + rc_port: PortInt, r_clone_network_name: str, local_mount_path: Path, memory_limit: ByteSize, @@ -53,7 +53,7 @@ async def create_r_clone_container( config={ "Image": f"rclone/rclone:{r_clone_version}", "Entrypoint": ["/bin/sh", "-c", f"{command}"], - "ExposedPorts": {f"{remote_control_port}/tcp": {}}, + "ExposedPorts": {f"{rc_port}/tcp": {}}, "HostConfig": { "NetworkMode": r_clone_network_name, "Binds": [], diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py index c76c2d693b9d..b15f80d13a80 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py @@ -34,8 +34,8 @@ _DEFAULT_MOUNT_ACTIVITY_UPDATE_INTERVAL: Final[timedelta] = timedelta(seconds=5) -class _TrackedMount: - def __init__( +class _TrackedMount: # pylint:disable=too-many-instance-attributes + def __init__( # pylint:disable=too-many-arguments self, node_id: NodeID, r_clone_settings: RCloneSettings, @@ -67,7 +67,7 @@ def __init__( self._container_manager = ContainerManager( r_clone_settings=r_clone_settings, node_id=node_id, - remote_control_port=rc_port, + rc_port=rc_port, local_mount_path=self.local_mount_path, index=self.index, r_clone_config_content=get_config_content( @@ -80,11 +80,11 @@ def __init__( ) self._rc_http_client = RemoteControlHttpClient( - remote_control_port=rc_port, - mount_settings=r_clone_settings.R_CLONE_MOUNT_SETTINGS, - remote_control_host=self._container_manager.r_clone_container_name, + rc_port=rc_port, + rc_host=self._container_manager.r_clone_container_name, rc_user=rc_user, rc_password=rc_password, + transfers_completed_timeout=r_clone_settings.R_CLONE_MOUNT_SETTINGS.R_CLONE_MOUNT_TRANSFERS_COMPLETED_TIMEOUT, ) async def _update_and_notify_mount_activity( From 899e17b3110b3bbc820c5a5d5a077bfbaebba5eb Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 10:44:59 +0100 Subject: [PATCH 77/79] added base mount test --- .../r_clone_mount/_container.py | 9 +- .../r_clone_mount/_docker_utils.py | 67 +++-- .../r_clone_mount/_manager.py | 5 +- .../test_node_ports_common_r_clone_mount.py | 250 ++++++++++++++++++ 4 files changed, 304 insertions(+), 27 deletions(-) create mode 100644 packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py index fe911ec3d1c9..8ca4a8d5fd45 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_container.py @@ -255,9 +255,14 @@ def __init__( self._update_interval_seconds = update_interval.total_seconds() self._r_clone_client_timeout = r_clone_client_timeout - self._base_url = f"http://{rc_host}:{rc_port}" + self.rc_host = rc_host + self.rc_port = rc_port self._auth = (rc_user, rc_password) + @property + def _base_url(self) -> float: + return f"http://{self.rc_host}:{self.rc_port}" + async def _request(self, method: str, path: str) -> Any: request_url = f"{self._base_url}/{path}" _logger.debug("Sending '%s %s' request", method, request_url) @@ -307,7 +312,7 @@ async def get_mount_activity(self) -> MountActivity: before_sleep=before_sleep_log(_logger, logging.WARNING), ) async def wait_for_interface_to_be_ready(self) -> None: - await self._rc_noop() + await self._post_vfs_queue() async def is_responsive(self) -> bool: try: diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py index 22dcb14c00a5..185eb20e8414 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_docker_utils.py @@ -8,6 +8,7 @@ from aiodocker import Docker from aiodocker.exceptions import DockerError from aiodocker.networks import DockerNetwork +from aiodocker.types import JSONObject from models_library.basic_types import PortInt from pydantic import ByteSize, NonNegativeInt @@ -34,6 +35,40 @@ async def get_or_crate_docker_session(docker: Docker | None) -> AsyncIterator[Do yield client +async def _get_config( + command: str, + r_clone_version: str, + rc_port: PortInt, + r_clone_network_name: str, + local_mount_path: Path, + memory_limit: ByteSize, + nano_cpus: NonNegativeInt, + handler_get_bind_paths: GetBindPathsProtocol, +) -> JSONObject: + return { + "Image": f"rclone/rclone:{r_clone_version}", + "Entrypoint": ["/bin/sh", "-c", f"{command}"], + "ExposedPorts": {f"{rc_port}/tcp": {}}, + "HostConfig": { + "NetworkMode": r_clone_network_name, + "Binds": [], + "Mounts": await handler_get_bind_paths(local_mount_path), + "Devices": [ + { + "PathOnHost": "/dev/fuse", + "PathInContainer": "/dev/fuse", + "CgroupPermissions": "rwm", + } + ], + "CapAdd": ["SYS_ADMIN"], + "SecurityOpt": ["apparmor:unconfined", "seccomp:unconfined"], + "Memory": memory_limit, + "MemorySwap": memory_limit, + "NanoCpus": nano_cpus, + }, + } + + async def create_r_clone_container( docker: Docker | None, container_name: str, @@ -50,28 +85,16 @@ async def create_r_clone_container( async with get_or_crate_docker_session(docker) as client: # create rclone container attached to the network r_clone_container = await client.containers.run( - config={ - "Image": f"rclone/rclone:{r_clone_version}", - "Entrypoint": ["/bin/sh", "-c", f"{command}"], - "ExposedPorts": {f"{rc_port}/tcp": {}}, - "HostConfig": { - "NetworkMode": r_clone_network_name, - "Binds": [], - "Mounts": await handler_get_bind_paths(local_mount_path), - "Devices": [ - { - "PathOnHost": "/dev/fuse", - "PathInContainer": "/dev/fuse", - "CgroupPermissions": "rwm", - } - ], - "CapAdd": ["SYS_ADMIN"], - "SecurityOpt": ["apparmor:unconfined", "seccomp:unconfined"], - "Memory": memory_limit, - "MemorySwap": memory_limit, - "NanoCpus": nano_cpus, - }, - }, + config=await _get_config( + command, + r_clone_version, + rc_port, + r_clone_network_name, + local_mount_path, + memory_limit, + nano_cpus, + handler_get_bind_paths, + ), name=container_name, ) container_inspect = await r_clone_container.show() diff --git a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py index b15f80d13a80..9a3cce416ecc 100644 --- a/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py +++ b/packages/simcore-sdk/src/simcore_sdk/node_ports_common/r_clone_mount/_manager.py @@ -187,9 +187,8 @@ async def ensure_mounted( handler_get_bind_paths=handler_get_bind_paths, handler_mount_activity=handler_mount_activity, ) - await tracked_mount.start_mount() - self._tracked_mounts[mount_id] = tracked_mount + await tracked_mount.start_mount() def is_mount_tracked(self, local_mount_path: Path, index: NonNegativeInt) -> bool: mount_id = get_mount_id(local_mount_path, index) @@ -202,7 +201,7 @@ async def ensure_unmounted( _logger, logging.INFO, f"unmounting {local_mount_path=}", log_duration=True ): mount_id = get_mount_id(local_mount_path, index) - tracked_mount = self._tracked_mounts[mount_id] + tracked_mount = self._tracked_mounts.pop(mount_id) await tracked_mount.wait_for_all_transfers_to_complete() diff --git a/packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py b/packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py new file mode 100644 index 000000000000..fc5c355873e3 --- /dev/null +++ b/packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py @@ -0,0 +1,250 @@ +# pylint: disable=protected-access +# pylint: disable=redefined-outer-name +# pylint: disable=unused-argument + +import contextlib +import re +from collections.abc import AsyncIterator, Iterator +from pathlib import Path + +import aiodocker +import pytest +from _pytest._py.path import LocalPath +from aiodocker.types import JSONObject +from faker import Faker +from models_library.api_schemas_storage.storage_schemas import S3BucketName +from models_library.basic_types import PortInt +from models_library.projects_nodes_io import NodeID, StorageFileID +from moto.server import ThreadedMotoServer +from pydantic import ByteSize, NonNegativeInt, TypeAdapter +from pytest_mock import MockerFixture +from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict +from servicelib.logging_utils import _dampen_noisy_loggers +from settings_library.r_clone import DEFAULT_VFS_CACHE_PATH, RCloneSettings +from simcore_sdk.node_ports_common.r_clone_mount import ( + GetBindPathsProtocol, + MountActivity, + MountRemoteType, + RCloneMountManager, +) +from simcore_sdk.node_ports_common.r_clone_mount._container import ( + RemoteControlHttpClient, +) +from simcore_sdk.node_ports_common.r_clone_mount._docker_utils import ( + _get_config as original_get_config, +) + +_dampen_noisy_loggers(("botocore", "aiobotocore", "aioboto3", "moto.server")) + + +@pytest.fixture +def bucket_name() -> S3BucketName: + return TypeAdapter(S3BucketName).validate_python("osparc-data") + + +@pytest.fixture +def r_clone_version(package_dir: Path) -> str: + install_rclone_bash = ( + (package_dir / ".." / ".." / ".." / "..").resolve() + / "scripts" + / "install_rclone.bash" + ) + assert install_rclone_bash.exists() + + match = re.search(r'R_CLONE_VERSION="([\d.]+)"', install_rclone_bash.read_text()) + assert match + return match.group(1) + + +@pytest.fixture +def mock_environment( + monkeypatch: pytest.MonkeyPatch, bucket_name: S3BucketName, r_clone_version: str +) -> EnvVarsDict: + return setenvs_from_dict( + monkeypatch, + { + "R_CLONE_PROVIDER": "AWS_MOTO", + "S3_ENDPOINT": "http://127.0.0.1:5000", + "S3_ACCESS_KEY": "test", + "S3_BUCKET_NAME": bucket_name, + "S3_SECRET_KEY": "test", + "S3_REGION": "us-east-1", + "R_CLONE_VERSION": r_clone_version, + "R_CLONE_MOUNT_CONTAINER_SHOW_DEBUG_LOGS": "1", + }, + ) + + +@pytest.fixture +def r_clone_settings(mock_environment: EnvVarsDict) -> RCloneSettings: + return RCloneSettings.create_from_envs() + + +@pytest.fixture +async def r_clone_mount_manager( + r_clone_settings: RCloneSettings, +) -> AsyncIterator[RCloneMountManager]: + + # TODO: maybe put this into a fixture + async def do_nothing() -> None: + pass + + manager = RCloneMountManager(r_clone_settings, handler_request_shutdown=do_nothing) + await manager.setup() + + yield manager + + await manager.teardown() + + +@pytest.fixture +def local_mount_path(tmpdir: LocalPath) -> Path: + local_mount_path = Path(tmpdir) / "local_mount_path" + local_mount_path.mkdir(parents=True, exist_ok=True) + return local_mount_path + + +@pytest.fixture +def vfs_cache_path(tmpdir: LocalPath) -> Path: + vfs_cache_path = Path(tmpdir) / "vfs_cache_path" + vfs_cache_path.mkdir(parents=True, exist_ok=True) + return vfs_cache_path + + +@pytest.fixture +def index() -> int: + return 0 + + +@pytest.fixture +def remote_path(faker: Faker) -> StorageFileID: + return TypeAdapter(StorageFileID).validate_python( + f"{faker.uuid4()}/{faker.uuid4()}/mounted-path" + ) + + +@pytest.fixture +def node_id(faker: Faker) -> NodeID: + return faker.uuid4(cast_to=None) + + +@pytest.fixture +def moto_server() -> Iterator[None]: + """Start moto S3 server on port 5000""" + server = ThreadedMotoServer(port="5000") + server.start() + yield None + server.stop() + + +@pytest.fixture +async def mocked_self_container(mocker: MockerFixture) -> AsyncIterator[None]: + # start the simplest lightweight container that sleeps forever + async with aiodocker.Docker() as client: + container = await client.containers.run( + config={"Image": "alpine:latest", "Cmd": ["sleep", "infinity"]} + ) + + mocker.patch( + "simcore_sdk.node_ports_common.r_clone_mount._docker_utils._get_self_container_id", + return_value=container.id, + ) + + yield None + + # remove started container + with contextlib.suppress(aiodocker.exceptions.DockerError): + await container.delete(force=True) + + +@pytest.fixture +async def mocked_r_clone_container_config(mocker: MockerFixture) -> None: + + async def _patched_get_config( + command: str, + r_clone_version: str, + rc_port: PortInt, + r_clone_network_name: str, + local_mount_path: Path, + memory_limit: ByteSize, + nano_cpus: NonNegativeInt, + handler_get_bind_paths: GetBindPathsProtocol, + ) -> JSONObject: + config = await original_get_config( + command, + r_clone_version, + rc_port, + r_clone_network_name, + local_mount_path, + memory_limit, + nano_cpus, + handler_get_bind_paths, + ) + # Add port forwarding to access from host + config["HostConfig"]["PortBindings"] = { + f"{rc_port}/tcp": [{"HostPort": str(rc_port)}] + } + config["HostConfig"]["NetworkMode"] = "host" + return config + + mocker.patch( + "simcore_sdk.node_ports_common.r_clone_mount._docker_utils._get_config", + side_effect=_patched_get_config, + ) + + # Patch the rc_host to use localhost instead of container name + + original_init = RemoteControlHttpClient.__init__ + + def _patched_init(self, rc_host: str, rc_port: PortInt, *args, **kwargs) -> None: + # Replace container hostname with localhost for host access + original_init(self, "localhost", rc_port, *args, **kwargs) + + mocker.patch.object( + RemoteControlHttpClient, + "__init__", + _patched_init, + ) + + +async def _handle_mount_activity(state_path: Path, activity: MountActivity) -> None: + print(f"⏳ {state_path=} {activity=}") + + +async def test_manager( + moto_server: None, + mocked_r_clone_container_config: None, + mocked_self_container: None, + r_clone_mount_manager: RCloneMountManager, + node_id: NodeID, + remote_path: StorageFileID, + local_mount_path: Path, + vfs_cache_path: Path, + index: int, +) -> None: + + async def _get_bind_paths_protocol(state_path: Path) -> list[Path]: + # no need to add bind mount vfs cache for testing + return [ + {"Type": "bind", "Source": f"{state_path}", "Target": f"{state_path}"}, + { + "Type": "bind", + "Source": f"{vfs_cache_path}", + "Target": f"{DEFAULT_VFS_CACHE_PATH}", + "BindOptions": {"Propagation": "rshared"}, + }, + ] + + await r_clone_mount_manager.ensure_mounted( + local_mount_path=local_mount_path, + remote_type=MountRemoteType.S3, + remote_path=remote_path, + node_id=node_id, + index=index, + handler_get_bind_paths=_get_bind_paths_protocol, + handler_mount_activity=_handle_mount_activity, + ) + + await r_clone_mount_manager.ensure_unmounted( + local_mount_path=local_mount_path, index=index + ) From 6d38fbd100485b5e1839ed578a0d3aa1de1c1bc0 Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 11:09:33 +0100 Subject: [PATCH 78/79] base working test --- .../unit/test_node_ports_common_r_clone_mount.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py b/packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py index fc5c355873e3..aefb72e01e7d 100644 --- a/packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py +++ b/packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py @@ -1,7 +1,6 @@ # pylint: disable=protected-access # pylint: disable=redefined-outer-name # pylint: disable=unused-argument - import contextlib import re from collections.abc import AsyncIterator, Iterator @@ -130,7 +129,6 @@ def node_id(faker: Faker) -> NodeID: @pytest.fixture def moto_server() -> Iterator[None]: - """Start moto S3 server on port 5000""" server = ThreadedMotoServer(port="5000") server.start() yield None @@ -159,7 +157,6 @@ async def mocked_self_container(mocker: MockerFixture) -> AsyncIterator[None]: @pytest.fixture async def mocked_r_clone_container_config(mocker: MockerFixture) -> None: - async def _patched_get_config( command: str, r_clone_version: str, @@ -193,18 +190,13 @@ async def _patched_get_config( ) # Patch the rc_host to use localhost instead of container name - original_init = RemoteControlHttpClient.__init__ def _patched_init(self, rc_host: str, rc_port: PortInt, *args, **kwargs) -> None: # Replace container hostname with localhost for host access original_init(self, "localhost", rc_port, *args, **kwargs) - mocker.patch.object( - RemoteControlHttpClient, - "__init__", - _patched_init, - ) + mocker.patch.object(RemoteControlHttpClient, "__init__", _patched_init) async def _handle_mount_activity(state_path: Path, activity: MountActivity) -> None: @@ -224,7 +216,6 @@ async def test_manager( ) -> None: async def _get_bind_paths_protocol(state_path: Path) -> list[Path]: - # no need to add bind mount vfs cache for testing return [ {"Type": "bind", "Source": f"{state_path}", "Target": f"{state_path}"}, { @@ -248,3 +239,8 @@ async def _get_bind_paths_protocol(state_path: Path) -> list[Path]: await r_clone_mount_manager.ensure_unmounted( local_mount_path=local_mount_path, index=index ) + + +# TODO: test to check that the container is up +# TODO: test to check that when container stopped event to kill app is called +# TODO: CHECK that content form folder is uploaded to S3 in the expected paths From daf5a8a449a00c7869b528bfd7693814434142ad Mon Sep 17 00:00:00 2001 From: Andrei Neagu Date: Fri, 19 Dec 2025 11:50:12 +0100 Subject: [PATCH 79/79] current tests --- .../test_node_ports_common_r_clone_mount.py | 157 +++++++++++++++++- 1 file changed, 151 insertions(+), 6 deletions(-) diff --git a/packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py b/packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py index aefb72e01e7d..15e4fecb59c4 100644 --- a/packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py +++ b/packages/simcore-sdk/tests/unit/test_node_ports_common_r_clone_mount.py @@ -2,14 +2,20 @@ # pylint: disable=redefined-outer-name # pylint: disable=unused-argument import contextlib +import os import re -from collections.abc import AsyncIterator, Iterator +from collections.abc import AsyncIterable, AsyncIterator, Iterator from pathlib import Path +from typing import cast +import aioboto3 import aiodocker +import aiofiles import pytest from _pytest._py.path import LocalPath +from aiobotocore.session import ClientCreatorContext from aiodocker.types import JSONObject +from botocore.client import Config from faker import Faker from models_library.api_schemas_storage.storage_schemas import S3BucketName from models_library.basic_types import PortInt @@ -18,6 +24,7 @@ from pydantic import ByteSize, NonNegativeInt, TypeAdapter from pytest_mock import MockerFixture from pytest_simcore.helpers.monkeypatch_envs import EnvVarsDict, setenvs_from_dict +from servicelib.file_utils import create_sha256_checksum from servicelib.logging_utils import _dampen_noisy_loggers from settings_library.r_clone import DEFAULT_VFS_CACHE_PATH, RCloneSettings from simcore_sdk.node_ports_common.r_clone_mount import ( @@ -32,6 +39,7 @@ from simcore_sdk.node_ports_common.r_clone_mount._docker_utils import ( _get_config as original_get_config, ) +from types_aiobotocore_s3 import S3Client _dampen_noisy_loggers(("botocore", "aiobotocore", "aioboto3", "moto.server")) @@ -79,6 +87,30 @@ def r_clone_settings(mock_environment: EnvVarsDict) -> RCloneSettings: return RCloneSettings.create_from_envs() +@pytest.fixture +async def s3_client( + r_clone_settings: RCloneSettings, bucket_name: S3BucketName +) -> AsyncIterable[S3Client]: + s3_settings = r_clone_settings.R_CLONE_S3 + session = aioboto3.Session() + session_client = session.client( + "s3", + endpoint_url=f"{s3_settings.S3_ENDPOINT}".replace("moto", "localhost"), + aws_access_key_id=s3_settings.S3_ACCESS_KEY, + aws_secret_access_key=s3_settings.S3_SECRET_KEY, + region_name=s3_settings.S3_REGION, + config=Config(signature_version="s3v4"), + ) + assert isinstance(session_client, ClientCreatorContext) # nosec + async with session_client as client: + client = cast(S3Client, client) + + # Create the bucket + await client.create_bucket(Bucket=bucket_name) + + yield client + + @pytest.fixture async def r_clone_mount_manager( r_clone_settings: RCloneSettings, @@ -203,21 +235,107 @@ async def _handle_mount_activity(state_path: Path, activity: MountActivity) -> N print(f"⏳ {state_path=} {activity=}") +async def _create_random_binary_file( + file_path: Path, + file_size: ByteSize, + chunk_size: int = TypeAdapter(ByteSize).validate_python("1mib"), +) -> None: + """Create a random binary file of specified size.""" + async with aiofiles.open(file_path, mode="wb") as file: + bytes_written = 0 + while bytes_written < file_size: + remaining_bytes = file_size - bytes_written + current_chunk_size = min(chunk_size, remaining_bytes) + await file.write(os.urandom(current_chunk_size)) + bytes_written += current_chunk_size + assert bytes_written == file_size + + +async def _create_file_of_size( + target_dir: Path, *, name: str, file_size: ByteSize +) -> Path: + """Create a single file with random content of specified size.""" + file_path = target_dir / name + if not file_path.parent.exists(): + file_path.parent.mkdir(parents=True, exist_ok=True) + + await _create_random_binary_file(file_path, file_size) + assert file_path.exists() + assert file_path.stat().st_size == file_size + return file_path + + +async def _create_files_in_dir( + target_dir: Path, file_count: int, file_size: ByteSize +) -> set[str]: + """Create multiple random files in a directory.""" + files = [] + for i in range(file_count): + file_path = await _create_file_of_size( + target_dir, name=f"file_{i}.bin", file_size=file_size + ) + files.append(file_path) + return {x.name for x in files} + + +async def _get_file_checksums_from_local( + local_path: Path, +) -> dict[Path, str]: + """Get SHA256 checksums of all files in a directory.""" + checksums = {} + for dirpath, _, filenames in os.walk(local_path): + for filename in filenames: + file_path = Path(dirpath) / filename + relative_path = file_path.relative_to(local_path) + + async with aiofiles.open(file_path, "rb") as file: + checksum = await create_sha256_checksum(file) + + checksums[relative_path] = checksum + return checksums + + +async def _get_file_checksums_from_s3( + s3_client: S3Client, bucket_name: S3BucketName, remote_path: StorageFileID +) -> dict[Path, str]: + response = await s3_client.list_objects_v2( + Bucket=bucket_name, Prefix=f"{remote_path}" + ) + + checksums = {} + for obj in response.get("Contents", []): + key = obj["Key"] + file_response = await s3_client.get_object(Bucket=bucket_name, Key=key) + checksum = await create_sha256_checksum(file_response["Body"]) + relative_path = Path(key).relative_to(Path(remote_path)) + checksums[relative_path] = checksum + + return checksums + + async def test_manager( moto_server: None, mocked_r_clone_container_config: None, mocked_self_container: None, r_clone_mount_manager: RCloneMountManager, + r_clone_settings: RCloneSettings, + bucket_name: S3BucketName, node_id: NodeID, remote_path: StorageFileID, local_mount_path: Path, vfs_cache_path: Path, index: int, + s3_client: S3Client, ) -> None: async def _get_bind_paths_protocol(state_path: Path) -> list[Path]: return [ - {"Type": "bind", "Source": f"{state_path}", "Target": f"{state_path}"}, + { + "Type": "bind", + "Source": f"{state_path}", + "Target": f"{state_path}", + "BindOptions": {"Propagation": "rshared"}, + }, { "Type": "bind", "Source": f"{vfs_cache_path}", @@ -236,11 +354,38 @@ async def _get_bind_paths_protocol(state_path: Path) -> list[Path]: handler_mount_activity=_handle_mount_activity, ) + # create random test files + file_count = 5 + file_size = TypeAdapter(ByteSize).validate_python("100kb") + created_files = await _create_files_in_dir(local_mount_path, file_count, file_size) + assert len(created_files) == file_count + + # get checksums of local files before unmounting + local_checksums = await _get_file_checksums_from_local(local_mount_path) + assert len(local_checksums) == file_count + + # wait for rclone to complete all transfers + for mount in r_clone_mount_manager._tracked_mounts.values(): # noqa: SLF001 + await mount.wait_for_all_transfers_to_complete() + + # verify data is in S3 with matching checksums and filenames + s3_checksums = await _get_file_checksums_from_s3( + s3_client, bucket_name, remote_path + ) + + # compare checksums and filenames + assert len(s3_checksums) == len(local_checksums), "File count mismatch" + assert set(s3_checksums.keys()) == set(local_checksums.keys()), "Filename mismatch" + + for file_path, local_checksum in local_checksums.items(): + s3_checksum = s3_checksums[file_path] + assert ( + local_checksum == s3_checksum + ), f"Checksum mismatch for {file_path}: local={local_checksum}, s3={s3_checksum}" + await r_clone_mount_manager.ensure_unmounted( local_mount_path=local_mount_path, index=index ) - -# TODO: test to check that the container is up -# TODO: test to check that when container stopped event to kill app is called -# TODO: CHECK that content form folder is uploaded to S3 in the expected paths + # bind to a different directory and ensure the same content is presnet there as well + # refactor a bit how the files are generated, some more randomnes in sizes, i want to be ranom in range of files and of sizes