From 6bcd724afb82472524a3e5f96a2b0f1351235425 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Wed, 19 Nov 2025 16:20:32 +0100 Subject: [PATCH 01/10] feat: add envidat data connectors support --- .../data_connectors/blueprints.py | 2 +- .../data_connectors/constants.py | 7 ++ .../data_connectors/core.py | 102 +++++++++++++++--- .../data_connectors/schema_org_dataset.py | 92 ++++++++++++++++ .../renku_data_services/storage/constants.py | 6 ++ .../storage/rclone_patches.py | 28 ++++- 6 files changed, 219 insertions(+), 18 deletions(-) create mode 100644 components/renku_data_services/data_connectors/constants.py create mode 100644 components/renku_data_services/data_connectors/schema_org_dataset.py create mode 100644 components/renku_data_services/storage/constants.py diff --git a/components/renku_data_services/data_connectors/blueprints.py b/components/renku_data_services/data_connectors/blueprints.py index bb6cddf35..62cfbc0b8 100644 --- a/components/renku_data_services/data_connectors/blueprints.py +++ b/components/renku_data_services/data_connectors/blueprints.py @@ -100,7 +100,7 @@ def post(self) -> BlueprintFactoryResponse: async def _post( _: Request, user: base_models.APIUser, body: apispec.DataConnectorPost, validator: RCloneValidator ) -> JSONResponse: - data_connector = validate_unsaved_data_connector(body, validator=validator) + data_connector = await validate_unsaved_data_connector(body, validator=validator) result = await self.data_connector_repo.insert_namespaced_data_connector( user=user, data_connector=data_connector ) diff --git a/components/renku_data_services/data_connectors/constants.py b/components/renku_data_services/data_connectors/constants.py new file mode 100644 index 000000000..d83ea9bc5 --- /dev/null +++ b/components/renku_data_services/data_connectors/constants.py @@ -0,0 +1,7 @@ +"""Constants for data connectors.""" + +from typing import Final + +from renku_data_services.storage.constants import ENVIDAT_V1_PROVIDER + +ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS: Final[list[str]] = ["doi", ENVIDAT_V1_PROVIDER] diff --git a/components/renku_data_services/data_connectors/core.py b/components/renku_data_services/data_connectors/core.py index 9117fc623..edced7f68 100644 --- a/components/renku_data_services/data_connectors/core.py +++ b/components/renku_data_services/data_connectors/core.py @@ -7,6 +7,7 @@ from html.parser import HTMLParser from typing import Any +import httpx from pydantic import ValidationError as PydanticValidationError from renku_data_services import base_models, errors @@ -15,9 +16,11 @@ NamespacePath, ProjectPath, ) -from renku_data_services.data_connectors import apispec, models +from renku_data_services.data_connectors import apispec, models, schema_org_dataset +from renku_data_services.data_connectors.constants import ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS from renku_data_services.data_connectors.doi.metadata import get_dataset_metadata from renku_data_services.storage import models as storage_models +from renku_data_services.storage.constants import ENVIDAT_V1_PROVIDER from renku_data_services.storage.rclone import RCloneValidator from renku_data_services.utils.core import get_openbis_pat @@ -43,7 +46,7 @@ def dump_storage_with_sensitive_fields( return body -def validate_unsaved_storage( +async def validate_unsaved_storage( storage: apispec.CloudStorageCorePost | apispec.CloudStorageUrlV2, validator: RCloneValidator ) -> models.CloudStorageCore: """Validate the storage configuration of an unsaved data connector.""" @@ -61,6 +64,10 @@ def validate_unsaved_storage( ) configuration = cloud_storage.configuration.config source_path = cloud_storage.source_path + elif storage.storage_type == ENVIDAT_V1_PROVIDER: + converted_storage = await convert_envidat_v1_data_connector_to_s3(storage) + configuration = converted_storage.configuration + source_path = converted_storage.source_path else: configuration = storage.configuration source_path = storage.source_path @@ -76,13 +83,13 @@ def validate_unsaved_storage( ) -def validate_unsaved_data_connector( +async def validate_unsaved_data_connector( body: apispec.DataConnectorPost, validator: RCloneValidator ) -> models.UnsavedDataConnector: """Validate an unsaved data connector.""" keywords = [kw.root for kw in body.keywords] if body.keywords is not None else [] - storage = validate_unsaved_storage(body.storage, validator=validator) + storage = await validate_unsaved_storage(body.storage, validator=validator) if body.namespace is None: raise NotImplementedError("Missing namespace not supported") @@ -115,20 +122,37 @@ async def prevalidate_unsaved_global_data_connector( ) -> models.UnsavedGlobalDataConnector: """Pre-validate an unsaved data connector.""" - storage = validate_unsaved_storage(body.storage, validator=validator) + storage = await validate_unsaved_storage(body.storage, validator=validator) # TODO: allow admins to create global data connectors, e.g. s3://giab - if storage.storage_type != "doi": - raise errors.ValidationError(message="Only doi storage type is allowed for global data connectors") + if storage.storage_type not in ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS: + raise errors.ValidationError( + message=f"Only {ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS} storage type is allowed for global data connectors" + ) if not storage.readonly: raise errors.ValidationError(message="Global data connectors must be read-only") - rclone_metadata = await validator.get_doi_metadata(configuration=storage.configuration) - - doi_uri = f"doi:{rclone_metadata.doi}" - slug = base_models.Slug.from_name(doi_uri).value - - # Override provider in storage config - storage.configuration["provider"] = rclone_metadata.provider + match storage.storage_type: + case "doi": + rclone_metadata = await validator.get_doi_metadata(configuration=storage.configuration) + + doi_uri = f"doi:{rclone_metadata.doi}" + slug = base_models.Slug.from_name(doi_uri).value + + # Override provider in storage config + storage.configuration["provider"] = rclone_metadata.provider + case x if x == ENVIDAT_V1_PROVIDER: + if not isinstance(body.storage, apispec.CloudStorageCorePost): + raise errors.ValidationError() + doi = body.storage.configuration.get("doi") + if not doi: + raise errors.ValidationError() + doi_uri = f"doi:{doi}" + slug = base_models.Slug.from_name(doi_uri).value + case x: + raise errors.ValidationError( + message=f"Only {ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS} storage type is allowed " + "for global data connectors" + ) return models.UnsavedGlobalDataConnector( name=doi_uri, @@ -157,8 +181,11 @@ async def validate_unsaved_global_data_connector( ) # Fetch DOI metadata - rclone_metadata = await validator.get_doi_metadata(configuration=data_connector.storage.configuration) - metadata = await get_dataset_metadata(rclone_metadata=rclone_metadata) + if data_connector.storage.storage_type == "doi": + rclone_metadata = await validator.get_doi_metadata(configuration=data_connector.storage.configuration) + metadata = await get_dataset_metadata(rclone_metadata=rclone_metadata) + else: + metadata = None name = data_connector.name description = "" @@ -378,3 +405,46 @@ def transform_secrets_for_front_end( ) break return secrets + + +async def convert_envidat_v1_data_connector_to_s3( + payload: apispec.CloudStorageCorePost, +) -> apispec.CloudStorageCorePost: + """Converts a doi-like configuration for Envidat to S3. + + If the paylaod that is passed in is not of the expected type nothing is changed + and the same payload that was passed in is returned. + """ + config = payload.configuration + if config.get("type") != ENVIDAT_V1_PROVIDER: + return payload + + doi = config.get("doi") + if not isinstance(doi, str): + raise errors.ValidationError() + if len(doi) == 0: + raise errors.ValidationError() + doi = doi.removeprefix("https://") + doi = doi.removeprefix("http://") + + new_config = payload.model_copy(deep=True) + new_config.configuration = {} + + envidat_url = "https://envidat.ch/converters-api/internal-dataset/convert/jsonld" + query_params = {"query": doi} + headers = {"accept": "application/json"} + + clnt = httpx.AsyncClient(follow_redirects=True) + async with clnt: + res = await clnt.get(envidat_url, params=query_params, headers=headers) + if res.status_code != 200: + raise errors.ProgrammingError() + dataset = schema_org_dataset.Dataset.model_validate_strings(res.text) + s3_config = schema_org_dataset.get_rclone_config( + dataset, + schema_org_dataset.DatasetProvider.envidat, + ) + new_config.configuration = dict(s3_config.rclone_config) + new_config.source_path = s3_config.path + new_config.storage_type = "s3" + return new_config diff --git a/components/renku_data_services/data_connectors/schema_org_dataset.py b/components/renku_data_services/data_connectors/schema_org_dataset.py new file mode 100644 index 000000000..a77498843 --- /dev/null +++ b/components/renku_data_services/data_connectors/schema_org_dataset.py @@ -0,0 +1,92 @@ +"""This is used by envidat and scicat to provide information about their datasets.""" + +from dataclasses import dataclass +from enum import StrEnum +from urllib.parse import parse_qs, urlparse + +from pydantic import BaseModel, ConfigDict, Field + +from renku_data_services.errors import errors + + +class Distribution(BaseModel): + """The distribution field of a schema.org dataset.""" + + model_config = ConfigDict(extra="ignore") + type: str + contentUrl: str + name: str + + +class Dataset(BaseModel): + """A very limited and partial spec of a schema.org Dataset used by Scicat and Envidat.""" + + model_config = ConfigDict(extra="ignore") + distribution: list[Distribution] = Field(default_factory=list) + + +class DatasetProvider(StrEnum): + """The provider for the dataset.""" + + envidat = "envidat" + + +@dataclass +class S3Config: + """Configuration for a location on S3 storage.""" + + rclone_config: dict[str, str] + bucket: str + prefix: str + + @property + def path(self) -> str: + """Return the path including the bucket name and the prefix.""" + return f"{self.bucket}/{self.prefix}" + + +def get_rclone_config(dataset: Dataset, provider: DatasetProvider) -> S3Config: + """Parse the dataset into an rclone configuration.""" + match provider: + case DatasetProvider.envidat: + return __get_rclone_s3_config_envidat(dataset) + # TODO: Add scicat here + case x: + raise errors.ValidationError(message=f"Got an unknown dataset provider {x}") + + +def __get_rclone_s3_config_envidat(dataset: Dataset) -> S3Config: + """Get the S3 rclone configuration and source path from a dataset returned by envidat.""" + # NOTE: The folks from Envidat assure us that the first entity in the list is the one we want + url = dataset.distribution[0].contentUrl + # NOTE: The folks from Envidat assure us that the URL has the following format + # http://./?prefix= + url_parsed = urlparse(url) + if not url_parsed.scheme: + raise errors.ValidationError(message="A scheme like http or https is needed for the S3 url.") + if not url_parsed.netloc: + raise errors.ValidationError(message="A hostname is needed for the S3 url.") + if not url_parsed.query: + raise errors.ValidationError(message="A query parameter with the path is needed for the S3 url.") + query_params = parse_qs(url_parsed.query) + prefix_list = query_params.get("prefix") + if prefix_list is None or len(prefix_list) == 0: + raise errors.ValidationError(message="The query paramter in the S3 url should container the 'prefix' key.") + prefix = prefix_list[0] + host_split = url_parsed.netloc.split(".") + if len(host_split) < 2: + raise errors.ValidationError( + message="The envidat s3 url is expected to have a host name with at least two parts." + ) + s3_host = ".".join(host_split[1:]) + bucket = host_split[0] + prefix = "/" + prefix.strip("/") + return S3Config( + { + "type": "s3", + "provider": "Other", + "endpoint": f"{url_parsed.scheme}://{s3_host}", + }, + bucket.strip("/"), + prefix, + ) diff --git a/components/renku_data_services/storage/constants.py b/components/renku_data_services/storage/constants.py new file mode 100644 index 000000000..9b9f2867a --- /dev/null +++ b/components/renku_data_services/storage/constants.py @@ -0,0 +1,6 @@ +"""Constants for storage.""" + +from typing import Final + +ENVIDAT_V1_PROVIDER: Final[str] = "envidat_v1" +SCICAT_V1_PROVIDER: Final[str] = "scicat_v1" diff --git a/components/renku_data_services/storage/rclone_patches.py b/components/renku_data_services/storage/rclone_patches.py index f081f78d3..7016c75e1 100644 --- a/components/renku_data_services/storage/rclone_patches.py +++ b/components/renku_data_services/storage/rclone_patches.py @@ -1,9 +1,11 @@ """Patches to apply to phe rclone storage schema.""" +from collections.abc import Callable from copy import deepcopy -from typing import Any, Final +from typing import Any, Final, cast from renku_data_services import errors +from renku_data_services.storage.constants import ENVIDAT_V1_PROVIDER, SCICAT_V1_PROVIDER BANNED_STORAGE: Final[set[str]] = { "alias", @@ -321,6 +323,28 @@ def __patch_schema_add_openbis_type(spec: list[dict[str, Any]]) -> None: ) +def __add_custom_doi_s3_provider(name: str, description: str, prefix: str) -> Callable[[list[dict[str, Any]]], None]: + """This is used to add envidata and scicat as providers. + + However this is not a real provider in Rclone. The data service has to intercept the request + and convert this provider to the proper S3 configuration where the data can be found. + """ + + def __patch(spec: list[dict[str, Any]]) -> None: + doi_original = find_storage(spec, "doi") + doi_new = deepcopy(doi_original) + doi_new["Description"] = description + doi_new["Name"] = name + doi_new["Prefix"] = prefix + doi_new_options = cast(list[dict[str, Any]], doi_new.get("Options", [])) + provider_ind = next((i for i, opt in enumerate(doi_new_options) if opt.get("Name") == "provider"), None) + if provider_ind is not None: + doi_new_options.pop(provider_ind) + spec.append(doi_new) + + return __patch + + def apply_patches(spec: list[dict[str, Any]]) -> None: """Apply patches to RClone schema.""" patches = [ @@ -331,6 +355,8 @@ def apply_patches(spec: list[dict[str, Any]]) -> None: __patch_schema_remove_oauth_propeties, __patch_polybox_storage, __patch_switchdrive_storage, + __add_custom_doi_s3_provider("Envidat", "Envidat data provider", ENVIDAT_V1_PROVIDER), + __add_custom_doi_s3_provider("SciCat", "SciCat data provider", SCICAT_V1_PROVIDER), __patch_schema_remove_banned_sftp_options, __patch_schema_add_openbis_type, ] From 0211c2ead5ad456b7676b9266b70849bf6d61d36 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Thu, 20 Nov 2025 11:21:16 +0100 Subject: [PATCH 02/10] chore: cleanup code, add tests --- .../data_connectors/blueprints.py | 2 +- .../data_connectors/core.py | 215 +++++++++++------- .../renku_data_services/data_connectors/db.py | 5 +- .../data_connectors/doi/metadata.py | 54 +++-- .../data_connectors/doi/models.py | 81 ++++++- .../schema_org.py} | 31 +-- .../data_connectors/models.py | 11 + .../renku_data_services/storage/rclone.py | 45 ++-- .../storage/rclone_patches.py | 4 +- .../data_api/test_data_connectors.py | 27 ++- .../data_connectors/test_doi.py | 42 ++++ .../data_connectors/test_metadata.py | 186 +++++++++++++++ 12 files changed, 544 insertions(+), 159 deletions(-) rename components/renku_data_services/data_connectors/{schema_org_dataset.py => doi/schema_org.py} (76%) create mode 100644 test/components/renku_data_services/data_connectors/test_doi.py create mode 100644 test/components/renku_data_services/data_connectors/test_metadata.py diff --git a/components/renku_data_services/data_connectors/blueprints.py b/components/renku_data_services/data_connectors/blueprints.py index 62cfbc0b8..46c7cddda 100644 --- a/components/renku_data_services/data_connectors/blueprints.py +++ b/components/renku_data_services/data_connectors/blueprints.py @@ -124,7 +124,7 @@ async def _post_global( ) -> JSONResponse: data_connector = await prevalidate_unsaved_global_data_connector(body, validator=validator) result, inserted = await self.data_connector_repo.insert_global_data_connector( - user=user, data_connector=data_connector, validator=validator + user=user, prevalidated_dc=data_connector, validator=validator ) return validated_json( apispec.DataConnector, diff --git a/components/renku_data_services/data_connectors/core.py b/components/renku_data_services/data_connectors/core.py index edced7f68..f7724ea58 100644 --- a/components/renku_data_services/data_connectors/core.py +++ b/components/renku_data_services/data_connectors/core.py @@ -16,12 +16,14 @@ NamespacePath, ProjectPath, ) -from renku_data_services.data_connectors import apispec, models, schema_org_dataset +from renku_data_services.data_connectors import apispec, models from renku_data_services.data_connectors.constants import ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS -from renku_data_services.data_connectors.doi.metadata import get_dataset_metadata +from renku_data_services.data_connectors.doi import schema_org +from renku_data_services.data_connectors.doi.metadata import create_envidat_metadata_url, get_dataset_metadata +from renku_data_services.data_connectors.doi.models import DOI, SchemaOrgDataset from renku_data_services.storage import models as storage_models from renku_data_services.storage.constants import ENVIDAT_V1_PROVIDER -from renku_data_services.storage.rclone import RCloneValidator +from renku_data_services.storage.rclone import RCloneDOIMetadata, RCloneValidator from renku_data_services.utils.core import get_openbis_pat @@ -46,41 +48,83 @@ def dump_storage_with_sensitive_fields( return body -async def validate_unsaved_storage( - storage: apispec.CloudStorageCorePost | apispec.CloudStorageUrlV2, validator: RCloneValidator +def validate_unsaved_storage_url( + storage: apispec.CloudStorageUrlV2, validator: RCloneValidator ) -> models.CloudStorageCore: + """Validate the unsaved storage when its configuration is specificed as a URL.""" + cloud_storage = storage_models.UnsavedCloudStorage.from_url( + project_id="FAKEPROJECTID", + name="fake-storage-name", + storage_url=storage.storage_url, + target_path=storage.target_path, + readonly=storage.readonly, + ) + configuration = cloud_storage.configuration.config + source_path = cloud_storage.source_path + validator.validate(configuration) + return models.CloudStorageCore( + storage_type=configuration["type"], + configuration=configuration, + source_path=source_path, + target_path=storage.target_path, + readonly=storage.readonly, + ) + + +def validate_unsaved_storage_generic( + storage: apispec.CloudStorageCorePost, validator: RCloneValidator +) -> models.CloudStorageCore: + """Validate the unsaved storage when its configuration is specificed as a URL.""" + configuration = storage.configuration + validator.validate(configuration) + storage_type = configuration.get("type") + if not isinstance(storage_type, str): + raise errors.ValidationError() + return models.CloudStorageCore( + storage_type=storage_type, + configuration=configuration, + source_path=storage.source_path, + target_path=storage.target_path, + readonly=storage.readonly, + ) + + +async def validate_unsaved_storage_doi( + storage: apispec.CloudStorageCorePost, validator: RCloneValidator +) -> tuple[models.CloudStorageCore, DOI]: """Validate the storage configuration of an unsaved data connector.""" configuration: dict[str, Any] source_path: str - if isinstance(storage, apispec.CloudStorageUrlV2): - cloud_storage = storage_models.UnsavedCloudStorage.from_url( - project_id="FAKEPROJECTID", - name="fake-storage-name", - storage_url=storage.storage_url, - target_path=storage.target_path, - readonly=storage.readonly, - ) - configuration = cloud_storage.configuration.config - source_path = cloud_storage.source_path - elif storage.storage_type == ENVIDAT_V1_PROVIDER: - converted_storage = await convert_envidat_v1_data_connector_to_s3(storage) - configuration = converted_storage.configuration - source_path = converted_storage.source_path - else: - configuration = storage.configuration - source_path = storage.source_path + doi_str = storage.configuration.get("doi") + if not isinstance(doi_str, str): + raise errors.ValidationError(message="Cannot find the doi in the storage configuration") + + doi = DOI(doi_str) + doi_host = await doi.resolve_host() + + match doi_host: + case "envidat.ch" | "www.envidat.ch": + converted_storage = await convert_envidat_v1_data_connector_to_s3(storage) + configuration = converted_storage.configuration + source_path = converted_storage.source_path or "/" + storage_type = ENVIDAT_V1_PROVIDER + case _: + # Most likely supported by rclone doi provider, you have to call validator.get_doi_metadata to confirm + configuration = storage.configuration + source_path = storage.source_path or "/" + storage_type = storage.storage_type or "doi" validator.validate(configuration) return models.CloudStorageCore( - storage_type=configuration["type"], + storage_type=storage_type, configuration=configuration, source_path=source_path, target_path=storage.target_path, readonly=storage.readonly, - ) + ), doi async def validate_unsaved_data_connector( @@ -89,7 +133,15 @@ async def validate_unsaved_data_connector( """Validate an unsaved data connector.""" keywords = [kw.root for kw in body.keywords] if body.keywords is not None else [] - storage = await validate_unsaved_storage(body.storage, validator=validator) + match body.storage: + case apispec.CloudStorageCorePost() if body.storage.storage_type != "doi": + storage = validate_unsaved_storage_generic(body.storage, validator=validator) + case apispec.CloudStorageCorePost() if body.storage.storage_type == "doi": + storage, _ = await validate_unsaved_storage_doi(body.storage, validator=validator) + case apispec.CloudStorageUrlV2(): + storage = validate_unsaved_storage_url(body.storage, validator=validator) + case _: + raise errors.ValidationError(message="The data connector provided has an unknown payload format.") if body.namespace is None: raise NotImplementedError("Missing namespace not supported") @@ -119,61 +171,55 @@ async def validate_unsaved_data_connector( async def prevalidate_unsaved_global_data_connector( body: apispec.GlobalDataConnectorPost, validator: RCloneValidator -) -> models.UnsavedGlobalDataConnector: +) -> models.PrevalidatedGlobalDataConnector: """Pre-validate an unsaved data connector.""" - - storage = await validate_unsaved_storage(body.storage, validator=validator) # TODO: allow admins to create global data connectors, e.g. s3://giab + if isinstance(body.storage, apispec.CloudStorageUrlV2): + raise errors.ValidationError(message="Global data connectors cannot be configured via a URL.") + storage, doi = await validate_unsaved_storage_doi(body.storage, validator=validator) if storage.storage_type not in ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS: - raise errors.ValidationError( - message=f"Only {ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS} storage type is allowed for global data connectors" - ) + raise errors.ValidationError(message="Only doi storage type is allowed for global data connectors") if not storage.readonly: raise errors.ValidationError(message="Global data connectors must be read-only") - match storage.storage_type: - case "doi": - rclone_metadata = await validator.get_doi_metadata(configuration=storage.configuration) - - doi_uri = f"doi:{rclone_metadata.doi}" - slug = base_models.Slug.from_name(doi_uri).value - - # Override provider in storage config - storage.configuration["provider"] = rclone_metadata.provider - case x if x == ENVIDAT_V1_PROVIDER: - if not isinstance(body.storage, apispec.CloudStorageCorePost): - raise errors.ValidationError() - doi = body.storage.configuration.get("doi") - if not doi: - raise errors.ValidationError() - doi_uri = f"doi:{doi}" - slug = base_models.Slug.from_name(doi_uri).value - case x: - raise errors.ValidationError( - message=f"Only {ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS} storage type is allowed " - "for global data connectors" - ) - - return models.UnsavedGlobalDataConnector( - name=doi_uri, - slug=slug, - visibility=Visibility.PUBLIC, - created_by="", - storage=storage, - description=None, - keywords=[], + rclone_metadata: RCloneDOIMetadata | None = None + doi_uri = f"doi:{doi}" + if storage.storage_type == "doi": + # This means that the storage is most likely supported by Rclone, by calling the get_doi_metadata we confirm + rclone_metadata = await validator.get_doi_metadata(configuration=storage.configuration) + if not rclone_metadata: + raise errors.ValidationError(message="The provided DOI is not supported.") + # Override provider in storage config + storage.configuration["provider"] = rclone_metadata.provider + + slug = base_models.Slug.from_name(doi_uri).value + return models.PrevalidatedGlobalDataConnector( + data_connector=models.UnsavedGlobalDataConnector( + name=doi_uri, + slug=slug, + visibility=Visibility.PUBLIC, + created_by="", + storage=storage, + description=None, + keywords=[], + ), + doi=doi, + rclone_metadata=rclone_metadata, ) async def validate_unsaved_global_data_connector( - data_connector: models.UnsavedGlobalDataConnector, + prevalidated_dc: models.PrevalidatedGlobalDataConnector, validator: RCloneValidator, ) -> models.UnsavedGlobalDataConnector: - """Validate an unsaved data connector.""" + """Validate the data connector.""" + data_connector = prevalidated_dc.data_connector + doi = prevalidated_dc.doi + rclone_metadata = prevalidated_dc.rclone_metadata # Check that we can list the files in the DOI connection_result = await validator.test_connection( - configuration=data_connector.storage.configuration, source_path="/" + configuration=data_connector.storage.configuration, source_path=data_connector.storage.source_path or "/" ) if not connection_result.success: raise errors.ValidationError( @@ -181,9 +227,11 @@ async def validate_unsaved_global_data_connector( ) # Fetch DOI metadata - if data_connector.storage.storage_type == "doi": - rclone_metadata = await validator.get_doi_metadata(configuration=data_connector.storage.configuration) - metadata = await get_dataset_metadata(rclone_metadata=rclone_metadata) + if rclone_metadata: + metadata = await get_dataset_metadata(rclone_metadata.provider, rclone_metadata.metadata_url) + elif data_connector.storage.storage_type == ENVIDAT_V1_PROVIDER: + metadata_url = create_envidat_metadata_url(doi) + metadata = await get_dataset_metadata(ENVIDAT_V1_PROVIDER, metadata_url) else: metadata = None @@ -209,15 +257,18 @@ async def validate_unsaved_global_data_connector( # Assign user-friendly target_path if possible target_path = data_connector.slug - with contextlib.suppress(errors.ValidationError): - name_slug = base_models.Slug.from_name(name).value - target_path = base_models.Slug.from_name(f"{name_slug[:30]}-{target_path}").value + if metadata: + # If there is no metdata, the slug is derived from the name, and the name is the doi + # So without metadata if we extend the target_path it just repeats the slug twice + with contextlib.suppress(errors.ValidationError): + name_slug = base_models.Slug.from_name(name).value + target_path = base_models.Slug.from_name(f"{name_slug[:30]}-{target_path}").value # Override source_path and target_path storage = models.CloudStorageCore( storage_type=data_connector.storage.storage_type, configuration=data_connector.storage.configuration, - source_path="/", + source_path=data_connector.storage.source_path, target_path=target_path, readonly=data_connector.storage.readonly, ) @@ -410,22 +461,14 @@ def transform_secrets_for_front_end( async def convert_envidat_v1_data_connector_to_s3( payload: apispec.CloudStorageCorePost, ) -> apispec.CloudStorageCorePost: - """Converts a doi-like configuration for Envidat to S3. - - If the paylaod that is passed in is not of the expected type nothing is changed - and the same payload that was passed in is returned. - """ + """Converts a doi-like configuration for Envidat to S3.""" config = payload.configuration - if config.get("type") != ENVIDAT_V1_PROVIDER: - return payload - doi = config.get("doi") if not isinstance(doi, str): raise errors.ValidationError() if len(doi) == 0: raise errors.ValidationError() - doi = doi.removeprefix("https://") - doi = doi.removeprefix("http://") + doi = DOI(doi) new_config = payload.model_copy(deep=True) new_config.configuration = {} @@ -434,15 +477,15 @@ async def convert_envidat_v1_data_connector_to_s3( query_params = {"query": doi} headers = {"accept": "application/json"} - clnt = httpx.AsyncClient(follow_redirects=True) + clnt = httpx.AsyncClient(follow_redirects=True, timeout=5) async with clnt: res = await clnt.get(envidat_url, params=query_params, headers=headers) if res.status_code != 200: raise errors.ProgrammingError() - dataset = schema_org_dataset.Dataset.model_validate_strings(res.text) - s3_config = schema_org_dataset.get_rclone_config( + dataset = SchemaOrgDataset.model_validate_json(res.text) + s3_config = schema_org.get_rclone_config( dataset, - schema_org_dataset.DatasetProvider.envidat, + schema_org.DatasetProvider.envidat, ) new_config.configuration = dict(s3_config.rclone_config) new_config.source_path = s3_config.path diff --git a/components/renku_data_services/data_connectors/db.py b/components/renku_data_services/data_connectors/db.py index af8e16b6c..6f4445d64 100644 --- a/components/renku_data_services/data_connectors/db.py +++ b/components/renku_data_services/data_connectors/db.py @@ -370,7 +370,7 @@ async def insert_namespaced_data_connector( async def insert_global_data_connector( self, user: base_models.APIUser, - data_connector: models.UnsavedGlobalDataConnector, + prevalidated_dc: models.PrevalidatedGlobalDataConnector, validator: RCloneValidator | None, *, session: AsyncSession | None = None, @@ -381,6 +381,7 @@ async def insert_global_data_connector( if user.id is None: raise errors.UnauthorizedError(message="You do not have the required permissions for this operation.") + data_connector = prevalidated_dc.data_connector slug = data_connector.slug or base_models.Slug.from_name(data_connector.name).value existing_global_dc_stmt = select(schemas.DataConnectorORM).where(schemas.DataConnectorORM.global_slug == slug) @@ -401,7 +402,7 @@ async def insert_global_data_connector( if validator is None: raise RuntimeError("Could not validate global data connector") data_connector = await validate_unsaved_global_data_connector( - data_connector=data_connector, validator=validator + prevalidated_dc=prevalidated_dc, validator=validator ) dc = await self._insert_data_connector(user=user, data_connector=data_connector, session=session) diff --git a/components/renku_data_services/data_connectors/doi/metadata.py b/components/renku_data_services/data_connectors/doi/metadata.py index 02ad9c857..7811f5ffe 100644 --- a/components/renku_data_services/data_connectors/doi/metadata.py +++ b/components/renku_data_services/data_connectors/doi/metadata.py @@ -1,27 +1,27 @@ """Metadata handling for DOIs.""" +from urllib.parse import urlencode + import httpx from pydantic import ValidationError as PydanticValidationError from renku_data_services.data_connectors.doi import models -from renku_data_services.storage.rclone import RCloneDOIMetadata +from renku_data_services.storage.constants import ENVIDAT_V1_PROVIDER -async def get_dataset_metadata(rclone_metadata: RCloneDOIMetadata) -> models.DOIMetadata | None: +async def get_dataset_metadata(provider: str, metadata_url: str) -> models.DOIMetadata | None: """Retrieve DOI metadata.""" - if rclone_metadata.provider == "invenio" or rclone_metadata.provider == "zenodo": - return await _get_dataset_metadata_invenio(rclone_metadata=rclone_metadata) - if rclone_metadata.provider == "dataverse": - return await _get_dataset_metadata_dataverse(rclone_metadata=rclone_metadata) + if provider == "invenio" or provider == "zenodo": + return await _get_dataset_metadata_invenio(metadata_url) + if provider == "dataverse": + return await _get_dataset_metadata_dataverse(metadata_url) + if provider == ENVIDAT_V1_PROVIDER: + return await _get_envidat_metadata(metadata_url) return None -async def _get_dataset_metadata_invenio(rclone_metadata: RCloneDOIMetadata) -> models.DOIMetadata | None: +async def _get_dataset_metadata_invenio(metadata_url: str) -> models.DOIMetadata | None: """Retrieve DOI metadata from the InvenioRDM API.""" - metadata_url = rclone_metadata.metadata_url - if not metadata_url: - return None - async with httpx.AsyncClient(timeout=5) as client: try: res = await client.get(url=metadata_url, follow_redirects=True, headers=[("accept", "application/json")]) @@ -43,11 +43,8 @@ async def _get_dataset_metadata_invenio(rclone_metadata: RCloneDOIMetadata) -> m return models.DOIMetadata(name=name, description=description, keywords=keywords) -async def _get_dataset_metadata_dataverse(rclone_metadata: RCloneDOIMetadata) -> models.DOIMetadata | None: +async def _get_dataset_metadata_dataverse(metadata_url: str) -> models.DOIMetadata | None: """Retrieve DOI metadata from the Dataverse API.""" - metadata_url = rclone_metadata.metadata_url - if not metadata_url: - return None async with httpx.AsyncClient(timeout=5) as client: try: @@ -118,3 +115,30 @@ async def _get_dataset_metadata_dataverse(rclone_metadata: RCloneDOIMetadata) -> except PydanticValidationError: pass return models.DOIMetadata(name=name, description=description, keywords=keywords) + + +def create_envidat_metadata_url(doi: models.DOI) -> str: + """Create the metadata url for envidat from a DOI.""" + url = "https://envidat.ch/converters-api/internal-dataset/convert/jsonld" + params = urlencode({"query": doi}) + return f"{url}?{params}" + + +async def _get_envidat_metadata(metadata_url: str) -> models.DOIMetadata | None: + """Get metadata about the envidat dataset.""" + clnt = httpx.AsyncClient(follow_redirects=True, timeout=5) + headers = {"accept": "application/json"} + async with clnt: + try: + res = await clnt.get(metadata_url, headers=headers) + except httpx.HTTPError: + return None + if res.status_code != 200: + return None + try: + parsed_metadata = models.SchemaOrgDataset.model_validate_json(res.text) + except PydanticValidationError: + return None + return models.DOIMetadata( + name=parsed_metadata.name, description=parsed_metadata.description or "", keywords=parsed_metadata.keywords + ) diff --git a/components/renku_data_services/data_connectors/doi/models.py b/components/renku_data_services/data_connectors/doi/models.py index 48da936aa..ed1422f9e 100644 --- a/components/renku_data_services/data_connectors/doi/models.py +++ b/components/renku_data_services/data_connectors/doi/models.py @@ -1,9 +1,61 @@ """Models for DOIs.""" +import re from dataclasses import dataclass -from typing import Any - -from pydantic import BaseModel, Field +from typing import Any, Self +from urllib.parse import urlparse + +import httpx +from pydantic import BaseModel, ConfigDict, Field + +from renku_data_services.errors import errors + + +class DOI(str): + """A doi for a dataset or a similar resource.""" + + __regex = re.compile(r"^10\.\d{4,9}/\S+$", re.IGNORECASE) + + def __new__(cls, doi: str) -> Self: + """Create a new doi. + + A few cases possible: + doi:10.16904/12 + 10.16904/12 + https://www.doi.org/10.16904/12 + http://www.doi.org/10.16904/12 + http://doi.org/10.16904/12 + """ + doi_parsed = urlparse(doi) + doi_clean = doi + if doi_parsed.netloc in ["www.doi.org", "doi.org"]: + if doi_parsed.scheme not in ["https", "http"]: + raise errors.ValidationError( + message=f"Received the right doi.org host but an unexpected scheme {doi_parsed} for doi {doi}." + ) + doi_clean = doi_parsed.path.strip("/") + if doi.startswith("doi:"): + doi_clean = doi[4:] + if not doi_clean or not DOI.__regex.match(doi_clean): + raise errors.ValidationError(message=f"The provided value {doi} is not a valid doi.") + return super().__new__(cls, doi_clean) + + @property + def url(self) -> str: + """Return a proper URL from the doi.""" + return f"https://doi.org/{self}" + + async def resolve_host(self) -> str | None: + """Resolves the DOI and returns the hostname of the url where the redirect leads.""" + clnt = httpx.AsyncClient(timeout=5, follow_redirects=True) + async with clnt: + try: + res = await clnt.get(self.url) + except httpx.HTTPError: + return None + if res.status_code != 200: + return None + return res.url.host @dataclass(frozen=True, eq=True, kw_only=True) @@ -67,3 +119,26 @@ class DataverseDatasetResponse(BaseModel): status: str = Field() data: DataverseDataset | None = Field() + + +class SchemaOrgDistribution(BaseModel): + """The distribution field of a schema.org dataset.""" + + model_config = ConfigDict(extra="ignore") + type: str = Field(alias="@type") + content_url: str = Field(alias="contentUrl") + + +class SchemaOrgDataset(BaseModel): + """A very limited and partial spec of a schema.org Dataset used by Scicat and Envidat.""" + + model_config = ConfigDict(extra="ignore") + distribution: list[SchemaOrgDistribution] = Field(default_factory=list) + name: str = Field() + description: str | None = None + raw_keywords: str = Field(alias="keywords", default="") + + @property + def keywords(self) -> list[str]: + """Split the single keywords string into a list.""" + return [i.strip() for i in self.raw_keywords.split(",")] diff --git a/components/renku_data_services/data_connectors/schema_org_dataset.py b/components/renku_data_services/data_connectors/doi/schema_org.py similarity index 76% rename from components/renku_data_services/data_connectors/schema_org_dataset.py rename to components/renku_data_services/data_connectors/doi/schema_org.py index a77498843..23d8bfa8a 100644 --- a/components/renku_data_services/data_connectors/schema_org_dataset.py +++ b/components/renku_data_services/data_connectors/doi/schema_org.py @@ -4,27 +4,10 @@ from enum import StrEnum from urllib.parse import parse_qs, urlparse -from pydantic import BaseModel, ConfigDict, Field - +from renku_data_services.data_connectors.doi.models import SchemaOrgDataset from renku_data_services.errors import errors -class Distribution(BaseModel): - """The distribution field of a schema.org dataset.""" - - model_config = ConfigDict(extra="ignore") - type: str - contentUrl: str - name: str - - -class Dataset(BaseModel): - """A very limited and partial spec of a schema.org Dataset used by Scicat and Envidat.""" - - model_config = ConfigDict(extra="ignore") - distribution: list[Distribution] = Field(default_factory=list) - - class DatasetProvider(StrEnum): """The provider for the dataset.""" @@ -42,10 +25,10 @@ class S3Config: @property def path(self) -> str: """Return the path including the bucket name and the prefix.""" - return f"{self.bucket}/{self.prefix}" + return f"/{self.bucket}{self.prefix}" -def get_rclone_config(dataset: Dataset, provider: DatasetProvider) -> S3Config: +def get_rclone_config(dataset: SchemaOrgDataset, provider: DatasetProvider) -> S3Config: """Parse the dataset into an rclone configuration.""" match provider: case DatasetProvider.envidat: @@ -55,10 +38,10 @@ def get_rclone_config(dataset: Dataset, provider: DatasetProvider) -> S3Config: raise errors.ValidationError(message=f"Got an unknown dataset provider {x}") -def __get_rclone_s3_config_envidat(dataset: Dataset) -> S3Config: +def __get_rclone_s3_config_envidat(dataset: SchemaOrgDataset) -> S3Config: """Get the S3 rclone configuration and source path from a dataset returned by envidat.""" # NOTE: The folks from Envidat assure us that the first entity in the list is the one we want - url = dataset.distribution[0].contentUrl + url = dataset.distribution[0].content_url # NOTE: The folks from Envidat assure us that the URL has the following format # http://./?prefix= url_parsed = urlparse(url) @@ -79,7 +62,7 @@ def __get_rclone_s3_config_envidat(dataset: Dataset) -> S3Config: message="The envidat s3 url is expected to have a host name with at least two parts." ) s3_host = ".".join(host_split[1:]) - bucket = host_split[0] + bucket = host_split[0].strip("/") prefix = "/" + prefix.strip("/") return S3Config( { @@ -87,6 +70,6 @@ def __get_rclone_s3_config_envidat(dataset: Dataset) -> S3Config: "provider": "Other", "endpoint": f"{url_parsed.scheme}://{s3_host}", }, - bucket.strip("/"), + bucket, prefix, ) diff --git a/components/renku_data_services/data_connectors/models.py b/components/renku_data_services/data_connectors/models.py index df0bcd880..e10084809 100644 --- a/components/renku_data_services/data_connectors/models.py +++ b/components/renku_data_services/data_connectors/models.py @@ -14,7 +14,9 @@ NamespacePath, ProjectPath, ) +from renku_data_services.data_connectors.doi.models import DOI from renku_data_services.namespace.models import GroupNamespace, ProjectNamespace, UserNamespace +from renku_data_services.storage.rclone import RCloneDOIMetadata from renku_data_services.utils.etag import compute_etag_from_fields if TYPE_CHECKING: @@ -99,6 +101,15 @@ class UnsavedGlobalDataConnector(BaseDataConnector): namespace: None = None +@dataclass(frozen=True, eq=True, kw_only=True) +class PrevalidatedGlobalDataConnector: + """Global data connector model that is unsaved but has been pre-validated.""" + + data_connector: UnsavedGlobalDataConnector + doi: DOI + rclone_metadata: RCloneDOIMetadata | None = None + + @dataclass(frozen=True, eq=True, kw_only=True) class DeletedDataConnector: """A dataconnector that has been deleted.""" diff --git a/components/renku_data_services/storage/rclone.py b/components/renku_data_services/storage/rclone.py index 385a6af58..2a3841d67 100644 --- a/components/renku_data_services/storage/rclone.py +++ b/components/renku_data_services/storage/rclone.py @@ -1,5 +1,7 @@ """Apispec schemas for storage service.""" +from __future__ import annotations + import asyncio import json import tempfile @@ -46,7 +48,7 @@ def __init__(self) -> None: logger.error("Couldn't load RClone config: %s", provider_config) raise - def validate(self, configuration: Union["RCloneConfig", dict[str, Any]], keep_sensitive: bool = False) -> None: + def validate(self, configuration: Union[RCloneConfig, dict[str, Any]], keep_sensitive: bool = False) -> None: """Validates an RClone config.""" provider = self.get_provider(configuration) @@ -81,7 +83,7 @@ def get_real_configuration(self, configuration: Union["RCloneConfig", dict[str, return real_config async def test_connection( - self, configuration: Union["RCloneConfig", dict[str, Any]], source_path: str + self, configuration: Union[RCloneConfig, dict[str, Any]], source_path: str ) -> ConnectionResult: """Tests connecting with an RClone config.""" try: @@ -121,21 +123,21 @@ async def test_connection( return ConnectionResult(success=success, error=error.decode()) async def obscure_config( - self, configuration: Union["RCloneConfig", dict[str, Any]] - ) -> Union["RCloneConfig", dict[str, Any]]: + self, configuration: Union[RCloneConfig, dict[str, Any]] + ) -> Union[RCloneConfig, dict[str, Any]]: """Obscure secrets in rclone config.""" provider = self.get_provider(configuration) result = await provider.obscure_password_options(configuration) return result - def remove_sensitive_options_from_config(self, configuration: Union["RCloneConfig", dict[str, Any]]) -> None: + def remove_sensitive_options_from_config(self, configuration: Union[RCloneConfig, dict[str, Any]]) -> None: """Remove sensitive fields from a config, e.g. when turning a private storage public.""" provider = self.get_provider(configuration) provider.remove_sensitive_options_from_config(configuration) - def get_provider(self, configuration: Union["RCloneConfig", dict[str, Any]]) -> "RCloneProviderSchema": + def get_provider(self, configuration: Union[RCloneConfig, dict[str, Any]]) -> RCloneProviderSchema: """Get a provider for configuration.""" storage_type = cast(str, configuration.get("type")) @@ -158,13 +160,13 @@ def asdict(self) -> list[dict[str, Any]]: return [provider.model_dump(exclude_none=True, by_alias=True) for provider in self.providers.values()] def get_private_fields( - self, configuration: Union["RCloneConfig", dict[str, Any]] - ) -> Generator["RCloneOption", None, None]: + self, configuration: Union[RCloneConfig, dict[str, Any]] + ) -> Generator[RCloneOption, None, None]: """Get private field descriptions for storage.""" provider = self.get_provider(configuration) return provider.get_private_fields(configuration) - async def get_doi_metadata(self, configuration: Union["RCloneConfig", dict[str, Any]]) -> "RCloneDOIMetadata": + async def get_doi_metadata(self, configuration: Union[RCloneConfig, dict[str, Any]]) -> RCloneDOIMetadata | None: """Returns the metadata of a DOI remote.""" provider = self.get_provider(configuration) if provider.name != "doi": @@ -192,14 +194,9 @@ async def get_doi_metadata(self, configuration: Union["RCloneConfig", dict[str, if success: metadata = RCloneDOIMetadata.model_validate_json(stdout.decode().strip()) return metadata - raise errors.ValidationError( - message=f"Could not resolve DOI {configuration.get("doi", "")} or the hosting platform is not supported", # noqa E501 - detail=f"Reason: {stderr.decode().strip()}", - ) + return None - def inject_default_values( - self, config: Union["RCloneConfig", dict[str, Any]] - ) -> Union["RCloneConfig", dict[str, Any]]: + def inject_default_values(self, config: Union[RCloneConfig, dict[str, Any]]) -> Union[RCloneConfig, dict[str, Any]]: """Adds default values for required options that are not provided in the config.""" provider = self.get_provider(config) cfg_provider: str | None = config.get("provider") @@ -220,8 +217,8 @@ def inject_default_values( @staticmethod def transform_polybox_switchdriver_config( - configuration: Union["RCloneConfig", dict[str, Any]], - ) -> Union["RCloneConfig", dict[str, Any]]: + configuration: Union[RCloneConfig, dict[str, Any]], + ) -> Union[RCloneConfig, dict[str, Any]]: """Transform the configuration for public access.""" storage_type = configuration.get("type") @@ -408,9 +405,7 @@ def check_unsafe_option(self, name: str) -> None: raise errors.ValidationError(message=f"The {name} option is not allowed.") return None - def validate_config( - self, configuration: Union["RCloneConfig", dict[str, Any]], keep_sensitive: bool = False - ) -> None: + def validate_config(self, configuration: Union[RCloneConfig, dict[str, Any]], keep_sensitive: bool = False) -> None: """Validate an RClone config.""" keys = set(configuration.keys()) - {"type"} provider: str | None = configuration.get("provider") @@ -446,15 +441,15 @@ def validate_config( configuration[key] = option.validate_config(value, provider=provider, keep_sensitive=keep_sensitive) - def remove_sensitive_options_from_config(self, configuration: Union["RCloneConfig", dict[str, Any]]) -> None: + def remove_sensitive_options_from_config(self, configuration: Union[RCloneConfig, dict[str, Any]]) -> None: """Remove sensitive options from configuration.""" for sensitive in self.sensitive_options: if sensitive.name in configuration: del configuration[sensitive.name] async def obscure_password_options( - self, configuration: Union["RCloneConfig", dict[str, Any]] - ) -> Union["RCloneConfig", dict[str, Any]]: + self, configuration: Union[RCloneConfig, dict[str, Any]] + ) -> Union[RCloneConfig, dict[str, Any]]: """Obscure all password options.""" for passwd in self.password_options: if val := configuration.get(passwd.name): @@ -475,7 +470,7 @@ async def obscure_password_options( return configuration def get_private_fields( - self, configuration: Union["RCloneConfig", dict[str, Any]] + self, configuration: Union[RCloneConfig, dict[str, Any]] ) -> Generator[RCloneOption, None, None]: """Get private field descriptions for storage.""" provider: str | None = configuration.get("provider") diff --git a/components/renku_data_services/storage/rclone_patches.py b/components/renku_data_services/storage/rclone_patches.py index 7016c75e1..298044c84 100644 --- a/components/renku_data_services/storage/rclone_patches.py +++ b/components/renku_data_services/storage/rclone_patches.py @@ -355,8 +355,8 @@ def apply_patches(spec: list[dict[str, Any]]) -> None: __patch_schema_remove_oauth_propeties, __patch_polybox_storage, __patch_switchdrive_storage, - __add_custom_doi_s3_provider("Envidat", "Envidat data provider", ENVIDAT_V1_PROVIDER), - __add_custom_doi_s3_provider("SciCat", "SciCat data provider", SCICAT_V1_PROVIDER), + __add_custom_doi_s3_provider("Envidat", "Envidat data provider", "doi"), + __add_custom_doi_s3_provider("SciCat", "SciCat data provider", "doi"), __patch_schema_remove_banned_sftp_options, __patch_schema_add_openbis_type, ] diff --git a/test/bases/renku_data_services/data_api/test_data_connectors.py b/test/bases/renku_data_services/data_api/test_data_connectors.py index 9f08a08cc..47e495e94 100644 --- a/test/bases/renku_data_services/data_api/test_data_connectors.py +++ b/test/bases/renku_data_services/data_api/test_data_connectors.py @@ -9,9 +9,10 @@ from renku_data_services.authz.models import Visibility from renku_data_services.base_models.core import NamespacePath, ProjectPath from renku_data_services.data_connectors import core +from renku_data_services.data_connectors.apispec import CloudStorageCorePost, GlobalDataConnectorPost from renku_data_services.data_connectors.doi.models import DOIMetadata from renku_data_services.namespace.models import NamespaceKind -from renku_data_services.storage.rclone import RCloneDOIMetadata +from renku_data_services.storage.rclone import RCloneDOIMetadata, RCloneValidator from renku_data_services.users.models import UserInfo from renku_data_services.utils.core import get_openbis_session_token_for_anonymous_user from test.bases.renku_data_services.data_api.utils import merge_headers @@ -2477,3 +2478,27 @@ async def _mock(*args, **kwargs) -> DOIMetadata | None: "_get_dataset_metadata_dataverse", _mock_get_dataset_metadata(_orig_get_dataset_metadata_dataverse), ) + + +async def test_validate_envidat_data_connector() -> None: + body = GlobalDataConnectorPost( + storage=CloudStorageCorePost( + storage_type="doi", + configuration={"type": "doi", "doi": "10.16904/12"}, + source_path="/", + target_path="/", + readonly=True, + ) + ) + validator = RCloneValidator() + res = await core.prevalidate_unsaved_global_data_connector(body, validator) + config = res.data_connector.storage.configuration + assert config["type"] == "s3" + assert config["provider"] == "Other" + assert config["endpoint"].find("zhdk.cloud.switch.ch") >= 0 + assert res.data_connector.storage.source_path == "/envidat-doi/10.16904_12" + res = await core.validate_unsaved_global_data_connector(res, validator) + assert res.description is not None + assert len(res.description) > 0 + assert res.keywords is not None + assert len(res.keywords) > 0 diff --git a/test/components/renku_data_services/data_connectors/test_doi.py b/test/components/renku_data_services/data_connectors/test_doi.py new file mode 100644 index 000000000..a4e0a9129 --- /dev/null +++ b/test/components/renku_data_services/data_connectors/test_doi.py @@ -0,0 +1,42 @@ +import pytest + +from renku_data_services.data_connectors.doi.models import DOI +from renku_data_services.errors import errors + + +@pytest.mark.parametrize( + ("raw_doi", "expected_value"), + [ + ("doi:10.16904/12", "10.16904/12"), + ("10.16904/12", "10.16904/12"), + ("https://www.doi.org/10.16904/12", "10.16904/12"), + ("http://www.doi.org/10.16904/12", "10.16904/12"), + ("http://doi.org/10.16904/12", "10.16904/12"), + ("http://doi.org/10.16904/12//", "10.16904/12"), + ("http://doi.org/10.16904/12/", "10.16904/12"), + ("http://doi.org/10.16904/12/?query=something#fragment", "10.16904/12"), + ("http://doi.org/10.16904/12?query=something#fragment", "10.16904/12"), + ("10.5281/zenodo.3831980", "10.5281/zenodo.3831980"), + ], +) +def test_valid_doi_parsing(raw_doi: str, expected_value: str) -> None: + assert DOI(raw_doi) == expected_value + + +@pytest.mark.parametrize( + "raw_doi", + [ + "wrong:10.16904/12", + "10.1690423423432423423423/12", + "s3://www.doi.org/10.16904/12", + "http://test.com/10.16904/12", + "bad", + "really bad", + "", + "https:10.16904/12", + "s3:10.16904/12", + ], +) +def test_invalid_doi_parsing(raw_doi: str) -> None: + with pytest.raises(errors.ValidationError): + DOI(raw_doi) diff --git a/test/components/renku_data_services/data_connectors/test_metadata.py b/test/components/renku_data_services/data_connectors/test_metadata.py new file mode 100644 index 000000000..3e76dd5b1 --- /dev/null +++ b/test/components/renku_data_services/data_connectors/test_metadata.py @@ -0,0 +1,186 @@ +from renku_data_services.data_connectors.doi.models import SchemaOrgDataset + +envidat_sample_response = """ +{ + "@context": "http://schema.org", + "@type": "https://schema.org/Dataset", + "url": "https://envidat.ch/#/metadata/ch2014", + "@id": "https://doi.org/10.16904/12", + "identifier": "https://doi.org/10.16904/12", + "sameAs": { + "@type": "Dataset", + "@id": "https://envidat.ch/#/metadata/c8696023-5622-481d-952a-13f88c35e9fe" + }, + "inLanguage": { + "alternateName": "eng", + "@type": "Language", + "name": "English" + }, + "publisher": { + "@type": "Organization", + "name": "EnviDat" + }, + "contentSize": "33.0 GB", + "size": "33.0 GB", + "datePublished": "2017", + "creator": [ + { + "@type": "Person", + "name": "Schoegl, Sebastian", + "givenName": "Sebastian", + "familyName": "Schoegl", + "affiliation": [ + { + "@type": "Organization", + "name": "WSL Institute for Snow and Avanche Research SLF" + } + ] + }, + { + "@type": "Person", + "name": "Marty, Christoph", + "givenName": "Christoph", + "familyName": "Marty", + "affiliation": [ + { + "@type": "Organization", + "name": "WSL Institute for Snow and Avanche Research SLF" + } + ] + }, + { + "@type": "Person", + "name": "Bavay, Mathias", + "givenName": "Mathias", + "familyName": "Bavay", + "affiliation": [ + { + "@type": "Organization", + "name": "WSL Institute for Snow and Avanche Research SLF" + }, + { + "@type": "Organization", + "name": "SLF" + } + ], + "@id": "0000-0002-5039-1578" + }, + { + "@type": "Person", + "name": "Lehning, Michael", + "givenName": "Michael", + "familyName": "Lehning", + "affiliation": [ + { + "@type": "Organization", + "name": "WSL Institute for Snow and Avanche Research SLF" + } + ] + } + ], + "author": [ + { + "@type": "Person", + "name": "Schoegl, Sebastian", + "givenName": "Sebastian", + "familyName": "Schoegl", + "affiliation": [ + { + "@type": "Organization", + "name": "WSL Institute for Snow and Avanche Research SLF" + } + ] + }, + { + "@type": "Person", + "name": "Marty, Christoph", + "givenName": "Christoph", + "familyName": "Marty", + "affiliation": [ + { + "@type": "Organization", + "name": "WSL Institute for Snow and Avanche Research SLF" + } + ] + }, + { + "@type": "Person", + "name": "Bavay, Mathias", + "givenName": "Mathias", + "familyName": "Bavay", + "affiliation": [ + { + "@type": "Organization", + "name": "WSL Institute for Snow and Avanche Research SLF" + }, + { + "@type": "Organization", + "name": "SLF" + } + ], + "@id": "0000-0002-5039-1578" + }, + { + "@type": "Person", + "name": "Lehning, Michael", + "givenName": "Michael", + "familyName": "Lehning", + "affiliation": [ + { + "@type": "Organization", + "name": "WSL Institute for Snow and Avanche Research SLF" + } + ] + } + ], + "keywords": "CH2011, CH2014, CLIMATE CHANGE, SNOW DEPTH, SNOW WATER EQUIVALENT", + "temporal": [ + "2017-02-07" + ], + "distribution": [ + { + "@type": "DataDownload", + "contentUrl": "https://envidat-doi.os.zhdk.cloud.switch.ch/?prefix=10.16904_12", + "name": "All resources in one place" + }, + { + "@type": "DataDownload", + "contentUrl": "http://ch2014-impacts.ch/", + "contentSize": 0, + "encodingFormat": "No Info", + "name": "CH2014 REPORT" + }, + { + "@type": "DataDownload", + "contentUrl": "https://envicloud.wsl.ch/#/?prefix=doi/12/ch2014/", + "contentSize": 35433480192, + "encodingFormat": "No Info", + "name": "Dataset" + }, + { + "@type": "DataDownload", + "contentUrl": "https://www.envidat.ch/dataset/c8696023-5622-481d-952a-13f88c35e9fe/resource/41555ebb-435b-40a3-b338-826e1c3172e3/download/graubunden_input.tar.bz2", + "contentSize": 1208440, + "encodingFormat": "application/x-tar", + "name": "Graubunden input" + }, + { + "@type": "DataDownload", + "contentUrl": "https://www.envidat.ch/dataset/c8696023-5622-481d-952a-13f88c35e9fe/resource/ad886f54-bc18-4972-9fd0-db9bda3f7dd5/download/aare_input.tar.bz2", + "contentSize": 360212, + "encodingFormat": "application/x-tar", + "name": "Aare input" + } + ], + "name": "Alpine3D simulations of future climate scenarios CH2014", + "description": "# Overview\\r\\n\\r\\nThe CH2014-Impacts initiative is a concerted national effort to describe impacts of climate change in Switzerland quantitatively, drawing on the scientific resources available in Switzerland today. The initiative links the recently developed Swiss Climate Change Scenarios CH2011 with an evolving base of quantitative impact models. The use of a common climate data set across disciplines and research groups sets a high standard of consistency and comparability of results. Impact studies explore the wide range of climatic changes in temperature and precipitation projected in CH2011 for the 21st century, which vary with the assumed global level of greenhouse gases, the time horizon, the underlying climate model, and the geographical region within Switzerland. The differences among climate projections are considered using three greenhouse gas scenarios, three future time periods in the 21st century, and three climate uncertainty levels (Figure 1). Impacts are shown with respect to the reference period 1980-2009 of CH2011, and add to any impacts that have already emerged as a result of earlier climate change.\\r\\n\\r\\n# Experimental Setup\\r\\n\\r\\nFuture snow cover changes are simulated with the physics-based model Alpine3D (Lehning et al., 2006). It is applied to two regions: The canton of Graub\u00fcnden and the Aare catchment. These domains are modeled with a Digital Elevation Model (DEM) with a resolution of 200 m \u00d7 200 m. This defines the simulation grid that has to be filled with land cover data and downscaled meteorological input data for each cell for the time period of interest at hourly resolution. The reference data set consists of automatic weather station data. All meteorological input parameters are spatially interpolated to the simulation grid. The reference period comprises only thirteen years (1999\u20132012), because the number of available high elevation weather stations for earlier times is not sufficient to achieve unbiased distribution of the observations with elevation. The model uses projected temperature and precipitation changes for all greenhouse gas scenarios (A1B, A2, and RCP3PD) and CH2011 time periods (2035, 2060, and 2085).\\r\\n\\r\\n# Data\\r\\n\\r\\nSnow cover changes are projected to be relatively small in the near term (2035) (Figure 5.1 top), in particular at higher elevations above 2000 m asl. As shown by Bavay et al. (2013) the spread in projected snow cover for this period is greater between different climate model chains (Chapter 3) than between the reference period and the model chain exhibiting the most moderate change. In the 2085 period much larger changes with the potential to fundamentally transform the snow dominated alpine area become apparent (Figure 5.1 bottom). These changes include a shortening of the snow season by 5\u20139 weeks for the A1B scenario. This is roughly equivalent to an elevation shift of 400\u2013800 m. The slight increase of winter precipitation and therefore snow fall projected in the CH2011 scenarios (with high associated uncertainty) can no longer compensate for the effect of increasing winter temperatures even at high elevations. In terms of Snow Water Equivalents (SWE), the projected reduction is up to two thirds toward the end of the century (2085). A continuous snow cover will be restricted to a shorter time period and/or to regions at increasingly high elevation. In Bern, for example, the number of days per year with at least 5 cm snow depth will decrease by 90% from now 20 days to only 2 days on average.\\r\\n", + "dateCreated": "2017-02-07T08:21:09.738064", + "dateModified": "2025-11-07T09:29:55.466890", + "version": "1", + "license": "https://opendefinition.org/licenses/odc-odbl" +} +""" # noqa: E501 + + +def test_envidat_metadata_parsing() -> None: + SchemaOrgDataset.model_validate_json(envidat_sample_response) From 3c6f2323490f7e73b1078176e40fa65655a4e931 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Fri, 28 Nov 2025 17:03:27 +0100 Subject: [PATCH 03/10] chore: enable connection testing --- components/renku_data_services/storage/rclone.py | 13 +++++++++++++ .../renku_data_services/storage/rclone_patches.py | 6 +++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/components/renku_data_services/storage/rclone.py b/components/renku_data_services/storage/rclone.py index 2a3841d67..93a5b8b4e 100644 --- a/components/renku_data_services/storage/rclone.py +++ b/components/renku_data_services/storage/rclone.py @@ -13,6 +13,7 @@ from renku_data_services import errors from renku_data_services.app_config import logging +from renku_data_services.storage.constants import ENVIDAT_V1_PROVIDER from renku_data_services.storage.rclone_patches import BANNED_SFTP_OPTIONS, BANNED_STORAGE, apply_patches logger = logging.getLogger(__name__) @@ -94,6 +95,7 @@ async def test_connection( # Obscure configuration and transform if needed obscured_config = await self.obscure_config(self.get_real_configuration(configuration)) transformed_config = self.inject_default_values(self.transform_polybox_switchdriver_config(obscured_config)) + transformed_config = self.transform_envidat_config(transformed_config) with tempfile.NamedTemporaryFile(mode="w+", delete=False, encoding="utf-8") as f: config = "\n".join(f"{k}={v}" for k, v in transformed_config.items()) @@ -254,6 +256,17 @@ def transform_polybox_switchdriver_config( return configuration + @staticmethod + def transform_envidat_config(configuration: RCloneConfig | dict[str, Any]) -> RCloneConfig | dict[str, Any]: + """Used to convert the configuration for Envidat into a real configuration.""" + storage_type = configuration.get("type") + if storage_type is None: + return configuration + if storage_type != ENVIDAT_V1_PROVIDER: + return configuration + configuration["type"] = "doi" + return configuration + class RCloneTriState(BaseModel): """Represents a Tristate of true|false|unset.""" diff --git a/components/renku_data_services/storage/rclone_patches.py b/components/renku_data_services/storage/rclone_patches.py index 298044c84..24c135fe6 100644 --- a/components/renku_data_services/storage/rclone_patches.py +++ b/components/renku_data_services/storage/rclone_patches.py @@ -5,7 +5,6 @@ from typing import Any, Final, cast from renku_data_services import errors -from renku_data_services.storage.constants import ENVIDAT_V1_PROVIDER, SCICAT_V1_PROVIDER BANNED_STORAGE: Final[set[str]] = { "alias", @@ -355,8 +354,9 @@ def apply_patches(spec: list[dict[str, Any]]) -> None: __patch_schema_remove_oauth_propeties, __patch_polybox_storage, __patch_switchdrive_storage, - __add_custom_doi_s3_provider("Envidat", "Envidat data provider", "doi"), - __add_custom_doi_s3_provider("SciCat", "SciCat data provider", "doi"), + # __add_custom_doi_s3_provider("Envidat", "Envidat data provider", ENVIDAT_V1_PROVIDER), + # TODO: Enable Scicat when it is ready in production + # __add_custom_doi_s3_provider("SciCat", "SciCat data provider", SCICAT_V1_PROVIDER), __patch_schema_remove_banned_sftp_options, __patch_schema_add_openbis_type, ] From 9adef08c4e8fdbbbe0f30160a66c83df4b171a5b Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Wed, 3 Dec 2025 10:59:40 +0100 Subject: [PATCH 04/10] feat: add doi and publisher info to global dcs --- .../data_connectors/api.spec.yaml | 29 ++++++++- .../data_connectors/apispec.py | 11 +++- .../data_connectors/blueprints.py | 3 + .../data_connectors/core.py | 17 +++++- .../renku_data_services/data_connectors/db.py | 10 ++++ .../data_connectors/doi/models.py | 60 ++++++++++++++++--- .../data_connectors/models.py | 7 ++- .../data_connectors/orm.py | 7 +++ ..._add_doi_and_publisher_info_for_global_.py | 50 ++++++++++++++++ .../data_api/test_data_connectors.py | 23 +++++++ 10 files changed, 204 insertions(+), 13 deletions(-) create mode 100644 components/renku_data_services/migrations/versions/bd97866a6253_add_doi_and_publisher_info_for_global_.py diff --git a/components/renku_data_services/data_connectors/api.spec.yaml b/components/renku_data_services/data_connectors/api.spec.yaml index f5bc0094c..e3367c836 100644 --- a/components/renku_data_services/data_connectors/api.spec.yaml +++ b/components/renku_data_services/data_connectors/api.spec.yaml @@ -475,6 +475,14 @@ components: $ref: "#/components/schemas/ETag" keywords: $ref: "#/components/schemas/KeywordsList" + doi: + $ref: "#/components/schemas/DOI" + publisher_name: + type: string + description: The publisher of the dataset. + publisher_url: + type: string + description: The URL for the publisher of the dataset. required: - id - name @@ -766,7 +774,22 @@ components: type: type: string description: data type of option value. RClone has more options but they map to the ones listed here. - enum: ["int", "bool", "string", "stringArray", "Time", "Duration", "MultiEncoder", "SizeSuffix", "SpaceSepList", "CommaSepList", "Tristate", "Encoding", "Bits"] + enum: + [ + "int", + "bool", + "string", + "stringArray", + "Time", + "Duration", + "MultiEncoder", + "SizeSuffix", + "SpaceSepList", + "CommaSepList", + "Tristate", + "Encoding", + "Bits", + ] required: - name - help @@ -870,6 +893,10 @@ components: type: string description: Entity Tag example: "9EE498F9D565D0C41E511377425F32F3" + DOI: + type: string + description: "A DOI." + example: "10.16904/envidat.33" DataConnectorsGetQuery: description: Query params for data connectors get request allOf: diff --git a/components/renku_data_services/data_connectors/apispec.py b/components/renku_data_services/data_connectors/apispec.py index 895a5bef4..c5d5303ba 100644 --- a/components/renku_data_services/data_connectors/apispec.py +++ b/components/renku_data_services/data_connectors/apispec.py @@ -1,6 +1,6 @@ # generated by datamodel-codegen: # filename: api.spec.yaml -# timestamp: 2025-06-19T07:18:06+00:00 +# timestamp: 2025-12-03T09:49:11+00:00 from __future__ import annotations @@ -364,6 +364,15 @@ class DataConnector(BaseAPISpec): examples=[["project", "keywords"]], min_length=0, ) + doi: Optional[str] = Field( + None, description="A DOI.", examples=["10.16904/envidat.33"] + ) + publisher_name: Optional[str] = Field( + None, description="The publisher of the dataset." + ) + publisher_url: Optional[str] = Field( + None, description="The URL for the publisher of the dataset." + ) class DataConnectorPost(BaseAPISpec): diff --git a/components/renku_data_services/data_connectors/blueprints.py b/components/renku_data_services/data_connectors/blueprints.py index 46c7cddda..35ebd0786 100644 --- a/components/renku_data_services/data_connectors/blueprints.py +++ b/components/renku_data_services/data_connectors/blueprints.py @@ -503,6 +503,9 @@ def _dump_data_connector( description=data_connector.description, etag=data_connector.etag, keywords=data_connector.keywords or [], + doi=data_connector.doi, + publisher_name=data_connector.publisher_name, + publisher_url=data_connector.publisher_url, ) return dict( id=str(data_connector.id), diff --git a/components/renku_data_services/data_connectors/core.py b/components/renku_data_services/data_connectors/core.py index f7724ea58..798fba48d 100644 --- a/components/renku_data_services/data_connectors/core.py +++ b/components/renku_data_services/data_connectors/core.py @@ -193,6 +193,7 @@ async def prevalidate_unsaved_global_data_connector( storage.configuration["provider"] = rclone_metadata.provider slug = base_models.Slug.from_name(doi_uri).value + doi_metadata = await doi.metadata() return models.PrevalidatedGlobalDataConnector( data_connector=models.UnsavedGlobalDataConnector( name=doi_uri, @@ -202,8 +203,14 @@ async def prevalidate_unsaved_global_data_connector( storage=storage, description=None, keywords=[], + doi=doi, + publisher_url=None + if doi_metadata is None or doi_metadata.publisher is None + else doi_metadata.publisher.url, + publisher_name=None + if doi_metadata is None or doi_metadata.publisher is None + else doi_metadata.publisher.name, ), - doi=doi, rclone_metadata=rclone_metadata, ) @@ -214,9 +221,12 @@ async def validate_unsaved_global_data_connector( ) -> models.UnsavedGlobalDataConnector: """Validate the data connector.""" data_connector = prevalidated_dc.data_connector - doi = prevalidated_dc.doi + doi = prevalidated_dc.data_connector.doi rclone_metadata = prevalidated_dc.rclone_metadata + if not doi: + raise errors.ValidationError(message="Global data connectors require a DOI.") + # Check that we can list the files in the DOI connection_result = await validator.test_connection( configuration=data_connector.storage.configuration, source_path=data_connector.storage.source_path or "/" @@ -281,6 +291,9 @@ async def validate_unsaved_global_data_connector( storage=storage, description=description or None, keywords=keywords, + doi=data_connector.doi, + publisher_name=data_connector.publisher_name, + publisher_url=data_connector.publisher_url, ) diff --git a/components/renku_data_services/data_connectors/db.py b/components/renku_data_services/data_connectors/db.py index 6f4445d64..0ca942904 100644 --- a/components/renku_data_services/data_connectors/db.py +++ b/components/renku_data_services/data_connectors/db.py @@ -30,6 +30,7 @@ from renku_data_services.data_connectors import apispec, models from renku_data_services.data_connectors import orm as schemas from renku_data_services.data_connectors.core import validate_unsaved_global_data_connector +from renku_data_services.data_connectors.doi.models import DOI from renku_data_services.namespace import orm as ns_schemas from renku_data_services.namespace.db import GroupRepository from renku_data_services.namespace.models import ProjectNamespace @@ -292,6 +293,9 @@ async def _insert_data_connector( slug = data_connector.slug or base_models.Slug.from_name(data_connector.name).value + doi: DOI | None = None + publisher_url: str | None = None + publisher_name: str | None = None if ns is not None and isinstance(data_connector, models.UnsavedDataConnector): existing_slug_stmt = ( select(ns_schemas.EntitySlugORM) @@ -313,6 +317,9 @@ async def _insert_data_connector( existing_global_dc = await session.scalar(existing_global_dc_stmt) if existing_global_dc is not None: raise errors.ConflictError(message=f"An entity with the slug '{data_connector.slug}' already exists.") + doi = data_connector.doi + publisher_name = data_connector.publisher_name + publisher_url = data_connector.publisher_url visibility_orm = ( apispec.Visibility(data_connector.visibility) @@ -331,6 +338,9 @@ async def _insert_data_connector( description=data_connector.description, keywords=data_connector.keywords, global_slug=slug if isinstance(data_connector, models.UnsavedGlobalDataConnector) else None, + doi=doi, + publisher_url=publisher_url, + publisher_name=publisher_name, ) if ns is not None: data_connector_slug = ns_schemas.EntitySlugORM.create_data_connector_slug( diff --git a/components/renku_data_services/data_connectors/doi/models.py b/components/renku_data_services/data_connectors/doi/models.py index ed1422f9e..d5a4d8fd2 100644 --- a/components/renku_data_services/data_connectors/doi/models.py +++ b/components/renku_data_services/data_connectors/doi/models.py @@ -1,15 +1,19 @@ """Models for DOIs.""" +from __future__ import annotations + import re from dataclasses import dataclass from typing import Any, Self from urllib.parse import urlparse import httpx -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, ValidationError from renku_data_services.errors import errors +_clnt = httpx.AsyncClient(timeout=5, follow_redirects=True) + class DOI(str): """A doi for a dataset or a similar resource.""" @@ -45,18 +49,36 @@ def url(self) -> str: """Return a proper URL from the doi.""" return f"https://doi.org/{self}" + @property + def prefix(self) -> str: + """The prefix of the doi, i.e. if the doi is 10.7910/DVN/XLX9F8, then the prefix is 10.7910.""" + return self.split("/")[0] + async def resolve_host(self) -> str | None: """Resolves the DOI and returns the hostname of the url where the redirect leads.""" - clnt = httpx.AsyncClient(timeout=5, follow_redirects=True) - async with clnt: - try: - res = await clnt.get(self.url) - except httpx.HTTPError: - return None + try: + res = await _clnt.get(self.url) + except httpx.HTTPError: + return None if res.status_code != 200: return None return res.url.host + async def metadata(self) -> SchemaOrgDataset | None: + """Get information about the publisher of the DOI.""" + try: + res = await _clnt.get(self.url, headers={"Accept": "application/vnd.schemaorg.ld+json"}) + except httpx.HTTPError: + return None + if res.status_code != 200: + return None + try: + output = SchemaOrgDataset.model_validate_json(res.text) + except ValidationError: + return None + else: + return output + @dataclass(frozen=True, eq=True, kw_only=True) class DOIMetadata: @@ -130,15 +152,37 @@ class SchemaOrgDistribution(BaseModel): class SchemaOrgDataset(BaseModel): - """A very limited and partial spec of a schema.org Dataset used by Scicat and Envidat.""" + """A very limited and partial spec of a schema.org Dataset used by Scicat, Envidat, doi.org.""" model_config = ConfigDict(extra="ignore") distribution: list[SchemaOrgDistribution] = Field(default_factory=list) name: str = Field() description: str | None = None raw_keywords: str = Field(alias="keywords", default="") + publisher: SchemaOrgPublisher | None = None @property def keywords(self) -> list[str]: """Split the single keywords string into a list.""" return [i.strip() for i in self.raw_keywords.split(",")] + + +class SchemaOrgPublisher(BaseModel): + """The schema.org publisher field in a dataset.""" + + model_config = ConfigDict(extra="ignore") + id: str | None = Field(alias="@id", default=None) + type: str | None = Field(alias="@type", default=None) + name: str + + @property + def url(self) -> str | None: + """Try to see if the id is a URL, and if so return it.""" + if self.id is None: + return None + parsed = urlparse(self.id) + if not parsed.scheme or not parsed.netloc: + return None + if parsed.scheme not in ["http", "https"]: + return None + return self.id.rstrip("/") diff --git a/components/renku_data_services/data_connectors/models.py b/components/renku_data_services/data_connectors/models.py index e10084809..7f212e0fd 100644 --- a/components/renku_data_services/data_connectors/models.py +++ b/components/renku_data_services/data_connectors/models.py @@ -87,6 +87,9 @@ class GlobalDataConnector(BaseDataConnector): id: ULID namespace: Final[None] = field(default=None, init=False) updated_at: datetime + publisher_name: str | None = None + publisher_url: str | None = None + doi: DOI | None = None @property def etag(self) -> str: @@ -99,6 +102,9 @@ class UnsavedGlobalDataConnector(BaseDataConnector): """Global data connector model.""" namespace: None = None + publisher_name: str | None = None + publisher_url: str | None = None + doi: DOI | None = None @dataclass(frozen=True, eq=True, kw_only=True) @@ -106,7 +112,6 @@ class PrevalidatedGlobalDataConnector: """Global data connector model that is unsaved but has been pre-validated.""" data_connector: UnsavedGlobalDataConnector - doi: DOI rclone_metadata: RCloneDOIMetadata | None = None diff --git a/components/renku_data_services/data_connectors/orm.py b/components/renku_data_services/data_connectors/orm.py index f51d54d5c..ed7581f9d 100644 --- a/components/renku_data_services/data_connectors/orm.py +++ b/components/renku_data_services/data_connectors/orm.py @@ -13,6 +13,7 @@ from renku_data_services.base_orm.registry import COMMON_ORM_REGISTRY from renku_data_services.data_connectors import models from renku_data_services.data_connectors.apispec import Visibility +from renku_data_services.data_connectors.doi.models import DOI from renku_data_services.project.orm import ProjectORM from renku_data_services.secrets.orm import SecretORM from renku_data_services.users.orm import UserORM @@ -97,6 +98,9 @@ class DataConnectorORM(BaseORM): init=False, viewonly=True, ) + doi: Mapped[str | None] = mapped_column(default=None, server_default=None, index=True, nullable=True) + publisher_name: Mapped[str | None] = mapped_column(default=None, server_default=None, index=True, nullable=True) + publisher_url: Mapped[str | None] = mapped_column(default=None, server_default=None, index=True, nullable=True) def dump(self) -> models.DataConnector | models.GlobalDataConnector: """Create a data connector model from the DataConnectorORM.""" @@ -112,6 +116,9 @@ def dump(self) -> models.DataConnector | models.GlobalDataConnector: storage=self._dump_storage(), description=self.description, keywords=self.keywords, + publisher_name=self.publisher_name, + publisher_url=self.publisher_url, + doi=DOI(self.doi) if self.doi is not None else None, ) elif self.slug is None: diff --git a/components/renku_data_services/migrations/versions/bd97866a6253_add_doi_and_publisher_info_for_global_.py b/components/renku_data_services/migrations/versions/bd97866a6253_add_doi_and_publisher_info_for_global_.py new file mode 100644 index 000000000..2bdc7010f --- /dev/null +++ b/components/renku_data_services/migrations/versions/bd97866a6253_add_doi_and_publisher_info_for_global_.py @@ -0,0 +1,50 @@ +"""add doi and publisher info for global data connectors + +Revision ID: bd97866a6253 +Revises: 42049656cdb8 +Create Date: 2025-12-03 09:38:17.534403 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "bd97866a6253" +down_revision = "42049656cdb8" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("data_connectors", sa.Column("doi", sa.String(), nullable=True), schema="storage") + op.add_column("data_connectors", sa.Column("publisher_name", sa.String(), nullable=True), schema="storage") + op.add_column("data_connectors", sa.Column("publisher_url", sa.String(), nullable=True), schema="storage") + op.create_index(op.f("ix_storage_data_connectors_doi"), "data_connectors", ["doi"], unique=False, schema="storage") + op.create_index( + op.f("ix_storage_data_connectors_publisher_name"), + "data_connectors", + ["publisher_name"], + unique=False, + schema="storage", + ) + op.create_index( + op.f("ix_storage_data_connectors_publisher_url"), + "data_connectors", + ["publisher_url"], + unique=False, + schema="storage", + ) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f("ix_storage_data_connectors_publisher_url"), table_name="data_connectors", schema="storage") + op.drop_index(op.f("ix_storage_data_connectors_publisher_name"), table_name="data_connectors", schema="storage") + op.drop_index(op.f("ix_storage_data_connectors_doi"), table_name="data_connectors", schema="storage") + op.drop_column("data_connectors", "publisher_url", schema="storage") + op.drop_column("data_connectors", "publisher_name", schema="storage") + op.drop_column("data_connectors", "doi", schema="storage") + # ### end Alembic commands ### diff --git a/test/bases/renku_data_services/data_api/test_data_connectors.py b/test/bases/renku_data_services/data_api/test_data_connectors.py index 47e495e94..6aa6341ea 100644 --- a/test/bases/renku_data_services/data_api/test_data_connectors.py +++ b/test/bases/renku_data_services/data_api/test_data_connectors.py @@ -2497,8 +2497,31 @@ async def test_validate_envidat_data_connector() -> None: assert config["provider"] == "Other" assert config["endpoint"].find("zhdk.cloud.switch.ch") >= 0 assert res.data_connector.storage.source_path == "/envidat-doi/10.16904_12" + assert res.data_connector.doi is not None + assert res.data_connector.publisher_url is not None + assert res.data_connector.publisher_name is not None res = await core.validate_unsaved_global_data_connector(res, validator) assert res.description is not None assert len(res.description) > 0 assert res.keywords is not None assert len(res.keywords) > 0 + assert res.doi is not None + assert res.publisher_url is not None + assert res.publisher_name is not None + + +async def test_add_envidat_data_connector(sanic_client: SanicASGITestClient, user_headers) -> None: + payload = { + "storage": { + "configuration": {"type": "doi", "doi": "10.16904/envidat.716"}, + "source_path": "/", + "target_path": "/", + "readonly": True, + } + } + _, res = await sanic_client.post("/api/data/data_connectors/global", json=payload, headers=user_headers) + assert res.status_code == 201 + assert res.json.get("doi") is not None + assert res.json.get("publisher_name") is not None + assert res.json.get("publisher_name").lower() == "envidat" + assert res.json.get("publisher_url") is not None From 2eb656f1dbed11d83e375dcff8e1f85de56298d3 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Wed, 10 Dec 2025 08:47:24 +0100 Subject: [PATCH 05/10] chore: add doi and publisher name to search --- components/renku_data_services/search/db.py | 2 + .../search/solr_user_query.py | 10 +++ .../renku_data_services/search/user_query.py | 82 +++++++++++++++++++ .../search/user_query_parser.py | 6 ++ .../solr/entity_documents.py | 4 +- .../renku_data_services/solr/entity_schema.py | 14 ++++ .../renku_data_services/solr/solr_schema.py | 2 + 7 files changed, 119 insertions(+), 1 deletion(-) diff --git a/components/renku_data_services/search/db.py b/components/renku_data_services/search/db.py index 72f434ae6..f0d963d47 100644 --- a/components/renku_data_services/search/db.py +++ b/components/renku_data_services/search/db.py @@ -80,6 +80,8 @@ def _dataconnector_to_entity_doc(dc: DataConnector | GlobalDataConnector) -> Dat description=dc.description, keywords=dc.keywords if dc.keywords is not None else [], version=DocVersions.off(), + doi=dc.doi if hasattr(dc, "doi") else None, + publisherName=dc.publisher_name if hasattr(dc, "publisher_name") else None, ) diff --git a/components/renku_data_services/search/solr_user_query.py b/components/renku_data_services/search/solr_user_query.py index d9319fb25..8907a648f 100644 --- a/components/renku_data_services/search/solr_user_query.py +++ b/components/renku_data_services/search/solr_user_query.py @@ -18,6 +18,7 @@ Created, CreatedByIs, DirectMemberIs, + DoiIs, IdIs, InheritedMemberIs, KeywordIs, @@ -25,6 +26,7 @@ NamespaceIs, Order, OrderBy, + PublisherNameIs, RoleIs, SlugIs, SortableField, @@ -344,6 +346,14 @@ async def visit_slug_is(self, ft: SlugIs) -> None: """Process slug-is segment.""" self.__append(st.field_is_any(Fields.slug, ft.values.map(st.from_str))) + async def visit_doi_is(self, ft: DoiIs) -> None: + """Process doi-is segment.""" + self.__append(st.field_is_any(Fields.doi, ft.values.map(st.from_str))) + + async def visit_publisher_name_is(self, ft: PublisherNameIs) -> None: + """Process publisher_name-is segment.""" + self.__append(st.field_is_any(Fields.publisher_name, ft.values.map(st.from_str))) + async def visit_visibility_is(self, ft: VisibilityIs) -> None: """Process visibility-is segment.""" self.__append(st.field_is_any(Fields.visibility, ft.values.map(st.from_visibility))) diff --git a/components/renku_data_services/search/user_query.py b/components/renku_data_services/search/user_query.py index b91721a6c..a70023bce 100644 --- a/components/renku_data_services/search/user_query.py +++ b/components/renku_data_services/search/user_query.py @@ -51,6 +51,8 @@ class Field(StrEnum): namespace = "namespace" direct_member = "direct_member" inherited_member = "inherited_member" + doi = "doi" + publisher_name = "publisher_name" class Comparison(StrEnum): @@ -631,6 +633,54 @@ async def accept(self, visitor: SegmentVisitior) -> None: return await visitor.visit_role_is(self) +@dataclass +class DoiIs(FieldComparison): + """Compare the doi against a list of values.""" + + values: Nel[str] + + @property + def field(self) -> Field: + """The field name.""" + return Field.doi + + @property + def cmp(self) -> Comparison: + """The comparison operation.""" + return Comparison.is_equal + + def _render_value(self) -> str: + return self.values.mk_string(",", Helper.quote) + + async def accept(self, visitor: SegmentVisitior) -> None: + """Apply this to the visitor.""" + return await visitor.visit_doi_is(self) + + +@dataclass +class PublisherNameIs(FieldComparison): + """Compare the publisher name against a list of values.""" + + values: Nel[str] + + @property + def field(self) -> Field: + """The field name.""" + return Field.publisher_name + + @property + def cmp(self) -> Comparison: + """The comparison operation.""" + return Comparison.is_equal + + def _render_value(self) -> str: + return self.values.mk_string(",", Helper.quote) + + async def accept(self, visitor: SegmentVisitior) -> None: + """Apply this to the visitor.""" + return await visitor.visit_publisher_name_is(self) + + @dataclass class Text(SegmentBase): """A query part that is not corresponding to a specific field.""" @@ -712,6 +762,8 @@ async def accept(self, visitor: SegmentVisitior) -> None: | RoleIs | InheritedMemberIs | DirectMemberIs + | DoiIs + | PublisherNameIs ) @@ -812,6 +864,16 @@ def role_is(cls, role: Role, *args: Role) -> Segment: """Return role-is query segment.""" return RoleIs(Nel(role, list(args))) + @classmethod + def doi_is(cls, doi: str, *args: str) -> Segment: + """Return slug-is query segment.""" + return DoiIs(Nel(doi, list(args))) + + @classmethod + def publisher_name_is(cls, publisher_name: str, *args: str) -> Segment: + """Return slug-is query segment.""" + return PublisherNameIs(Nel(publisher_name, list(args))) + @dataclass class UserQuery: @@ -920,6 +982,16 @@ async def visit_inherited_member_is(self, ft: InheritedMemberIs) -> None: """Visit inherited-member-is.""" ... + @abstractmethod + async def visit_doi_is(self, ft: DoiIs) -> None: + """Visit doi-is.""" + ... + + @abstractmethod + async def visit_publisher_name_is(self, ft: PublisherNameIs) -> None: + """Visit doi-is.""" + ... + class UserQueryVisitor[T](SegmentVisitior): """A visitor to transform user queries.""" @@ -1001,6 +1073,16 @@ async def visit_visibility_is(self, ft: VisibilityIs) -> None: """Forwards to `visit_field_term`.""" return await self.visit_field_term(ft) + @override + async def visit_doi_is(self, ft: DoiIs) -> None: + """Forwards to `visit_field_term`.""" + return await self.visit_field_term(ft) + + @override + async def visit_publisher_name_is(self, ft: PublisherNameIs) -> None: + """Forwards to `visit_field_term`.""" + return await self.visit_field_term(ft) + class EmptyUserQueryVisitor[T](UserQueryFieldTermVisitor[T]): """A visitor with every method doing nothing. diff --git a/components/renku_data_services/search/user_query_parser.py b/components/renku_data_services/search/user_query_parser.py index 34adf8c85..4a5c39748 100644 --- a/components/renku_data_services/search/user_query_parser.py +++ b/components/renku_data_services/search/user_query_parser.py @@ -28,6 +28,7 @@ CreatedByIs, DateTimeCalc, DirectMemberIs, + DoiIs, Field, Helper, IdIs, @@ -40,6 +41,7 @@ PartialDate, PartialDateTime, PartialTime, + PublisherNameIs, RelativeDate, RoleIs, SlugIs, @@ -115,6 +117,10 @@ def _make_field_term(args: tuple[str, Nel[str]]) -> Parser: return success(NamespaceIs(values)) case Field.created_by: return success(CreatedByIs(values)) + case Field.doi: + return success(DoiIs(values)) + case Field.publisher_name: + return success(PublisherNameIs(values)) case _: return fail(f"Invalid field name: {field}") diff --git a/components/renku_data_services/solr/entity_documents.py b/components/renku_data_services/solr/entity_documents.py index cf56f53b0..7cb556c85 100644 --- a/components/renku_data_services/solr/entity_documents.py +++ b/components/renku_data_services/solr/entity_documents.py @@ -237,6 +237,8 @@ class DataConnector(EntityDoc, frozen=True): isNamespace: Annotated[Literal[False], BeforeValidator(lambda e: False)] = False namespaceDetails: ResponseBody | None = None creatorDetails: ResponseBody | None = None + publisherName: str | None = None + doi: str | None = None @property def entity_type(self) -> EntityType: @@ -287,7 +289,7 @@ def _add_tzinfo(cls, v: datetime) -> datetime: @classmethod def from_dict(cls, d: dict[str, Any]) -> DataConnector: - """Create a Project from a dictionary.""" + """Create a data connector from a dictionary.""" return DataConnector.model_validate(d) diff --git a/components/renku_data_services/solr/entity_schema.py b/components/renku_data_services/solr/entity_schema.py index 4def2d01f..092a0ed68 100644 --- a/components/renku_data_services/solr/entity_schema.py +++ b/components/renku_data_services/solr/entity_schema.py @@ -50,6 +50,10 @@ class Fields: creator_details: Final[FieldName] = FieldName("creatorDetails") namespace_details: Final[FieldName] = FieldName("namespaceDetails") + # data connector fields + doi: Final[FieldName] = FieldName("doi") + publisher_name: Final[FieldName] = FieldName("publisherName") + class Analyzers: """A collection of analyzers.""" @@ -155,4 +159,14 @@ class FieldTypes: ], requires_reindex=True, ), + SchemaMigration( + version=13, + commands=[ + AddCommand(Field.of(Fields.doi, FieldTypes.string)), + AddCommand(CopyFieldRule(source=Fields.doi, dest=Fields.content_all)), + AddCommand(Field.of(Fields.publisher_name, FieldTypes.string)), + AddCommand(CopyFieldRule(source=Fields.publisher_name, dest=Fields.content_all)), + ], + requires_reindex=False, + ), ] diff --git a/components/renku_data_services/solr/solr_schema.py b/components/renku_data_services/solr/solr_schema.py index e54ccdca2..916faf438 100644 --- a/components/renku_data_services/solr/solr_schema.py +++ b/components/renku_data_services/solr/solr_schema.py @@ -110,7 +110,9 @@ class FieldTypeClasses: type_float = FieldTypeClass("FloatPointField") type_double = FieldTypeClass("DoublePointField") type_text = FieldTypeClass("TextField") + """TextField gets tokenized in Solr by default in our deployment.""" type_str = FieldTypeClass("StrField") + """StrField does not get tokenized in Solr by default in our deployment.""" type_uuid = FieldTypeClass("UUIDField") type_rank = FieldTypeClass("RankField") type_date_point = FieldTypeClass("DatePointField") From 21baf9d1d2be068cc7faecce5a0bd3915d68791d Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Fri, 12 Dec 2025 15:29:08 +0100 Subject: [PATCH 06/10] fix: case insensitive doi, publisher_name, keywds --- .../renku_data_services/solr/entity_schema.py | 24 +++++++++++++++++-- .../renku_data_services/solr/solr_schema.py | 8 +++++++ .../renku_data_services/storage/rclone.py | 4 ++-- 3 files changed, 32 insertions(+), 4 deletions(-) diff --git a/components/renku_data_services/solr/entity_schema.py b/components/renku_data_services/solr/entity_schema.py index 092a0ed68..824b068d3 100644 --- a/components/renku_data_services/solr/entity_schema.py +++ b/components/renku_data_services/solr/entity_schema.py @@ -11,6 +11,7 @@ FieldName, FieldType, Filters, + ReplaceCommand, SchemaCommand, Tokenizers, TypeName, @@ -79,6 +80,11 @@ class Analyzers: ], ) + keyword_case_insensitive: Final[Analyzer] = Analyzer( + tokenizer=Tokenizers.keyword, + filters=[Filters.lowercase], + ) + class FieldTypes: """A collection of field types.""" @@ -99,6 +105,12 @@ class FieldTypes: ) date_time: Final[FieldType] = FieldType.date_time_point(TypeName("SearchDateTime")) + keyword: Final[FieldType] = ( + FieldType.text(TypeName("Keyword")).make_stored().with_analyzer(Analyzers.keyword_case_insensitive) + ) + """keyword is a field type that is not changed at all by the tokenizer, and is stored unchanged + but is searched in case-insensitive manner. Note, analyzers cannot be added to StrField, so we use TextField.""" + initial_entity_schema: Final[list[SchemaCommand]] = [ AddCommand(FieldTypes.id), @@ -162,11 +174,19 @@ class FieldTypes: SchemaMigration( version=13, commands=[ - AddCommand(Field.of(Fields.doi, FieldTypes.string)), + AddCommand(FieldTypes.keyword), + AddCommand(Field.of(Fields.doi, FieldTypes.keyword)), AddCommand(CopyFieldRule(source=Fields.doi, dest=Fields.content_all)), - AddCommand(Field.of(Fields.publisher_name, FieldTypes.string)), + AddCommand(Field.of(Fields.publisher_name, FieldTypes.keyword)), AddCommand(CopyFieldRule(source=Fields.publisher_name, dest=Fields.content_all)), ], requires_reindex=False, ), + SchemaMigration( + version=14, + commands=[ + ReplaceCommand(Field.of(Fields.keywords, FieldTypes.keyword).make_multi_valued()), + ], + requires_reindex=True, + ), ] diff --git a/components/renku_data_services/solr/solr_schema.py b/components/renku_data_services/solr/solr_schema.py index 916faf438..f72c44230 100644 --- a/components/renku_data_services/solr/solr_schema.py +++ b/components/renku_data_services/solr/solr_schema.py @@ -46,6 +46,10 @@ class Tokenizers: icu: Tokenizer = Tokenizer(name="icu") openNlp: Tokenizer = Tokenizer(name="openNlp") + # The keyword tokenizer treats the entire field as a single token + # See https://solr.apache.org/guide/solr/latest/indexing-guide/tokenizers.html#keyword-tokenizer + keyword: Tokenizer = Tokenizer(name="keyword") + @final class Filter(BaseModel): @@ -156,6 +160,10 @@ def with_index_analyzer(self, a: Analyzer) -> Self: """Return a copy with index analyzers set to the given one.""" return self.model_copy(update={"indexAnalyzer": a}) + def make_stored(self) -> Self: + """Make the field "stored" so that original value of the field is stored and can be retrieved.""" + return self.model_copy(update={"stored": True}) + @classmethod def id(cls, name: TypeName) -> FieldType: """Create a field that can be used as a document id.""" diff --git a/components/renku_data_services/storage/rclone.py b/components/renku_data_services/storage/rclone.py index 93a5b8b4e..df0b437cf 100644 --- a/components/renku_data_services/storage/rclone.py +++ b/components/renku_data_services/storage/rclone.py @@ -56,7 +56,7 @@ def validate(self, configuration: Union[RCloneConfig, dict[str, Any]], keep_sens provider.validate_config(configuration, keep_sensitive=keep_sensitive) def validate_sensitive_data( - self, configuration: Union["RCloneConfig", dict[str, Any]], sensitive_data: dict[str, str] + self, configuration: Union[RCloneConfig, dict[str, Any]], sensitive_data: dict[str, str] ) -> None: """Validates whether the provided sensitive data is marked as sensitive in the rclone schema.""" sensitive_options = self.get_provider(configuration).sensitive_options @@ -68,7 +68,7 @@ def validate_sensitive_data( continue raise errors.ValidationError(message=f"The '{key}' property is not marked as sensitive.") - def get_real_configuration(self, configuration: Union["RCloneConfig", dict[str, Any]]) -> dict[str, Any]: + def get_real_configuration(self, configuration: Union[RCloneConfig, dict[str, Any]]) -> dict[str, Any]: """Converts a Renku rclone configuration to a real rclone config.""" real_config = dict(configuration) From b2adaf76f56620e4d24ab8c89654b1c754d6fe16 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Wed, 17 Dec 2025 15:44:52 +0100 Subject: [PATCH 07/10] chore: mock tests to succeed if envidat is unavailable --- .../data_connectors/core.py | 20 +++++++-- .../data_connectors/doi/schema_org.py | 4 +- .../storage/rclone_patches.py | 3 -- .../data_api/test_data_connectors.py | 45 +++++++++++++++++-- test/conftest.py | 8 ++++ .../test_metadata.py => constants.py} | 8 +--- 6 files changed, 72 insertions(+), 16 deletions(-) rename test/{components/renku_data_services/data_connectors/test_metadata.py => constants.py} (97%) diff --git a/components/renku_data_services/data_connectors/core.py b/components/renku_data_services/data_connectors/core.py index 798fba48d..630c5fd20 100644 --- a/components/renku_data_services/data_connectors/core.py +++ b/components/renku_data_services/data_connectors/core.py @@ -478,9 +478,19 @@ async def convert_envidat_v1_data_connector_to_s3( config = payload.configuration doi = config.get("doi") if not isinstance(doi, str): - raise errors.ValidationError() + if doi is None: + raise errors.ValidationError( + message="Cannot get configuration for Envidat data connector because " + "the doi is missing from the payload." + ) + raise errors.ValidationError( + message=f"Cannot get configuration for Envidat data connector because the doi '{doi}' " + "in the payload is not a string." + ) if len(doi) == 0: - raise errors.ValidationError() + raise errors.ValidationError( + message="Cannot get configuration for Envidat data connector because the doi is a string with zero length." + ) doi = DOI(doi) new_config = payload.model_copy(deep=True) @@ -494,7 +504,11 @@ async def convert_envidat_v1_data_connector_to_s3( async with clnt: res = await clnt.get(envidat_url, params=query_params, headers=headers) if res.status_code != 200: - raise errors.ProgrammingError() + raise errors.ValidationError( + message="Cannot get configuration for Envidat data connector because Envidat responded " + f"with an unexpected {res.status_code} status code at {res.url}.", + detail=f"Response from envidat: {res.text}", + ) dataset = SchemaOrgDataset.model_validate_json(res.text) s3_config = schema_org.get_rclone_config( dataset, diff --git a/components/renku_data_services/data_connectors/doi/schema_org.py b/components/renku_data_services/data_connectors/doi/schema_org.py index 23d8bfa8a..8595a5c31 100644 --- a/components/renku_data_services/data_connectors/doi/schema_org.py +++ b/components/renku_data_services/data_connectors/doi/schema_org.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from enum import StrEnum +from pathlib import PurePosixPath from urllib.parse import parse_qs, urlparse from renku_data_services.data_connectors.doi.models import SchemaOrgDataset @@ -25,7 +26,8 @@ class S3Config: @property def path(self) -> str: """Return the path including the bucket name and the prefix.""" - return f"/{self.bucket}{self.prefix}" + # NOTE: PurePosixPath("/test") / "/subfolder" == /subfolder, so we still have to strip / + return (PurePosixPath("/") / PurePosixPath(self.bucket) / self.prefix.lstrip("/")).as_posix() def get_rclone_config(dataset: SchemaOrgDataset, provider: DatasetProvider) -> S3Config: diff --git a/components/renku_data_services/storage/rclone_patches.py b/components/renku_data_services/storage/rclone_patches.py index 24c135fe6..c03f4b50f 100644 --- a/components/renku_data_services/storage/rclone_patches.py +++ b/components/renku_data_services/storage/rclone_patches.py @@ -354,9 +354,6 @@ def apply_patches(spec: list[dict[str, Any]]) -> None: __patch_schema_remove_oauth_propeties, __patch_polybox_storage, __patch_switchdrive_storage, - # __add_custom_doi_s3_provider("Envidat", "Envidat data provider", ENVIDAT_V1_PROVIDER), - # TODO: Enable Scicat when it is ready in production - # __add_custom_doi_s3_provider("SciCat", "SciCat data provider", SCICAT_V1_PROVIDER), __patch_schema_remove_banned_sftp_options, __patch_schema_add_openbis_type, ] diff --git a/test/bases/renku_data_services/data_api/test_data_connectors.py b/test/bases/renku_data_services/data_api/test_data_connectors.py index 6aa6341ea..ccb0fe9e9 100644 --- a/test/bases/renku_data_services/data_api/test_data_connectors.py +++ b/test/bases/renku_data_services/data_api/test_data_connectors.py @@ -2480,7 +2480,40 @@ async def _mock(*args, **kwargs) -> DOIMetadata | None: ) -async def test_validate_envidat_data_connector() -> None: +def _mock_get_envidat_metadata(metadata: DOIMetadata, sanic_client: SanicASGITestClient, monkeypatch: "MonkeyPatch"): + """Mock the _get_envidat_metadata method.""" + + # The Evnidat API may be unresponsive, so we mock its response + from renku_data_services.data_connectors.doi import metadata as metadata_mod + + _orig_get_envidat_metadata = metadata_mod._get_envidat_metadata + + def _mock_get_envidat_metadata(original_fn): + async def _mock(*args, **kwargs) -> DOIMetadata | None: + fetched_metadata = await original_fn(*args, **kwargs) + if fetched_metadata is not None: + assert fetched_metadata == metadata + return fetched_metadata + + warnings.warn("Could not retrieve DOI metadata, returning saved one", stacklevel=2) + return metadata + + return _mock + + monkeypatch.setattr(metadata_mod, "_get_envidat_metadata", _mock_get_envidat_metadata(_orig_get_envidat_metadata)) + monkeypatch.setattr( + metadata_mod, + "_get_envidat_metadata", + _mock_get_envidat_metadata(_orig_get_envidat_metadata), + ) + + +async def test_validate_envidat_data_connector( + envidat_metadata: DOIMetadata, + sanic_client: SanicASGITestClient, + monkeypatch: "MonkeyPatch", +) -> None: + _mock_get_envidat_metadata(envidat_metadata, sanic_client, monkeypatch) body = GlobalDataConnectorPost( storage=CloudStorageCorePost( storage_type="doi", @@ -2510,10 +2543,16 @@ async def test_validate_envidat_data_connector() -> None: assert res.publisher_name is not None -async def test_add_envidat_data_connector(sanic_client: SanicASGITestClient, user_headers) -> None: +async def test_add_envidat_data_connector( + sanic_client: SanicASGITestClient, + user_headers: dict[str, str], + envidat_metadata: DOIMetadata, + monkeypatch: "MonkeyPatch", +) -> None: + _mock_get_envidat_metadata(envidat_metadata, sanic_client, monkeypatch) payload = { "storage": { - "configuration": {"type": "doi", "doi": "10.16904/envidat.716"}, + "configuration": {"type": "doi", "doi": "10.16904/12"}, "source_path": "/", "target_path": "/", "readonly": True, diff --git a/test/conftest.py b/test/conftest.py index 952a0e3c6..08cf4f2b1 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -29,12 +29,14 @@ from renku_data_services.app_config import logging from renku_data_services.authz.config import AuthzConfig from renku_data_services.data_api.dependencies import DependencyManager +from renku_data_services.data_connectors.doi.models import DOIMetadata, SchemaOrgDataset from renku_data_services.db_config.config import DBConfig from renku_data_services.secrets_storage_api.dependencies import DependencyManager as SecretsDependencyManager from renku_data_services.solr import entity_schema from renku_data_services.solr.solr_client import SolrClientConfig from renku_data_services.solr.solr_migrate import SchemaMigrator from renku_data_services.users import models as user_preferences_models +from test.constants import envidat_sample_response from test.utils import TestDependencyManager @@ -489,3 +491,9 @@ def pytest_runtest_setup(item): ), append=False, ) + + +@pytest.fixture() +def envidat_metadata() -> DOIMetadata: + md = SchemaOrgDataset.model_validate_json(envidat_sample_response) + return DOIMetadata(name=md.name, description=md.description or "", keywords=md.keywords) diff --git a/test/components/renku_data_services/data_connectors/test_metadata.py b/test/constants.py similarity index 97% rename from test/components/renku_data_services/data_connectors/test_metadata.py rename to test/constants.py index 3e76dd5b1..a75859ebf 100644 --- a/test/components/renku_data_services/data_connectors/test_metadata.py +++ b/test/constants.py @@ -1,6 +1,6 @@ -from renku_data_services.data_connectors.doi.models import SchemaOrgDataset +from typing import Final -envidat_sample_response = """ +envidat_sample_response: Final[str] = """ { "@context": "http://schema.org", "@type": "https://schema.org/Dataset", @@ -180,7 +180,3 @@ "license": "https://opendefinition.org/licenses/odc-odbl" } """ # noqa: E501 - - -def test_envidat_metadata_parsing() -> None: - SchemaOrgDataset.model_validate_json(envidat_sample_response) From 7ae72c72c010bd193c09af722a2d30bedd262c6e Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Wed, 17 Dec 2025 16:42:14 +0100 Subject: [PATCH 08/10] squashme: make alembic history linear --- .../bd97866a6253_add_doi_and_publisher_info_for_global_.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/components/renku_data_services/migrations/versions/bd97866a6253_add_doi_and_publisher_info_for_global_.py b/components/renku_data_services/migrations/versions/bd97866a6253_add_doi_and_publisher_info_for_global_.py index 2bdc7010f..cf73a0d41 100644 --- a/components/renku_data_services/migrations/versions/bd97866a6253_add_doi_and_publisher_info_for_global_.py +++ b/components/renku_data_services/migrations/versions/bd97866a6253_add_doi_and_publisher_info_for_global_.py @@ -1,7 +1,7 @@ """add doi and publisher info for global data connectors Revision ID: bd97866a6253 -Revises: 42049656cdb8 +Revises: 780302876bce Create Date: 2025-12-03 09:38:17.534403 """ @@ -11,7 +11,7 @@ # revision identifiers, used by Alembic. revision = "bd97866a6253" -down_revision = "42049656cdb8" +down_revision = "780302876bce" branch_labels = None depends_on = None From 67bd42bf6e573373b7535a72362bcfa6cd8bc33d Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Fri, 19 Dec 2025 10:16:59 +0100 Subject: [PATCH 09/10] squashme: address comments --- .../data_connectors/core.py | 19 +++++++++++-------- .../renku_data_services/search/user_query.py | 4 ++-- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/components/renku_data_services/data_connectors/core.py b/components/renku_data_services/data_connectors/core.py index 630c5fd20..3d8a106ab 100644 --- a/components/renku_data_services/data_connectors/core.py +++ b/components/renku_data_services/data_connectors/core.py @@ -267,12 +267,16 @@ async def validate_unsaved_global_data_connector( # Assign user-friendly target_path if possible target_path = data_connector.slug - if metadata: - # If there is no metdata, the slug is derived from the name, and the name is the doi - # So without metadata if we extend the target_path it just repeats the slug twice + target_path_extension: str | None = None + with contextlib.suppress(errors.ValidationError): + target_path_extension = base_models.Slug.from_name(name).value + # If we were not able to get metadata about the dataset earlier, + # the slug and the name are essentially both the same and equal to the doi. + # And if we extend the target_path in this case it just repeats the slug twice. + # That is why we do the check below to avoid this case and avoid the ugly target path. + if target_path_extension and target_path != target_path_extension: with contextlib.suppress(errors.ValidationError): - name_slug = base_models.Slug.from_name(name).value - target_path = base_models.Slug.from_name(f"{name_slug[:30]}-{target_path}").value + target_path = base_models.Slug.from_name(f"{target_path_extension[:30]}-{target_path}").value # Override source_path and target_path storage = models.CloudStorageCore( @@ -496,13 +500,12 @@ async def convert_envidat_v1_data_connector_to_s3( new_config = payload.model_copy(deep=True) new_config.configuration = {} - envidat_url = "https://envidat.ch/converters-api/internal-dataset/convert/jsonld" - query_params = {"query": doi} + envidat_url = create_envidat_metadata_url(doi) headers = {"accept": "application/json"} clnt = httpx.AsyncClient(follow_redirects=True, timeout=5) async with clnt: - res = await clnt.get(envidat_url, params=query_params, headers=headers) + res = await clnt.get(envidat_url, headers=headers) if res.status_code != 200: raise errors.ValidationError( message="Cannot get configuration for Envidat data connector because Envidat responded " diff --git a/components/renku_data_services/search/user_query.py b/components/renku_data_services/search/user_query.py index a70023bce..f695f7a3e 100644 --- a/components/renku_data_services/search/user_query.py +++ b/components/renku_data_services/search/user_query.py @@ -866,12 +866,12 @@ def role_is(cls, role: Role, *args: Role) -> Segment: @classmethod def doi_is(cls, doi: str, *args: str) -> Segment: - """Return slug-is query segment.""" + """Return doi-is query segment.""" return DoiIs(Nel(doi, list(args))) @classmethod def publisher_name_is(cls, publisher_name: str, *args: str) -> Segment: - """Return slug-is query segment.""" + """Return publisher-name-is query segment.""" return PublisherNameIs(Nel(publisher_name, list(args))) From f8442672263aff0f870d68d5af1756b4a55afd15 Mon Sep 17 00:00:00 2001 From: Tasko Olevski Date: Fri, 19 Dec 2025 10:35:58 +0100 Subject: [PATCH 10/10] squashme: add docs and tests --- .../search/query_manual.md | 32 +++++++++++++++++++ .../search/test_user_query_parser.py | 4 +++ 2 files changed, 36 insertions(+) diff --git a/components/renku_data_services/search/query_manual.md b/components/renku_data_services/search/query_manual.md index 93c68d9e6..7d912b138 100644 --- a/components/renku_data_services/search/query_manual.md +++ b/components/renku_data_services/search/query_manual.md @@ -217,6 +217,38 @@ silently ignored. This content will be replaced by the output of the code block above. +## DOIs + +These are applicable only for data connectors created from DOIs. +The DOI value provided is not changed at all by the query parser. +Note that we store all DOI values not as full URLs +(i.e. `https://www.doi.org/10.16904/envidat.714`) but +only as the value (i.e. `10.16904/envidat.714`). Future improvements +will probably remove these limitations and handle full URLs transparently. +The search will look up this term in a case-insensitive manner. + + + + + + + +This content will be replaced by the output of the code block above. + + +## Publisher names + +These are applicable only for data connectors created from DOIs. +The search will look up this term in a case-insensitive manner. + + + + + + + +This content will be replaced by the output of the code block above. + ## Dates diff --git a/test/components/renku_data_services/search/test_user_query_parser.py b/test/components/renku_data_services/search/test_user_query_parser.py index d73b9d8dd..da2e43aa4 100644 --- a/test/components/renku_data_services/search/test_user_query_parser.py +++ b/test/components/renku_data_services/search/test_user_query_parser.py @@ -15,6 +15,7 @@ CreatedByIs, DateTimeCalc, DirectMemberIs, + DoiIs, IdIs, InheritedMemberIs, KeywordIs, @@ -25,6 +26,7 @@ PartialDate, PartialDateTime, PartialTime, + PublisherNameIs, RelativeDate, RoleIs, SlugIs, @@ -197,6 +199,8 @@ def test_field_term() -> None: assert pp.field_term.parse("direct_member:123-456") == DirectMemberIs(Nel(UserId("123-456"))) assert pp.field_term.parse("inherited_member:@john") == InheritedMemberIs(Nel(Username.from_name("john"))) assert pp.field_term.parse("inherited_member:123-456") == InheritedMemberIs(Nel(UserId("123-456"))) + assert pp.field_term.parse("doi:10.16904/envidat.714") == DoiIs(Nel("10.16904/envidat.714")) + assert pp.field_term.parse("publisher_name:EnviDat") == PublisherNameIs(Nel("EnviDat")) def test_free_text() -> None: