Skip to content

Commit c318e8c

Browse files
committed
chore: cleanup code, add tests
1 parent 43d6a9b commit c318e8c

File tree

8 files changed

+445
-92
lines changed

8 files changed

+445
-92
lines changed

components/renku_data_services/data_connectors/core.py

Lines changed: 77 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,11 @@
1515
NamespacePath,
1616
ProjectPath,
1717
)
18-
from renku_data_services.data_connectors import apispec, models, schema_org_dataset
18+
from renku_data_services.data_connectors import apispec, models
1919
from renku_data_services.data_connectors.constants import ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS
20-
from renku_data_services.data_connectors.doi.metadata import get_dataset_metadata
20+
from renku_data_services.data_connectors.doi import schema_org
21+
from renku_data_services.data_connectors.doi.metadata import create_envidat_metadata_url, get_dataset_metadata
22+
from renku_data_services.data_connectors.doi.models import DOI, SchemaOrgDataset
2123
from renku_data_services.storage import models as storage_models
2224
from renku_data_services.storage.constants import ENVIDAT_V1_PROVIDER
2325
from renku_data_services.storage.rclone import RCloneValidator
@@ -129,29 +131,37 @@ async def prevalidate_unsaved_global_data_connector(
129131
if not storage.readonly:
130132
raise errors.ValidationError(message="Global data connectors must be read-only")
131133

132-
match storage.storage_type:
133-
case "doi":
134-
rclone_metadata = await validator.get_doi_metadata(configuration=storage.configuration)
135-
136-
doi_uri = f"doi:{rclone_metadata.doi}"
137-
slug = base_models.Slug.from_name(doi_uri).value
138-
139-
# Override provider in storage config
140-
storage.configuration["provider"] = rclone_metadata.provider
141-
case x if x == ENVIDAT_V1_PROVIDER:
142-
if not isinstance(body.storage, apispec.CloudStorageCorePost):
143-
raise errors.ValidationError()
144-
doi = body.storage.configuration.get("doi")
145-
if not doi:
146-
raise errors.ValidationError()
147-
doi_uri = f"doi:{doi}"
148-
slug = base_models.Slug.from_name(doi_uri).value
149-
case x:
134+
rclone_metadata = await validator.get_doi_metadata(configuration=storage.configuration)
135+
if rclone_metadata:
136+
doi_uri = f"doi:{rclone_metadata.doi}"
137+
138+
# Override provider in storage config
139+
storage.configuration["provider"] = rclone_metadata.provider
140+
doi = DOI(rclone_metadata.doi)
141+
else:
142+
# The storage is not supported by rclone
143+
if not isinstance(body.storage, apispec.CloudStorageCorePost):
144+
raise errors.ValidationError(
145+
message="When the data connector is not supported by rclone we cannot parse a storage URL."
146+
)
147+
# Try to see if we have a different type not directly supported by rclone - from envidat for example
148+
doi_str = body.storage.configuration.get("doi")
149+
if not isinstance(doi_str, str):
150+
raise errors.ValidationError(message="A doi could not be found in the storage configuration.")
151+
doi = DOI(doi_str)
152+
host = await doi.resolve_host()
153+
if not host:
154+
raise errors.ValidationError(message=f"The provided doi {doi} cannot be resolved.")
155+
doi_uri = f"doi:{doi}"
156+
if host not in ["envidat.ch", "www.envidat.ch"]:
150157
raise errors.ValidationError(
151-
message=f"Only {ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS} storage type is allowed "
152-
"for global data connectors"
158+
message="The doi for the global data connector resolved to an unsupported host"
153159
)
160+
# Set the storage type and re-validate
161+
body.storage.storage_type = ENVIDAT_V1_PROVIDER
162+
storage = await validate_unsaved_storage(body.storage, validator=validator)
154163

164+
slug = base_models.Slug.from_name(doi_uri).value
155165
return models.UnsavedGlobalDataConnector(
156166
name=doi_uri,
157167
slug=slug,
@@ -160,6 +170,7 @@ async def prevalidate_unsaved_global_data_connector(
160170
storage=storage,
161171
description=None,
162172
keywords=[],
173+
doi=doi,
163174
)
164175

165176

@@ -181,7 +192,12 @@ async def validate_unsaved_global_data_connector(
181192
# Fetch DOI metadata
182193
if data_connector.storage.storage_type == "doi":
183194
rclone_metadata = await validator.get_doi_metadata(configuration=data_connector.storage.configuration)
184-
metadata = await get_dataset_metadata(rclone_metadata=rclone_metadata)
195+
if not rclone_metadata:
196+
raise errors.ValidationError()
197+
metadata = await get_dataset_metadata(data_connector.storage.storage_type, rclone_metadata.metadata_url)
198+
elif data_connector.storage.storage_type == ENVIDAT_V1_PROVIDER:
199+
metadata_url = create_envidat_metadata_url(data_connector.doi)
200+
metadata = await get_dataset_metadata(data_connector.storage.storage_type, metadata_url)
185201
else:
186202
metadata = None
187203

@@ -228,6 +244,7 @@ async def validate_unsaved_global_data_connector(
228244
storage=storage,
229245
description=description or None,
230246
keywords=keywords,
247+
doi=data_connector.doi,
231248
)
232249

233250

@@ -371,8 +388,7 @@ async def convert_envidat_v1_data_connector_to_s3(
371388
raise errors.ValidationError()
372389
if len(doi) == 0:
373390
raise errors.ValidationError()
374-
doi = doi.removeprefix("https://")
375-
doi = doi.removeprefix("http://")
391+
doi = DOI(doi)
376392

377393
new_config = payload.model_copy(deep=True)
378394
new_config.configuration = {}
@@ -386,12 +402,45 @@ async def convert_envidat_v1_data_connector_to_s3(
386402
res = await clnt.get(envidat_url, params=query_params, headers=headers)
387403
if res.status_code != 200:
388404
raise errors.ProgrammingError()
389-
dataset = schema_org_dataset.Dataset.model_validate_strings(res.text)
390-
s3_config = schema_org_dataset.get_rclone_config(
405+
dataset = SchemaOrgDataset.model_validate_strings(res.text)
406+
s3_config = schema_org.get_rclone_config(
391407
dataset,
392-
schema_org_dataset.DatasetProvider.envidat,
408+
schema_org.DatasetProvider.envidat,
393409
)
394410
new_config.configuration = dict(s3_config.rclone_config)
395411
new_config.source_path = s3_config.path
396412
new_config.storage_type = "s3"
397413
return new_config
414+
415+
416+
# async def get_metadata(
417+
# configuration: storage_models.RCloneConfig | dict[str, Any], validator: RCloneValidator
418+
# ) -> RCloneDOIMetadata | None:
419+
# """Get metadata for the dataset."""
420+
# if isinstance(configuration, storage_models.RCloneConfig):
421+
# return await validator.get_doi_metadata(configuration)
422+
# doi = configuration.get("doi")
423+
# if not doi:
424+
# return None
425+
# parsed_doi = urlparse(doi)
426+
# if parsed_doi.scheme.decode() not in ["http", "https"]:
427+
# doi = urlunparse(parsed_doi._replace(scheme=b"https")).decode()
428+
# clnt = httpx.AsyncClient(follow_redirects=True)
429+
# async with clnt:
430+
# res = await clnt.get(doi)
431+
# if res.status_code != 200:
432+
# return None
433+
# match res.url.host:
434+
# case "www.envidat.ch":
435+
#
436+
#
437+
# async def get_envidat_metadata(doi: DOI) -> dict | None:
438+
# """Get metadata about the envidat dataset, the doi should not be a url."""
439+
# clnt = httpx.AsyncClient()
440+
# url = "https://envidat.ch/converters-api/internal-dataset/convert/jsonld"
441+
# params = {"query": doi}
442+
# async with clnt:
443+
# res = clnt.get(url, params=params)
444+
# if res.status_code != 200:
445+
# return None
446+
#

components/renku_data_services/data_connectors/doi/metadata.py

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,27 @@
11
"""Metadata handling for DOIs."""
22

3+
from urllib.parse import urlencode
4+
35
import httpx
46
from pydantic import ValidationError as PydanticValidationError
57

68
from renku_data_services.data_connectors.doi import models
7-
from renku_data_services.storage.rclone import RCloneDOIMetadata
9+
from renku_data_services.storage.constants import ENVIDAT_V1_PROVIDER
810

911

10-
async def get_dataset_metadata(rclone_metadata: RCloneDOIMetadata) -> models.DOIMetadata | None:
12+
async def get_dataset_metadata(provider: str, metadata_url: str) -> models.DOIMetadata | None:
1113
"""Retrieve DOI metadata."""
12-
if rclone_metadata.provider == "invenio" or rclone_metadata.provider == "zenodo":
13-
return await _get_dataset_metadata_invenio(rclone_metadata=rclone_metadata)
14-
if rclone_metadata.provider == "dataverse":
15-
return await _get_dataset_metadata_dataverse(rclone_metadata=rclone_metadata)
14+
if provider == "invenio" or provider == "zenodo":
15+
return await _get_dataset_metadata_invenio(metadata_url)
16+
if provider == "dataverse":
17+
return await _get_dataset_metadata_dataverse(metadata_url)
18+
if provider == ENVIDAT_V1_PROVIDER:
19+
return await _get_envidat_metadata(metadata_url)
1620
return None
1721

1822

19-
async def _get_dataset_metadata_invenio(rclone_metadata: RCloneDOIMetadata) -> models.DOIMetadata | None:
23+
async def _get_dataset_metadata_invenio(metadata_url: str) -> models.DOIMetadata | None:
2024
"""Retrieve DOI metadata from the InvenioRDM API."""
21-
metadata_url = rclone_metadata.metadata_url
22-
if not metadata_url:
23-
return None
24-
2525
async with httpx.AsyncClient(timeout=5) as client:
2626
try:
2727
res = await client.get(url=metadata_url, follow_redirects=True, headers=[("accept", "application/json")])
@@ -43,11 +43,8 @@ async def _get_dataset_metadata_invenio(rclone_metadata: RCloneDOIMetadata) -> m
4343
return models.DOIMetadata(name=name, description=description, keywords=keywords)
4444

4545

46-
async def _get_dataset_metadata_dataverse(rclone_metadata: RCloneDOIMetadata) -> models.DOIMetadata | None:
46+
async def _get_dataset_metadata_dataverse(metadata_url: str) -> models.DOIMetadata | None:
4747
"""Retrieve DOI metadata from the Dataverse API."""
48-
metadata_url = rclone_metadata.metadata_url
49-
if not metadata_url:
50-
return None
5148

5249
async with httpx.AsyncClient(timeout=5) as client:
5350
try:
@@ -118,3 +115,30 @@ async def _get_dataset_metadata_dataverse(rclone_metadata: RCloneDOIMetadata) ->
118115
except PydanticValidationError:
119116
pass
120117
return models.DOIMetadata(name=name, description=description, keywords=keywords)
118+
119+
120+
def create_envidat_metadata_url(doi: models.DOI) -> str:
121+
"""Create the metadata url for envidat from a DOI."""
122+
url = "https://envidat.ch/converters-api/internal-dataset/convert/jsonld"
123+
params = urlencode({"query": doi})
124+
return f"{url}?{params}"
125+
126+
127+
async def _get_envidat_metadata(metadata_url: str) -> models.DOIMetadata | None:
128+
"""Get metadata about the envidat dataset."""
129+
clnt = httpx.AsyncClient(follow_redirects=True, timeout=5)
130+
headers = {"accept": "application/json"}
131+
async with clnt:
132+
try:
133+
res = await clnt.get(metadata_url, headers=headers)
134+
except httpx.HTTPError:
135+
return None
136+
if res.status_code != 200:
137+
return None
138+
try:
139+
parsed_metadata = models.SchemaOrgDataset.model_validate_json(res.text)
140+
except PydanticValidationError:
141+
return None
142+
return models.DOIMetadata(
143+
name=parsed_metadata.name, description=parsed_metadata.description or "", keywords=parsed_metadata.keywords
144+
)

components/renku_data_services/data_connectors/doi/models.py

Lines changed: 75 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,58 @@
11
"""Models for DOIs."""
22

3+
import re
34
from dataclasses import dataclass
4-
from typing import Any
5-
6-
from pydantic import BaseModel, Field
5+
from typing import Any, Self
6+
from urllib.parse import urlparse
7+
8+
import httpx
9+
from pydantic import BaseModel, ConfigDict, Field
10+
11+
from renku_data_services.errors import errors
12+
13+
14+
class DOI(str):
15+
"""A doi for a dataset or a similar resource."""
16+
17+
__regex = re.compile(r"^10\.\d{4,9}/\S+$", re.IGNORECASE)
18+
19+
def __new__(cls, doi: str) -> Self:
20+
"""Create a new doi.
21+
22+
A few cases possible:
23+
doi:10.16904/12
24+
10.16904/12
25+
https://www.doi.org/10.16904/12
26+
http://www.doi.org/10.16904/12
27+
http://doi.org/10.16904/12
28+
"""
29+
doi_parsed = urlparse(doi)
30+
doi_clean = doi
31+
if doi_parsed.netloc in ["www.doi.org", "doi.org"]:
32+
if doi_parsed.scheme not in ["https", "http"]:
33+
raise errors.ValidationError(
34+
message=f"Received the right doi.org host but an unexpected scheme {doi_parsed} for doi {doi}."
35+
)
36+
doi_clean = doi_parsed.path.strip("/")
37+
if doi.startswith("doi:"):
38+
doi_clean = doi[4:]
39+
if not doi_clean or not DOI.__regex.match(doi_clean):
40+
raise errors.ValidationError(message=f"The provided value {doi} is not a valid doi.")
41+
return super().__new__(cls, doi_clean)
42+
43+
@property
44+
def url(self) -> str:
45+
"""Return a proper URL from the doi."""
46+
return f"https://doi.org/{self}"
47+
48+
async def resolve_host(self) -> str | None:
49+
"""Resolves the DOI and returns the hostname of the url where the redirect leads."""
50+
clnt = httpx.AsyncClient(follow_redirects=True)
51+
async with clnt:
52+
res = await clnt.get(self.url)
53+
if res.status_code != 200:
54+
return None
55+
return res.url.host
756

857

958
@dataclass(frozen=True, eq=True, kw_only=True)
@@ -67,3 +116,26 @@ class DataverseDatasetResponse(BaseModel):
67116

68117
status: str = Field()
69118
data: DataverseDataset | None = Field()
119+
120+
121+
class SchemaOrgDistribution(BaseModel):
122+
"""The distribution field of a schema.org dataset."""
123+
124+
model_config = ConfigDict(extra="ignore")
125+
type: str = Field(alias="@type")
126+
content_url: str = Field(alias="contentUrl")
127+
128+
129+
class SchemaOrgDataset(BaseModel):
130+
"""A very limited and partial spec of a schema.org Dataset used by Scicat and Envidat."""
131+
132+
model_config = ConfigDict(extra="ignore")
133+
distribution: list[SchemaOrgDistribution] = Field(default_factory=list)
134+
name: str = Field()
135+
description: str | None = None
136+
raw_keywords: str = Field(alias="keywords", default="")
137+
138+
@property
139+
def keywords(self) -> list[str]:
140+
"""Split the single keywords string into a list."""
141+
return self.raw_keywords.split()

components/renku_data_services/data_connectors/schema_org_dataset.py renamed to components/renku_data_services/data_connectors/doi/schema_org.py

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,27 +4,10 @@
44
from enum import StrEnum
55
from urllib.parse import parse_qs, urlparse
66

7-
from pydantic import BaseModel, ConfigDict, Field
8-
7+
from renku_data_services.data_connectors.doi.models import SchemaOrgDataset
98
from renku_data_services.errors import errors
109

1110

12-
class Distribution(BaseModel):
13-
"""The distribution field of a schema.org dataset."""
14-
15-
model_config = ConfigDict(extra="ignore")
16-
type: str
17-
contentUrl: str
18-
name: str
19-
20-
21-
class Dataset(BaseModel):
22-
"""A very limited and partial spec of a schema.org Dataset used by Scicat and Envidat."""
23-
24-
model_config = ConfigDict(extra="ignore")
25-
distribution: list[Distribution] = Field(default_factory=list)
26-
27-
2811
class DatasetProvider(StrEnum):
2912
"""The provider for the dataset."""
3013

@@ -45,7 +28,7 @@ def path(self) -> str:
4528
return f"{self.bucket}/{self.prefix}"
4629

4730

48-
def get_rclone_config(dataset: Dataset, provider: DatasetProvider) -> S3Config:
31+
def get_rclone_config(dataset: SchemaOrgDataset, provider: DatasetProvider) -> S3Config:
4932
"""Parse the dataset into an rclone configuration."""
5033
match provider:
5134
case DatasetProvider.envidat:
@@ -55,10 +38,10 @@ def get_rclone_config(dataset: Dataset, provider: DatasetProvider) -> S3Config:
5538
raise errors.ValidationError(message=f"Got an unknown dataset provider {x}")
5639

5740

58-
def __get_rclone_s3_config_envidat(dataset: Dataset) -> S3Config:
41+
def __get_rclone_s3_config_envidat(dataset: SchemaOrgDataset) -> S3Config:
5942
"""Get the S3 rclone configuration and source path from a dataset returned by envidat."""
6043
# NOTE: The folks from Envidat assure us that the first entity in the list is the one we want
61-
url = dataset.distribution[0].contentUrl
44+
url = dataset.distribution[0].content_url
6245
# NOTE: The folks from Envidat assure us that the URL has the following format
6346
# http://<bucket-name>.<s3 domain>/?prefix=<path to files>
6447
url_parsed = urlparse(url)

components/renku_data_services/data_connectors/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
NamespacePath,
1515
ProjectPath,
1616
)
17+
from renku_data_services.data_connectors.doi.models import DOI
1718
from renku_data_services.namespace.models import GroupNamespace, ProjectNamespace, UserNamespace
1819
from renku_data_services.utils.etag import compute_etag_from_fields
1920

@@ -97,6 +98,7 @@ class UnsavedGlobalDataConnector(BaseDataConnector):
9798
"""Global data connector model."""
9899

99100
namespace: None = None
101+
doi: DOI
100102

101103

102104
@dataclass(frozen=True, eq=True, kw_only=True)

0 commit comments

Comments
 (0)