1515 NamespacePath ,
1616 ProjectPath ,
1717)
18- from renku_data_services .data_connectors import apispec , models , schema_org_dataset
18+ from renku_data_services .data_connectors import apispec , models
1919from renku_data_services .data_connectors .constants import ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS
20- from renku_data_services .data_connectors .doi .metadata import get_dataset_metadata
20+ from renku_data_services .data_connectors .doi import schema_org
21+ from renku_data_services .data_connectors .doi .metadata import create_envidat_metadata_url , get_dataset_metadata
22+ from renku_data_services .data_connectors .doi .models import DOI , SchemaOrgDataset
2123from renku_data_services .storage import models as storage_models
2224from renku_data_services .storage .constants import ENVIDAT_V1_PROVIDER
2325from renku_data_services .storage .rclone import RCloneValidator
@@ -129,29 +131,37 @@ async def prevalidate_unsaved_global_data_connector(
129131 if not storage .readonly :
130132 raise errors .ValidationError (message = "Global data connectors must be read-only" )
131133
132- match storage .storage_type :
133- case "doi" :
134- rclone_metadata = await validator .get_doi_metadata (configuration = storage .configuration )
135-
136- doi_uri = f"doi:{ rclone_metadata .doi } "
137- slug = base_models .Slug .from_name (doi_uri ).value
138-
139- # Override provider in storage config
140- storage .configuration ["provider" ] = rclone_metadata .provider
141- case x if x == ENVIDAT_V1_PROVIDER :
142- if not isinstance (body .storage , apispec .CloudStorageCorePost ):
143- raise errors .ValidationError ()
144- doi = body .storage .configuration .get ("doi" )
145- if not doi :
146- raise errors .ValidationError ()
147- doi_uri = f"doi:{ doi } "
148- slug = base_models .Slug .from_name (doi_uri ).value
149- case x :
134+ rclone_metadata = await validator .get_doi_metadata (configuration = storage .configuration )
135+ if rclone_metadata :
136+ doi_uri = f"doi:{ rclone_metadata .doi } "
137+
138+ # Override provider in storage config
139+ storage .configuration ["provider" ] = rclone_metadata .provider
140+ doi = DOI (rclone_metadata .doi )
141+ else :
142+ # The storage is not supported by rclone
143+ if not isinstance (body .storage , apispec .CloudStorageCorePost ):
144+ raise errors .ValidationError (
145+ message = "When the data connector is not supported by rclone we cannot parse a storage URL."
146+ )
147+ # Try to see if we have a different type not directly supported by rclone - from envidat for example
148+ doi_str = body .storage .configuration .get ("doi" )
149+ if not isinstance (doi_str , str ):
150+ raise errors .ValidationError (message = "A doi could not be found in the storage configuration." )
151+ doi = DOI (doi_str )
152+ host = await doi .resolve_host ()
153+ if not host :
154+ raise errors .ValidationError (message = f"The provided doi { doi } cannot be resolved." )
155+ doi_uri = f"doi:{ doi } "
156+ if host not in ["envidat.ch" , "www.envidat.ch" ]:
150157 raise errors .ValidationError (
151- message = f"Only { ALLOWED_GLOBAL_DATA_CONNECTOR_PROVIDERS } storage type is allowed "
152- "for global data connectors"
158+ message = "The doi for the global data connector resolved to an unsupported host"
153159 )
160+ # Set the storage type and re-validate
161+ body .storage .storage_type = ENVIDAT_V1_PROVIDER
162+ storage = await validate_unsaved_storage (body .storage , validator = validator )
154163
164+ slug = base_models .Slug .from_name (doi_uri ).value
155165 return models .UnsavedGlobalDataConnector (
156166 name = doi_uri ,
157167 slug = slug ,
@@ -160,6 +170,7 @@ async def prevalidate_unsaved_global_data_connector(
160170 storage = storage ,
161171 description = None ,
162172 keywords = [],
173+ doi = doi ,
163174 )
164175
165176
@@ -181,7 +192,12 @@ async def validate_unsaved_global_data_connector(
181192 # Fetch DOI metadata
182193 if data_connector .storage .storage_type == "doi" :
183194 rclone_metadata = await validator .get_doi_metadata (configuration = data_connector .storage .configuration )
184- metadata = await get_dataset_metadata (rclone_metadata = rclone_metadata )
195+ if not rclone_metadata :
196+ raise errors .ValidationError ()
197+ metadata = await get_dataset_metadata (data_connector .storage .storage_type , rclone_metadata .metadata_url )
198+ elif data_connector .storage .storage_type == ENVIDAT_V1_PROVIDER :
199+ metadata_url = create_envidat_metadata_url (data_connector .doi )
200+ metadata = await get_dataset_metadata (data_connector .storage .storage_type , metadata_url )
185201 else :
186202 metadata = None
187203
@@ -228,6 +244,7 @@ async def validate_unsaved_global_data_connector(
228244 storage = storage ,
229245 description = description or None ,
230246 keywords = keywords ,
247+ doi = data_connector .doi ,
231248 )
232249
233250
@@ -371,8 +388,7 @@ async def convert_envidat_v1_data_connector_to_s3(
371388 raise errors .ValidationError ()
372389 if len (doi ) == 0 :
373390 raise errors .ValidationError ()
374- doi = doi .removeprefix ("https://" )
375- doi = doi .removeprefix ("http://" )
391+ doi = DOI (doi )
376392
377393 new_config = payload .model_copy (deep = True )
378394 new_config .configuration = {}
@@ -386,12 +402,45 @@ async def convert_envidat_v1_data_connector_to_s3(
386402 res = await clnt .get (envidat_url , params = query_params , headers = headers )
387403 if res .status_code != 200 :
388404 raise errors .ProgrammingError ()
389- dataset = schema_org_dataset . Dataset .model_validate_strings (res .text )
390- s3_config = schema_org_dataset .get_rclone_config (
405+ dataset = SchemaOrgDataset .model_validate_strings (res .text )
406+ s3_config = schema_org .get_rclone_config (
391407 dataset ,
392- schema_org_dataset .DatasetProvider .envidat ,
408+ schema_org .DatasetProvider .envidat ,
393409 )
394410 new_config .configuration = dict (s3_config .rclone_config )
395411 new_config .source_path = s3_config .path
396412 new_config .storage_type = "s3"
397413 return new_config
414+
415+
416+ # async def get_metadata(
417+ # configuration: storage_models.RCloneConfig | dict[str, Any], validator: RCloneValidator
418+ # ) -> RCloneDOIMetadata | None:
419+ # """Get metadata for the dataset."""
420+ # if isinstance(configuration, storage_models.RCloneConfig):
421+ # return await validator.get_doi_metadata(configuration)
422+ # doi = configuration.get("doi")
423+ # if not doi:
424+ # return None
425+ # parsed_doi = urlparse(doi)
426+ # if parsed_doi.scheme.decode() not in ["http", "https"]:
427+ # doi = urlunparse(parsed_doi._replace(scheme=b"https")).decode()
428+ # clnt = httpx.AsyncClient(follow_redirects=True)
429+ # async with clnt:
430+ # res = await clnt.get(doi)
431+ # if res.status_code != 200:
432+ # return None
433+ # match res.url.host:
434+ # case "www.envidat.ch":
435+ #
436+ #
437+ # async def get_envidat_metadata(doi: DOI) -> dict | None:
438+ # """Get metadata about the envidat dataset, the doi should not be a url."""
439+ # clnt = httpx.AsyncClient()
440+ # url = "https://envidat.ch/converters-api/internal-dataset/convert/jsonld"
441+ # params = {"query": doi}
442+ # async with clnt:
443+ # res = clnt.get(url, params=params)
444+ # if res.status_code != 200:
445+ # return None
446+ #
0 commit comments