Skip to content

Commit eecf290

Browse files
committed
feat: add doi and publisher info to global dcs
1 parent 0593dad commit eecf290

File tree

10 files changed

+187
-13
lines changed

10 files changed

+187
-13
lines changed

components/renku_data_services/data_connectors/api.spec.yaml

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,14 @@ components:
475475
$ref: "#/components/schemas/ETag"
476476
keywords:
477477
$ref: "#/components/schemas/KeywordsList"
478+
doi:
479+
$ref: "#/components/schemas/DOI"
480+
publisher_name:
481+
type: string
482+
description: The publisher of the dataset.
483+
publisher_url:
484+
type: string
485+
description: The URL for the publisher of the dataset.
478486
required:
479487
- id
480488
- name
@@ -766,7 +774,22 @@ components:
766774
type:
767775
type: string
768776
description: data type of option value. RClone has more options but they map to the ones listed here.
769-
enum: ["int", "bool", "string", "stringArray", "Time", "Duration", "MultiEncoder", "SizeSuffix", "SpaceSepList", "CommaSepList", "Tristate", "Encoding", "Bits"]
777+
enum:
778+
[
779+
"int",
780+
"bool",
781+
"string",
782+
"stringArray",
783+
"Time",
784+
"Duration",
785+
"MultiEncoder",
786+
"SizeSuffix",
787+
"SpaceSepList",
788+
"CommaSepList",
789+
"Tristate",
790+
"Encoding",
791+
"Bits",
792+
]
770793
required:
771794
- name
772795
- help
@@ -870,6 +893,10 @@ components:
870893
type: string
871894
description: Entity Tag
872895
example: "9EE498F9D565D0C41E511377425F32F3"
896+
DOI:
897+
type: string
898+
description: "A DOI."
899+
example: "10.16904/envidat.33"
873900
DataConnectorsGetQuery:
874901
description: Query params for data connectors get request
875902
allOf:

components/renku_data_services/data_connectors/apispec.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# generated by datamodel-codegen:
22
# filename: api.spec.yaml
3-
# timestamp: 2025-06-19T07:18:06+00:00
3+
# timestamp: 2025-12-03T09:49:11+00:00
44

55
from __future__ import annotations
66

@@ -364,6 +364,15 @@ class DataConnector(BaseAPISpec):
364364
examples=[["project", "keywords"]],
365365
min_length=0,
366366
)
367+
doi: Optional[str] = Field(
368+
None, description="A DOI.", examples=["10.16904/envidat.33"]
369+
)
370+
publisher_name: Optional[str] = Field(
371+
None, description="The publisher of the dataset."
372+
)
373+
publisher_url: Optional[str] = Field(
374+
None, description="The URL for the publisher of the dataset."
375+
)
367376

368377

369378
class DataConnectorPost(BaseAPISpec):

components/renku_data_services/data_connectors/blueprints.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,9 @@ def _dump_data_connector(
513513
description=data_connector.description,
514514
etag=data_connector.etag,
515515
keywords=data_connector.keywords or [],
516+
doi=data_connector.doi,
517+
publisher_name=data_connector.publisher_name,
518+
publisher_url=data_connector.publisher_url,
516519
)
517520
return dict(
518521
id=str(data_connector.id),

components/renku_data_services/data_connectors/core.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@ async def prevalidate_unsaved_global_data_connector(
193193
storage.configuration["provider"] = rclone_metadata.provider
194194

195195
slug = base_models.Slug.from_name(doi_uri).value
196+
doi_metadata = await doi.metadata()
196197
return models.PrevalidatedGlobalDataConnector(
197198
data_connector=models.UnsavedGlobalDataConnector(
198199
name=doi_uri,
@@ -202,8 +203,14 @@ async def prevalidate_unsaved_global_data_connector(
202203
storage=storage,
203204
description=None,
204205
keywords=[],
206+
doi=doi,
207+
publisher_url=None
208+
if doi_metadata is None or doi_metadata.publisher is None
209+
else doi_metadata.publisher.url,
210+
publisher_name=None
211+
if doi_metadata is None or doi_metadata.publisher is None
212+
else doi_metadata.publisher.name,
205213
),
206-
doi=doi,
207214
rclone_metadata=rclone_metadata,
208215
)
209216

@@ -214,9 +221,12 @@ async def validate_unsaved_global_data_connector(
214221
) -> models.UnsavedGlobalDataConnector:
215222
"""Validate the data connector."""
216223
data_connector = prevalidated_dc.data_connector
217-
doi = prevalidated_dc.doi
224+
doi = prevalidated_dc.data_connector.doi
218225
rclone_metadata = prevalidated_dc.rclone_metadata
219226

227+
if not doi:
228+
raise errors.ValidationError(message="Global data connectors require a DOI.")
229+
220230
# Check that we can list the files in the DOI
221231
connection_result = await validator.test_connection(
222232
configuration=data_connector.storage.configuration, source_path=data_connector.storage.source_path or "/"
@@ -281,6 +291,9 @@ async def validate_unsaved_global_data_connector(
281291
storage=storage,
282292
description=description or None,
283293
keywords=keywords,
294+
doi=data_connector.doi,
295+
publisher_name=data_connector.publisher_name,
296+
publisher_url=data_connector.publisher_url,
284297
)
285298

286299

components/renku_data_services/data_connectors/db.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from renku_data_services.data_connectors import apispec, models
3131
from renku_data_services.data_connectors import orm as schemas
3232
from renku_data_services.data_connectors.core import validate_unsaved_global_data_connector
33+
from renku_data_services.data_connectors.doi.models import DOI
3334
from renku_data_services.namespace import orm as ns_schemas
3435
from renku_data_services.namespace.db import GroupRepository
3536
from renku_data_services.namespace.models import ProjectNamespace
@@ -292,6 +293,9 @@ async def _insert_data_connector(
292293

293294
slug = data_connector.slug or base_models.Slug.from_name(data_connector.name).value
294295

296+
doi: DOI | None = None
297+
publisher_url: str | None = None
298+
publisher_name: str | None = None
295299
if ns is not None and isinstance(data_connector, models.UnsavedDataConnector):
296300
existing_slug_stmt = (
297301
select(ns_schemas.EntitySlugORM)
@@ -313,6 +317,9 @@ async def _insert_data_connector(
313317
existing_global_dc = await session.scalar(existing_global_dc_stmt)
314318
if existing_global_dc is not None:
315319
raise errors.ConflictError(message=f"An entity with the slug '{data_connector.slug}' already exists.")
320+
doi = data_connector.doi
321+
publisher_name = data_connector.publisher_name
322+
publisher_url = data_connector.publisher_url
316323

317324
visibility_orm = (
318325
apispec.Visibility(data_connector.visibility)
@@ -331,6 +338,9 @@ async def _insert_data_connector(
331338
description=data_connector.description,
332339
keywords=data_connector.keywords,
333340
global_slug=slug if isinstance(data_connector, models.UnsavedGlobalDataConnector) else None,
341+
doi=doi,
342+
publisher_url=publisher_url,
343+
publisher_name=publisher_name,
334344
)
335345
if ns is not None:
336346
data_connector_slug = ns_schemas.EntitySlugORM.create_data_connector_slug(

components/renku_data_services/data_connectors/doi/models.py

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
"""Models for DOIs."""
22

3+
from __future__ import annotations
4+
35
import re
46
from dataclasses import dataclass
57
from typing import Any, Self
68
from urllib.parse import urlparse
79

810
import httpx
9-
from pydantic import BaseModel, ConfigDict, Field
11+
from pydantic import BaseModel, ConfigDict, Field, ValidationError
1012

1113
from renku_data_services.errors import errors
1214

15+
_clnt = httpx.AsyncClient(timeout=5, follow_redirects=True)
16+
1317

1418
class DOI(str):
1519
"""A doi for a dataset or a similar resource."""
@@ -45,18 +49,36 @@ def url(self) -> str:
4549
"""Return a proper URL from the doi."""
4650
return f"https://doi.org/{self}"
4751

52+
@property
53+
def prefix(self) -> str:
54+
"""The prefix of the doi, i.e. if the doi is 10.7910/DVN/XLX9F8, then the prefix is 10.7910."""
55+
return self.split("/")[0]
56+
4857
async def resolve_host(self) -> str | None:
4958
"""Resolves the DOI and returns the hostname of the url where the redirect leads."""
50-
clnt = httpx.AsyncClient(timeout=5, follow_redirects=True)
51-
async with clnt:
52-
try:
53-
res = await clnt.get(self.url)
54-
except httpx.HTTPError:
55-
return None
59+
try:
60+
res = await _clnt.get(self.url)
61+
except httpx.HTTPError:
62+
return None
5663
if res.status_code != 200:
5764
return None
5865
return res.url.host
5966

67+
async def metadata(self) -> SchemaOrgDataset | None:
68+
"""Get information about the publisher of the DOI."""
69+
try:
70+
res = await _clnt.get(self.url, headers={"Accept": "application/vnd.schemaorg.ld+json"})
71+
except httpx.HTTPError:
72+
return None
73+
if res.status_code != 200:
74+
return None
75+
try:
76+
output = SchemaOrgDataset.model_validate_json(res.text)
77+
except ValidationError:
78+
return None
79+
else:
80+
return output
81+
6082

6183
@dataclass(frozen=True, eq=True, kw_only=True)
6284
class DOIMetadata:
@@ -130,15 +152,37 @@ class SchemaOrgDistribution(BaseModel):
130152

131153

132154
class SchemaOrgDataset(BaseModel):
133-
"""A very limited and partial spec of a schema.org Dataset used by Scicat and Envidat."""
155+
"""A very limited and partial spec of a schema.org Dataset used by Scicat, Envidat, doi.org."""
134156

135157
model_config = ConfigDict(extra="ignore")
136158
distribution: list[SchemaOrgDistribution] = Field(default_factory=list)
137159
name: str = Field()
138160
description: str | None = None
139161
raw_keywords: str = Field(alias="keywords", default="")
162+
publisher: SchemaOrgPublisher | None = None
140163

141164
@property
142165
def keywords(self) -> list[str]:
143166
"""Split the single keywords string into a list."""
144167
return [i.strip() for i in self.raw_keywords.split(",")]
168+
169+
170+
class SchemaOrgPublisher(BaseModel):
171+
"""The schema.org publisher field in a dataset."""
172+
173+
model_config = ConfigDict(extra="ignore")
174+
id: str | None = Field(alias="@id", default=None)
175+
type: str | None = Field(alias="@type", default=None)
176+
name: str
177+
178+
@property
179+
def url(self) -> str | None:
180+
"""Try to see if the id is a URL, and if so return it."""
181+
if self.id is None:
182+
return None
183+
parsed = urlparse(self.id)
184+
if not parsed.scheme or not parsed.netloc:
185+
return None
186+
if parsed.scheme not in ["http", "https"]:
187+
return None
188+
return self.id.rstrip("/")

components/renku_data_services/data_connectors/models.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ class GlobalDataConnector(BaseDataConnector):
8787
id: ULID
8888
namespace: Final[None] = field(default=None, init=False)
8989
updated_at: datetime
90+
publisher_name: str | None = None
91+
publisher_url: str | None = None
92+
doi: DOI | None = None
9093

9194
@property
9295
def etag(self) -> str:
@@ -99,14 +102,16 @@ class UnsavedGlobalDataConnector(BaseDataConnector):
99102
"""Global data connector model."""
100103

101104
namespace: None = None
105+
publisher_name: str | None = None
106+
publisher_url: str | None = None
107+
doi: DOI | None = None
102108

103109

104110
@dataclass(frozen=True, eq=True, kw_only=True)
105111
class PrevalidatedGlobalDataConnector:
106112
"""Global data connector model that is unsaved but has been pre-validated."""
107113

108114
data_connector: UnsavedGlobalDataConnector
109-
doi: DOI
110115
rclone_metadata: RCloneDOIMetadata | None = None
111116

112117

components/renku_data_services/data_connectors/orm.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from renku_data_services.base_orm.registry import COMMON_ORM_REGISTRY
1414
from renku_data_services.data_connectors import models
1515
from renku_data_services.data_connectors.apispec import Visibility
16+
from renku_data_services.data_connectors.doi.models import DOI
1617
from renku_data_services.project.orm import ProjectORM
1718
from renku_data_services.secrets.orm import SecretORM
1819
from renku_data_services.users.orm import UserORM
@@ -97,6 +98,9 @@ class DataConnectorORM(BaseORM):
9798
init=False,
9899
viewonly=True,
99100
)
101+
doi: Mapped[str | None] = mapped_column(default=None, server_default=None, index=True, nullable=True)
102+
publisher_name: Mapped[str | None] = mapped_column(default=None, server_default=None, index=True, nullable=True)
103+
publisher_url: Mapped[str | None] = mapped_column(default=None, server_default=None, index=True, nullable=True)
100104

101105
def dump(self) -> models.DataConnector | models.GlobalDataConnector:
102106
"""Create a data connector model from the DataConnectorORM."""
@@ -112,6 +116,9 @@ def dump(self) -> models.DataConnector | models.GlobalDataConnector:
112116
storage=self._dump_storage(),
113117
description=self.description,
114118
keywords=self.keywords,
119+
publisher_name=self.publisher_name,
120+
publisher_url=self.publisher_url,
121+
doi=DOI(self.doi) if self.doi is not None else None,
115122
)
116123

117124
elif self.slug is None:
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""add doi and publisher info for global data connectors
2+
3+
Revision ID: bd97866a6253
4+
Revises: 42049656cdb8
5+
Create Date: 2025-12-03 09:38:17.534403
6+
7+
"""
8+
9+
import sqlalchemy as sa
10+
from alembic import op
11+
12+
# revision identifiers, used by Alembic.
13+
revision = "bd97866a6253"
14+
down_revision = "42049656cdb8"
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade() -> None:
20+
# ### commands auto generated by Alembic - please adjust! ###
21+
op.add_column("data_connectors", sa.Column("doi", sa.String(), nullable=True), schema="storage")
22+
op.add_column("data_connectors", sa.Column("publisher_name", sa.String(), nullable=True), schema="storage")
23+
op.add_column("data_connectors", sa.Column("publisher_url", sa.String(), nullable=True), schema="storage")
24+
op.create_index(op.f("ix_storage_data_connectors_doi"), "data_connectors", ["doi"], unique=False, schema="storage")
25+
op.create_index(
26+
op.f("ix_storage_data_connectors_publisher_name"),
27+
"data_connectors",
28+
["publisher_name"],
29+
unique=False,
30+
schema="storage",
31+
)
32+
op.create_index(
33+
op.f("ix_storage_data_connectors_publisher_url"),
34+
"data_connectors",
35+
["publisher_url"],
36+
unique=False,
37+
schema="storage",
38+
)
39+
# ### end Alembic commands ###
40+
41+
42+
def downgrade() -> None:
43+
# ### commands auto generated by Alembic - please adjust! ###
44+
op.drop_index(op.f("ix_storage_data_connectors_publisher_url"), table_name="data_connectors", schema="storage")
45+
op.drop_index(op.f("ix_storage_data_connectors_publisher_name"), table_name="data_connectors", schema="storage")
46+
op.drop_index(op.f("ix_storage_data_connectors_doi"), table_name="data_connectors", schema="storage")
47+
op.drop_column("data_connectors", "publisher_url", schema="storage")
48+
op.drop_column("data_connectors", "publisher_name", schema="storage")
49+
op.drop_column("data_connectors", "doi", schema="storage")
50+
# ### end Alembic commands ###

test/bases/renku_data_services/data_api/test_data_connectors.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2505,8 +2505,14 @@ async def test_validate_envidat_data_connector() -> None:
25052505
assert config["provider"] == "Other"
25062506
assert config["endpoint"].find("zhdk.cloud.switch.ch") >= 0
25072507
assert res.data_connector.storage.source_path == "/envidat-doi/10.16904_12"
2508+
assert res.data_connector.doi is not None
2509+
assert res.data_connector.publisher_url is not None
2510+
assert res.data_connector.publisher_name is not None
25082511
res = await core.validate_unsaved_global_data_connector(res, validator)
25092512
assert res.description is not None
25102513
assert len(res.description) > 0
25112514
assert res.keywords is not None
25122515
assert len(res.keywords) > 0
2516+
assert res.doi is not None
2517+
assert res.publisher_url is not None
2518+
assert res.publisher_name is not None

0 commit comments

Comments
 (0)