Skip to content

Commit 1a76542

Browse files
committed
feat: add doi and publisher info to global dcs
1 parent 621de7f commit 1a76542

File tree

10 files changed

+187
-13
lines changed

10 files changed

+187
-13
lines changed

components/renku_data_services/data_connectors/api.spec.yaml

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,14 @@ components:
475475
$ref: "#/components/schemas/ETag"
476476
keywords:
477477
$ref: "#/components/schemas/KeywordsList"
478+
doi:
479+
$ref: "#/components/schemas/DOI"
480+
publisher_name:
481+
type: string
482+
description: The publisher of the dataset.
483+
publisher_url:
484+
type: string
485+
description: The URL for the publisher of the dataset.
478486
required:
479487
- id
480488
- name
@@ -766,7 +774,22 @@ components:
766774
type:
767775
type: string
768776
description: data type of option value. RClone has more options but they map to the ones listed here.
769-
enum: ["int", "bool", "string", "stringArray", "Time", "Duration", "MultiEncoder", "SizeSuffix", "SpaceSepList", "CommaSepList", "Tristate", "Encoding", "Bits"]
777+
enum:
778+
[
779+
"int",
780+
"bool",
781+
"string",
782+
"stringArray",
783+
"Time",
784+
"Duration",
785+
"MultiEncoder",
786+
"SizeSuffix",
787+
"SpaceSepList",
788+
"CommaSepList",
789+
"Tristate",
790+
"Encoding",
791+
"Bits",
792+
]
770793
required:
771794
- name
772795
- help
@@ -870,6 +893,10 @@ components:
870893
type: string
871894
description: Entity Tag
872895
example: "9EE498F9D565D0C41E511377425F32F3"
896+
DOI:
897+
type: string
898+
description: "A DOI."
899+
example: "10.16904/envidat.33"
873900
DataConnectorsGetQuery:
874901
description: Query params for data connectors get request
875902
allOf:

components/renku_data_services/data_connectors/apispec.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# generated by datamodel-codegen:
22
# filename: api.spec.yaml
3-
# timestamp: 2025-06-19T07:18:06+00:00
3+
# timestamp: 2025-12-03T09:49:11+00:00
44

55
from __future__ import annotations
66

@@ -364,6 +364,15 @@ class DataConnector(BaseAPISpec):
364364
examples=[["project", "keywords"]],
365365
min_length=0,
366366
)
367+
doi: Optional[str] = Field(
368+
None, description="A DOI.", examples=["10.16904/envidat.33"]
369+
)
370+
publisher_name: Optional[str] = Field(
371+
None, description="The publisher of the dataset."
372+
)
373+
publisher_url: Optional[str] = Field(
374+
None, description="The URL for the publisher of the dataset."
375+
)
367376

368377

369378
class DataConnectorPost(BaseAPISpec):

components/renku_data_services/data_connectors/blueprints.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,9 @@ def _dump_data_connector(
481481
description=data_connector.description,
482482
etag=data_connector.etag,
483483
keywords=data_connector.keywords or [],
484+
doi=data_connector.doi,
485+
publisher_name=data_connector.publisher_name,
486+
publisher_url=data_connector.publisher_url,
484487
)
485488
return dict(
486489
id=str(data_connector.id),

components/renku_data_services/data_connectors/core.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,7 @@ async def prevalidate_unsaved_global_data_connector(
191191
storage.configuration["provider"] = rclone_metadata.provider
192192

193193
slug = base_models.Slug.from_name(doi_uri).value
194+
doi_metadata = await doi.metadata()
194195
return models.PrevalidatedGlobalDataConnector(
195196
data_connector=models.UnsavedGlobalDataConnector(
196197
name=doi_uri,
@@ -200,8 +201,14 @@ async def prevalidate_unsaved_global_data_connector(
200201
storage=storage,
201202
description=None,
202203
keywords=[],
204+
doi=doi,
205+
publisher_url=None
206+
if doi_metadata is None or doi_metadata.publisher is None
207+
else doi_metadata.publisher.url,
208+
publisher_name=None
209+
if doi_metadata is None or doi_metadata.publisher is None
210+
else doi_metadata.publisher.name,
203211
),
204-
doi=doi,
205212
rclone_metadata=rclone_metadata,
206213
)
207214

@@ -212,9 +219,12 @@ async def validate_unsaved_global_data_connector(
212219
) -> models.UnsavedGlobalDataConnector:
213220
"""Validate the data connector."""
214221
data_connector = prevalidated_dc.data_connector
215-
doi = prevalidated_dc.doi
222+
doi = prevalidated_dc.data_connector.doi
216223
rclone_metadata = prevalidated_dc.rclone_metadata
217224

225+
if not doi:
226+
raise errors.ValidationError(message="Global data connectors require a DOI.")
227+
218228
# Check that we can list the files in the DOI
219229
connection_result = await validator.test_connection(
220230
configuration=data_connector.storage.configuration, source_path=data_connector.storage.source_path or "/"
@@ -279,6 +289,9 @@ async def validate_unsaved_global_data_connector(
279289
storage=storage,
280290
description=description or None,
281291
keywords=keywords,
292+
doi=data_connector.doi,
293+
publisher_name=data_connector.publisher_name,
294+
publisher_url=data_connector.publisher_url,
282295
)
283296

284297

components/renku_data_services/data_connectors/db.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
from renku_data_services.data_connectors import apispec, models
3030
from renku_data_services.data_connectors import orm as schemas
3131
from renku_data_services.data_connectors.core import validate_unsaved_global_data_connector
32+
from renku_data_services.data_connectors.doi.models import DOI
3233
from renku_data_services.namespace import orm as ns_schemas
3334
from renku_data_services.namespace.db import GroupRepository
3435
from renku_data_services.namespace.models import ProjectNamespace
@@ -291,6 +292,9 @@ async def _insert_data_connector(
291292

292293
slug = data_connector.slug or base_models.Slug.from_name(data_connector.name).value
293294

295+
doi: DOI | None = None
296+
publisher_url: str | None = None
297+
publisher_name: str | None = None
294298
if ns is not None and isinstance(data_connector, models.UnsavedDataConnector):
295299
existing_slug_stmt = (
296300
select(ns_schemas.EntitySlugORM)
@@ -312,6 +316,9 @@ async def _insert_data_connector(
312316
existing_global_dc = await session.scalar(existing_global_dc_stmt)
313317
if existing_global_dc is not None:
314318
raise errors.ConflictError(message=f"An entity with the slug '{data_connector.slug}' already exists.")
319+
doi = data_connector.doi
320+
publisher_name = data_connector.publisher_name
321+
publisher_url = data_connector.publisher_url
315322

316323
visibility_orm = (
317324
apispec.Visibility(data_connector.visibility)
@@ -330,6 +337,9 @@ async def _insert_data_connector(
330337
description=data_connector.description,
331338
keywords=data_connector.keywords,
332339
global_slug=slug if isinstance(data_connector, models.UnsavedGlobalDataConnector) else None,
340+
doi=doi,
341+
publisher_url=publisher_url,
342+
publisher_name=publisher_name,
333343
)
334344
if ns is not None:
335345
data_connector_slug = ns_schemas.EntitySlugORM.create_data_connector_slug(

components/renku_data_services/data_connectors/doi/models.py

Lines changed: 52 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
"""Models for DOIs."""
22

3+
from __future__ import annotations
4+
35
import re
46
from dataclasses import dataclass
57
from typing import Any, Self
68
from urllib.parse import urlparse
79

810
import httpx
9-
from pydantic import BaseModel, ConfigDict, Field
11+
from pydantic import BaseModel, ConfigDict, Field, ValidationError
1012

1113
from renku_data_services.errors import errors
1214

15+
_clnt = httpx.AsyncClient(timeout=5, follow_redirects=True)
16+
1317

1418
class DOI(str):
1519
"""A doi for a dataset or a similar resource."""
@@ -45,18 +49,36 @@ def url(self) -> str:
4549
"""Return a proper URL from the doi."""
4650
return f"https://doi.org/{self}"
4751

52+
@property
53+
def prefix(self) -> str:
54+
"""The prefix of the doi, i.e. if the doi is 10.7910/DVN/XLX9F8, then the prefix is 10.7910."""
55+
return self.split("/")[0]
56+
4857
async def resolve_host(self) -> str | None:
4958
"""Resolves the DOI and returns the hostname of the url where the redirect leads."""
50-
clnt = httpx.AsyncClient(timeout=5, follow_redirects=True)
51-
async with clnt:
52-
try:
53-
res = await clnt.get(self.url)
54-
except httpx.HTTPError:
55-
return None
59+
try:
60+
res = await _clnt.get(self.url)
61+
except httpx.HTTPError:
62+
return None
5663
if res.status_code != 200:
5764
return None
5865
return res.url.host
5966

67+
async def metadata(self) -> SchemaOrgDataset | None:
68+
"""Get information about the publisher of the DOI."""
69+
try:
70+
res = await _clnt.get(self.url, headers={"Accept": "application/vnd.schemaorg.ld+json"})
71+
except httpx.HTTPError:
72+
return None
73+
if res.status_code != 200:
74+
return None
75+
try:
76+
output = SchemaOrgDataset.model_validate_json(res.text)
77+
except ValidationError:
78+
return None
79+
else:
80+
return output
81+
6082

6183
@dataclass(frozen=True, eq=True, kw_only=True)
6284
class DOIMetadata:
@@ -130,15 +152,37 @@ class SchemaOrgDistribution(BaseModel):
130152

131153

132154
class SchemaOrgDataset(BaseModel):
133-
"""A very limited and partial spec of a schema.org Dataset used by Scicat and Envidat."""
155+
"""A very limited and partial spec of a schema.org Dataset used by Scicat, Envidat, doi.org."""
134156

135157
model_config = ConfigDict(extra="ignore")
136158
distribution: list[SchemaOrgDistribution] = Field(default_factory=list)
137159
name: str = Field()
138160
description: str | None = None
139161
raw_keywords: str = Field(alias="keywords", default="")
162+
publisher: SchemaOrgPublisher | None = None
140163

141164
@property
142165
def keywords(self) -> list[str]:
143166
"""Split the single keywords string into a list."""
144167
return [i.strip() for i in self.raw_keywords.split(",")]
168+
169+
170+
class SchemaOrgPublisher(BaseModel):
171+
"""The schema.org publisher field in a dataset."""
172+
173+
model_config = ConfigDict(extra="ignore")
174+
id: str | None = Field(alias="@id", default=None)
175+
type: str | None = Field(alias="@type", default=None)
176+
name: str
177+
178+
@property
179+
def url(self) -> str | None:
180+
"""Try to see if the id is a URL, and if so return it."""
181+
if self.id is None:
182+
return None
183+
parsed = urlparse(self.id)
184+
if not parsed.scheme or not parsed.netloc:
185+
return None
186+
if parsed.scheme not in ["http", "https"]:
187+
return None
188+
return self.id.rstrip("/")

components/renku_data_services/data_connectors/models.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,9 @@ class GlobalDataConnector(BaseDataConnector):
8787
id: ULID
8888
namespace: Final[None] = field(default=None, init=False)
8989
updated_at: datetime
90+
publisher_name: str | None = None
91+
publisher_url: str | None = None
92+
doi: DOI | None = None
9093

9194
@property
9295
def etag(self) -> str:
@@ -99,14 +102,16 @@ class UnsavedGlobalDataConnector(BaseDataConnector):
99102
"""Global data connector model."""
100103

101104
namespace: None = None
105+
publisher_name: str | None = None
106+
publisher_url: str | None = None
107+
doi: DOI | None = None
102108

103109

104110
@dataclass(frozen=True, eq=True, kw_only=True)
105111
class PrevalidatedGlobalDataConnector:
106112
"""Global data connector model that is unsaved but has been pre-validated."""
107113

108114
data_connector: UnsavedGlobalDataConnector
109-
doi: DOI
110115
rclone_metadata: RCloneDOIMetadata | None = None
111116

112117

components/renku_data_services/data_connectors/orm.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from renku_data_services.base_orm.registry import COMMON_ORM_REGISTRY
1414
from renku_data_services.data_connectors import models
1515
from renku_data_services.data_connectors.apispec import Visibility
16+
from renku_data_services.data_connectors.doi.models import DOI
1617
from renku_data_services.project.orm import ProjectORM
1718
from renku_data_services.secrets.orm import SecretORM
1819
from renku_data_services.users.orm import UserORM
@@ -97,6 +98,9 @@ class DataConnectorORM(BaseORM):
9798
init=False,
9899
viewonly=True,
99100
)
101+
doi: Mapped[str | None] = mapped_column(default=None, server_default=None, index=True, nullable=True)
102+
publisher_name: Mapped[str | None] = mapped_column(default=None, server_default=None, index=True, nullable=True)
103+
publisher_url: Mapped[str | None] = mapped_column(default=None, server_default=None, index=True, nullable=True)
100104

101105
def dump(self) -> models.DataConnector | models.GlobalDataConnector:
102106
"""Create a data connector model from the DataConnectorORM."""
@@ -112,6 +116,9 @@ def dump(self) -> models.DataConnector | models.GlobalDataConnector:
112116
storage=self._dump_storage(),
113117
description=self.description,
114118
keywords=self.keywords,
119+
publisher_name=self.publisher_name,
120+
publisher_url=self.publisher_url,
121+
doi=DOI(self.doi) if self.doi is not None else None,
115122
)
116123

117124
elif self.slug is None:
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""add doi and publisher info for global data connectors
2+
3+
Revision ID: bd97866a6253
4+
Revises: 42049656cdb8
5+
Create Date: 2025-12-03 09:38:17.534403
6+
7+
"""
8+
9+
import sqlalchemy as sa
10+
from alembic import op
11+
12+
# revision identifiers, used by Alembic.
13+
revision = "bd97866a6253"
14+
down_revision = "42049656cdb8"
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade() -> None:
20+
# ### commands auto generated by Alembic - please adjust! ###
21+
op.add_column("data_connectors", sa.Column("doi", sa.String(), nullable=True), schema="storage")
22+
op.add_column("data_connectors", sa.Column("publisher_name", sa.String(), nullable=True), schema="storage")
23+
op.add_column("data_connectors", sa.Column("publisher_url", sa.String(), nullable=True), schema="storage")
24+
op.create_index(op.f("ix_storage_data_connectors_doi"), "data_connectors", ["doi"], unique=False, schema="storage")
25+
op.create_index(
26+
op.f("ix_storage_data_connectors_publisher_name"),
27+
"data_connectors",
28+
["publisher_name"],
29+
unique=False,
30+
schema="storage",
31+
)
32+
op.create_index(
33+
op.f("ix_storage_data_connectors_publisher_url"),
34+
"data_connectors",
35+
["publisher_url"],
36+
unique=False,
37+
schema="storage",
38+
)
39+
# ### end Alembic commands ###
40+
41+
42+
def downgrade() -> None:
43+
# ### commands auto generated by Alembic - please adjust! ###
44+
op.drop_index(op.f("ix_storage_data_connectors_publisher_url"), table_name="data_connectors", schema="storage")
45+
op.drop_index(op.f("ix_storage_data_connectors_publisher_name"), table_name="data_connectors", schema="storage")
46+
op.drop_index(op.f("ix_storage_data_connectors_doi"), table_name="data_connectors", schema="storage")
47+
op.drop_column("data_connectors", "publisher_url", schema="storage")
48+
op.drop_column("data_connectors", "publisher_name", schema="storage")
49+
op.drop_column("data_connectors", "doi", schema="storage")
50+
# ### end Alembic commands ###

test/bases/renku_data_services/data_api/test_data_connectors.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2447,8 +2447,14 @@ async def test_validate_envidat_data_connector() -> None:
24472447
assert config["provider"] == "Other"
24482448
assert config["endpoint"].find("zhdk.cloud.switch.ch") >= 0
24492449
assert res.data_connector.storage.source_path == "/envidat-doi/10.16904_12"
2450+
assert res.data_connector.doi is not None
2451+
assert res.data_connector.publisher_url is not None
2452+
assert res.data_connector.publisher_name is not None
24502453
res = await core.validate_unsaved_global_data_connector(res, validator)
24512454
assert res.description is not None
24522455
assert len(res.description) > 0
24532456
assert res.keywords is not None
24542457
assert len(res.keywords) > 0
2458+
assert res.doi is not None
2459+
assert res.publisher_url is not None
2460+
assert res.publisher_name is not None

0 commit comments

Comments
 (0)