|
1 | 1 | """Models for DOIs.""" |
2 | 2 |
|
| 3 | +from __future__ import annotations |
| 4 | + |
3 | 5 | import re |
4 | 6 | from dataclasses import dataclass |
5 | 7 | from typing import Any, Self |
6 | 8 | from urllib.parse import urlparse |
7 | 9 |
|
8 | 10 | import httpx |
9 | | -from pydantic import BaseModel, ConfigDict, Field |
| 11 | +from pydantic import BaseModel, ConfigDict, Field, ValidationError |
10 | 12 |
|
11 | 13 | from renku_data_services.errors import errors |
12 | 14 |
|
| 15 | +_clnt = httpx.AsyncClient(timeout=5, follow_redirects=True) |
| 16 | + |
13 | 17 |
|
14 | 18 | class DOI(str): |
15 | 19 | """A doi for a dataset or a similar resource.""" |
@@ -45,18 +49,36 @@ def url(self) -> str: |
45 | 49 | """Return a proper URL from the doi.""" |
46 | 50 | return f"https://doi.org/{self}" |
47 | 51 |
|
| 52 | + @property |
| 53 | + def prefix(self) -> str: |
| 54 | + """The prefix of the doi, i.e. if the doi is 10.7910/DVN/XLX9F8, then the prefix is 10.7910.""" |
| 55 | + return self.split("/")[0] |
| 56 | + |
48 | 57 | async def resolve_host(self) -> str | None: |
49 | 58 | """Resolves the DOI and returns the hostname of the url where the redirect leads.""" |
50 | | - clnt = httpx.AsyncClient(timeout=5, follow_redirects=True) |
51 | | - async with clnt: |
52 | | - try: |
53 | | - res = await clnt.get(self.url) |
54 | | - except httpx.HTTPError: |
55 | | - return None |
| 59 | + try: |
| 60 | + res = await _clnt.get(self.url) |
| 61 | + except httpx.HTTPError: |
| 62 | + return None |
56 | 63 | if res.status_code != 200: |
57 | 64 | return None |
58 | 65 | return res.url.host |
59 | 66 |
|
| 67 | + async def metadata(self) -> SchemaOrgDataset | None: |
| 68 | + """Get information about the publisher of the DOI.""" |
| 69 | + try: |
| 70 | + res = await _clnt.get(self.url, headers={"Accept": "application/vnd.schemaorg.ld+json"}) |
| 71 | + except httpx.HTTPError: |
| 72 | + return None |
| 73 | + if res.status_code != 200: |
| 74 | + return None |
| 75 | + try: |
| 76 | + output = SchemaOrgDataset.model_validate_json(res.text) |
| 77 | + except ValidationError: |
| 78 | + return None |
| 79 | + else: |
| 80 | + return output |
| 81 | + |
60 | 82 |
|
61 | 83 | @dataclass(frozen=True, eq=True, kw_only=True) |
62 | 84 | class DOIMetadata: |
@@ -130,15 +152,37 @@ class SchemaOrgDistribution(BaseModel): |
130 | 152 |
|
131 | 153 |
|
132 | 154 | class SchemaOrgDataset(BaseModel): |
133 | | - """A very limited and partial spec of a schema.org Dataset used by Scicat and Envidat.""" |
| 155 | + """A very limited and partial spec of a schema.org Dataset used by Scicat, Envidat, doi.org.""" |
134 | 156 |
|
135 | 157 | model_config = ConfigDict(extra="ignore") |
136 | 158 | distribution: list[SchemaOrgDistribution] = Field(default_factory=list) |
137 | 159 | name: str = Field() |
138 | 160 | description: str | None = None |
139 | 161 | raw_keywords: str = Field(alias="keywords", default="") |
| 162 | + publisher: SchemaOrgPublisher | None = None |
140 | 163 |
|
141 | 164 | @property |
142 | 165 | def keywords(self) -> list[str]: |
143 | 166 | """Split the single keywords string into a list.""" |
144 | 167 | return [i.strip() for i in self.raw_keywords.split(",")] |
| 168 | + |
| 169 | + |
| 170 | +class SchemaOrgPublisher(BaseModel): |
| 171 | + """The schema.org publisher field in a dataset.""" |
| 172 | + |
| 173 | + model_config = ConfigDict(extra="ignore") |
| 174 | + id: str | None = Field(alias="@id", default=None) |
| 175 | + type: str | None = Field(alias="@type", default=None) |
| 176 | + name: str |
| 177 | + |
| 178 | + @property |
| 179 | + def url(self) -> str | None: |
| 180 | + """Try to see if the id is a URL, and if so return it.""" |
| 181 | + if self.id is None: |
| 182 | + return None |
| 183 | + parsed = urlparse(self.id) |
| 184 | + if not parsed.scheme or not parsed.netloc: |
| 185 | + return None |
| 186 | + if parsed.scheme not in ["http", "https"]: |
| 187 | + return None |
| 188 | + return self.id.rstrip("/") |
0 commit comments