diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..1906d99 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,22 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +## [0.15] - 2025-12-18 +- Prepare new PyPI release 0.15 (skipping 0.13/0.14 as requested). +- Improve Vault authentication: host-restricted token exchange and clearer errors. +- Add tests for Vault auth behavior. +- Add docstrings to increase docstring coverage for CI. + +Note: After merging this branch, publish a PyPI release (version 0.15) so +`pip install databusclient` reflects the updated CLI behavior and bug fixes. +# Changelog + +## 0.15 - Prepared release + +- Prepare PyPI release 0.15. +- Restrict Vault token exchange to known hosts and provide clearer auth errors. +- Add tests for Vault auth behavior. +- Documentation: note about Vault-hosts and `--vault-token` usage. + +(See PR and issue tracker for details.) diff --git a/README.md b/README.md index dc9991f..a4e3b6f 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,12 @@ Before using the client, install it via pip: python3 -m pip install databusclient ``` +Note: the PyPI release was updated and this repository prepares version `0.15`. If you previously installed `databusclient` via `pip` and observe different CLI behavior, upgrade to the latest release: + +```bash +python3 -m pip install --upgrade databusclient==0.15 +``` + You can then use the client in the command line: ```bash @@ -164,6 +170,8 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD - If no `--localdir` is provided, the current working directory is used as base directory. The downloaded files will be stored in the working directory in a folder structure according to the Databus layout, i.e. `./$ACCOUNT/$GROUP/$ARTIFACT/$VERSION/`. - `--vault-token` - If the dataset/files to be downloaded require vault authentication, you need to provide a vault token with `--vault-token /path/to/vault-token.dat`. See [Registration (Access Token)](#registration-access-token) for details on how to get a vault token. + + Note: Vault tokens are only required for certain protected Databus hosts (for example: `data.dbpedia.io`, `data.dev.dbpedia.link`). The client now detects those hosts and will fail early with a clear message if a token is required but not provided. Do not pass `--vault-token` for public downloads. - `--databus-key` - If the databus is protected and needs API key authentication, you can provide the API key with `--databus-key YOUR_API_KEY`. diff --git a/databusclient/__init__.py b/databusclient/__init__.py index d15edb6..92fe8b7 100644 --- a/databusclient/__init__.py +++ b/databusclient/__init__.py @@ -1,3 +1,10 @@ +"""Top-level package for the databus Python client. + +This module exposes a small set of convenience functions and the CLI +entrypoint so the package can be used as a library or via +``python -m databusclient``. +""" + from databusclient import cli from databusclient.api.deploy import create_dataset, create_distribution, deploy @@ -5,4 +12,10 @@ def run(): + """Start the Click CLI application. + + This function is used by the ``__main__`` module and the package + entrypoint to invoke the command line interface. + """ + cli.app() diff --git a/databusclient/__main__.py b/databusclient/__main__.py index 8fe6fda..3a50f9a 100644 --- a/databusclient/__main__.py +++ b/databusclient/__main__.py @@ -1,3 +1,19 @@ +"""Module used for ``python -m databusclient`` execution. + +Runs the package's CLI application. +""" + from databusclient import cli -cli.app() + +def main(): + """Invoke the CLI application. + + Kept as a named function for easier testing and clarity. + """ + + cli.app() + + +if __name__ == "__main__": + main() diff --git a/databusclient/api/delete.py b/databusclient/api/delete.py index 2ea8fb4..e96f97b 100644 --- a/databusclient/api/delete.py +++ b/databusclient/api/delete.py @@ -1,3 +1,10 @@ +"""Helpers for deleting Databus resources via the Databus HTTP API. + +This module provides utilities to delete groups, artifacts and versions on a +Databus instance using authenticated HTTP requests. The class `DeleteQueue` +also allows batching of deletions. +""" + import json from typing import List @@ -16,23 +23,43 @@ class DeleteQueue: """ def __init__(self, databus_key: str): + """Create a DeleteQueue bound to a given Databus API key. + + Args: + databus_key: API key used to authenticate deletion requests. + """ self.databus_key = databus_key self.queue: set[str] = set() def add_uri(self, databusURI: str): + """Add a single Databus URI to the deletion queue. + + The URI will be deleted when `execute()` is called. + """ self.queue.add(databusURI) def add_uris(self, databusURIs: List[str]): + """Add multiple Databus URIs to the deletion queue. + + Args: + databusURIs: Iterable of full Databus URIs. + """ for uri in databusURIs: self.queue.add(uri) def is_empty(self) -> bool: + """Return True if the queue is empty.""" return len(self.queue) == 0 def is_not_empty(self) -> bool: + """Return True if the queue contains any URIs.""" return len(self.queue) > 0 def execute(self): + """Execute all queued deletions. + + Each queued URI will be deleted using `_delete_resource`. + """ for uri in self.queue: print(f"[DELETE] {uri}") _delete_resource( diff --git a/databusclient/api/deploy.py b/databusclient/api/deploy.py index ef8ebf5..23c77ea 100644 --- a/databusclient/api/deploy.py +++ b/databusclient/api/deploy.py @@ -1,3 +1,10 @@ +"""Build and publish Databus datasets (JSON-LD) from provided metadata. + +This module exposes helpers to create distribution strings, compute file +information (sha256 and size), construct dataset JSON-LD payloads and +publish them to a Databus instance using the Databus publish API. +""" + import hashlib import json from enum import Enum @@ -25,6 +32,13 @@ class DeployLogLevel(Enum): def _get_content_variants(distribution_str: str) -> Optional[Dict[str, str]]: + """Parse content-variant key/value pairs from a distribution string. + + The CLI supports passing a distribution as ``url|lang=en_type=parsed|...``. + This helper extracts the ``lang``/``type`` style key/value pairs as a + dictionary. + """ + args = distribution_str.split("|") # cv string is ALWAYS at position 1 after the URL @@ -50,6 +64,12 @@ def _get_content_variants(distribution_str: str) -> Optional[Dict[str, str]]: def _get_filetype_definition( distribution_str: str, ) -> Tuple[Optional[str], Optional[str]]: + """Extract an explicit file format and compression from a distribution string. + + Returns (file_extension, compression) where each may be ``None`` if the + format should be inferred from the URL path. + """ + file_ext = None compression = None @@ -87,6 +107,12 @@ def _get_filetype_definition( def _get_extensions(distribution_str: str) -> Tuple[str, str, str]: + """Return tuple `(extension_part, format_extension, compression)`. + + ``extension_part`` is the textual extension appended to generated + filenames (e.g. ".ttl.gz"). + """ + extension_part = "" format_extension, compression = _get_filetype_definition(distribution_str) @@ -126,6 +152,11 @@ def _get_extensions(distribution_str: str) -> Tuple[str, str, str]: def _get_file_stats(distribution_str: str) -> Tuple[Optional[str], Optional[int]]: + """Parse an optional ``sha256sum:length`` tuple from a distribution string. + + Returns (sha256sum, content_length) or (None, None) when not provided. + """ + metadata_list = distribution_str.split("|")[1:] # check whether there is the shasum:length tuple separated by : if len(metadata_list) == 0 or ":" not in metadata_list[-1]: @@ -146,6 +177,12 @@ def _get_file_stats(distribution_str: str) -> Tuple[Optional[str], Optional[int] def _load_file_stats(url: str) -> Tuple[str, int]: + """Download the file at ``url`` and compute its SHA-256 and length. + + This is used as a fallback when the caller did not supply checksum/size + information in the CLI or metadata file. + """ + resp = requests.get(url, timeout=30) if resp.status_code >= 400: raise requests.exceptions.RequestException(response=resp) @@ -156,6 +193,11 @@ def _load_file_stats(url: str) -> Tuple[str, int]: def get_file_info(distribution_str: str) -> Tuple[Dict[str, str], str, str, str, int]: + """Return parsed file information for a distribution string. + + Returns a tuple `(cvs, format_extension, compression, sha256sum, size)`. + """ + cvs = _get_content_variants(distribution_str) extension_part, format_extension, compression = _get_extensions(distribution_str) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index df7c53c..640cc4a 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -1,6 +1,7 @@ import json import os from typing import List +from urllib.parse import urlparse import requests from SPARQLWrapper import JSON, SPARQLWrapper @@ -12,6 +13,18 @@ ) +# Hosts that require Vault token based authentication. Central source of truth. +VAULT_REQUIRED_HOSTS = { + "data.dbpedia.io", + "data.dev.dbpedia.link", +} + + +class DownloadAuthError(Exception): + """Raised when an authorization problem occurs during download.""" + + + def _download_file( url, localDir, @@ -52,13 +65,23 @@ def _download_file( os.makedirs(dirpath, exist_ok=True) # Create the necessary directories # --- 1. Get redirect URL by requesting HEAD --- headers = {} + + # Determine hostname early and fail fast if this host requires Vault token. + # This prevents confusing 401/403 errors later and tells the user exactly + # what to do (provide --vault-token). + parsed = urlparse(url) + host = parsed.hostname + if host in VAULT_REQUIRED_HOSTS and not vault_token_file: + raise DownloadAuthError( + f"Vault token required for host '{host}', but no token was provided. Please use --vault-token." + ) + # --- 1a. public databus --- response = requests.head(url, timeout=30) # --- 1b. Databus API key required --- if response.status_code == 401: - # print(f"API key required for {url}") if not databus_key: - raise ValueError("Databus API key not given for protected download") + raise DownloadAuthError("Databus API key not given for protected download") headers = {"X-API-KEY": databus_key} response = requests.head(url, headers=headers, timeout=30) @@ -81,25 +104,54 @@ def _download_file( response = requests.get( url, headers=headers, stream=True, allow_redirects=True, timeout=30 ) - www = response.headers.get( - "WWW-Authenticate", "" - ) # Check if authentication is required + www = response.headers.get("WWW-Authenticate", "") # Check if authentication is required - # --- 3. If redirected to authentication 401 Unauthorized, get Vault token and retry --- + # --- 3. Handle authentication responses --- + # 3a. Server requests Bearer auth. Only attempt token exchange for hosts + # we explicitly consider Vault-protected (VAULT_REQUIRED_HOSTS). This avoids + # sending tokens to unrelated hosts and makes auth behavior predictable. if response.status_code == 401 and "bearer" in www.lower(): - print(f"Authentication required for {url}") - if not (vault_token_file): - raise ValueError("Vault token file not given for protected download") + # If host is not configured for Vault, do not attempt token exchange. + if host not in VAULT_REQUIRED_HOSTS: + raise DownloadAuthError( + "Server requests Bearer authentication but this host is not configured for Vault token exchange." + " Try providing a databus API key with --databus-key or contact your administrator." + ) + + # Host requires Vault; ensure token file provided. + if not vault_token_file: + raise DownloadAuthError( + f"Vault token required for host '{host}', but no token was provided. Please use --vault-token." + ) - # --- 3a. Fetch Vault token --- - # TODO: cache token + # --- 3b. Fetch Vault token and retry --- + # Token exchange is potentially sensitive and should only be performed + # for known hosts. __get_vault_access__ handles reading the refresh + # token and exchanging it; errors are translated to DownloadAuthError + # for user-friendly CLI output. vault_token = __get_vault_access__(url, vault_token_file, auth_url, client_id) headers["Authorization"] = f"Bearer {vault_token}" - headers.pop("Accept-Encoding") + headers.pop("Accept-Encoding", None) - # --- 3b. Retry with token --- + # Retry with token response = requests.get(url, headers=headers, stream=True, timeout=30) + # Map common auth failures to friendly messages + if response.status_code == 401: + raise DownloadAuthError("Vault token is invalid or expired. Please generate a new token.") + if response.status_code == 403: + raise DownloadAuthError("Vault token is valid but has insufficient permissions to access this file.") + + # 3c. Generic forbidden without Bearer challenge + if response.status_code == 403: + raise DownloadAuthError("Access forbidden: your token or API key does not have permission to download this file.") + + # 3d. Generic unauthorized without Bearer + if response.status_code == 401: + raise DownloadAuthError( + "Unauthorized: access denied. Check your --databus-key or --vault-token settings." + ) + try: response.raise_for_status() # Raise if still failing except requests.exceptions.HTTPError as e: diff --git a/databusclient/api/utils.py b/databusclient/api/utils.py index 7e27ff3..25c5300 100644 --- a/databusclient/api/utils.py +++ b/databusclient/api/utils.py @@ -1,3 +1,9 @@ +"""Utility helpers used by the API submodules. + +Contains small parsing helpers and HTTP helpers that are shared by +`download`, `deploy` and `delete` modules. +""" + from typing import Optional, Tuple import requests @@ -24,6 +30,12 @@ def get_databus_id_parts_from_file_url( A tuple containing (host, accountId, groupId, artifactId, versionId, fileId). Each element is a string or None if not present. """ + """Split a Databus URI into its six parts. + + The returned tuple is (host, accountId, groupId, artifactId, versionId, fileId). + Missing parts are returned as ``None``. + """ + uri = uri.removeprefix("https://").removeprefix("http://") parts = uri.strip("/").split("/") parts += [None] * (6 - len(parts)) # pad with None if less than 6 parts @@ -31,16 +43,16 @@ def get_databus_id_parts_from_file_url( def fetch_databus_jsonld(uri: str, databus_key: str | None = None) -> str: - """ - Retrieve JSON-LD representation of a databus resource. + """Fetch the JSON-LD representation of a Databus resource. - Parameters: - - uri: The full databus URI - - databus_key: Optional Databus API key for authentication on protected resources + Args: + uri: Full Databus resource URI. + databus_key: Optional API key for protected resources. Returns: - JSON-LD string representation of the databus resource. + The response body as a string containing JSON-LD. """ + headers = {"Accept": "application/ld+json"} if databus_key is not None: headers["X-API-KEY"] = databus_key diff --git a/databusclient/cli.py b/databusclient/cli.py index 97430f5..1a345f3 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -7,13 +7,17 @@ import databusclient.api.deploy as api_deploy from databusclient.api.delete import delete as api_delete -from databusclient.api.download import download as api_download +from databusclient.api.download import download as api_download, DownloadAuthError from databusclient.extensions import webdav @click.group() def app(): - """Databus Client CLI""" + """Databus Client CLI. + + Provides `deploy`, `download`, and `delete` commands for interacting + with the DBpedia Databus. + """ pass @@ -171,16 +175,19 @@ def download( """ Download datasets from databus, optionally using vault access if vault options are provided. """ - api_download( - localDir=localdir, - endpoint=databus, - databusURIs=databusuris, - token=vault_token, - databus_key=databus_key, - all_versions=all_versions, - auth_url=authurl, - client_id=clientid, - ) + try: + api_download( + localDir=localdir, + endpoint=databus, + databusURIs=databusuris, + token=vault_token, + databus_key=databus_key, + all_versions=all_versions, + auth_url=authurl, + client_id=clientid, + ) + except DownloadAuthError as e: + raise click.ClickException(str(e)) @app.command() diff --git a/databusclient/extensions/webdav.py b/databusclient/extensions/webdav.py index c0747f6..7981a49 100644 --- a/databusclient/extensions/webdav.py +++ b/databusclient/extensions/webdav.py @@ -1,3 +1,11 @@ +"""WebDAV/Nextcloud upload helper used by the deploy CLI. + +This module computes SHA-256 checksums and sizes for local files and uses +``rclone`` to copy files to a remote WebDAV/Nextcloud instance. The +`upload_to_webdav` function returns a list of metadata dictionaries suitable +for passing to ``deploy_from_metadata``. +""" + import hashlib import os import posixpath @@ -6,6 +14,14 @@ def compute_sha256_and_length(filepath): + """Compute the SHA-256 hex digest and total byte length of a file. + + Args: + filepath: Path to the file to hash. + + Returns: + Tuple of (sha256_hex, size_in_bytes). + """ sha256 = hashlib.sha256() total_length = 0 with open(filepath, "rb") as f: @@ -19,6 +35,11 @@ def compute_sha256_and_length(filepath): def get_all_files(path): + """Return a list of all files for a path. + + If `path` is a file, returns a single-element list. If it is a directory, + walks the directory recursively and returns absolute file paths. + """ if os.path.isfile(path): return [path] files = [] @@ -31,6 +52,17 @@ def get_all_files(path): def upload_to_webdav( source_paths: list[str], remote_name: str, remote_path: str, webdav_url: str ): + """Upload local files or folders to a configured rclone remote. + + Args: + source_paths: List of files or directories to upload. + remote_name: Name of the rclone remote (e.g., "nextcloud"). + remote_path: Destination path on the remote. + webdav_url: Public WebDAV URL used to construct download URLs. + + Returns: + A list of dicts with keys: ``filename``, ``checksum``, ``size``, ``url``. + """ result = [] for path in source_paths: if not os.path.exists(path): diff --git a/pyproject.toml b/pyproject.toml index 5593c74..92f479b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "databusclient" -version = "0.14" +version = "0.15" description = "A simple client for submitting, downloading, and deleting data on the DBpedia Databus" authors = ["DBpedia Association"] license = "Apache-2.0 License" diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..5f4c0a2 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,30 @@ +import sys +import types + +# Provide a lightweight fake SPARQLWrapper module for tests when not installed. +if "SPARQLWrapper" not in sys.modules: + mod = types.ModuleType("SPARQLWrapper") + mod.JSON = None + + class DummySPARQL: + def __init__(self, *args, **kwargs): + pass + + def setQuery(self, q): + self._q = q + + def setReturnFormat(self, f): + self._fmt = f + + def setCustomHttpHeaders(self, h): + self._headers = h + + def query(self): + class R: + def convert(self): + return {"results": {"bindings": []}} + + return R() + + mod.SPARQLWrapper = DummySPARQL + sys.modules["SPARQLWrapper"] = mod diff --git a/tests/test_download_auth.py b/tests/test_download_auth.py new file mode 100644 index 0000000..7225e08 --- /dev/null +++ b/tests/test_download_auth.py @@ -0,0 +1,104 @@ +from unittest.mock import Mock, patch + +import pytest + +import requests + +import databusclient.api.download as dl + +from databusclient.api.download import VAULT_REQUIRED_HOSTS, DownloadAuthError + + +def make_response(status=200, headers=None, content=b""): + headers = headers or {} + mock = Mock() + mock.status_code = status + mock.headers = headers + mock.content = content + + def iter_content(chunk_size): + if content: + yield content + else: + return + + mock.iter_content = lambda chunk: iter(iter_content(chunk)) + + def raise_for_status(): + if mock.status_code >= 400: + raise requests.exceptions.HTTPError() + + mock.raise_for_status = raise_for_status + return mock + + +def test_vault_host_no_token_raises(): + vault_host = next(iter(VAULT_REQUIRED_HOSTS)) + url = f"https://{vault_host}/some/protected/file.ttl" + + with pytest.raises(DownloadAuthError) as exc: + dl._download_file(url, localDir='.', vault_token_file=None) + + assert "Vault token required" in str(exc.value) + + +def test_non_vault_host_no_token_allows_download(monkeypatch): + url = "https://example.com/public/file.txt" + + resp_head = make_response(status=200, headers={}) + resp_get = make_response(status=200, headers={"content-length": "0"}, content=b"") + + with patch("requests.head", return_value=resp_head), patch( + "requests.get", return_value=resp_get + ): + # should not raise + dl._download_file(url, localDir='.', vault_token_file=None) + + +def test_401_after_token_exchange_reports_invalid_token(monkeypatch): + vault_host = next(iter(VAULT_REQUIRED_HOSTS)) + url = f"https://{vault_host}/protected/file.ttl" + + # initial head and get -> 401 with Bearer + resp_head = make_response(status=200, headers={}) + resp_401 = make_response(status=401, headers={"WWW-Authenticate": "Bearer realm=\"auth\""}) + + # after retry with token -> still 401 + resp_401_retry = make_response(status=401, headers={}) + + # Mock requests.get side effects: first 401 (challenge), then 401 after token + get_side_effects = [resp_401, resp_401_retry] + + # Mock token exchange responses + post_resp_1 = Mock() + post_resp_1.json.return_value = {"access_token": "ACCESS"} + post_resp_2 = Mock() + post_resp_2.json.return_value = {"access_token": "VAULT"} + + with patch("requests.head", return_value=resp_head), patch( + "requests.get", side_effect=get_side_effects + ), patch("requests.post", side_effect=[post_resp_1, post_resp_2]): + # set REFRESH_TOKEN so __get_vault_access__ doesn't try to open a file + monkeypatch.setenv("REFRESH_TOKEN", "x" * 90) + + with pytest.raises(DownloadAuthError) as exc: + dl._download_file(url, localDir='.', vault_token_file="/does/not/matter") + + assert "invalid or expired" in str(exc.value) + + +def test_403_reports_insufficient_permissions(): + vault_host = next(iter(VAULT_REQUIRED_HOSTS)) + url = f"https://{vault_host}/protected/file.ttl" + + resp_head = make_response(status=200, headers={}) + resp_403 = make_response(status=403, headers={}) + + with patch("requests.head", return_value=resp_head), patch( + "requests.get", return_value=resp_403 + ): + # provide a token path so early check does not block + with pytest.raises(DownloadAuthError) as exc: + dl._download_file(url, localDir='.', vault_token_file="/some/token/file") + + assert "permission" in str(exc.value) or "forbidden" in str(exc.value)