From 248ae33b8f094c361a8280b83241fc780f4629f7 Mon Sep 17 00:00:00 2001 From: notactuallyfinn Date: Fri, 6 Feb 2026 13:12:19 +0100 Subject: [PATCH 1/7] added adjusted files from feature/153-refactor-datamodel for process --- src/hermes/commands/process/base.py | 51 ++++------- src/hermes/model/merge/__init__.py | 3 + src/hermes/model/merge/action.py | 83 ++++++++++++++++++ src/hermes/model/merge/container.py | 116 +++++++++++++++++++++++++ src/hermes/model/merge/match.py | 17 ++++ src/hermes/model/merge/strategy.py | 42 +++++++++ src/hermes/model/types/ld_container.py | 6 +- 7 files changed, 279 insertions(+), 39 deletions(-) create mode 100644 src/hermes/model/merge/__init__.py create mode 100644 src/hermes/model/merge/action.py create mode 100644 src/hermes/model/merge/container.py create mode 100644 src/hermes/model/merge/match.py create mode 100644 src/hermes/model/merge/strategy.py diff --git a/src/hermes/commands/process/base.py b/src/hermes/commands/process/base.py index 9e29d1e6..83480056 100644 --- a/src/hermes/commands/process/base.py +++ b/src/hermes/commands/process/base.py @@ -5,13 +5,13 @@ # SPDX-FileContributor: Michael Meinel import argparse -import json -import sys from pydantic import BaseModel from hermes.commands.base import HermesCommand, HermesPlugin -from hermes.model.context import HermesHarvestContext, CodeMetaContext +from hermes.model.api import SoftwareMetadata +from hermes.model.context_manager import HermesContext +from hermes.model.merge.container import ld_merge_dict class HermesProcessPlugin(HermesPlugin): @@ -33,42 +33,21 @@ class HermesProcessCommand(HermesCommand): def __call__(self, args: argparse.Namespace) -> None: self.args = args - ctx = CodeMetaContext() - - if not (ctx.hermes_dir / "harvest").exists(): - self.log.error("You must run the harvest command before process") - sys.exit(1) + ctx = HermesContext() + merged_doc = ld_merge_dict([{}]) # Get all harvesters harvester_names = self.root_settings.harvest.sources - harvester_names.reverse() # Switch order for priority handling + ctx.prepare_step('harvest') for harvester in harvester_names: self.log.info("## Process data from %s", harvester) - - harvest_context = HermesHarvestContext(ctx, harvester, {}) - try: - harvest_context.load_cache() - # when the harvest step ran, but there is no cache file, this is a serious flaw - except FileNotFoundError: - self.log.warning("No output data from harvester %s found, skipping", harvester) - continue - - ctx.merge_from(harvest_context) - ctx.merge_contexts_from(harvest_context) - - if ctx._errors: - self.log.error('Errors during merge') - self.errors.extend(ctx._errors) - - for ep, error in ctx._errors: - self.log.info(" - %s: %s", ep.name, error) - - tags_path = ctx.get_cache('process', 'tags', create=True) - with tags_path.open('w') as tags_file: - json.dump(ctx.tags, tags_file, indent=2) - - ctx.prepare_codemeta() - - with open(ctx.get_cache("process", ctx.hermes_name, create=True), 'w') as codemeta_file: - json.dump(ctx._data, codemeta_file, indent=2) + merged_doc.update(SoftwareMetadata.load_from_cache(ctx, harvester)) + ctx.finalize_step("harvest") + + ctx.prepare_step("process") + with ctx["result"] as result_ctx: + result_ctx["codemeta"] = merged_doc.compact() + result_ctx["context"] = {"@context": merged_doc.full_context} + result_ctx["expanded"] = merged_doc.ld_value + ctx.finalize_step("process") diff --git a/src/hermes/model/merge/__init__.py b/src/hermes/model/merge/__init__.py new file mode 100644 index 00000000..1741dca8 --- /dev/null +++ b/src/hermes/model/merge/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022 German Aerospace Center (DLR) +# +# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file diff --git a/src/hermes/model/merge/action.py b/src/hermes/model/merge/action.py new file mode 100644 index 00000000..80f45591 --- /dev/null +++ b/src/hermes/model/merge/action.py @@ -0,0 +1,83 @@ +# SPDX-FileCopyrightText: 2025 German Aerospace Center (DLR) +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: Michael Meinel + +from hermes.model.types import ld_list + + +class MergeError(ValueError): + pass + + +class MergeAction: + def merge(self, target, key, value, update): + raise NotImplementedError() + + +class Reject(MergeAction): + @classmethod + def merge(cls, target, key, value, update): + if value != update: + target.reject(key, update) + return value + + +class Replace(MergeAction): + @classmethod + def merge(cls, target, key, value, update): + if value != update: + target.replace(key, value) + return update + + +class Concat(MergeAction): + @classmethod + def merge(cls, target, key, value, update): + return cls.merge_to_list(value, update) + + @classmethod + def merge_to_list(cls, head, tail): + if not isinstance(head, (list, ld_list)): + head = [head] + if not isinstance(tail, (list, ld_list)): + head.append(tail) + else: + head.extend(tail) + return head + + +class Collect(MergeAction): + def __init__(self, match): + self.match = match + + def merge(self, target, key, value, update): + if not isinstance(value, list): + value = [value] + if not isinstance(update, list): + update = [update] + + for update_item in update: + if not any(self.match(item, update_item) for item in value): + value.append(update_item) + + if len(value) == 1: + return value[0] + else: + return value + + +class MergeSet(MergeAction): + def __init__(self, match, merge_items=True): + self.match = match + self.merge_items = merge_items + + def merge(self, target, key, value, update): + for item in update: + target_item = target.match(key[-1], item, self.match) + if target_item and self.merge_items: + target_item.update(item) + else: + value.append(item) + return value diff --git a/src/hermes/model/merge/container.py b/src/hermes/model/merge/container.py new file mode 100644 index 00000000..80395d87 --- /dev/null +++ b/src/hermes/model/merge/container.py @@ -0,0 +1,116 @@ +# SPDX-FileCopyrightText: 2025 German Aerospace Center (DLR) +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: Michael Meinel + +from hermes.model.types import ld_context, ld_dict, ld_list + +from .strategy import CODEMETA_STRATEGY, PROV_STRATEGY, REPLACE_STRATEGY +from ..types.pyld_util import bundled_loader + + +class _ld_merge_container: + def _to_python(self, full_iri, ld_value): + value = super()._to_python(full_iri, ld_value) + if isinstance(value, ld_dict) and not isinstance(value, ld_merge_dict): + value = ld_merge_dict( + value.ld_value, + parent=value.parent, + key=value.key, + index=value.index, + context=value.context + ) + if isinstance(value, ld_list) and not isinstance(value, ld_merge_list): + value = ld_merge_list( + value.ld_value, + parent=value.parent, + key=value.key, + index=value.index, + context=value.context + ) + return value + + +class ld_merge_list(_ld_merge_container, ld_list): + def __init__(self, data, *, parent=None, key=None, index=None, context=None): + super().__init__(data, parent=parent, key=key, index=index, context=context) + + +class ld_merge_dict(_ld_merge_container, ld_dict): + def __init__(self, data, *, parent=None, key=None, index=None, context=None): + super().__init__(data, parent=parent, key=key, index=index, context=context) + + self.update_context(ld_context.HERMES_PROV_CONTEXT) + + self.strategies = {**REPLACE_STRATEGY} + self.add_strategy(CODEMETA_STRATEGY) + self.add_strategy(PROV_STRATEGY) + + def update_context(self, other_context): + if other_context: + if len(self.context) < 1 or not isinstance(self.context[-1], dict): + self.context.append({}) + + if not isinstance(other_context, list): + other_context = [other_context] + for ctx in other_context: + if isinstance(ctx, dict): + # FIXME: Shouldn't the dict be appended instead? + # How it is implemented currently results in anomalies like this: + # other_context = [{"codemeta": "https://doi.org/10.5063/schema/codemeta-1.0/"}] + # self.context = [{"codemeta": "https://doi.org/10.5063/schema/codemeta-2.0/"}] + # resulting context is only [{"codemeta": "https://doi.org/10.5063/schema/codemeta-1.0/"}] + # values that start with "https://doi.org/10.5063/schema/codemeta-2.0/" can't be compacted anymore + self.context[-1].update(ctx) + elif ctx not in self.context: + self.context.insert(0, ctx) + + self.active_ctx = self.ld_proc.initial_ctx(self.context, {"documentLoader": bundled_loader}) + + def update(self, other): + if isinstance(other, ld_dict): + self.update_context(other.context) + + super().update(other) + + def add_strategy(self, strategy): + for key, value in strategy.items(): + self.strategies[key] = {**value, **self.strategies.get(key, {})} + + def __setitem__(self, key, value): + if key in self: + value = self._merge_item(key, value) + super().__setitem__(key, value) + + def match(self, key, value, match): + for index, item in enumerate(self[key]): + if match(item, value): + if isinstance(item, ld_dict) and not isinstance(item, ld_merge_dict): + item = ld_merge_dict( + item.ld_value, parent=item.parent, key=item.key, index=index, context=item.context + ) + elif isinstance(item, ld_list) and not isinstance(item, ld_merge_list): + item = ld_merge_list( + item.ld_value, parent=item.parent, key=item.key, index=index, context=item.context + ) + return item + + def _merge_item(self, key, value): + strategy = {**self.strategies[None]} + ld_types = self.data_dict.get('@type', []) + for ld_type in ld_types: + strategy.update(self.strategies.get(ld_type, {})) + + merger = strategy.get(key, strategy[None]) + return merger.merge(self, [*self.path, key], self[key], value) + + def _add_related(self, rel, key, value): + self.emplace(rel) + self[rel].append({"@type": "schema:PropertyValue", "schema:name": str(key), "schema:value": str(value)}) + + def reject(self, key, value): + self._add_related("hermes-rt:reject", key, value) + + def replace(self, key, value): + self._add_related("hermes-rt:replace", key, value) diff --git a/src/hermes/model/merge/match.py b/src/hermes/model/merge/match.py new file mode 100644 index 00000000..03b9f9ef --- /dev/null +++ b/src/hermes/model/merge/match.py @@ -0,0 +1,17 @@ +# SPDX-FileCopyrightText: 2025 German Aerospace Center (DLR) +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: Michael Meinel + + +def match_equals(a, b): + return a == b + + +def match_keys(*keys): + def match_func(left, right): + active_keys = [key for key in keys if key in left and key in right] + pairs = [(left[key] == right[key]) for key in active_keys] + return len(active_keys) > 0 and all(pairs) + return match_func diff --git a/src/hermes/model/merge/strategy.py b/src/hermes/model/merge/strategy.py new file mode 100644 index 00000000..12681fe6 --- /dev/null +++ b/src/hermes/model/merge/strategy.py @@ -0,0 +1,42 @@ +# SPDX-FileCopyrightText: 2025 German Aerospace Center (DLR) +# +# SPDX-License-Identifier: Apache-2.0 + +# SPDX-FileContributor: Michael Meinel + +from hermes.model.types.ld_context import iri_map as iri + +from .action import Reject, Replace, Collect, Concat, MergeSet +from .match import match_equals, match_keys + + +REPLACE_STRATEGY = { + None: { + None: Replace, + "@type": Collect(match_equals), + }, +} + + +REJECT_STRATEGY = { + None: { + None: Reject, + "@type": Collect(match_equals), + }, +} + + +PROV_STRATEGY = { + None: { + iri["hermes-rt:graph"]: Concat, + iri["hermes-rt:replace"]: Concat, + iri["hermes-rt:reject"]: Concat, + }, +} + + +CODEMETA_STRATEGY = { + iri["schema:SoftwareSourceCode"]: { + iri["schema:author"]: MergeSet(match_keys('@id', iri['schema:email'])), + }, +} diff --git a/src/hermes/model/types/ld_container.py b/src/hermes/model/types/ld_container.py index a18c886d..f97868d9 100644 --- a/src/hermes/model/types/ld_container.py +++ b/src/hermes/model/types/ld_container.py @@ -237,7 +237,7 @@ def _to_expanded_json( # while searching build a path such that it leads from the found ld_dicts ld_value to selfs data_dict/ item_list parent = self path = [] - while parent.__class__.__name__ not in ("ld_dict", "SoftwareMetadata"): + while parent.__class__.__name__ not in ("ld_dict", "SoftwareMetadata", "ld_merge_dict"): if parent.container_type == "@list": path.extend(["@list", 0]) elif parent.container_type == "@graph": @@ -250,7 +250,7 @@ def _to_expanded_json( # if neither self nor any of its parents is a ld_dict: # create a dict with the key of the outer most parent of self and this parents ld_value as a value # this dict is stored in an ld_container and simulates the most minimal JSON-LD object possible - if parent.__class__.__name__ not in ("ld_dict", "SoftwareMetadata"): + if parent.__class__.__name__ not in ("ld_dict", "SoftwareMetadata", "ld_merge_dict"): key = self.ld_proc.expand_iri(parent.active_ctx, parent.key) parent = ld_container([{key: parent._data}]) path.append(0) @@ -277,7 +277,7 @@ def _to_expanded_json( [(new_key, temp) for new_key in temp.keys() if isinstance(temp[new_key], special_types)] ) elif isinstance(temp, ld_container): - if temp.__class__.__name__ == "ld_list" and temp.container_type == "@set": + if temp.__class__.__name__ in ("ld_list", "ld_merge_list") and temp.container_type == "@set": ref[key] = temp._data else: ref[key] = temp._data[0] From ebebca4e5099c1a856acfbf755077ca5d0a2aa45 Mon Sep 17 00:00:00 2001 From: notactuallyfinn Date: Fri, 6 Feb 2026 14:00:09 +0100 Subject: [PATCH 2/7] added first tests --- src/hermes/commands/__init__.py | 2 +- src/hermes/commands/cli.py | 4 +- test/hermes_test/model/test_api_e2e.py | 103 +++++++++++++++++++++++++ 3 files changed, 106 insertions(+), 3 deletions(-) diff --git a/src/hermes/commands/__init__.py b/src/hermes/commands/__init__.py index 278faddf..e1ddf036 100644 --- a/src/hermes/commands/__init__.py +++ b/src/hermes/commands/__init__.py @@ -14,6 +14,6 @@ # from hermes.commands.init.base import HermesInitCommand # from hermes.commands.curate.base import HermesCurateCommand from hermes.commands.harvest.base import HermesHarvestCommand -# from hermes.commands.process.base import HermesProcessCommand +from hermes.commands.process.base import HermesProcessCommand from hermes.commands.deposit.base import HermesDepositCommand # from hermes.commands.postprocess.base import HermesPostprocessCommand diff --git a/src/hermes/commands/cli.py b/src/hermes/commands/cli.py index 0ec2d1ae..d465f3b8 100644 --- a/src/hermes/commands/cli.py +++ b/src/hermes/commands/cli.py @@ -16,7 +16,7 @@ # from hermes.commands import (HermesHelpCommand, HermesVersionCommand, HermesCleanCommand, # HermesHarvestCommand, HermesProcessCommand, HermesCurateCommand, # HermesDepositCommand, HermesPostprocessCommand, HermesInitCommand) -from hermes.commands import HermesDepositCommand, HermesHarvestCommand +from hermes.commands import HermesDepositCommand, HermesHarvestCommand, HermesProcessCommand from hermes.commands.base import HermesCommand @@ -43,7 +43,7 @@ def main() -> None: # HermesInitCommand(parser), # HermesCleanCommand(parser), HermesHarvestCommand(parser), - # HermesProcessCommand(parser), + HermesProcessCommand(parser), # HermesCurateCommand(parser), HermesDepositCommand(parser), # HermesPostprocessCommand(parser), diff --git a/test/hermes_test/model/test_api_e2e.py b/test/hermes_test/model/test_api_e2e.py index 18dc973c..0eddc59b 100644 --- a/test/hermes_test/model/test_api_e2e.py +++ b/test/hermes_test/model/test_api_e2e.py @@ -475,3 +475,106 @@ def test_invenio_deposit(tmp_path, monkeypatch, sandbox_auth, metadata, invenio_ # TODO: compare to actually expected value assert result == invenio_metadata + + +@pytest.mark.parametrize( + "metadata_in, metadata_out", + [ + ( + { + "cff": SoftwareMetadata({ + "@type": ["http://schema.org/SoftwareSourceCode"], + "http://schema.org/description": [{"@value": "for testing"}], + "http://schema.org/name": [{"@value": "Test"}], + "http://schema.org/author": [{ + "@type": "http://schema.org/Person", + "http://schema.org/familyName": [{"@value": "Test"}], + "http://schema.org/givenName": [{"@value": "Testi"}] + }], + "http://schema.org/license": [{"@id": "https://spdx.org/licenses/Apache-2.0"}] + }) + }, + SoftwareMetadata({ + "@type": ["http://schema.org/SoftwareSourceCode"], + "http://schema.org/description": [{"@value": "for testing"}], + "http://schema.org/name": [{"@value": "Test"}], + "http://schema.org/author": [{ + "@type": "http://schema.org/Person", + "http://schema.org/familyName": [{"@value": "Test"}], + "http://schema.org/givenName": [{"@value": "Testi"}] + }], + "http://schema.org/license": [{"@id": "https://spdx.org/licenses/Apache-2.0"}] + }) + ), + ( + { + "cff": SoftwareMetadata({ + "@type": ["http://schema.org/SoftwareSourceCode"], + "http://schema.org/name": [{"@value": "Test"}], + "http://schema.org/author": [{ + "@type": "http://schema.org/Person", + "http://schema.org/familyName": [{"@value": "Test"}], + "http://schema.org/givenName": [{"@value": "Testi"}], + "http://schema.org/email": [{"@value": "test.testi@testis.tests"}] + }], + "http://schema.org/license": [{"@id": "https://spdx.org/licenses/Apache-2.0"}] + }), + "codemeta": SoftwareMetadata({ + "@type": ["http://schema.org/SoftwareSourceCode"], + "http://schema.org/description": [{"@value": "for testing"}], + "http://schema.org/name": [{"@value": "Test"}], + "http://schema.org/author": [{ + "@type": "http://schema.org/Person", + "http://schema.org/familyName": [{"@value": "Test"}], + "http://schema.org/givenName": [{"@value": "Testi"}], + "http://schema.org/email": [{"@value": "test.testi@testis.tests"}] + }] + }) + }, + SoftwareMetadata({ + "@type": ["http://schema.org/SoftwareSourceCode"], + "http://schema.org/description": [{"@value": "for testing"}], + "http://schema.org/name": [{"@value": "Test"}], + "http://schema.org/author": [{ + "@type": "http://schema.org/Person", + "http://schema.org/familyName": [{"@value": "Test"}], + "http://schema.org/givenName": [{"@value": "Testi"}], + "http://schema.org/email": [{"@value": "test.testi@testis.tests"}] + }], + "http://schema.org/license": [{"@id": "https://spdx.org/licenses/Apache-2.0"}] + }) + ) + ] +) +def test_process(tmp_path, monkeypatch, metadata_in, metadata_out): + monkeypatch.chdir(tmp_path) + + manager = context_manager.HermesContext(tmp_path) + manager.prepare_step("harvest") + for harvester, result in metadata_in.items(): + with manager[harvester] as cache: + cache["codemeta"] = result.compact() + cache["context"] = {"@context": result.full_context} + cache["expanded"] = result.ld_value + manager.finalize_step("harvest") + + config_file = tmp_path / "hermes.toml" + config_file.write_text(f"[harvest]\nsources = [{", ".join(f"\"{harvester}\"" for harvester in metadata_in)}]") + + orig_argv = sys.argv[:] + sys.argv = ["hermes", "process", "--path", str(tmp_path), "--config", str(config_file)] + result = {} + try: + monkeypatch.setattr(context_manager.HermesContext.__init__, "__defaults__", (tmp_path.cwd(),)) + cli.main() + except SystemExit as e: + if e.code != 0: + raise e + finally: + manager.prepare_step("process") + result = SoftwareMetadata.load_from_cache(manager, "result") + manager.finalize_step("process") + sys.argv = orig_argv + + assert result.ld_value == metadata_out.ld_value + assert result == metadata_out From f21df496ef85d61341dfa31ff15f4cbf54d42a87 Mon Sep 17 00:00:00 2001 From: Michael Fritzsche Date: Mon, 9 Feb 2026 09:16:05 +0100 Subject: [PATCH 3/7] (re)added version and help commands to the available commands --- src/hermes/commands/__init__.py | 6 +++--- src/hermes/commands/base.py | 21 +++++++++++++++++++++ src/hermes/commands/cli.py | 8 +++++--- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/src/hermes/commands/__init__.py b/src/hermes/commands/__init__.py index e1ddf036..d239cb0e 100644 --- a/src/hermes/commands/__init__.py +++ b/src/hermes/commands/__init__.py @@ -8,9 +8,9 @@ # "unused import" errors. # flake8: noqa -# from hermes.commands.base import HermesHelpCommand -# from hermes.commands.base import HermesVersionCommand -# from hermes.commands.clean.base import HermesCleanCommand +from hermes.commands.base import HermesHelpCommand +from hermes.commands.base import HermesVersionCommand +from hermes.commands.clean.base import HermesCleanCommand # from hermes.commands.init.base import HermesInitCommand # from hermes.commands.curate.base import HermesCurateCommand from hermes.commands.harvest.base import HermesHarvestCommand diff --git a/src/hermes/commands/base.py b/src/hermes/commands/base.py index 2d182267..12e3c994 100644 --- a/src/hermes/commands/base.py +++ b/src/hermes/commands/base.py @@ -175,6 +175,7 @@ def __call__(self, command: HermesCommand) -> None: class HermesHelpSettings(BaseModel): + """Intentionally empty settings class for the help command.""" pass @@ -200,3 +201,23 @@ def __call__(self, args: argparse.Namespace) -> None: # Otherwise, simply show the general help and exit (cleanly). self.parser.print_help() self.parser.exit() + + +class HermesVersionSettings(BaseModel): + """Intentionally empty settings class for the version command.""" + pass + + +class HermesVersionCommand(HermesCommand): + """Show HERMES version and exit.""" + + command_name = "version" + settings_class = HermesVersionSettings + + def load_settings(self, args: argparse.Namespace): + """Pass loading settings as not necessary for this command.""" + pass + + def __call__(self, args: argparse.Namespace) -> None: + self.log.info(metadata.version("hermes")) + self.parser.exit() diff --git a/src/hermes/commands/cli.py b/src/hermes/commands/cli.py index d465f3b8..debe6f62 100644 --- a/src/hermes/commands/cli.py +++ b/src/hermes/commands/cli.py @@ -16,7 +16,9 @@ # from hermes.commands import (HermesHelpCommand, HermesVersionCommand, HermesCleanCommand, # HermesHarvestCommand, HermesProcessCommand, HermesCurateCommand, # HermesDepositCommand, HermesPostprocessCommand, HermesInitCommand) -from hermes.commands import HermesDepositCommand, HermesHarvestCommand, HermesProcessCommand +from hermes.commands import ( + HermesDepositCommand, HermesHarvestCommand, HermesHelpCommand, HermesProcessCommand, HermesVersionCommand +) from hermes.commands.base import HermesCommand @@ -38,8 +40,8 @@ def main() -> None: setting_types = {} for command in ( - # HermesHelpCommand(parser), - # HermesVersionCommand(parser), + HermesHelpCommand(parser), + HermesVersionCommand(parser), # HermesInitCommand(parser), # HermesCleanCommand(parser), HermesHarvestCommand(parser), From d4d9ca8d6e84edf137cf739483816a346139a151 Mon Sep 17 00:00:00 2001 From: Michael Fritzsche Date: Mon, 9 Feb 2026 09:16:50 +0100 Subject: [PATCH 4/7] made test for process step more complex --- test/hermes_test/model/test_api_e2e.py | 37 +++++++++++++++++--------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/test/hermes_test/model/test_api_e2e.py b/test/hermes_test/model/test_api_e2e.py index 0eddc59b..7a65098b 100644 --- a/test/hermes_test/model/test_api_e2e.py +++ b/test/hermes_test/model/test_api_e2e.py @@ -511,12 +511,18 @@ def test_invenio_deposit(tmp_path, monkeypatch, sandbox_auth, metadata, invenio_ "cff": SoftwareMetadata({ "@type": ["http://schema.org/SoftwareSourceCode"], "http://schema.org/name": [{"@value": "Test"}], - "http://schema.org/author": [{ - "@type": "http://schema.org/Person", - "http://schema.org/familyName": [{"@value": "Test"}], - "http://schema.org/givenName": [{"@value": "Testi"}], - "http://schema.org/email": [{"@value": "test.testi@testis.tests"}] - }], + "http://schema.org/author": [ + { + "@type": "http://schema.org/Person", + "http://schema.org/familyName": [{"@value": "Test"}], + "http://schema.org/email": [{"@value": "test.testi@testis.tests"}] + }, + { + "@type": "http://schema.org/Person", + "http://schema.org/familyName": [{"@value": "Tester"}], + "http://schema.org/email": [{"@value": "test@tester.tests"}] + } + ], "http://schema.org/license": [{"@id": "https://spdx.org/licenses/Apache-2.0"}] }), "codemeta": SoftwareMetadata({ @@ -535,12 +541,19 @@ def test_invenio_deposit(tmp_path, monkeypatch, sandbox_auth, metadata, invenio_ "@type": ["http://schema.org/SoftwareSourceCode"], "http://schema.org/description": [{"@value": "for testing"}], "http://schema.org/name": [{"@value": "Test"}], - "http://schema.org/author": [{ - "@type": "http://schema.org/Person", - "http://schema.org/familyName": [{"@value": "Test"}], - "http://schema.org/givenName": [{"@value": "Testi"}], - "http://schema.org/email": [{"@value": "test.testi@testis.tests"}] - }], + "http://schema.org/author": [ + { + "@type": "http://schema.org/Person", + "http://schema.org/familyName": [{"@value": "Test"}], + "http://schema.org/givenName": [{"@value": "Testi"}], + "http://schema.org/email": [{"@value": "test.testi@testis.tests"}] + }, + { + "@type": "http://schema.org/Person", + "http://schema.org/familyName": [{"@value": "Tester"}], + "http://schema.org/email": [{"@value": "test@tester.tests"}] + } + ], "http://schema.org/license": [{"@id": "https://spdx.org/licenses/Apache-2.0"}] }) ) From 7cfa7bcc7be101dd6580ead1d933f762e768d280 Mon Sep 17 00:00:00 2001 From: Michael Fritzsche Date: Mon, 9 Feb 2026 09:18:19 +0100 Subject: [PATCH 5/7] made process step and ld_container._to_expanded_json more robust --- src/hermes/commands/process/base.py | 9 ++++++++- src/hermes/model/types/ld_container.py | 6 +++--- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/hermes/commands/process/base.py b/src/hermes/commands/process/base.py index 83480056..1aae0dab 100644 --- a/src/hermes/commands/process/base.py +++ b/src/hermes/commands/process/base.py @@ -11,6 +11,7 @@ from hermes.commands.base import HermesCommand, HermesPlugin from hermes.model.api import SoftwareMetadata from hermes.model.context_manager import HermesContext +from hermes.model.error import HermesContextError from hermes.model.merge.container import ld_merge_dict @@ -42,7 +43,13 @@ def __call__(self, args: argparse.Namespace) -> None: ctx.prepare_step('harvest') for harvester in harvester_names: self.log.info("## Process data from %s", harvester) - merged_doc.update(SoftwareMetadata.load_from_cache(ctx, harvester)) + try: + metadata = SoftwareMetadata.load_from_cache(ctx, harvester) + except HermesContextError as e: + self.log.error("Error while trying to load data from harvest plugin '%s': %s", harvester, e) + self.errors.append(e) + continue + merged_doc.update(metadata) ctx.finalize_step("harvest") ctx.prepare_step("process") diff --git a/src/hermes/model/types/ld_container.py b/src/hermes/model/types/ld_container.py index f97868d9..756f2033 100644 --- a/src/hermes/model/types/ld_container.py +++ b/src/hermes/model/types/ld_container.py @@ -237,7 +237,7 @@ def _to_expanded_json( # while searching build a path such that it leads from the found ld_dicts ld_value to selfs data_dict/ item_list parent = self path = [] - while parent.__class__.__name__ not in ("ld_dict", "SoftwareMetadata", "ld_merge_dict"): + while not "ld_dict" in [sub_cls.__name__ for sub_cls in type(parent).mro()]: if parent.container_type == "@list": path.extend(["@list", 0]) elif parent.container_type == "@graph": @@ -250,7 +250,7 @@ def _to_expanded_json( # if neither self nor any of its parents is a ld_dict: # create a dict with the key of the outer most parent of self and this parents ld_value as a value # this dict is stored in an ld_container and simulates the most minimal JSON-LD object possible - if parent.__class__.__name__ not in ("ld_dict", "SoftwareMetadata", "ld_merge_dict"): + if not "ld_dict" in [sub_cls.__name__ for sub_cls in type(parent).mro()]: key = self.ld_proc.expand_iri(parent.active_ctx, parent.key) parent = ld_container([{key: parent._data}]) path.append(0) @@ -277,7 +277,7 @@ def _to_expanded_json( [(new_key, temp) for new_key in temp.keys() if isinstance(temp[new_key], special_types)] ) elif isinstance(temp, ld_container): - if temp.__class__.__name__ in ("ld_list", "ld_merge_list") and temp.container_type == "@set": + if "ld_list" in [sub_cls.__name__ for sub_cls in type(temp).mro()] and temp.container_type == "@set": ref[key] = temp._data else: ref[key] = temp._data[0] From 520ef39bf267643f32ab13da06d10db22a014565 Mon Sep 17 00:00:00 2001 From: Michael Fritzsche Date: Mon, 9 Feb 2026 09:26:51 +0100 Subject: [PATCH 6/7] improved flake8 rating --- src/hermes/model/merge/__init__.py | 2 +- src/hermes/model/types/ld_container.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hermes/model/merge/__init__.py b/src/hermes/model/merge/__init__.py index 1741dca8..faf5a2f5 100644 --- a/src/hermes/model/merge/__init__.py +++ b/src/hermes/model/merge/__init__.py @@ -1,3 +1,3 @@ # SPDX-FileCopyrightText: 2022 German Aerospace Center (DLR) # -# SPDX-License-Identifier: Apache-2.0 \ No newline at end of file +# SPDX-License-Identifier: Apache-2.0 diff --git a/src/hermes/model/types/ld_container.py b/src/hermes/model/types/ld_container.py index 756f2033..f30a212c 100644 --- a/src/hermes/model/types/ld_container.py +++ b/src/hermes/model/types/ld_container.py @@ -237,7 +237,7 @@ def _to_expanded_json( # while searching build a path such that it leads from the found ld_dicts ld_value to selfs data_dict/ item_list parent = self path = [] - while not "ld_dict" in [sub_cls.__name__ for sub_cls in type(parent).mro()]: + while "ld_dict" not in [sub_cls.__name__ for sub_cls in type(parent).mro()]: if parent.container_type == "@list": path.extend(["@list", 0]) elif parent.container_type == "@graph": @@ -250,7 +250,7 @@ def _to_expanded_json( # if neither self nor any of its parents is a ld_dict: # create a dict with the key of the outer most parent of self and this parents ld_value as a value # this dict is stored in an ld_container and simulates the most minimal JSON-LD object possible - if not "ld_dict" in [sub_cls.__name__ for sub_cls in type(parent).mro()]: + if "ld_dict" not in [sub_cls.__name__ for sub_cls in type(parent).mro()]: key = self.ld_proc.expand_iri(parent.active_ctx, parent.key) parent = ld_container([{key: parent._data}]) path.append(0) From bcdc82124a1a6f3cacd0398bcf3a978ae8a18b57 Mon Sep 17 00:00:00 2001 From: notactuallyfinn Date: Fri, 13 Feb 2026 13:54:47 +0100 Subject: [PATCH 7/7] added lots of comments and fixed small inconsistencies --- src/hermes/commands/deposit/invenio.py | 4 +- src/hermes/model/merge/container.py | 301 +++++++++++++++++++++++-- src/hermes/model/merge/match.py | 53 ++++- src/hermes/model/types/ld_container.py | 6 +- src/hermes/model/types/ld_list.py | 4 +- test/hermes_test/model/test_api_e2e.py | 9 +- 6 files changed, 342 insertions(+), 35 deletions(-) diff --git a/src/hermes/commands/deposit/invenio.py b/src/hermes/commands/deposit/invenio.py index 3915d536..ba45c146 100644 --- a/src/hermes/commands/deposit/invenio.py +++ b/src/hermes/commands/deposit/invenio.py @@ -513,7 +513,7 @@ def _codemeta_to_invenio_deposition(self) -> dict: creators = [] for author in metadata.get("author", []): - if not "Person" in author.get("@type", []): + if "Person" not in author.get("@type", []): continue creator = {} if len( @@ -527,7 +527,7 @@ def _codemeta_to_invenio_deposition(self) -> dict: raise HermesValidationError(f"Author has too many family names: {author}") if len(author.get("familyName", [])) == 1: given_names_str = " ".join(author.get("givenName", [])) - name = f"{author["familyName"][0]}, {given_names_str}" + name = f"{author['familyName'][0]}, {given_names_str}" elif len(author.get("name", [])) != 1: raise HermesValidationError(f"Author has too many or no names: {author}") else: diff --git a/src/hermes/model/merge/container.py b/src/hermes/model/merge/container.py index 80395d87..ec9fedd9 100644 --- a/src/hermes/model/merge/container.py +++ b/src/hermes/model/merge/container.py @@ -3,16 +3,49 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileContributor: Michael Meinel +# SPDX-FileContributor: Michael Fritzsche -from hermes.model.types import ld_context, ld_dict, ld_list +from typing import Callable, Union +from typing_extensions import Self + +from hermes.model.merge.action import MergeAction +from hermes.model.types import ld_container, ld_context, ld_dict, ld_list +from hermes.model.types.ld_container import ( + BASIC_TYPE, EXPANDED_JSON_LD_VALUE, JSON_LD_CONTEXT_DICT, JSON_LD_VALUE, TIME_TYPE +) from .strategy import CODEMETA_STRATEGY, PROV_STRATEGY, REPLACE_STRATEGY from ..types.pyld_util import bundled_loader class _ld_merge_container: - def _to_python(self, full_iri, ld_value): + """ + Abstract base class for ld_merge_dict and ld_merge_list, + providing the merge containers with overrides of ld_container._to_python(). + See also :class:`ld_dict`, :class:`ld_list` and :class:`ld_container`. + """ + + def _to_python( + self: Self, + full_iri: str, + ld_value: Union[EXPANDED_JSON_LD_VALUE, dict[str, EXPANDED_JSON_LD_VALUE], list[str], str] + ) -> Union["ld_merge_dict", "ld_merge_list", BASIC_TYPE, TIME_TYPE]: + """ + Returns a pythonized version of the given value pretending the value is in self and full_iri its key. + + :param self: the ld_container ld_value is considered to be in. + :type self: Self + :param full_iri: The expanded iri of the key of ld_value / self (later if self is not a dictionary). + :type full_iri: str + :param ld_value: The value thats pythonized value is requested. ld_value has to be valid expanded JSON-LD if it + was embeded in self._data. + :type ld_value: EXPANDED_JSON_LD_VALUE | dict[str, EXPANDED_JSON_LD_VALUE] | list[str] | str + + :return: The pythonized value of the ld_value. + :rtype: ld_merge_dict | ld_merge_list | BASIC_TYPE | TIME_TYPE + """ value = super()._to_python(full_iri, ld_value) + # replace ld_dicts with ld_merge_dicts if isinstance(value, ld_dict) and not isinstance(value, ld_merge_dict): value = ld_merge_dict( value.ld_value, @@ -21,6 +54,7 @@ def _to_python(self, full_iri, ld_value): index=value.index, context=value.context ) + # replace ld_lists with ld_merge_lists if isinstance(value, ld_list) and not isinstance(value, ld_merge_list): value = ld_merge_list( value.ld_value, @@ -33,21 +67,108 @@ def _to_python(self, full_iri, ld_value): class ld_merge_list(_ld_merge_container, ld_list): - def __init__(self, data, *, parent=None, key=None, index=None, context=None): + """ + ld_list wrapper to ensure the 'merge_container'-property does not get lost, while merging. + See also :class:`ld_list` and :class:`ld_merge_container`. + """ + + def __init__( + self: "ld_merge_list", + data: Union[list[str], list[dict[str, EXPANDED_JSON_LD_VALUE]]], + *, + parent: Union[ld_container, None] = None, + key: Union[str, None] = None, + index: Union[int, None] = None, + context: Union[list[Union[str, JSON_LD_CONTEXT_DICT]], None] = None + ) -> None: + """ + Create a new ld_merge_list. + For further information on this function and the errors it throws see :meth:`ld_list.__init__`. + + :param self: The instance of ld_merge_list to be initialized. + :type self: Self + :param data: The expanded json-ld data that is mapped (must be valid for @set, @list or @graph) + :type data: list[str] | list[dict[str, BASIC_TYPE | EXPANDED_JSON_LD_VALUE]] + :param parent: parent node of this container. + :type parent: ld_container | None + :param key: key into the parent container. + :type key: str | None + :param index: index into the parent container. + :type index: int | None + :param context: local context for this container. + :type context: list[str | JSON_LD_CONTEXT_DICT] | None + + :return: + :rtype: None + """ super().__init__(data, parent=parent, key=key, index=index, context=context) class ld_merge_dict(_ld_merge_container, ld_dict): - def __init__(self, data, *, parent=None, key=None, index=None, context=None): + """ + ld_dict wrapper providing methods to merge an object of this class with an ld_dict object. + See also :class:`ld_dict` and :class:`ld_merge_container`. + + :ivar strategies: The strategies for merging different types of values in the ld_dicts. + :ivartype strategies: dict[str | None, dict[str | None, MergeAction]] + """ + + def __init__( + self: Self, + data: list[dict[str, EXPANDED_JSON_LD_VALUE]], + *, + parent: Union[ld_dict, ld_list, None] = None, + key: Union[str, None] = None, + index: Union[int, None] = None, + context: Union[list[Union[str, JSON_LD_CONTEXT_DICT]], None] = None + ) -> None: + """ + Create a new instance of an ld_merge_dict. + See also :meth:`ld_dict.__init__`. + + :param self: The instance of ld_container to be initialized. + :type self: Self + :param data: The expanded json-ld data that is mapped. + :type data: EXPANDED_JSON_LD_VALUE + :param parent: parent node of this container. + :type parent: ld_dict | ld_list | None + :param key: key into the parent container. + :type key: str | None + :param index: index into the parent container. + :type index: int | None + :param context: local context for this container. + :type context: list[str | JSON_LD_CONTEXT_DICT] | None + + :return: + :rtype: None + + :raises ValueError: If the given data doesn't represent an ld_dict. + """ super().__init__(data, parent=parent, key=key, index=index, context=context) + # add provernance context self.update_context(ld_context.HERMES_PROV_CONTEXT) + # add strategies self.strategies = {**REPLACE_STRATEGY} self.add_strategy(CODEMETA_STRATEGY) self.add_strategy(PROV_STRATEGY) - def update_context(self, other_context): + def update_context( + self: Self, other_context: Union[list[Union[str, JSON_LD_CONTEXT_DICT]], None] + ) -> None: + """ + Updates selfs context with other_context. + JSON-LD processing prioritizes the context values in order (first least important, last most important). + + :param self: The instance of the ld_merge_dict context is added to. + :type self: Self + :param other_context: The context object that is added to selfs context. + :type other_context: list[str | JSON_LD_CONTEXT_DICT] | None + + :return: + :rtype: None + """ if other_context: if len(self.context) < 1 or not isinstance(self.context[-1], dict): self.context.append({}) @@ -56,7 +177,7 @@ def update_context(self, other_context): other_context = [other_context] for ctx in other_context: if isinstance(ctx, dict): - # FIXME: Shouldn't the dict be appended instead? + # FIXME #471: Shouldn't the dict be appended instead? # How it is implemented currently results in anomalies like this: # other_context = [{"codemeta": "https://doi.org/10.5063/schema/codemeta-1.0/"}] # self.context = [{"codemeta": "https://doi.org/10.5063/schema/codemeta-2.0/"}] @@ -64,53 +185,187 @@ def update_context(self, other_context): # values that start with "https://doi.org/10.5063/schema/codemeta-2.0/" can't be compacted anymore self.context[-1].update(ctx) elif ctx not in self.context: + # FIXME #471: If multiple string values are in self.context, the others are prefered + # if the new one is inserted at the beginning. But with the dictionaries the order is reversed. self.context.insert(0, ctx) + # update the active context that is used for compaction/ expansion self.active_ctx = self.ld_proc.initial_ctx(self.context, {"documentLoader": bundled_loader}) - def update(self, other): + def update(self: Self, other: ld_dict) -> None: + """ + Updates/ Merges this ld_merge dict with the given ld_dict other. + This overwrites :meth:`ld_dict.update`, and may cause unexpected behavior if not used carefully. + + :param self: The ld_merge_dict that is updated with other. + :type self: Self + :param other: The ld_container that is merged into self. + :type other: ld_dict + + :return: + :rtype: None + """ + # update add all new context if isinstance(other, ld_dict): self.update_context(other.context) + # add the acutal values based on the MergeAction strategies + # this works implicitly because ld_dict.update invokes self.__setitem__ which is overwritten by ld_merge_dict super().update(other) - def add_strategy(self, strategy): + def add_strategy(self: Self, strategy: dict[Union[str, None], dict[Union[str, None], MergeAction]]) -> None: + """ + Adds the given strategy to the self.strategies. + + :param self: The ld_merge_dict the strategy is added to. + :type self: Self + :param strategy: The object describing how which object types are supposed to be merged. + :type strategy: dict[str | None, dict[str | None, MergeAction]] + """ for key, value in strategy.items(): self.strategies[key] = {**value, **self.strategies.get(key, {})} - def __setitem__(self, key, value): + def __setitem__(self: Self, key: str, value: Union[JSON_LD_VALUE, BASIC_TYPE, TIME_TYPE, ld_dict, ld_list]): + """ + Creates the new entry for self[key] using self.strategies on the values in self[key] and value. + Wraps :meth:`ld_dict.__setitem__`, and may cause unexpected behavior if not used carefully. + + :param self: The ld_merge_dict whose value at key gets updated/ merged with value. + :type self: Self + :param key: The key at whicht the value is updated/ merged at in self. + :type key: str + :param value: The value that is merged into self[key]. + :type value: JSON_LD_VALUE | BASIC_TYPE | TIME_TYPE | ld_dict | ld_list + """ + # create the new item if self[key] and value have to be merged. if key in self: value = self._merge_item(key, value) + # update the entry of self[key] super().__setitem__(key, value) - def match(self, key, value, match): - for index, item in enumerate(self[key]): + def match( + self: Self, + key: str, + value: Union[BASIC_TYPE, TIME_TYPE, ld_dict, ld_list], + match: Union[ + Callable[ + [ + Union[BASIC_TYPE, TIME_TYPE, "ld_merge_dict", ld_merge_list], + Union[BASIC_TYPE, TIME_TYPE, ld_dict, ld_list] + ], + bool + ], + Callable[["ld_merge_dict", ld_dict], bool] + ] + ) -> Union[BASIC_TYPE, TIME_TYPE, "ld_merge_dict", ld_merge_list]: + """ + Returns the first item in self[key] for which match(item, value) returns true. + If no such item is found None is returned instead. + + :param self: The ld_merge_dict in whose entry for key a match for value is searched. + :type self: Self + :param key: The key to the items in self in which a match for value is searched. + :type key: str + :param value: The value a match is searched for in self[key]. + :type value: Union[JSON_LD_VALUE, BASIC_TYPE, TIME_TYPE, ld_dict, ld_list] + :param match: The method defining if two objects are a match. + :type match: Callable[ + [ + BASIC_TYPE | TIME_TYPE | ld_merge_dict | ld_merge_list, + BASIC_TYPE | TIME_TYPE | ld_dict | ld_list + ], + bool + ] | Callable[[ld_merge_dict, ld_dict], bool] + + :return: The item in self[key] that is a match to value if one exists else None + :rtype: BASIC_TYPE | TIME_TYPE | ld_merge_dict | ld_merge_list + """ + # iterate over all items in self[key] and return the first that is a match + for item in self[key]: if match(item, value): - if isinstance(item, ld_dict) and not isinstance(item, ld_merge_dict): - item = ld_merge_dict( - item.ld_value, parent=item.parent, key=item.key, index=index, context=item.context - ) - elif isinstance(item, ld_list) and not isinstance(item, ld_merge_list): - item = ld_merge_list( - item.ld_value, parent=item.parent, key=item.key, index=index, context=item.context - ) return item - def _merge_item(self, key, value): + def _merge_item( + self: Self, key: str, value: Union[BASIC_TYPE, TIME_TYPE, ld_dict, ld_list] + ) -> Union[BASIC_TYPE, TIME_TYPE, "ld_merge_dict", ld_merge_list]: + """ + Applies the most suitable merge strategy to merge self[key] and value and then returns the result. + + :param self: The ld_merge_dict whose entry at key is to be merged with value. + :type self: Self + :param key: The key to the entry in self that is to be merged with value. + :type key: str + :param value: The value that is to be merged with self[key]. + :type value: BASIC_TYPE | TIME_TYPE | ld_dict | ld_list + + :return: The result of the merge from self[key] with value. + :rtype: BASIC_TYPE | TIME_TYPE | ld_merge_dict | ld_merge_list + """ + # search for all applicable strategies strategy = {**self.strategies[None]} ld_types = self.data_dict.get('@type', []) for ld_type in ld_types: strategy.update(self.strategies.get(ld_type, {})) + # choose one merge strategy and return the item returned by following the merge startegy merger = strategy.get(key, strategy[None]) return merger.merge(self, [*self.path, key], self[key], value) - def _add_related(self, rel, key, value): + def _add_related( + self: Self, rel: str, key: str, value: Union[BASIC_TYPE, TIME_TYPE, ld_dict, ld_list] + ) -> None: + """ + Adds an entry for rel to self containing which key and value is affected. + + :param self: The ld_merge_container the special entry is added to. + :type self: Self + :param rel: The "type" of the special entry (used as the key). + :type rel: str + :param key: The key of the affected key, value pair in self. + :type key: str + :param value: The value of the affected key, value pair in self. + :type value: BASIC_TYPE | TIME_TYPE | ld_dict | ld_list + + :return: + :rtype: None + """ + # make sure appending is possible self.emplace(rel) + # append the new entry self[rel].append({"@type": "schema:PropertyValue", "schema:name": str(key), "schema:value": str(value)}) - def reject(self, key, value): + def reject(self: Self, key: str, value: Union[BASIC_TYPE, TIME_TYPE, ld_dict, ld_list]) -> None: + """ + Adds an entry to self containing containing information that the key, value pair + key, value has been rejected in the merge. + For further information see :meth:`ld_merge_dict._add_related`. + + :param self: The ld_merge_container the special entry is added to. + :type self: Self + :param key: The key of the rejected key, value pair in self. + :type key: str + :param value: The value of the rejected key, value pair in self. + :type value: BASIC_TYPE | TIME_TYPE | ld_dict | ld_list + + :return: + :rtype: None + """ self._add_related("hermes-rt:reject", key, value) - def replace(self, key, value): + def replace(self: Self, key: str, value: Union[BASIC_TYPE, TIME_TYPE, ld_dict, ld_list]) -> None: + """ + Adds an entry to self containing containing information that the key, value pair + key, value was replaced in the merge. + For further information see :meth:`ld_merge_dict._add_related`. + + :param self: The ld_merge_container the special entry is added to. + :type self: Self + :param key: The key of the old key, value pair in self. + :type key: str + :param value: The value of the old key, value pair in self. + :type value: BASIC_TYPE | TIME_TYPE | ld_dict | ld_list + + :return: + :rtype: None + """ self._add_related("hermes-rt:replace", key, value) diff --git a/src/hermes/model/merge/match.py b/src/hermes/model/merge/match.py index 03b9f9ef..77abca35 100644 --- a/src/hermes/model/merge/match.py +++ b/src/hermes/model/merge/match.py @@ -4,14 +4,61 @@ # SPDX-FileContributor: Michael Meinel +from typing import Any, Callable -def match_equals(a, b): +from hermes.model.merge.container import ld_merge_dict +from hermes.model.types import ld_dict + + +def match_equals(a: Any, b: Any) -> bool: + """ + Wrapper method for normal == comparison. + + :param a: First item for the comparison. + :type a: Any + :param b: Second item for the comparison. + :type b: Any + + :return: Truth value of a == b. + :rtype: bool + """ return a == b -def match_keys(*keys): - def match_func(left, right): +def match_keys( + *keys: list[str] +) -> Callable[[ld_merge_dict, ld_dict], bool]: + """ + Creates a function taking to parameters that returns true + if both given parameter have at least one common key in the given list of keys + and for all common keys in the given list of keys the values of both objects are the same. + + :param keys: The list of important keys for the comparison method. + :type keys: list[str] + + :return: A function comparing two given objects values for the keys in keys. + :rtype: Callable[[ld_merge_dict, ld_dict], bool] + """ + + # create and return the match function using the given keys + def match_func(left: ld_merge_dict, right: ld_dict) -> bool: + """ + Compares left to right by checking if a) they have at least one common key in a predetermined list of keys and + b) testing if both objects have equal values for all common keys in the predetermined key list. + + :param left: The first object for the comparison. + :type left: ld_merge_dict + :param right: The second object for the comparison. + :type right: ld_dict + + :return: The result of the comparison. + :rtype: bool + """ + # create a list of all common important keys active_keys = [key for key in keys if key in left and key in right] + # check if both objects have the same values for all active keys pairs = [(left[key] == right[key]) for key in active_keys] + # return whether or not both objects had the same values for all active keys + # and there was at least one active key return len(active_keys) > 0 and all(pairs) return match_func diff --git a/src/hermes/model/types/ld_container.py b/src/hermes/model/types/ld_container.py index f30a212c..b2456017 100644 --- a/src/hermes/model/types/ld_container.py +++ b/src/hermes/model/types/ld_container.py @@ -176,7 +176,9 @@ def ld_value(self: Self) -> EXPANDED_JSON_LD_VALUE: return self._data def _to_python( - self: Self, full_iri: str, ld_value: Union[list, dict, str] + self: Self, + full_iri: str, + ld_value: Union[EXPANDED_JSON_LD_VALUE, dict[str, EXPANDED_JSON_LD_VALUE], list[str], str] ) -> Union["ld_container", BASIC_TYPE, TIME_TYPE]: """ Returns a pythonized version of the given value pretending the value is in self and full_iri its key. @@ -187,7 +189,7 @@ def _to_python( :type full_iri: str :param ld_value: The value thats pythonized value is requested. ld_value has to be valid expanded JSON-LD if it was embeded in self._data. - :type ld_value: list | dict | str + :type ld_value: EXPANDED_JSON_LD_VALUE | dict[str, EXPANDED_JSON_LD_VALUE] | list[str] | str :return: The pythonized value of the ld_value. :rtype: ld_container | BASIC_TYPE | TIME_TYPE diff --git a/src/hermes/model/types/ld_list.py b/src/hermes/model/types/ld_list.py index c4d1c450..a76db3b6 100644 --- a/src/hermes/model/types/ld_list.py +++ b/src/hermes/model/types/ld_list.py @@ -23,7 +23,7 @@ class ld_list(ld_container): """ An JSON-LD container resembling a list ("@set", "@list" or "@graph"). - See also :class:`ld_container` + See also :class:`ld_container`. :ivar container_type: The type of JSON-LD container the list is representing. ("@set", "@list", "graph") :ivartype container_type: str @@ -35,7 +35,7 @@ def __init__( self: Self, data: Union[list[str], list[dict[str, EXPANDED_JSON_LD_VALUE]]], *, - parent: Union["ld_container", None] = None, + parent: Union[ld_container, None] = None, key: Union[str, None] = None, index: Union[int, None] = None, context: Union[list[Union[str, JSON_LD_CONTEXT_DICT]], None] = None, diff --git a/test/hermes_test/model/test_api_e2e.py b/test/hermes_test/model/test_api_e2e.py index 7a65098b..f756f101 100644 --- a/test/hermes_test/model/test_api_e2e.py +++ b/test/hermes_test/model/test_api_e2e.py @@ -4,6 +4,7 @@ # SPDX-FileContributor: Michael Fritzsche +from datetime import date import json import pytest import sys @@ -422,7 +423,7 @@ def test_file_deposit(tmp_path, monkeypatch, metadata): }), { "upload_type": "software", - "publication_date": "2026-02-02", + "publication_date": date.today().isoformat(), "title": "Test", "creators": [{"name": "Test, Testi"}], "description": "for testing", @@ -445,6 +446,8 @@ def test_invenio_deposit(tmp_path, monkeypatch, sandbox_auth, metadata, invenio_ cache["codemeta"] = metadata.compact() manager.finalize_step("curate") + (tmp_path / "test.txt").write_text("Test, oh wonderful test!\n") + config_file = tmp_path / "hermes.toml" config_file.write_text(f"""[deposit] target = "invenio" @@ -452,7 +455,7 @@ def test_invenio_deposit(tmp_path, monkeypatch, sandbox_auth, metadata, invenio_ site_url = "https://sandbox.zenodo.org" access_right = "closed" auth_token = "{sandbox_auth}" -files = ["hermes.toml"] +files = ["test.txt"] [deposit.invenio.api_paths] licenses = "api/vocabularies/licenses" """) @@ -572,7 +575,7 @@ def test_process(tmp_path, monkeypatch, metadata_in, metadata_out): manager.finalize_step("harvest") config_file = tmp_path / "hermes.toml" - config_file.write_text(f"[harvest]\nsources = [{", ".join(f"\"{harvester}\"" for harvester in metadata_in)}]") + config_file.write_text(f"[harvest]\nsources = [{', '.join(f'\"{harvester}\"' for harvester in metadata_in)}]") orig_argv = sys.argv[:] sys.argv = ["hermes", "process", "--path", str(tmp_path), "--config", str(config_file)]