diff --git a/docs/changes/newsfragments/498.feature b/docs/changes/newsfragments/498.feature new file mode 100644 index 000000000..c7a14e19e --- /dev/null +++ b/docs/changes/newsfragments/498.feature @@ -0,0 +1 @@ +Introduce :func:`.generate_yaml` to generate feature YAML from metadata by `Synchon Mandal`_ diff --git a/docs/links.inc b/docs/links.inc index dfc113e5b..fcaf3ebad 100644 --- a/docs/links.inc +++ b/docs/links.inc @@ -13,6 +13,7 @@ .. _`INM-7`: https://www.fz-juelich.de/inm/inm-7/EN/Home/home_node.html .. _`julearn`: https://juaml.github.io/julearn .. _`junifer-data`: https://github.com/juaml/junifer-data-client +.. _`julio`: https://github.com/juaml/julio .. _`pandas`: https://pandas.pydata.org .. _`pandas.DataFrame` : https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html diff --git a/docs/using/generate_yaml.rst b/docs/using/generate_yaml.rst new file mode 100644 index 000000000..ba9dcb1f4 --- /dev/null +++ b/docs/using/generate_yaml.rst @@ -0,0 +1,56 @@ +.. include:: ../links.inc + +.. _generate_yaml: + +Generating YAML from metadata +============================= + +``junifer`` stores the pipeline metadata for a run along with the extracted feature data. +So, the metadata for all the "elements" processed with a pipeline is unique. The metadata +contains all the necessary information to recreate the configuration used for the processing. + +If one wants to generate the processing YAML, :func:`.generate_yaml` can be used for that. +The only requirement is providing the metadata which can be extracted by following the initial steps of +:ref:`analysing results `. + + +Configuration for ``julio`` +--------------------------- + +When generating a registry with `julio`_, we can configure the YAML generation process. For now, only +DataGrabbers can be configured through the use of ``_dump_exclude`` class variable, like so: + + .. code-block:: python + + from typing import ClassVar + + from junifer.api.decorators import register_datagrabber + from junifer.datagrabber import PatternDataladDataGrabber + + + @register_datagrabber + class MyDataGrabber(PatternDataladDataGrabber): + + _dump_exclude: ClassVar[set[str]] = { + "patterns", + "replacements", + "confounds_format", + "partial_pattern_ok", + "uri", + "rootdir", + "datadir", + "datalad_id", + "datalad_dirty", + "datalad_commit_id", + } + + +The above can be considered a standard setup for a custom DataGrabber inheriting from :class:`.PatternDataladDataGrabber`. + + +.. admonition:: Tip + + - For DataGrabbers inheriting from :class:`.BaseDataGrabber` custom setup is possible but not required. + - For DataGrabbers inheriting from :class:`.PatternDataGrabber` no extra setup should be required. + - For :class:`.PatternDataladDataGrabber`\s specified via the YAML, it is not possible + to customise and is usually not required. If such a need arises, creating a custom DataGrabber is the only way. diff --git a/docs/using/index.rst b/docs/using/index.rst index 338d9e72c..a1fac87fe 100644 --- a/docs/using/index.rst +++ b/docs/using/index.rst @@ -20,6 +20,7 @@ to interact with HPC and HTC systems. queueing configuring dumping + generate_yaml .. _using_components: diff --git a/junifer/api/__init__.pyi b/junifer/api/__init__.pyi index acdae47a3..6dbb31147 100644 --- a/junifer/api/__init__.pyi +++ b/junifer/api/__init__.pyi @@ -1,4 +1,19 @@ -__all__ = ["decorators", "collect", "queue", "run", "reset", "list_elements"] +__all__ = [ + "decorators", + "collect", + "queue", + "run", + "reset", + "list_elements", + "generate_yaml", +] from . import decorators -from .functions import collect, list_elements, reset, run, queue +from .functions import ( + collect, + generate_yaml, + list_elements, + reset, + run, + queue, +) diff --git a/junifer/api/functions.py b/junifer/api/functions.py index 7c08c5bf6..8e1f7126d 100644 --- a/junifer/api/functions.py +++ b/junifer/api/functions.py @@ -6,9 +6,12 @@ # License: AGPL import atexit +import datetime as dt +import io import os import shutil from pathlib import Path +from typing import TYPE_CHECKING, Any from ..api.queue_context import GnuParallelLocalAdapter, HTCondorAdapter from ..datagrabber import BaseDataGrabber @@ -30,7 +33,18 @@ from ..utils import logger, raise_error, warn_with_log, yaml -__all__ = ["collect", "list_elements", "queue", "reset", "run"] +if TYPE_CHECKING: + from ruamel.yaml.comments import CommentedMap + + +__all__ = [ + "collect", + "generate_yaml", + "list_elements", + "queue", + "reset", + "run", +] def _get_datagrabber(datagrabber_config: dict) -> DataGrabberLike: @@ -463,3 +477,110 @@ def list_elements( elements_to_list.append(str_element) return "\n".join(elements_to_list) + + +def generate_yaml(meta: dict) -> "CommentedMap": + """Generate the feature YAML from metadata. + + Parameters + ---------- + meta : dict + Feature metadata as dictionary. + + Returns + ------- + ruamel.yaml.comments.CommentedMap + Feature YAML. + + """ + y: dict[str, Any] = {} + y["workdir"] = "" + # Add "with" section if present + if "with" in meta: + y["with"] = meta["with"].copy() + # Set datagrabber + meta_dg = meta["datagrabber"].copy() + a = meta_dg.pop("class") + dg = PipelineComponentRegistry().get_class(step="datagrabber", name=a) + dg_model = dg.model_construct(**meta_dg) + y["datagrabber"] = { + "kind": a, + **dg_model.model_dump( + mode="json", + exclude=dg_model._dump_exclude + if hasattr(dg_model, "_dump_exclude") + else {}, + exclude_defaults=True, + exclude_none=True, + ), + } + # Set preprocessor(s) + if "preprocess" in meta: + y["preprocess"] = [] + meta_p = meta["preprocess"].copy() + if not isinstance(meta_p, list): + meta_p = [meta_p] + for mp in meta_p: + b = mp.pop("class") + p = PipelineComponentRegistry().get_class( + step="preprocessing", name=b + ) + p_model = p.model_construct(**mp) + y["preprocess"].append( + { + "kind": b, + **p_model.model_dump( + mode="json", + exclude={"required_data_types"}, + exclude_defaults=True, + exclude_none=True, + ), + } + ) + # Set marker + meta_m = meta["marker"].copy() + c = meta_m.pop("class") + m = PipelineComponentRegistry().get_class(step="marker", name=c) + m_model = m.model_construct(**meta_m) + y["markers"] = [] + y["markers"].append( + { + "kind": c, + **m_model.model_dump( + mode="json", + exclude_defaults=True, + exclude_none=True, + ), + } + ) + # Set storage + y["storage"] = { + "kind": "HDF5FeatureStorage", + "uri": "", + } + # Set queue + if "queue" in meta: + y["queue"] = meta["queue"].copy() + else: + y["queue"] = { + "jobname": meta["name"], + "kind": "", + } + # Dump and load yaml to format + f = io.StringIO() + yaml.dump(y, stream=f) + f.seek(0) + d = yaml.load(f) + # Add preamble + pre = ( + "Auto-generated by junifer on " + f"{dt.datetime.now(tz=dt.UTC).strftime('%Y-%m-%d %H:%M:%S')} UTC\n\n" + ) + if "dependencies" in meta: + for k, v in meta["dependencies"].items(): + pre += f"{k}=={v}\n" + d.yaml_set_start_comment(pre) + # Add newline between sections + for s in d.keys(): + d.yaml_set_comment_before_after_key(s, before="\n") + return d diff --git a/junifer/configs/juseless/datagrabbers/aomic_id1000_vbm.py b/junifer/configs/juseless/datagrabbers/aomic_id1000_vbm.py index 3555fc05d..6e40f2dc0 100644 --- a/junifer/configs/juseless/datagrabbers/aomic_id1000_vbm.py +++ b/junifer/configs/juseless/datagrabbers/aomic_id1000_vbm.py @@ -4,7 +4,7 @@ # Synchon Mandal # License: AGPL -from typing import Literal +from typing import ClassVar, Literal from pydantic import AnyUrl @@ -31,6 +31,19 @@ class JuselessDataladAOMICID1000VBM(PatternDataladDataGrabber): """ + _dump_exclude: ClassVar[set[str]] = { + "patterns", + "replacements", + "confounds_format", + "partial_pattern_ok", + "uri", + "rootdir", + "datadir", + "datalad_id", + "datalad_dirty", + "datalad_commit_id", + } + uri: AnyUrl = AnyUrl("https://gin.g-node.org/felixh/ds003097_ReproVBM") types: list[Literal[DataType.VBM_GM]] = [DataType.VBM_GM] # noqa: RUF012 patterns: DataGrabberPatterns = { # noqa: RUF012 diff --git a/junifer/configs/juseless/datagrabbers/camcan_vbm.py b/junifer/configs/juseless/datagrabbers/camcan_vbm.py index 85fccec6f..9342796f0 100644 --- a/junifer/configs/juseless/datagrabbers/camcan_vbm.py +++ b/junifer/configs/juseless/datagrabbers/camcan_vbm.py @@ -5,7 +5,7 @@ # Synchon Mandal # License: AGPL -from typing import Literal +from typing import ClassVar, Literal from pydantic import AnyUrl @@ -32,6 +32,19 @@ class JuselessDataladCamCANVBM(PatternDataladDataGrabber): """ + _dump_exclude: ClassVar[set[str]] = { + "patterns", + "replacements", + "confounds_format", + "partial_pattern_ok", + "uri", + "rootdir", + "datadir", + "datalad_id", + "datalad_dirty", + "datalad_commit_id", + } + uri: AnyUrl = AnyUrl( "ria+http://cat_12.5.ds.inm7.de#a139b26a-8406-11ea-8f94-a0369f287950" ) diff --git a/junifer/configs/juseless/datagrabbers/ixi_vbm.py b/junifer/configs/juseless/datagrabbers/ixi_vbm.py index ee1955e39..0f58308e3 100644 --- a/junifer/configs/juseless/datagrabbers/ixi_vbm.py +++ b/junifer/configs/juseless/datagrabbers/ixi_vbm.py @@ -48,6 +48,19 @@ class JuselessDataladIXIVBM(PatternDataladDataGrabber): """ + _dump_exclude: ClassVar[set[str]] = { + "patterns", + "replacements", + "confounds_format", + "partial_pattern_ok", + "uri", + "rootdir", + "datadir", + "datalad_id", + "datalad_dirty", + "datalad_commit_id", + } + uri: AnyUrl = AnyUrl( "ria+http://cat_12.5.ds.inm7.de#b7107c52-8408-11ea-89c6-a0369f287950" ) diff --git a/junifer/configs/juseless/datagrabbers/ukb_vbm.py b/junifer/configs/juseless/datagrabbers/ukb_vbm.py index 1f05a8bb8..369b94f1c 100644 --- a/junifer/configs/juseless/datagrabbers/ukb_vbm.py +++ b/junifer/configs/juseless/datagrabbers/ukb_vbm.py @@ -6,7 +6,7 @@ # License: AGPL from pathlib import Path -from typing import Literal +from typing import ClassVar, Literal from pydantic import AnyUrl @@ -33,6 +33,19 @@ class JuselessDataladUKBVBM(PatternDataladDataGrabber): """ + _dump_exclude: ClassVar[set[str]] = { + "patterns", + "replacements", + "confounds_format", + "partial_pattern_ok", + "uri", + "rootdir", + "datadir", + "datalad_id", + "datalad_dirty", + "datalad_commit_id", + } + uri: AnyUrl = AnyUrl("ria+http://ukb.ds.inm7.de#~cat_m0wp1") rootdir: Path = Path("m0wp1") types: list[Literal[DataType.VBM_GM]] = [DataType.VBM_GM] # noqa: RUF012 diff --git a/junifer/datagrabber/aomic/id1000.py b/junifer/datagrabber/aomic/id1000.py index 2a1fd2349..e1ae2e6e1 100644 --- a/junifer/datagrabber/aomic/id1000.py +++ b/junifer/datagrabber/aomic/id1000.py @@ -7,7 +7,7 @@ # Synchon Mandal # License: AGPL -from typing import Annotated, Literal +from typing import Annotated, ClassVar, Literal from pydantic import AnyUrl, BeforeValidator @@ -52,6 +52,19 @@ class DataladAOMICID1000(PatternDataladDataGrabber): """ + _dump_exclude: ClassVar[set[str]] = { + "patterns", + "replacements", + "confounds_format", + "partial_pattern_ok", + "uri", + "rootdir", + "datadir", + "datalad_id", + "datalad_dirty", + "datalad_commit_id", + } + uri: AnyUrl = AnyUrl("https://github.com/OpenNeuroDatasets/ds003097.git") types: Annotated[_types | list[_types], BeforeValidator(ensure_list)] = [ # noqa: RUF012 DataType.BOLD, diff --git a/junifer/datagrabber/aomic/piop1.py b/junifer/datagrabber/aomic/piop1.py index cac65e01f..34a2d7f59 100644 --- a/junifer/datagrabber/aomic/piop1.py +++ b/junifer/datagrabber/aomic/piop1.py @@ -8,7 +8,7 @@ # License: AGPL from itertools import product -from typing import Annotated, Literal +from typing import Annotated, ClassVar, Literal from pydantic import AnyUrl, BeforeValidator @@ -66,6 +66,19 @@ class DataladAOMICPIOP1(PatternDataladDataGrabber): """ + _dump_exclude: ClassVar[set[str]] = { + "patterns", + "replacements", + "confounds_format", + "partial_pattern_ok", + "uri", + "rootdir", + "datadir", + "datalad_id", + "datalad_dirty", + "datalad_commit_id", + } + uri: AnyUrl = AnyUrl("https://github.com/OpenNeuroDatasets/ds002785") types: Annotated[_types | list[_types], BeforeValidator(ensure_list)] = [ # noqa: RUF012 DataType.BOLD, diff --git a/junifer/datagrabber/aomic/piop2.py b/junifer/datagrabber/aomic/piop2.py index 1fab2a0a8..56d8b94ef 100644 --- a/junifer/datagrabber/aomic/piop2.py +++ b/junifer/datagrabber/aomic/piop2.py @@ -8,7 +8,7 @@ # License: AGPL from itertools import product -from typing import Annotated, Literal +from typing import Annotated, ClassVar, Literal from pydantic import AnyUrl, BeforeValidator @@ -64,6 +64,19 @@ class DataladAOMICPIOP2(PatternDataladDataGrabber): """ + _dump_exclude: ClassVar[set[str]] = { + "patterns", + "replacements", + "confounds_format", + "partial_pattern_ok", + "uri", + "rootdir", + "datadir", + "datalad_id", + "datalad_dirty", + "datalad_commit_id", + } + uri: AnyUrl = AnyUrl("https://github.com/OpenNeuroDatasets/ds002790") types: Annotated[_types | list[_types], BeforeValidator(ensure_list)] = [ # noqa: RUF012 DataType.BOLD, diff --git a/junifer/datagrabber/dmcc13_benchmark.py b/junifer/datagrabber/dmcc13_benchmark.py index e726b97c7..0e7f54ba5 100644 --- a/junifer/datagrabber/dmcc13_benchmark.py +++ b/junifer/datagrabber/dmcc13_benchmark.py @@ -5,7 +5,7 @@ from enum import Enum from itertools import product -from typing import Annotated, Literal +from typing import Annotated, ClassVar, Literal from pydantic import AnyUrl, BeforeValidator @@ -124,6 +124,19 @@ class DMCC13Benchmark(PatternDataladDataGrabber): """ + _dump_exclude: ClassVar[set[str]] = { + "patterns", + "replacements", + "confounds_format", + "partial_pattern_ok", + "uri", + "rootdir", + "datadir", + "datalad_id", + "datalad_dirty", + "datalad_commit_id", + } + uri: AnyUrl = AnyUrl("https://github.com/OpenNeuroDatasets/ds003452.git") types: Annotated[_types | list[_types], BeforeValidator(ensure_list)] = [ # noqa: RUF012 DataType.BOLD, diff --git a/junifer/datagrabber/hcp1200/datalad_hcp1200.py b/junifer/datagrabber/hcp1200/datalad_hcp1200.py index 8a6abee6a..5ae2a1396 100644 --- a/junifer/datagrabber/hcp1200/datalad_hcp1200.py +++ b/junifer/datagrabber/hcp1200/datalad_hcp1200.py @@ -6,7 +6,7 @@ # License: AGPL from pathlib import Path -from typing import Annotated, Literal +from typing import Annotated, ClassVar, Literal from pydantic import AnyUrl, BeforeValidator @@ -50,6 +50,19 @@ class DataladHCP1200(DataladDataGrabber, HCP1200): """ + _dump_exclude: ClassVar[set[str]] = { + "patterns", + "replacements", + "confounds_format", + "partial_pattern_ok", + "uri", + "rootdir", + "datadir", + "datalad_id", + "datalad_dirty", + "datalad_commit_id", + } + uri: AnyUrl = AnyUrl( "https://github.com/datalad-datasets/" "human-connectome-project-openaccess.git" diff --git a/junifer/datagrabber/pattern.py b/junifer/datagrabber/pattern.py index 9b4bd60bf..df05841b5 100644 --- a/junifer/datagrabber/pattern.py +++ b/junifer/datagrabber/pattern.py @@ -8,6 +8,7 @@ import re from copy import deepcopy from pathlib import Path +from typing import ClassVar import numpy as np from aenum import Enum as AEnum @@ -82,6 +83,13 @@ class PatternDataGrabber(BaseDataGrabber, PatternValidationMixin): """ + _dump_exclude: ClassVar[set[str]] = { + "patterns", + "replacements", + "confounds_format", + "partial_pattern_ok", + } + patterns: DataGrabberPatterns = Field(frozen=True) replacements: list[str] = Field(frozen=True) confounds_format: ConfoundsFormat | None = Field(None, frozen=True) diff --git a/junifer/datagrabber/pattern_datalad.py b/junifer/datagrabber/pattern_datalad.py index 4bba19354..529660b6e 100644 --- a/junifer/datagrabber/pattern_datalad.py +++ b/junifer/datagrabber/pattern_datalad.py @@ -5,6 +5,8 @@ # Synchon Mandal # License: AGPL +from typing import ClassVar + from pydantic import ConfigDict from ..api.decorators import register_datagrabber @@ -53,6 +55,14 @@ class PatternDataladDataGrabber(DataladDataGrabber, PatternDataGrabber): """ + _dump_exclude: ClassVar[set[str]] = { + "uri", + "datadir", + "datalad_dirty", + "datalad_commit_id", + "datalad_id", + } + model_config = ConfigDict(extra="allow") def validate_datagrabber_params(self) -> None: