From 0ccae382dd0c777f73abd66c132005f2b04678ea Mon Sep 17 00:00:00 2001 From: Manuel Bellersen Date: Tue, 19 Aug 2025 23:46:57 +0200 Subject: [PATCH] GH-54: refactor(serialization): Unify prediction and feature set serialization This commit refactors the serialization of pipeline predictions to align with the existing process for feature sets. Previously, predictions (as NumPy arrays) were handled by a custom `serialize_ndarray` function and tracked using a separate `TableInformation` data class. This led to code duplication and inconsistency. The changes include: - Removing the `TableInformation` class and its related exceptions. - Deleting the `serialize_ndarray` utility function. - Updating the `serialize_predictions` function to: 1. Convert the NumPy array from `pipeline.predict()` into a `getml.DataFrame` via `pyarrow.Table`. 2. Use the existing `serialize_dataframe_or_view` function to store the prediction data and its metadata. - Simplifying the `derive_instance_with_relative_path` utility, as it now only needs to handle `DataFrameInformation`. Additionally, this commit introduces `basedpyright` into the CI pipeline for stricter static type checking, along with necessary `pyright: ignore` annotations for third-party library interactions. A minor bug where `std` in column statistics could be `None` has also been fixed. --- .github/workflows/python-tests.yml | 1 + src/getml_io/getml/getml.py | 2 +- .../metadata/container_information.py | 2 - .../metadata/dataframe_information.py | 2 +- src/getml_io/metadata/exception.py | 22 -- src/getml_io/metadata/pipeline_information.py | 10 +- src/getml_io/metadata/prediction_results.py | 4 +- src/getml_io/metadata/table_information.py | 9 - src/getml_io/metadata/utils.py | 47 +-- src/getml_io/serialize/dataframe_or_view.py | 4 +- src/getml_io/serialize/exception.py | 8 - src/getml_io/serialize/ndarray.py | 54 --- src/getml_io/serialize/pipeline.py | 31 +- tests/helpers.py | 2 +- tests/integration/assertions.py | 11 +- tests/integration/conftest.py | 2 +- tests/integration/data/getmlproject.py | 2 +- .../data/loans/expected.pipeline.json | 172 ++++---- .../data/numerical/expected.pipeline.json | 374 ++++++++++-------- .../data/robot/expected.pipeline.json | 204 +++++++++- tests/integration/helpers.py | 4 +- tests/integration/test_serialize_loans.py | 4 +- tests/integration/test_serialize_numerical.py | 4 +- tests/integration/test_serialize_robot.py | 6 +- tests/unit/conftest.py | 86 ++-- tests/unit/getml/test_project.py | 18 +- .../metadata/test_container_information.py | 4 +- .../metadata/test_pipeline_information.py | 16 +- tests/unit/metadata/test_utils.py | 69 +--- tests/unit/serialize/test_ndarray.py | 66 ---- tests/unit/serialize/test_pipeline.py | 28 +- tests/unit/test_cli.py | 4 +- tests/unit/utils/test_storage.py | 2 +- 33 files changed, 647 insertions(+), 627 deletions(-) delete mode 100644 src/getml_io/metadata/table_information.py delete mode 100644 src/getml_io/serialize/ndarray.py delete mode 100644 tests/unit/serialize/test_ndarray.py diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml index c41573d..cfd58ab 100644 --- a/.github/workflows/python-tests.yml +++ b/.github/workflows/python-tests.yml @@ -60,6 +60,7 @@ jobs: run: | uv run ruff check --extend-ignore FIX . uv run ruff format --check . + uv run basedpyright . - name: Show TODOs run: | uv run ruff check --select FIX . || true diff --git a/src/getml_io/getml/getml.py b/src/getml_io/getml/getml.py index f63b37c..41cf1b9 100644 --- a/src/getml_io/getml/getml.py +++ b/src/getml_io/getml/getml.py @@ -29,7 +29,7 @@ def alive_getml() -> Generator[None]: getml_is_alive_on_entry = getml.communication.is_monitor_alive() if not getml_is_alive_on_entry: try: - getml.engine.launch(launch_browser=False) + getml.engine.launch(launch_browser=False) # pyright: ignore [reportUnknownMemberType] except Exception as exception: message = ( "Failed to launch getML engine. " diff --git a/src/getml_io/metadata/container_information.py b/src/getml_io/metadata/container_information.py index 441431b..9a80835 100644 --- a/src/getml_io/metadata/container_information.py +++ b/src/getml_io/metadata/container_information.py @@ -9,7 +9,6 @@ from typing_extensions import TypedDict from getml_io.metadata.dataframe_information import DataFrameInformation -from getml_io.metadata.exception import DataFrameInformationPathNotRelativeError from getml_io.metadata.utils import derive_instance_with_relative_path @@ -60,5 +59,4 @@ def _create_dataframe_information_with_relative_path( return derive_instance_with_relative_path( dataframe_information, self.path, - DataFrameInformationPathNotRelativeError, ) diff --git a/src/getml_io/metadata/dataframe_information.py b/src/getml_io/metadata/dataframe_information.py index ba25820..e82bd7f 100644 --- a/src/getml_io/metadata/dataframe_information.py +++ b/src/getml_io/metadata/dataframe_information.py @@ -21,7 +21,7 @@ class ColumnStatisticsDouble: q25: float q50: float q75: float - std: float + std: float | None null_percentage: float column_type: Literal["DOUBLE"] diff --git a/src/getml_io/metadata/exception.py b/src/getml_io/metadata/exception.py index a4508cc..903c27c 100644 --- a/src/getml_io/metadata/exception.py +++ b/src/getml_io/metadata/exception.py @@ -42,25 +42,3 @@ def __init__( path=path, base_path=base_path, ) - - -class TableInformationPathNotRelativeError(PathNotRelativeError): - """Exception raised on erroneous TableInformation path. - - Raised when the path of a TableInformation is not relative - to the given base path. - """ - - def __init__( - self, - name: str, - path: Path, - base_path: Path, - ) -> None: - """Initialize the exception with a custom message.""" - super().__init__( - item_type="TableInformation", - item_name=name, - path=path, - base_path=base_path, - ) diff --git a/src/getml_io/metadata/pipeline_information.py b/src/getml_io/metadata/pipeline_information.py index 903d38d..541d109 100644 --- a/src/getml_io/metadata/pipeline_information.py +++ b/src/getml_io/metadata/pipeline_information.py @@ -20,10 +20,6 @@ from getml_io.getml.predictors import FeatureSelector, Predictor from getml_io.getml.preprocessors import Preprocessor from getml_io.metadata.data_model_information import DataModelInformation -from getml_io.metadata.exception import ( - DataFrameInformationPathNotRelativeError, - TableInformationPathNotRelativeError, -) from getml_io.metadata.feature_sets import FeatureSets from getml_io.metadata.placeholder_information import PlaceholderInformation from getml_io.metadata.prediction_results import PredictionResults @@ -92,17 +88,15 @@ def _serialize_model(self) -> PipelineInformationDict: "id": self.id, "predictions": { name: derive_instance_with_relative_path( - table_information, + dataframe_information, self.path, - TableInformationPathNotRelativeError, ) - for name, table_information in self.predictions.items() + for name, dataframe_information in self.predictions.items() }, "feature_sets": { name: derive_instance_with_relative_path( dataframe_information, self.path, - DataFrameInformationPathNotRelativeError, ) for name, dataframe_information in self.feature_sets.items() }, diff --git a/src/getml_io/metadata/prediction_results.py b/src/getml_io/metadata/prediction_results.py index 1391c46..4debb21 100644 --- a/src/getml_io/metadata/prediction_results.py +++ b/src/getml_io/metadata/prediction_results.py @@ -1,6 +1,6 @@ from collections.abc import Mapping from typing import TypeAlias -from getml_io.metadata.table_information import TableInformation +from getml_io.metadata.dataframe_information import DataFrameInformation -PredictionResults: TypeAlias = Mapping[str, TableInformation] +PredictionResults: TypeAlias = Mapping[str, DataFrameInformation] diff --git a/src/getml_io/metadata/table_information.py b/src/getml_io/metadata/table_information.py deleted file mode 100644 index 41fa038..0000000 --- a/src/getml_io/metadata/table_information.py +++ /dev/null @@ -1,9 +0,0 @@ -from pathlib import Path - -from pydantic.dataclasses import dataclass - - -@dataclass -class TableInformation: - name: str - path: Path diff --git a/src/getml_io/metadata/utils.py b/src/getml_io/metadata/utils.py index 9f23967..3cbdc96 100644 --- a/src/getml_io/metadata/utils.py +++ b/src/getml_io/metadata/utils.py @@ -1,55 +1,38 @@ import dataclasses -from collections.abc import Callable from pathlib import Path -from typing import Protocol, TypeVar -from getml_io.metadata.exception import PathNotRelativeError - - -class InstanceProtocol(Protocol): - name: str - path: Path - - -InstanceType = TypeVar("InstanceType", bound=InstanceProtocol) -ErrorType = TypeVar("ErrorType", bound=PathNotRelativeError) -ErrorFactory = Callable[[str, Path, Path], ErrorType] +from getml_io.metadata.dataframe_information import DataFrameInformation +from getml_io.metadata.exception import ( + DataFrameInformationPathNotRelativeError, +) def derive_instance_with_relative_path( - instance: InstanceType, + dataframe_information: DataFrameInformation, base_path: Path, - error_factory: ErrorFactory[ErrorType], -) -> InstanceType: +) -> DataFrameInformation: """Derive a copy of an instance with a path relative to the given base path. Args: - instance: The instance to use as a template. + dataframe_information: The instance to use as a template. base_path: The base path to which the instance's path should be relative. - error_factory: A callable that creates an error if the path is not relative. Returns: A new instance with the path relative to the base path. Raises: - PathNotRelativeError: If the instance's path cannot be made relative - to the base path. The specific subclass raised is determined - by the `error_factory`. - TypeError: If the instance is not a dataclass. + DataFrameInformationPathNotRelativeError: If the instance's path cannot be made + relative to the base path. """ - if not dataclasses.is_dataclass(instance): - message = f"Instance must be a dataclass: {type(instance)}" - raise TypeError(message) try: return dataclasses.replace( - instance, - path=instance.path.relative_to(base_path), + dataframe_information, + path=dataframe_information.path.relative_to(base_path), ) except Exception as exception: - error = error_factory( - instance.name, - instance.path, + raise DataFrameInformationPathNotRelativeError( + dataframe_information.name, + dataframe_information.path, base_path, - ) - raise error from exception + ) from exception diff --git a/src/getml_io/serialize/dataframe_or_view.py b/src/getml_io/serialize/dataframe_or_view.py index dfa7254..c84da94 100644 --- a/src/getml_io/serialize/dataframe_or_view.py +++ b/src/getml_io/serialize/dataframe_or_view.py @@ -131,7 +131,7 @@ def _fetch_raw_summary_statistics( parquet_filepath: Path, ) -> dict[str, dict[str, str | int | float]]: with ( - duckdb.connect() as connection, + duckdb.connect() as connection, # pyright: ignore [reportUnknownMemberType] ): logger.debug( "Calculating summary statistics for Parquet '%s'", @@ -141,7 +141,7 @@ def _fetch_raw_summary_statistics( "dict[str, dict[str, str | int | float]]", cast( "object", - connection.execute( + connection.execute( # pyright: ignore [reportUnknownMemberType] SUMMARIZE_STATEMENT_TEMPLATE, [str(parquet_filepath)], ) diff --git a/src/getml_io/serialize/exception.py b/src/getml_io/serialize/exception.py index dc0737c..e61bdce 100644 --- a/src/getml_io/serialize/exception.py +++ b/src/getml_io/serialize/exception.py @@ -72,14 +72,6 @@ def __init__(self, pipeline_id: str, pipeline_json_path: Path) -> None: super().__init__("pipeline information", pipeline_id, pipeline_json_path) -class TableParquetStorageError(GetMLIOStorageError): - """Exception raised when storing Table as parquet fails.""" - - def __init__(self, name: str, path: Path) -> None: - """Initialize the exception with a custom message.""" - super().__init__("Table as parquet", name, path) - - class UnsupportedColumnStatisticsError(GetMLIOError): """Exception raised when an unsupported column statistics is encountered.""" diff --git a/src/getml_io/serialize/ndarray.py b/src/getml_io/serialize/ndarray.py deleted file mode 100644 index f8f0205..0000000 --- a/src/getml_io/serialize/ndarray.py +++ /dev/null @@ -1,54 +0,0 @@ -from pathlib import Path - -import numpy as np -import pyarrow as pa -from numpy.typing import NDArray -from pyarrow import Table, parquet - -from getml_io.serialize.exception import TableParquetStorageError -from getml_io.utils.exception import StorageDirectoryCreationError - - -def serialize_ndarray( - array: NDArray[np.float64], - target_storage_directory: Path, - name: str, -) -> Path: - """Serialize a NumPy ndarray into the target storage directory as a Parquet file. - - Args: - array: The NumPy ndarray to serialize. - target_storage_directory: The directory where the serialized ndarray - will be saved. - name: The name of the serialized ndarray. - - Returns: - Path: The path to the serialized Parquet file. - - Raises: - StorageDirectoryCreationError: If the target storage directory - cannot be created. - TableParquetStorageError: If storing the ndarray as a Parquet file fails. - - """ - try: - target_storage_directory.mkdir(parents=True, exist_ok=True) - except Exception as exception: - raise StorageDirectoryCreationError(target_storage_directory) from exception - - path = target_storage_directory / f"{name}.parquet" - table: Table = Table.from_arrays( - array.transpose(), - schema=pa.schema( - [pa.field(str(i), pa.float64()) for i in range(array.shape[1])], - ), - ) - try: - parquet.write_table( - table=table, - where=path, - ) - except Exception as exception: - raise TableParquetStorageError(name, path) from exception - - return path diff --git a/src/getml_io/serialize/pipeline.py b/src/getml_io/serialize/pipeline.py index 11af6e0..4a44146 100644 --- a/src/getml_io/serialize/pipeline.py +++ b/src/getml_io/serialize/pipeline.py @@ -3,6 +3,7 @@ from typing import cast import numpy as np +import pyarrow as pa from getml import feature_learning as getml_feature_learner from getml import predictors as getml_predictor from getml import preprocessors as getml_preprocessor @@ -47,10 +48,8 @@ PipelineInformation, ) from getml_io.metadata.prediction_results import PredictionResults -from getml_io.metadata.table_information import TableInformation from getml_io.serialize.data_model import serialize_data_model from getml_io.serialize.dataframe_or_view import serialize_dataframe_or_view -from getml_io.serialize.ndarray import serialize_ndarray from getml_io.serialize.pipeline_information import serialize_pipeline_information from getml_io.serialize.placeholder import serialize_placeholder from getml_io.utils.convert import ( @@ -151,16 +150,22 @@ def serialize_predictions( predict_storage_directory = target_storage_directory / "predictions" prediction_results: PredictionResults = {} for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets): - prediction = pipeline.predict(container[subset_name]) - # TODO @urfoex: #54 Convert NDArray to DataFrame and use dataframe serialization - path = serialize_ndarray( - array=cast("NDArray[np.float64]", prediction), - target_storage_directory=predict_storage_directory, - name=subset_name, + prediction = cast( + "NDArray[np.float64]", + pipeline.predict(container[subset_name]), # pyright: ignore [reportUnknownMemberType] ) - prediction_results[subset_name] = TableInformation( - name=subset_name, - path=path, + prediction_table = pa.Table.from_arrays( # pyright: ignore [reportUnknownMemberType, reportUnknownVariableType] + prediction.T, + names=list(map(str, range(prediction.shape[1]))), + ) + prediction_dataframe = DataFrame.from_arrow( # pyright: ignore [reportUnknownMemberType] + prediction_table, # pyright: ignore [reportUnknownArgumentType] + f"prediction.{subset_name}", + ) + + prediction_results[subset_name] = serialize_dataframe_or_view( + prediction_dataframe, + predict_storage_directory, ) return prediction_results @@ -187,7 +192,7 @@ def serialize_feature_sets( transform_storage_directory = target_storage_directory / "feature_sets" feature_sets: FeatureSets = {} for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets): - features = pipeline.transform( + features = pipeline.transform( # pyright: ignore [reportUnknownMemberType, reportUnknownVariableType] container[subset_name], df_name=f"features.{subset_name}", ) @@ -263,7 +268,7 @@ def serialize_predictor( return TypeAdapter(XGBoostRegressor).validate_python(predictor_as_dict) -def serialize_preprocessor( +def serialize_preprocessor( # noqa: PLR0911 preprocessor: getml_preprocessor.CategoryTrimmer | getml_preprocessor.EmailDomain | getml_preprocessor.Imputation diff --git a/tests/helpers.py b/tests/helpers.py index 834de9b..99f2aee 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -3,7 +3,7 @@ def as_magicmock( - value: Any, # noqa: ANN401 + value: Any, # noqa: ANN401 # pyright: ignore [reportAny, reportExplicitAny] ) -> unittest.mock.MagicMock: """Assume `value` conforms to the return type for static analysis. diff --git a/tests/integration/assertions.py b/tests/integration/assertions.py index 422d10d..1aad1a6 100644 --- a/tests/integration/assertions.py +++ b/tests/integration/assertions.py @@ -8,7 +8,6 @@ DataFrameInformation, ) from getml_io.metadata.pipeline_information import PipelineInformation -from getml_io.metadata.table_information import TableInformation def assert_container_parquets( @@ -168,7 +167,7 @@ def assert_pipeline_information( prediction_name, prediction, ) in expected_pipeline_information.predictions.items(): - assert_table_information( + assert_dataframe_information( pipeline_information.predictions[prediction_name], prediction, ) @@ -211,11 +210,3 @@ def assert_pipeline_information( assert pipeline_information.tags assert pipeline_information.targets == expected_pipeline_information.targets assert pipeline_information.data_model == expected_pipeline_information.data_model - - -def assert_table_information( - table_information: TableInformation, - expected_table_information: TableInformation, -) -> None: - assert table_information.name == expected_table_information.name - assert table_information.path == expected_table_information.path diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 239e4d5..6fa896c 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -34,7 +34,7 @@ def data_path() -> Path: @pytest.fixture def project_name(request: pytest.FixtureRequest) -> str: - name = cast("str", request.node.name) + name = cast("str", request.node.name) # pyright: ignore [reportUnknownMemberType] sanitized_name = re.sub(r"[^a-zA-Z0-9_-]", "_", name) name_hash = hashlib.md5(name.encode("utf-8")).hexdigest()[:6] # noqa: S324 return f"getml-io-{sanitized_name}-{name_hash}" diff --git a/tests/integration/data/getmlproject.py b/tests/integration/data/getmlproject.py index 7d66ed6..312cabf 100644 --- a/tests/integration/data/getmlproject.py +++ b/tests/integration/data/getmlproject.py @@ -107,7 +107,7 @@ def _save_project_bundle(self) -> None: self._project_name, str(self._path), ) - getml.project.save(filename=self._path) + getml.project.save(filename=self._path) # pyright: ignore [reportUnknownMemberType] @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) diff --git a/tests/integration/data/loans/expected.pipeline.json b/tests/integration/data/loans/expected.pipeline.json index e224a2b..2b402a3 100644 --- a/tests/integration/data/loans/expected.pipeline.json +++ b/tests/integration/data/loans/expected.pipeline.json @@ -1,13 +1,53 @@ { - "id": "n4ARIs", + "id": "wzvjNK", "predictions": { "train": { - "name": "train", - "path": "pipeline/predictions/train.parquet" + "name": "prediction.train", + "path": "pipeline/predictions/prediction.train.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 459, + "approx_unique": 452, + "avg": 0.11984065395154968, + "min": 0.0013052262365818024, + "max": 0.9940769672393799, + "q25": 0.007235923552394119, + "q50": 0.017314163701874868, + "q75": 0.06367057869728242, + "std": 0.25014608992693693, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } }, "test": { - "name": "test", - "path": "pipeline/predictions/test.parquet" + "name": "prediction.test", + "path": "pipeline/predictions/prediction.test.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 223, + "approx_unique": 229, + "avg": 0.10654706618594081, + "min": 0.0016005451325327158, + "max": 0.9642216563224792, + "q25": 0.008410481399753027, + "q50": 0.016846238325039547, + "q75": 0.05923604799641504, + "std": 0.2183209639358569, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } } }, "feature_sets": { @@ -68,14 +108,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 54, - "avg": 1963.1830065359477, + "approx_unique": 40, + "avg": 769.5424836601308, "min": 0.0, - "max": 53991.0, - "q25": 145.83333333333334, - "q50": 569.3877551020407, - "q75": 900.0, - "std": 6658.363509588833, + "max": 19621.0, + "q25": 0.0, + "q50": 0.0, + "q75": 202.7777777777778, + "std": 2515.4971605816318, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -86,14 +126,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 287, - "avg": 26940800.0, + "approx_unique": 17, + "avg": 882447.0588235294, "min": 0.0, - "max": 42076800.0, - "q25": 16213200.0, - "q50": 30620865.30612245, - "q75": 39661200.0, - "std": 13345080.744145872, + "max": 68601600.0, + "q25": 0.0, + "q50": 0.0, + "q75": 0.0, + "std": 6212366.872861004, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -140,14 +180,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 128, - "avg": 72.32483660130725, + "approx_unique": 34, + "avg": 7.1725490196078425, "min": 0.0, - "max": 200.0, - "q25": 50.90277777777777, - "q50": 63.077551020408166, - "q75": 100.0, - "std": 43.688883621604056, + "max": 100.0, + "q25": 0.0, + "q50": 0.0, + "q75": 0.0, + "std": 22.558438138781643, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -194,14 +234,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 19, - "avg": 2.6230936819172115, + "approx_unique": 20, + "avg": 2.7734204793028323, "min": 0.0, "max": 95.0, "q25": 0.0, "q50": 0.0, "q75": 0.0, - "std": 12.698880328094846, + "std": 13.070720949321736, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -230,14 +270,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 22, - "avg": 0.19324618736383445, + "approx_unique": 23, + "avg": 0.1993464052287582, "min": 0.0, "max": 6.6, "q25": 0.0, "q50": 0.0, "q75": 0.0, - "std": 0.7828247239515089, + "std": 0.7869983380593814, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -357,13 +397,13 @@ "statistics": { "count": 223, "approx_unique": 30, - "avg": 1794.914798206278, + "avg": 842.9372197309417, "min": 0.0, - "max": 67376.0, - "q25": 200.0, - "q50": 466.6666666666667, - "q75": 900.0, - "std": 6927.7664625313855, + "max": 18200.0, + "q25": 0.0, + "q50": 0.0, + "q75": 144.44444444444446, + "std": 2885.3221572288658, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -374,14 +414,14 @@ "role": "numerical", "statistics": { "count": 223, - "approx_unique": 147, - "avg": 27327971.30044843, + "approx_unique": 11, + "avg": 1550550.67264574, "min": 0.0, - "max": 43804800.0, - "q25": 17558400.0, - "q50": 29433600.0, - "q75": 39708000.0, - "std": 12861334.000076162, + "max": 128390400.0, + "q25": 0.0, + "q50": 0.0, + "q75": 0.0, + "std": 10521062.635618178, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -428,14 +468,14 @@ "role": "numerical", "statistics": { "count": 223, - "approx_unique": 92, - "avg": 77.92331838565016, + "approx_unique": 13, + "avg": 3.6977578475336323, "min": 0.0, - "max": 200.0, - "q25": 52.963888888888896, - "q50": 70.53333333333333, - "q75": 100.0, - "std": 40.10804320182245, + "max": 100.0, + "q25": 0.0, + "q50": 0.0, + "q75": 0.0, + "std": 15.026950095889092, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -518,14 +558,14 @@ "role": "numerical", "statistics": { "count": 223, - "approx_unique": 16, - "avg": 0.19192825112107625, + "approx_unique": 17, + "avg": 0.20986547085201795, "min": 0.0, "max": 6.6, "q25": 0.0, "q50": 0.0, "q75": 0.0, - "std": 0.8325188926028172, + "std": 0.8666449130086721, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -591,11 +631,11 @@ "feature_learners": [ { "aggregation": [ - "MIN", + "AVG", "SUM", - "COUNT", + "MIN", "MAX", - "AVG" + "COUNT" ], "allow_sets": true, "delta_t": 0.0, @@ -609,19 +649,19 @@ "num_threads": 0, "propositionalization": { "aggregation": [ - "MIN", - "AVG", - "MODE", + "COUNT DISTINCT", "COUNT MINUS COUNT DISTINCT", - "MEDIAN", + "AVG", "SUM", + "TREND", + "MIN", "STDDEV", "LAST", - "COUNT", "MAX", "FIRST", - "COUNT DISTINCT", - "TREND" + "MEDIAN", + "MODE", + "COUNT" ], "delta_t": 0.0, "loss_function": "CrossEntropyLoss", @@ -917,4 +957,4 @@ }, "peripheral": {} } -} +} \ No newline at end of file diff --git a/tests/integration/data/numerical/expected.pipeline.json b/tests/integration/data/numerical/expected.pipeline.json index 469e1a1..d714ce1 100644 --- a/tests/integration/data/numerical/expected.pipeline.json +++ b/tests/integration/data/numerical/expected.pipeline.json @@ -1,13 +1,53 @@ { - "id": "RbdvzM", + "id": "td6iXW", "predictions": { "train": { - "name": "train", - "path": "pipeline/predictions/train.parquet" + "name": "prediction.train", + "path": "pipeline/predictions/prediction.train.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 390, + "approx_unique": 345, + "avg": 96.17355588521713, + "min": 0.19379276037216187, + "max": 154.47401428222656, + "q25": 63.90802917480469, + "q50": 113.03656747606065, + "q75": 126.34682067871094, + "std": 40.818051834384214, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } }, "test": { - "name": "test", - "path": "pipeline/predictions/test.parquet" + "name": "prediction.test", + "path": "pipeline/predictions/prediction.test.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 110, + "approx_unique": 97, + "avg": 93.47890945889733, + "min": 1.1542826890945435, + "max": 148.96876525878906, + "q25": 62.2480583190918, + "q50": 110.78643798828125, + "q75": 124.41654968261719, + "std": 39.87365738682376, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } } }, "feature_sets": { @@ -68,14 +108,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 325, - "avg": 96.16907668123311, + "approx_unique": 365, + "avg": 96.16907668121483, "min": 0.0, - "max": 154.95853837280958, - "q25": 63.78581572178133, - "q50": 113.27765320315866, - "q75": 126.50463010412777, - "std": 40.83179514271343, + "max": 154.95853837278798, + "q25": 63.785815721758624, + "q50": 113.27765320314755, + "q75": 126.50463010411934, + "std": 40.83179514271078, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -86,14 +126,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 460, - "avg": 0.8376766309822631, - "min": -0.005741885778443031, - "max": 2.719408433645138, - "q25": 0.6520781997438738, - "q50": 0.9700795623721068, - "q75": 1.0064016977732944, - "std": 0.2557014318224649, + "approx_unique": 341, + "avg": 0.8376766309807187, + "min": -0.005741885826104295, + "max": 2.719408433664087, + "q25": 0.6520781997282549, + "q50": 0.9700795623830403, + "q75": 1.0064016977783699, + "std": 0.25570143183522287, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -104,14 +144,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 539, - "avg": 86.38617317829309, + "approx_unique": 463, + "avg": 86.38617317829464, "min": 0.0, - "max": 139.4831117157228, - "q25": 57.23467415456723, - "q50": 101.87912254707464, - "q75": 113.6064311988338, - "std": 36.712671335094896, + "max": 139.48311171571882, + "q25": 57.23467415457054, + "q50": 101.87912254707474, + "q75": 113.60643119883412, + "std": 36.71267133509418, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -122,14 +162,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 348, - "avg": 1.4866715394762007, + "approx_unique": 353, + "avg": 1.4866715395011707, "min": 0.0, - "max": 18.07352556921219, - "q25": 0.5113719486689973, - "q50": 0.6982874132797834, - "q75": 1.482302195832012, - "std": 2.49587156776478, + "max": 18.073525569347655, + "q25": 0.511371948685024, + "q50": 0.6982874132907706, + "q75": 1.4823021958627673, + "std": 2.4958715677815673, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -140,14 +180,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 302, - "avg": 78.0638784245176, + "approx_unique": 354, + "avg": 78.06387842451866, "min": 0.0, - "max": 125.9264443718258, - "q25": 51.62534776200198, - "q50": 92.20745748713675, - "q75": 102.6979676300216, - "std": 33.2115342480785, + "max": 125.92644437182243, + "q25": 51.62534776200333, + "q50": 92.20745748713837, + "q75": 102.69796763002303, + "std": 33.211534248078635, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -158,14 +198,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 326, - "avg": 0.5609713545814653, - "min": -1.884329594891604, - "max": 7.386337074028212, - "q25": 0.31063646371654113, - "q50": 0.4926810309517353, - "q75": 0.5897466914989792, - "std": 1.074825687942176, + "approx_unique": 461, + "avg": 0.5609713545719003, + "min": -1.8843295948944159, + "max": 7.386337073984908, + "q25": 0.31063646370630504, + "q50": 0.4926810309422083, + "q75": 0.5897466914903939, + "std": 1.0748256879378195, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -176,14 +216,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 468, - "avg": 70.10412027364201, + "approx_unique": 383, + "avg": 70.10412027365707, "min": 0.0, - "max": 113.10321385165373, - "q25": 46.531787803716234, - "q50": 82.55886648683865, - "q75": 92.25996193853011, - "std": 29.748700279905826, + "max": 113.10321385165179, + "q25": 46.531787803724455, + "q50": 82.5588664868703, + "q75": 92.25996193854685, + "std": 29.74870027991256, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -194,14 +234,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 375, - "avg": 0.5140333258914743, - "min": -0.44402437982294773, - "max": 0.9493693949050362, - "q25": 0.4048085322650249, - "q50": 0.6491476769270209, - "q75": 0.6766508980437053, - "std": 0.2595060636392321, + "approx_unique": 387, + "avg": 0.5140333259704128, + "min": -0.4440243790143118, + "max": 0.9493693945162422, + "q25": 0.40480853237326986, + "q50": 0.6491476769378332, + "q75": 0.6766508980594182, + "std": 0.25950606351142164, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -212,14 +252,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 378, - "avg": 62.90616871451977, + "approx_unique": 449, + "avg": 62.90321872320113, "min": 0.0, - "max": 101.78880366744104, - "q25": 41.77890864641669, - "q50": 74.08221392645343, - "q75": 82.5708916483322, - "std": 26.691806127961343, + "max": 101.78880366743772, + "q25": 41.77890864641813, + "q50": 74.08221392645562, + "q75": 82.57089164833283, + "std": 26.68981566900884, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -230,14 +270,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 403, - "avg": 0.49645826037201785, - "min": -0.45380937173777053, - "max": 8.224653499349321, - "q25": -0.013087906368716035, - "q50": 0.16252383064546452, - "q75": 0.3213966011515323, - "std": 1.3656064778531225, + "approx_unique": 383, + "avg": 0.49645826039061436, + "min": -0.4538093717154356, + "max": 8.224653499362855, + "q25": -0.013087906352437301, + "q50": 0.16252383066178144, + "q75": 0.3213966011730961, + "std": 1.3656064778513446, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -321,13 +361,13 @@ "statistics": { "count": 110, "approx_unique": 116, - "avg": 93.45267098580021, - "min": 1.0004581336996778, - "max": 150.31561443815576, - "q25": 61.996094719141645, - "q50": 110.66337045645069, - "q75": 124.18593897749646, - "std": 39.90174967625321, + "avg": 93.45267098578368, + "min": 1.000458133699219, + "max": 150.31561443814397, + "q25": 61.99609471912092, + "q50": 110.66337045642251, + "q75": 124.18593897756794, + "std": 39.90174967625208, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -338,14 +378,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 115, - "avg": 0.8371683464674687, - "min": 0.26161883596933283, - "max": 1.3008994594133347, - "q25": 0.6463480307577066, - "q50": 0.9769794994204424, - "q75": 0.9981102813307521, - "std": 0.2269445176703966, + "approx_unique": 99, + "avg": 0.8371683464659075, + "min": 0.2616188358978256, + "max": 1.3008994594821477, + "q25": 0.6463480307323424, + "q50": 0.9769794994299659, + "q75": 0.9981102813377745, + "std": 0.22694451768807689, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -356,14 +396,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 118, - "avg": 83.92568798146631, - "min": 0.8936319178549497, - "max": 135.34260543221754, - "q25": 55.61865811398502, - "q50": 99.41231630964933, - "q75": 112.10778547546019, - "std": 35.861589885099654, + "approx_unique": 128, + "avg": 83.92568798146797, + "min": 0.8936319178550347, + "max": 135.34260543221382, + "q25": 55.618658113988545, + "q50": 99.41231630965478, + "q75": 112.10778547545671, + "std": 35.86158988509899, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -374,14 +414,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 97, - "avg": 1.3729637342160355, - "min": 0.26150392324676175, - "max": 14.741123450975595, - "q25": 0.5079183381010651, - "q50": 0.733030259298452, - "q75": 1.5598762921754463, - "std": 2.1587383317031876, + "approx_unique": 83, + "avg": 1.372963734240981, + "min": 0.26150392327196315, + "max": 14.74112345108553, + "q25": 0.5079183381167619, + "q50": 0.733030259305973, + "q75": 1.559876292211866, + "std": 2.1587383317179856, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -392,14 +432,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 88, - "avg": 75.84121197115688, - "min": 0.8078674007376023, - "max": 122.03529180787784, - "q25": 50.16882583040196, - "q50": 90.30250196991506, - "q75": 101.0109783562762, - "std": 32.43788489337715, + "approx_unique": 107, + "avg": 75.84121197115796, + "min": 0.8078674007376269, + "max": 122.03529180788084, + "q25": 50.16882583040319, + "q50": 90.30250196991355, + "q75": 101.01097835627817, + "std": 32.43788489337733, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -410,14 +450,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 106, - "avg": 0.5049229013398149, - "min": -1.0234322123252753, - "max": 6.118363465755303, - "q25": 0.14283511563729268, - "q50": 0.4803652747350423, - "q75": 0.5681416543623228, - "std": 1.0283977404635392, + "approx_unique": 97, + "avg": 0.5049229013302758, + "min": -1.0234322123289814, + "max": 6.118363465727491, + "q25": 0.14283511563451265, + "q50": 0.48036527472462065, + "q75": 0.5681416543552915, + "std": 1.0283977404603937, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -428,14 +468,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 112, - "avg": 68.11771275542432, - "min": 0.7276733948500856, - "max": 109.66181307290546, - "q25": 45.197855854491515, - "q50": 80.4634801504667, - "q75": 90.96212075430984, - "std": 29.061616382304035, + "approx_unique": 116, + "avg": 68.11771275543998, + "min": 0.727673394850341, + "max": 109.66181307292277, + "q25": 45.197855854500325, + "q50": 80.46348015049212, + "q75": 90.9621207543436, + "std": 29.061616382311644, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -446,14 +486,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 109, - "avg": 0.4979052918970571, - "min": -0.7685655004747685, - "max": 0.8965485721217097, - "q25": 0.31406675065235196, - "q50": 0.6465958189771321, - "q75": 0.675510875478117, - "std": 0.27431090429004085, + "approx_unique": 86, + "avg": 0.49790529198137257, + "min": -0.7685654995270176, + "max": 0.8965485719768319, + "q25": 0.3140667507684413, + "q50": 0.6465958189822099, + "q75": 0.6755108754950195, + "std": 0.2743109041542205, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -464,14 +504,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 94, - "avg": 61.12278600295739, - "min": 0.6534488975859385, - "max": 98.15463652420176, - "q25": 40.59131151508852, - "q50": 71.44444119849118, - "q75": 81.54953124371184, - "std": 26.07679480798417, + "approx_unique": 110, + "avg": 61.12278600295826, + "min": 0.6534488975859953, + "max": 98.15463652420087, + "q25": 40.59131151509016, + "q50": 71.44444119849189, + "q75": 81.54953124371187, + "std": 26.076794807984033, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -482,14 +522,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 96, - "avg": 0.37822210364341335, - "min": -0.9960609367144815, - "max": 6.973058442481465, - "q25": -0.02509066174406189, - "q50": 0.16454383870471806, - "q75": 0.30357637468359877, - "std": 1.1509689359232387, + "approx_unique": 120, + "avg": 0.3782221036628038, + "min": -0.9960609366987908, + "max": 6.973058442492447, + "q25": -0.025090661724871316, + "q50": 0.1645438387261663, + "q75": 0.3035763747028691, + "std": 1.1509689359209772, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -531,18 +571,18 @@ "propositionalization": { "aggregation": [ "COUNT DISTINCT", - "STDDEV", - "COUNT", - "FIRST", "COUNT MINUS COUNT DISTINCT", - "MEDIAN", "AVG", - "MAX", - "LAST", - "MIN", "SUM", + "TREND", + "MIN", + "STDDEV", + "LAST", + "MAX", + "FIRST", + "MEDIAN", "MODE", - "TREND" + "COUNT" ], "delta_t": 0.0, "loss_function": "SquareLoss", @@ -688,4 +728,4 @@ }, "peripheral": {} } -} +} \ No newline at end of file diff --git a/tests/integration/data/robot/expected.pipeline.json b/tests/integration/data/robot/expected.pipeline.json index df8eb6e..7388fa9 100644 --- a/tests/integration/data/robot/expected.pipeline.json +++ b/tests/integration/data/robot/expected.pipeline.json @@ -1,17 +1,185 @@ { - "id": "kSE7Uw", + "id": "XLdkKu", "predictions": { "train": { - "name": "train", - "path": "pipeline/predictions/train.parquet" + "name": "prediction.train", + "path": "pipeline/predictions/prediction.train.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 90, + "approx_unique": 92, + "avg": -10.733253468407526, + "min": -11.220943450927734, + "max": -10.405853271484375, + "q25": -10.853607177734375, + "q50": -10.71152925491333, + "q75": -10.611865997314453, + "std": 0.17923356795586112, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "1": { + "name": "1", + "role": "unused_float", + "statistics": { + "count": 90, + "approx_unique": 96, + "avg": 6.420799822277493, + "min": 5.9611358642578125, + "max": 6.9497480392456055, + "q25": 6.250061988830566, + "q50": 6.429170370101929, + "q75": 6.589086055755615, + "std": 0.23492959994799614, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "2": { + "name": "2", + "role": "unused_float", + "statistics": { + "count": 90, + "approx_unique": 89, + "avg": -7.648273118336996, + "min": -7.977388381958008, + "max": -7.296791076660156, + "q25": -7.702513217926025, + "q50": -7.645588159561157, + "q75": -7.6043314933776855, + "std": 0.11707979723485837, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } }, "validation": { - "name": "validation", - "path": "pipeline/predictions/validation.parquet" + "name": "prediction.validation", + "path": "pipeline/predictions/prediction.validation.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 35, + "avg": -10.93016627629598, + "min": -11.100025177001953, + "max": -10.782144546508789, + "q25": -11.029928207397461, + "q50": -10.914761066436768, + "q75": -10.841729164123535, + "std": 0.09749280830566094, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "1": { + "name": "1", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 32, + "avg": 6.241030391057333, + "min": 5.999167442321777, + "max": 6.531711578369141, + "q25": 6.109324932098389, + "q50": 6.203312873840332, + "q75": 6.360936164855957, + "std": 0.14808194665012037, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "2": { + "name": "2", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 30, + "avg": -7.675311549504598, + "min": -7.775376319885254, + "max": -7.590496063232422, + "q25": -7.704952239990234, + "q50": -7.686032295227051, + "q75": -7.637664794921875, + "std": 0.04378525558931166, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } }, "test": { - "name": "test", - "path": "pipeline/predictions/test.parquet" + "name": "prediction.test", + "path": "pipeline/predictions/prediction.test.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 30, + "avg": -10.998719056447348, + "min": -11.182074546813965, + "max": -10.69487476348877, + "q25": -11.061859130859375, + "q50": -11.018784046173096, + "q75": -10.933016777038574, + "std": 0.10115848786445528, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "1": { + "name": "1", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 25, + "avg": 6.590824826558431, + "min": 6.305027008056641, + "max": 6.856055736541748, + "q25": 6.494635581970215, + "q50": 6.575701713562012, + "q75": 6.6769700050354, + "std": 0.13825323392743774, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "2": { + "name": "2", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 27, + "avg": -7.63157286643982, + "min": -7.698906421661377, + "max": -7.564383506774902, + "q25": -7.653155326843262, + "q50": -7.634854078292847, + "q75": -7.6035566329956055, + "std": 0.034576567990887985, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } } }, "feature_sets": { @@ -6852,19 +7020,19 @@ "num_threads": 0, "propositionalization": { "aggregation": [ - "MAX", - "SUM", + "COUNT DISTINCT", + "COUNT MINUS COUNT DISTINCT", "AVG", - "STDDEV", + "SUM", + "TREND", "MIN", - "COUNT MINUS COUNT DISTINCT", - "COUNT", + "STDDEV", "LAST", - "COUNT DISTINCT", - "MEDIAN", + "MAX", "FIRST", - "TREND", - "MODE" + "MEDIAN", + "MODE", + "COUNT" ], "delta_t": 0.0, "loss_function": "SquareLoss", @@ -6940,7 +7108,7 @@ "preprocessors": [], "share_selected_features": 0.5, "tags": [ - "container-RYJL5w" + "container-0ENVoV" ], "targets": [ "f_x", @@ -7198,4 +7366,4 @@ }, "peripheral": {} } -} +} \ No newline at end of file diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 9aedfc4..b8208f0 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -16,7 +16,7 @@ def load_container_information( assert container_information_json_path.exists() assert container_information_json_path.is_file() - container_information_json = json.loads(container_information_json_path.read_text()) + container_information_json = json.loads(container_information_json_path.read_text()) # pyright: ignore [reportAny] container_information_json["path"] = Path("dummy.json") return ContainerInformationAdapter.validate_python(container_information_json) @@ -27,6 +27,6 @@ def load_pipeline_information( assert pipeline_information_json_path.exists() assert pipeline_information_json_path.is_file() - pipeline_information_json = json.loads(pipeline_information_json_path.read_text()) + pipeline_information_json = json.loads(pipeline_information_json_path.read_text()) # pyright: ignore [reportAny] pipeline_information_json["path"] = Path("dummy.json") return PipelineInformationAdapter.validate_python(pipeline_information_json) diff --git a/tests/integration/test_serialize_loans.py b/tests/integration/test_serialize_loans.py index c5a379d..85e1d74 100644 --- a/tests/integration/test_serialize_loans.py +++ b/tests/integration/test_serialize_loans.py @@ -93,7 +93,7 @@ def test_serialize_loans( "features.test.parquet", ], expected_predictions=[ - "train.parquet", - "test.parquet", + "prediction.train.parquet", + "prediction.test.parquet", ], ) diff --git a/tests/integration/test_serialize_numerical.py b/tests/integration/test_serialize_numerical.py index 10b82d0..15290b1 100644 --- a/tests/integration/test_serialize_numerical.py +++ b/tests/integration/test_serialize_numerical.py @@ -91,7 +91,7 @@ def test_serialize_numerical( "features.test.parquet", ], expected_predictions=[ - "train.parquet", - "test.parquet", + "prediction.train.parquet", + "prediction.test.parquet", ], ) diff --git a/tests/integration/test_serialize_robot.py b/tests/integration/test_serialize_robot.py index 39acd28..9cfb766 100644 --- a/tests/integration/test_serialize_robot.py +++ b/tests/integration/test_serialize_robot.py @@ -93,8 +93,8 @@ def test_serialize_robot( "features.validation.parquet", ], expected_predictions=[ - "train.parquet", - "test.parquet", - "validation.parquet", + "prediction.train.parquet", + "prediction.test.parquet", + "prediction.validation.parquet", ], ) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index ae80cd3..6b95f92 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -3,11 +3,12 @@ import re from collections.abc import Mapping, Sequence from pathlib import Path -from typing import Protocol +from typing import Any, Protocol import getml.data.roles as getml_roles import numpy as np import pandas as pd +import pyarrow as pa import pytest import pytest_mock from duckdb import DuckDBPyConnection @@ -38,7 +39,6 @@ JoinInformation, PlaceholderInformation, ) -from getml_io.metadata.table_information import TableInformation from getml_io.utils.convert import assume_is_str @@ -109,10 +109,10 @@ def __getitem__(self, key: str) -> Subset: ... container.train = mock_subsets["train"] container.test = mock_subsets["test"] container.validation = mock_subsets["validation"] - container.__getitem__.side_effect = lambda key: Subset( - container_id=container.id, - population=container.subsets[key].with_name(key), - peripheral=container.peripheral, + container.__getitem__.side_effect = lambda key: Subset( # pyright: ignore [reportAny, reportUnknownLambdaType] + container_id=container.id, # pyright: ignore [reportAny] + population=container.subsets[key].with_name(key), # pyright: ignore [reportAny] + peripheral=container.peripheral, # pyright: ignore [reportAny] ) return container @@ -239,8 +239,8 @@ def mock_dataframe( ] dataframe.roles = mocker.MagicMock() dataframe.roles.column = ( - lambda name: getml_roles.categorical - if name.startswith("Categorical") + lambda name: getml_roles.categorical # pyright: ignore [reportUnknownLambdaType] + if name.startswith("Categorical") # pyright: ignore [reportUnknownMemberType] else getml_roles.numerical ) return dataframe @@ -391,7 +391,7 @@ def mock_pipeline( ) -> Pipeline: pipeline = mocker.Mock() pipeline.id = "mock_pipeline_id" - pipeline.predict = lambda _: ndarray + pipeline.predict = lambda _: ndarray # pyright: ignore [reportUnknownLambdaType] def pipeline_transform(_: DataFrame | View | Subset, *, df_name: str) -> DataFrame: dataframe = copy.deepcopy(mock_dataframe) @@ -425,14 +425,6 @@ def pipeline_transform(_: DataFrame | View | Subset, *, df_name: str) -> DataFra return pipeline -@pytest.fixture -def table_information(tmp_path: Path) -> TableInformation: - return TableInformation( - name="table_name", - path=tmp_path / "table_name.parquet", - ) - - @pytest.fixture def mock_project_empty( mocker: pytest_mock.MockerFixture, @@ -445,26 +437,6 @@ def mock_project_empty( ) -@pytest.fixture -def table_information_test( - tmp_path: Path, -) -> TableInformation: - return TableInformation( - name="test", - path=tmp_path / "test.parquet", - ) - - -@pytest.fixture -def table_information_validation( - tmp_path: Path, -) -> TableInformation: - return TableInformation( - name="validation", - path=tmp_path / "validation.parquet", - ) - - @pytest.fixture def dataframe_information_features_test( tmp_path: Path, @@ -567,8 +539,8 @@ def category_trimmer() -> CategoryTrimmer: @pytest.fixture def pipeline_information( # noqa: PLR0913 tmp_path: Path, - table_information_test: TableInformation, - table_information_validation: TableInformation, + dataframe_information_test: DataFrameInformation, + dataframe_information_validation: DataFrameInformation, dataframe_information_features_test: DataFrameInformation, dataframe_information_features_validation: DataFrameInformation, data_model_information: DataModelInformation, @@ -580,18 +552,18 @@ def pipeline_information( # noqa: PLR0913 id="pipeline_id", predictions={ "test": dataclasses.replace( - table_information_test, + dataframe_information_test, path=tmp_path / "pipeline" / "predictions" - / table_information_test.path.name, + / dataframe_information_test.path.name, ), "validation": dataclasses.replace( - table_information_validation, + dataframe_information_validation, path=tmp_path / "pipeline" / "predictions" - / table_information_validation.path.name, + / dataframe_information_validation.path.name, ), }, feature_sets={ @@ -683,7 +655,7 @@ def mock_duckdb_execute( ) -> None: connection_context_manager = mocker.MagicMock(DuckDBPyConnection) connection = mocker.MagicMock(DuckDBPyConnection) - connection_context_manager.__enter__.return_value = connection + connection_context_manager.__enter__.return_value = connection # pyright: ignore [reportAny] _ = mocker.patch( "getml_io.serialize.dataframe_or_view.duckdb.connect", return_value=connection_context_manager, @@ -703,9 +675,31 @@ def mocked_df() -> pd.DataFrame: ] return generate_raw_summary_statistics_pd(current_dataframe) - mock_execution.df.side_effect = mocked_df + mock_execution.df.side_effect = mocked_df # pyright: ignore [reportAny] return mock_execution - connection.execute.side_effect = mocked_execute + connection.execute.side_effect = mocked_execute # pyright: ignore [reportAny] return mock_duckdb_execute + + +@pytest.fixture +def mock_getml_dataframe_from_array( + mocker: pytest_mock.MockerFixture, + mock_dataframe: DataFrame, +) -> None: + def mock_from_array( + table: pa.Table, # pyright: ignore [reportUnknownMemberType, reportUnknownParameterType] + name: str, + *_args: tuple[Any, ...], # pyright: ignore [reportExplicitAny] + **_kwargs: dict[str, Any], # pyright: ignore [reportExplicitAny] + ) -> DataFrame: + _ = table # pyright: ignore [reportUnknownVariableType] + dataframe = copy.deepcopy(mock_dataframe) + dataframe.name = name + return dataframe + + _ = mocker.patch( + "getml_io.serialize.pipeline.DataFrame.from_arrow", + side_effect=mock_from_array, + ) diff --git a/tests/unit/getml/test_project.py b/tests/unit/getml/test_project.py index 7a6dc88..a2fb111 100644 --- a/tests/unit/getml/test_project.py +++ b/tests/unit/getml/test_project.py @@ -59,9 +59,9 @@ def test_load_project( # Then mock_alive_getml.assert_called() - mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) - as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() - as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() + mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() # pyright: ignore [reportAny] mock_list_projects.assert_called() mock_list_running_projects.assert_called() mock_set_project.assert_called_with("project_name") @@ -121,9 +121,9 @@ def test_load_project_raises_project_not_found( assert "project_name" in str(excinfo.value) mock_alive_getml.assert_called() - mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) - as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() - as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() + mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() # pyright: ignore [reportAny] mock_list_projects.assert_called() mock_list_running_projects.assert_not_called() mock_set_project.assert_not_called() @@ -183,9 +183,9 @@ def test_load_project_raises_pipeline_not_found( assert "pipeline_id" in str(excinfo.value) mock_alive_getml.assert_called() - mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) - as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() - as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() + mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() # pyright: ignore [reportAny] mock_list_projects.assert_called() mock_list_running_projects.assert_called() mock_set_project.assert_called_with("project_name") diff --git a/tests/unit/metadata/test_container_information.py b/tests/unit/metadata/test_container_information.py index 8a601b4..0bbd005 100644 --- a/tests/unit/metadata/test_container_information.py +++ b/tests/unit/metadata/test_container_information.py @@ -13,7 +13,7 @@ def test_serialize_model_without_dataframe_information( container_information_empty: ContainerInformation, ) -> None: # When - serialized_model = TypeAdapter(ContainerInformation).dump_python( + serialized_model = TypeAdapter(ContainerInformation).dump_python( # pyright: ignore [reportAny] container_information_empty, ) # Then @@ -38,7 +38,7 @@ def test_serialize_model( container_information: ContainerInformation, ) -> None: # When - serialized_model = TypeAdapter(ContainerInformation).dump_python( + serialized_model = TypeAdapter(ContainerInformation).dump_python( # pyright: ignore [reportAny] container_information, ) diff --git a/tests/unit/metadata/test_pipeline_information.py b/tests/unit/metadata/test_pipeline_information.py index fd69582..7ec8e15 100644 --- a/tests/unit/metadata/test_pipeline_information.py +++ b/tests/unit/metadata/test_pipeline_information.py @@ -14,7 +14,7 @@ def test_serialize_model_without_transforms( pipeline_information_empty: PipelineInformation, ) -> None: # When - serialized_model = TypeAdapter(PipelineInformation).dump_python( + serialized_model = TypeAdapter(PipelineInformation).dump_python( # pyright: ignore [reportAny] pipeline_information_empty, ) @@ -68,13 +68,13 @@ def test_serialize_model( pipeline_information: PipelineInformation, ) -> None: # When - serialized_model = TypeAdapter(PipelineInformation).dump_python( + serialized_model = TypeAdapter(PipelineInformation).dump_python( # pyright: ignore [reportAny] pipeline_information, ) # Then serialized_model["feature_learners"][0]["aggregation"] = list( - serialized_model["feature_learners"][0]["aggregation"], + serialized_model["feature_learners"][0]["aggregation"], # pyright: ignore [reportAny] ) expected_serialized_pipeline_information = ( _get_expected_serialized_pipeline_information() @@ -107,12 +107,14 @@ def _get_expected_serialized_pipeline_information() -> PipelineInformationType: "id": "pipeline_id", "predictions": { "test": { - "name": "test", - "path": Path("pipeline/predictions/test.parquet"), + "name": "dataframe_test", + "path": Path("pipeline/predictions/dataframe_test.parquet"), + "column_profile": expected_column_profile, }, "validation": { - "name": "validation", - "path": Path("pipeline/predictions/validation.parquet"), + "name": "dataframe_validation", + "path": Path("pipeline/predictions/dataframe_validation.parquet"), + "column_profile": expected_column_profile, }, }, "feature_sets": { diff --git a/tests/unit/metadata/test_utils.py b/tests/unit/metadata/test_utils.py index 90dcbb5..224070c 100644 --- a/tests/unit/metadata/test_utils.py +++ b/tests/unit/metadata/test_utils.py @@ -5,98 +5,45 @@ from getml_io.metadata.dataframe_information import DataFrameInformation from getml_io.metadata.exception import ( DataFrameInformationPathNotRelativeError, - TableInformationPathNotRelativeError, ) -from getml_io.metadata.table_information import TableInformation from getml_io.metadata.utils import derive_instance_with_relative_path @pytest.mark.unit -@pytest.mark.parametrize( - ("instance_fixture", "error_factory"), - [ - ("dataframe_information", DataFrameInformationPathNotRelativeError), - ("table_information", TableInformationPathNotRelativeError), - ], -) def test_derive_instance_with_relative_path( - request: pytest.FixtureRequest, tmp_path: Path, - instance_fixture: str, - error_factory: type[ - DataFrameInformationPathNotRelativeError | TableInformationPathNotRelativeError - ], + dataframe_information: DataFrameInformation, ) -> None: # Given - instance: DataFrameInformation | TableInformation = request.getfixturevalue( - instance_fixture, - ) # When derived_instance = derive_instance_with_relative_path( - instance, + dataframe_information, tmp_path, - error_factory, ) # Then - assert isinstance(derived_instance, type(instance)) - assert derived_instance.path == Path(instance.path.name) + assert isinstance(derived_instance, DataFrameInformation) + assert derived_instance.path == Path(dataframe_information.path.name) @pytest.mark.unit -@pytest.mark.parametrize( - ("instance_fixture", "error_factory"), - [ - ("dataframe_information", DataFrameInformationPathNotRelativeError), - ("table_information", TableInformationPathNotRelativeError), - ], -) def test_derive_instance_with_relative_path_not_relative( - request: pytest.FixtureRequest, - instance_fixture: str, - error_factory: type[ - DataFrameInformationPathNotRelativeError | TableInformationPathNotRelativeError - ], + dataframe_information: DataFrameInformation, ) -> None: # Given - instance: DataFrameInformation | TableInformation = request.getfixturevalue( - instance_fixture, - ) non_relative_path = Path("/non/relative/path") # When / Then with pytest.raises( - error_factory, + DataFrameInformationPathNotRelativeError, match=( r"'.*Information' with name '.*_name' " - f"and path '{instance.path}' " + f"and path '{dataframe_information.path}' " r"is not relative to base path '/non/relative/path'." ), ): _ = derive_instance_with_relative_path( - instance, + dataframe_information, non_relative_path, - error_factory, - ) - - -@pytest.mark.unit -def test_derive_instance_with_relative_path_not_dataclass( - tmp_path: Path, -) -> None: - # Given - class NotADataclass: - name: str = "not_a_dataclass_instance" - path: Path = Path("not_a_dataclass_instance.parquet") - - # When / Then - with pytest.raises( - TypeError, - match=r"Instance must be a dataclass:", - ): - _ = derive_instance_with_relative_path( - NotADataclass(), - tmp_path, - DataFrameInformationPathNotRelativeError, ) diff --git a/tests/unit/serialize/test_ndarray.py b/tests/unit/serialize/test_ndarray.py deleted file mode 100644 index 1660f0f..0000000 --- a/tests/unit/serialize/test_ndarray.py +++ /dev/null @@ -1,66 +0,0 @@ -from pathlib import Path - -import numpy as np -import pytest -import pytest_mock -from numpy.typing import NDArray -from pyarrow import parquet - -from getml_io.serialize.exception import TableParquetStorageError -from getml_io.serialize.ndarray import serialize_ndarray -from getml_io.utils.exception import StorageDirectoryCreationError - - -@pytest.mark.unit -def test_serialize_ndarray(tmp_path: Path, ndarray: NDArray[np.float64]) -> None: - # Given - name = "test_array" - - # When - path = serialize_ndarray(ndarray, tmp_path, name) - - # Then - assert path.exists() - assert path.is_file() - assert path.name == f"{name}.parquet" - - -@pytest.mark.unit -def test_serialize_ndarray_directory_creation_error( - ndarray: NDArray[np.float64], -) -> None: - # Given - invalid_target_storage_directory = Path("/invalid/ndarrays") - name = "test_array" - - # When / Then - with pytest.raises( - StorageDirectoryCreationError, - match=r"Failed to create storage directory '/invalid/ndarrays'.", - ): - _ = serialize_ndarray(ndarray, invalid_target_storage_directory, name) - - -@pytest.mark.unit -def test_serialize_ndarray_storage_error( - tmp_path: Path, - mocker: pytest_mock.MockerFixture, - ndarray: NDArray[np.float64], -) -> None: - # Given - name = "test_array" - - mock_write_table = mocker.patch.object( - parquet, - "write_table", - side_effect=Exception("Storage error"), - ) - - # When / Then - with pytest.raises( - TableParquetStorageError, - match=(r"Failed to store Table as parquet 'test_array' at path"), - ): - _ = serialize_ndarray(ndarray, tmp_path, name) - - mock_write_table.assert_called() diff --git a/tests/unit/serialize/test_pipeline.py b/tests/unit/serialize/test_pipeline.py index 5639d6e..ad523c7 100644 --- a/tests/unit/serialize/test_pipeline.py +++ b/tests/unit/serialize/test_pipeline.py @@ -48,19 +48,24 @@ @pytest.mark.unit -def test_serialize_pipeline( +def test_serialize_pipeline( # noqa: PLR0913 tmp_path: Path, mock_pipeline: Pipeline, mock_container: Container, mock_duckdb_execute_factory: MockDuckDBExecuteFactory, mock_dataframe: DataFrame, + mock_getml_dataframe_from_array: None, ) -> None: # Given + _ = mock_getml_dataframe_from_array mock_duckdb_execute_factory( { Path("pipeline/feature_sets/features.train.parquet"): mock_dataframe, Path("pipeline/feature_sets/features.test.parquet"): mock_dataframe, Path("pipeline/feature_sets/features.validation.parquet"): mock_dataframe, + Path("pipeline/predictions/prediction.train.parquet"): mock_dataframe, + Path("pipeline/predictions/prediction.test.parquet"): mock_dataframe, + Path("pipeline/predictions/prediction.validation.parquet"): mock_dataframe, }, ) @@ -161,11 +166,24 @@ def test_serialize_feature_sets( @pytest.mark.unit -def test_serialize_predictions( +def test_serialize_predictions( # noqa: PLR0913 tmp_path: Path, mock_pipeline: Pipeline, mock_container: Container, + mock_duckdb_execute_factory: MockDuckDBExecuteFactory, + mock_dataframe: DataFrame, + mock_getml_dataframe_from_array: None, ) -> None: + # Given + _ = mock_getml_dataframe_from_array + mock_duckdb_execute_factory( + { + Path("predictions/prediction.train.parquet"): mock_dataframe, + Path("predictions/prediction.test.parquet"): mock_dataframe, + Path("predictions/prediction.validation.parquet"): mock_dataframe, + }, + ) + # When prediction_results = serialize_predictions( mock_pipeline, @@ -191,12 +209,10 @@ def _assert_predictions_valid( path: Path, ) -> None: assert predictions[subset_name] - assert predictions[subset_name].name == subset_name + assert predictions[subset_name].name == f"prediction.{subset_name}" - expected_path = path / f"{subset_name}.parquet" + expected_path = path / f"prediction.{subset_name}.parquet" assert predictions[subset_name].path == expected_path - assert expected_path.exists() - assert expected_path.is_file() def _assert_features_valid( diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 144cef8..3094a32 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -53,7 +53,7 @@ def test_serialize( DEFAULT_ROOT_STORAGE_DIRECTORY, clear_storage_directory=False, ) - assert mock_logging_basic_config.call_args.kwargs["level"] == logging.WARNING + assert mock_logging_basic_config.call_args.kwargs["level"] == logging.WARNING # pyright: ignore [reportAny] @pytest.mark.unit @@ -99,7 +99,7 @@ def test_serialize_with_clear_storage_directory( root_storage_directory, clear_storage_directory=True, ) - assert mock_logging_basic_config.call_args.kwargs["level"] == logging.INFO + assert mock_logging_basic_config.call_args.kwargs["level"] == logging.INFO # pyright: ignore [reportAny] @pytest.mark.unit diff --git a/tests/unit/utils/test_storage.py b/tests/unit/utils/test_storage.py index e7bd7ee..190afbd 100644 --- a/tests/unit/utils/test_storage.py +++ b/tests/unit/utils/test_storage.py @@ -250,7 +250,7 @@ def test_get_default_root_storage_directory( if platform_system == Windows: monkeypatch.setattr( "platformdirs.windows.get_win_folder", - lambda _: Path("~/AppData/Local").expanduser(), + lambda _: Path("~/AppData/Local").expanduser(), # pyright: ignore [reportUnknownArgumentType, reportUnknownLambdaType] ) # When