diff --git a/.github/workflows/python-tests.yml b/.github/workflows/python-tests.yml index c41573d..cfd58ab 100644 --- a/.github/workflows/python-tests.yml +++ b/.github/workflows/python-tests.yml @@ -60,6 +60,7 @@ jobs: run: | uv run ruff check --extend-ignore FIX . uv run ruff format --check . + uv run basedpyright . - name: Show TODOs run: | uv run ruff check --select FIX . || true diff --git a/src/getml_io/getml/getml.py b/src/getml_io/getml/getml.py index f63b37c..41cf1b9 100644 --- a/src/getml_io/getml/getml.py +++ b/src/getml_io/getml/getml.py @@ -29,7 +29,7 @@ def alive_getml() -> Generator[None]: getml_is_alive_on_entry = getml.communication.is_monitor_alive() if not getml_is_alive_on_entry: try: - getml.engine.launch(launch_browser=False) + getml.engine.launch(launch_browser=False) # pyright: ignore [reportUnknownMemberType] except Exception as exception: message = ( "Failed to launch getML engine. " diff --git a/src/getml_io/metadata/container_information.py b/src/getml_io/metadata/container_information.py index 441431b..9a80835 100644 --- a/src/getml_io/metadata/container_information.py +++ b/src/getml_io/metadata/container_information.py @@ -9,7 +9,6 @@ from typing_extensions import TypedDict from getml_io.metadata.dataframe_information import DataFrameInformation -from getml_io.metadata.exception import DataFrameInformationPathNotRelativeError from getml_io.metadata.utils import derive_instance_with_relative_path @@ -60,5 +59,4 @@ def _create_dataframe_information_with_relative_path( return derive_instance_with_relative_path( dataframe_information, self.path, - DataFrameInformationPathNotRelativeError, ) diff --git a/src/getml_io/metadata/dataframe_information.py b/src/getml_io/metadata/dataframe_information.py index ba25820..e82bd7f 100644 --- a/src/getml_io/metadata/dataframe_information.py +++ b/src/getml_io/metadata/dataframe_information.py @@ -21,7 +21,7 @@ class ColumnStatisticsDouble: q25: float q50: float q75: float - std: float + std: float | None null_percentage: float column_type: Literal["DOUBLE"] diff --git a/src/getml_io/metadata/exception.py b/src/getml_io/metadata/exception.py index a4508cc..903c27c 100644 --- a/src/getml_io/metadata/exception.py +++ b/src/getml_io/metadata/exception.py @@ -42,25 +42,3 @@ def __init__( path=path, base_path=base_path, ) - - -class TableInformationPathNotRelativeError(PathNotRelativeError): - """Exception raised on erroneous TableInformation path. - - Raised when the path of a TableInformation is not relative - to the given base path. - """ - - def __init__( - self, - name: str, - path: Path, - base_path: Path, - ) -> None: - """Initialize the exception with a custom message.""" - super().__init__( - item_type="TableInformation", - item_name=name, - path=path, - base_path=base_path, - ) diff --git a/src/getml_io/metadata/pipeline_information.py b/src/getml_io/metadata/pipeline_information.py index 903d38d..541d109 100644 --- a/src/getml_io/metadata/pipeline_information.py +++ b/src/getml_io/metadata/pipeline_information.py @@ -20,10 +20,6 @@ from getml_io.getml.predictors import FeatureSelector, Predictor from getml_io.getml.preprocessors import Preprocessor from getml_io.metadata.data_model_information import DataModelInformation -from getml_io.metadata.exception import ( - DataFrameInformationPathNotRelativeError, - TableInformationPathNotRelativeError, -) from getml_io.metadata.feature_sets import FeatureSets from getml_io.metadata.placeholder_information import PlaceholderInformation from getml_io.metadata.prediction_results import PredictionResults @@ -92,17 +88,15 @@ def _serialize_model(self) -> PipelineInformationDict: "id": self.id, "predictions": { name: derive_instance_with_relative_path( - table_information, + dataframe_information, self.path, - TableInformationPathNotRelativeError, ) - for name, table_information in self.predictions.items() + for name, dataframe_information in self.predictions.items() }, "feature_sets": { name: derive_instance_with_relative_path( dataframe_information, self.path, - DataFrameInformationPathNotRelativeError, ) for name, dataframe_information in self.feature_sets.items() }, diff --git a/src/getml_io/metadata/prediction_results.py b/src/getml_io/metadata/prediction_results.py index 1391c46..4debb21 100644 --- a/src/getml_io/metadata/prediction_results.py +++ b/src/getml_io/metadata/prediction_results.py @@ -1,6 +1,6 @@ from collections.abc import Mapping from typing import TypeAlias -from getml_io.metadata.table_information import TableInformation +from getml_io.metadata.dataframe_information import DataFrameInformation -PredictionResults: TypeAlias = Mapping[str, TableInformation] +PredictionResults: TypeAlias = Mapping[str, DataFrameInformation] diff --git a/src/getml_io/metadata/table_information.py b/src/getml_io/metadata/table_information.py deleted file mode 100644 index 41fa038..0000000 --- a/src/getml_io/metadata/table_information.py +++ /dev/null @@ -1,9 +0,0 @@ -from pathlib import Path - -from pydantic.dataclasses import dataclass - - -@dataclass -class TableInformation: - name: str - path: Path diff --git a/src/getml_io/metadata/utils.py b/src/getml_io/metadata/utils.py index 9f23967..3cbdc96 100644 --- a/src/getml_io/metadata/utils.py +++ b/src/getml_io/metadata/utils.py @@ -1,55 +1,38 @@ import dataclasses -from collections.abc import Callable from pathlib import Path -from typing import Protocol, TypeVar -from getml_io.metadata.exception import PathNotRelativeError - - -class InstanceProtocol(Protocol): - name: str - path: Path - - -InstanceType = TypeVar("InstanceType", bound=InstanceProtocol) -ErrorType = TypeVar("ErrorType", bound=PathNotRelativeError) -ErrorFactory = Callable[[str, Path, Path], ErrorType] +from getml_io.metadata.dataframe_information import DataFrameInformation +from getml_io.metadata.exception import ( + DataFrameInformationPathNotRelativeError, +) def derive_instance_with_relative_path( - instance: InstanceType, + dataframe_information: DataFrameInformation, base_path: Path, - error_factory: ErrorFactory[ErrorType], -) -> InstanceType: +) -> DataFrameInformation: """Derive a copy of an instance with a path relative to the given base path. Args: - instance: The instance to use as a template. + dataframe_information: The instance to use as a template. base_path: The base path to which the instance's path should be relative. - error_factory: A callable that creates an error if the path is not relative. Returns: A new instance with the path relative to the base path. Raises: - PathNotRelativeError: If the instance's path cannot be made relative - to the base path. The specific subclass raised is determined - by the `error_factory`. - TypeError: If the instance is not a dataclass. + DataFrameInformationPathNotRelativeError: If the instance's path cannot be made + relative to the base path. """ - if not dataclasses.is_dataclass(instance): - message = f"Instance must be a dataclass: {type(instance)}" - raise TypeError(message) try: return dataclasses.replace( - instance, - path=instance.path.relative_to(base_path), + dataframe_information, + path=dataframe_information.path.relative_to(base_path), ) except Exception as exception: - error = error_factory( - instance.name, - instance.path, + raise DataFrameInformationPathNotRelativeError( + dataframe_information.name, + dataframe_information.path, base_path, - ) - raise error from exception + ) from exception diff --git a/src/getml_io/serialize/dataframe_or_view.py b/src/getml_io/serialize/dataframe_or_view.py index dfa7254..c84da94 100644 --- a/src/getml_io/serialize/dataframe_or_view.py +++ b/src/getml_io/serialize/dataframe_or_view.py @@ -131,7 +131,7 @@ def _fetch_raw_summary_statistics( parquet_filepath: Path, ) -> dict[str, dict[str, str | int | float]]: with ( - duckdb.connect() as connection, + duckdb.connect() as connection, # pyright: ignore [reportUnknownMemberType] ): logger.debug( "Calculating summary statistics for Parquet '%s'", @@ -141,7 +141,7 @@ def _fetch_raw_summary_statistics( "dict[str, dict[str, str | int | float]]", cast( "object", - connection.execute( + connection.execute( # pyright: ignore [reportUnknownMemberType] SUMMARIZE_STATEMENT_TEMPLATE, [str(parquet_filepath)], ) diff --git a/src/getml_io/serialize/exception.py b/src/getml_io/serialize/exception.py index dc0737c..e61bdce 100644 --- a/src/getml_io/serialize/exception.py +++ b/src/getml_io/serialize/exception.py @@ -72,14 +72,6 @@ def __init__(self, pipeline_id: str, pipeline_json_path: Path) -> None: super().__init__("pipeline information", pipeline_id, pipeline_json_path) -class TableParquetStorageError(GetMLIOStorageError): - """Exception raised when storing Table as parquet fails.""" - - def __init__(self, name: str, path: Path) -> None: - """Initialize the exception with a custom message.""" - super().__init__("Table as parquet", name, path) - - class UnsupportedColumnStatisticsError(GetMLIOError): """Exception raised when an unsupported column statistics is encountered.""" diff --git a/src/getml_io/serialize/ndarray.py b/src/getml_io/serialize/ndarray.py deleted file mode 100644 index f8f0205..0000000 --- a/src/getml_io/serialize/ndarray.py +++ /dev/null @@ -1,54 +0,0 @@ -from pathlib import Path - -import numpy as np -import pyarrow as pa -from numpy.typing import NDArray -from pyarrow import Table, parquet - -from getml_io.serialize.exception import TableParquetStorageError -from getml_io.utils.exception import StorageDirectoryCreationError - - -def serialize_ndarray( - array: NDArray[np.float64], - target_storage_directory: Path, - name: str, -) -> Path: - """Serialize a NumPy ndarray into the target storage directory as a Parquet file. - - Args: - array: The NumPy ndarray to serialize. - target_storage_directory: The directory where the serialized ndarray - will be saved. - name: The name of the serialized ndarray. - - Returns: - Path: The path to the serialized Parquet file. - - Raises: - StorageDirectoryCreationError: If the target storage directory - cannot be created. - TableParquetStorageError: If storing the ndarray as a Parquet file fails. - - """ - try: - target_storage_directory.mkdir(parents=True, exist_ok=True) - except Exception as exception: - raise StorageDirectoryCreationError(target_storage_directory) from exception - - path = target_storage_directory / f"{name}.parquet" - table: Table = Table.from_arrays( - array.transpose(), - schema=pa.schema( - [pa.field(str(i), pa.float64()) for i in range(array.shape[1])], - ), - ) - try: - parquet.write_table( - table=table, - where=path, - ) - except Exception as exception: - raise TableParquetStorageError(name, path) from exception - - return path diff --git a/src/getml_io/serialize/pipeline.py b/src/getml_io/serialize/pipeline.py index 11af6e0..4a44146 100644 --- a/src/getml_io/serialize/pipeline.py +++ b/src/getml_io/serialize/pipeline.py @@ -3,6 +3,7 @@ from typing import cast import numpy as np +import pyarrow as pa from getml import feature_learning as getml_feature_learner from getml import predictors as getml_predictor from getml import preprocessors as getml_preprocessor @@ -47,10 +48,8 @@ PipelineInformation, ) from getml_io.metadata.prediction_results import PredictionResults -from getml_io.metadata.table_information import TableInformation from getml_io.serialize.data_model import serialize_data_model from getml_io.serialize.dataframe_or_view import serialize_dataframe_or_view -from getml_io.serialize.ndarray import serialize_ndarray from getml_io.serialize.pipeline_information import serialize_pipeline_information from getml_io.serialize.placeholder import serialize_placeholder from getml_io.utils.convert import ( @@ -151,16 +150,22 @@ def serialize_predictions( predict_storage_directory = target_storage_directory / "predictions" prediction_results: PredictionResults = {} for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets): - prediction = pipeline.predict(container[subset_name]) - # TODO @urfoex: #54 Convert NDArray to DataFrame and use dataframe serialization - path = serialize_ndarray( - array=cast("NDArray[np.float64]", prediction), - target_storage_directory=predict_storage_directory, - name=subset_name, + prediction = cast( + "NDArray[np.float64]", + pipeline.predict(container[subset_name]), # pyright: ignore [reportUnknownMemberType] ) - prediction_results[subset_name] = TableInformation( - name=subset_name, - path=path, + prediction_table = pa.Table.from_arrays( # pyright: ignore [reportUnknownMemberType, reportUnknownVariableType] + prediction.T, + names=list(map(str, range(prediction.shape[1]))), + ) + prediction_dataframe = DataFrame.from_arrow( # pyright: ignore [reportUnknownMemberType] + prediction_table, # pyright: ignore [reportUnknownArgumentType] + f"prediction.{subset_name}", + ) + + prediction_results[subset_name] = serialize_dataframe_or_view( + prediction_dataframe, + predict_storage_directory, ) return prediction_results @@ -187,7 +192,7 @@ def serialize_feature_sets( transform_storage_directory = target_storage_directory / "feature_sets" feature_sets: FeatureSets = {} for subset_name in assume_is_dict_str_to_dataframe_or_view(container.subsets): - features = pipeline.transform( + features = pipeline.transform( # pyright: ignore [reportUnknownMemberType, reportUnknownVariableType] container[subset_name], df_name=f"features.{subset_name}", ) @@ -263,7 +268,7 @@ def serialize_predictor( return TypeAdapter(XGBoostRegressor).validate_python(predictor_as_dict) -def serialize_preprocessor( +def serialize_preprocessor( # noqa: PLR0911 preprocessor: getml_preprocessor.CategoryTrimmer | getml_preprocessor.EmailDomain | getml_preprocessor.Imputation diff --git a/tests/helpers.py b/tests/helpers.py index 834de9b..99f2aee 100644 --- a/tests/helpers.py +++ b/tests/helpers.py @@ -3,7 +3,7 @@ def as_magicmock( - value: Any, # noqa: ANN401 + value: Any, # noqa: ANN401 # pyright: ignore [reportAny, reportExplicitAny] ) -> unittest.mock.MagicMock: """Assume `value` conforms to the return type for static analysis. diff --git a/tests/integration/assertions.py b/tests/integration/assertions.py index 422d10d..1aad1a6 100644 --- a/tests/integration/assertions.py +++ b/tests/integration/assertions.py @@ -8,7 +8,6 @@ DataFrameInformation, ) from getml_io.metadata.pipeline_information import PipelineInformation -from getml_io.metadata.table_information import TableInformation def assert_container_parquets( @@ -168,7 +167,7 @@ def assert_pipeline_information( prediction_name, prediction, ) in expected_pipeline_information.predictions.items(): - assert_table_information( + assert_dataframe_information( pipeline_information.predictions[prediction_name], prediction, ) @@ -211,11 +210,3 @@ def assert_pipeline_information( assert pipeline_information.tags assert pipeline_information.targets == expected_pipeline_information.targets assert pipeline_information.data_model == expected_pipeline_information.data_model - - -def assert_table_information( - table_information: TableInformation, - expected_table_information: TableInformation, -) -> None: - assert table_information.name == expected_table_information.name - assert table_information.path == expected_table_information.path diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 239e4d5..6fa896c 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -34,7 +34,7 @@ def data_path() -> Path: @pytest.fixture def project_name(request: pytest.FixtureRequest) -> str: - name = cast("str", request.node.name) + name = cast("str", request.node.name) # pyright: ignore [reportUnknownMemberType] sanitized_name = re.sub(r"[^a-zA-Z0-9_-]", "_", name) name_hash = hashlib.md5(name.encode("utf-8")).hexdigest()[:6] # noqa: S324 return f"getml-io-{sanitized_name}-{name_hash}" diff --git a/tests/integration/data/getmlproject.py b/tests/integration/data/getmlproject.py index 7d66ed6..312cabf 100644 --- a/tests/integration/data/getmlproject.py +++ b/tests/integration/data/getmlproject.py @@ -107,7 +107,7 @@ def _save_project_bundle(self) -> None: self._project_name, str(self._path), ) - getml.project.save(filename=self._path) + getml.project.save(filename=self._path) # pyright: ignore [reportUnknownMemberType] @dataclass(config=ConfigDict(arbitrary_types_allowed=True)) diff --git a/tests/integration/data/loans/expected.pipeline.json b/tests/integration/data/loans/expected.pipeline.json index e224a2b..2b402a3 100644 --- a/tests/integration/data/loans/expected.pipeline.json +++ b/tests/integration/data/loans/expected.pipeline.json @@ -1,13 +1,53 @@ { - "id": "n4ARIs", + "id": "wzvjNK", "predictions": { "train": { - "name": "train", - "path": "pipeline/predictions/train.parquet" + "name": "prediction.train", + "path": "pipeline/predictions/prediction.train.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 459, + "approx_unique": 452, + "avg": 0.11984065395154968, + "min": 0.0013052262365818024, + "max": 0.9940769672393799, + "q25": 0.007235923552394119, + "q50": 0.017314163701874868, + "q75": 0.06367057869728242, + "std": 0.25014608992693693, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } }, "test": { - "name": "test", - "path": "pipeline/predictions/test.parquet" + "name": "prediction.test", + "path": "pipeline/predictions/prediction.test.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 223, + "approx_unique": 229, + "avg": 0.10654706618594081, + "min": 0.0016005451325327158, + "max": 0.9642216563224792, + "q25": 0.008410481399753027, + "q50": 0.016846238325039547, + "q75": 0.05923604799641504, + "std": 0.2183209639358569, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } } }, "feature_sets": { @@ -68,14 +108,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 54, - "avg": 1963.1830065359477, + "approx_unique": 40, + "avg": 769.5424836601308, "min": 0.0, - "max": 53991.0, - "q25": 145.83333333333334, - "q50": 569.3877551020407, - "q75": 900.0, - "std": 6658.363509588833, + "max": 19621.0, + "q25": 0.0, + "q50": 0.0, + "q75": 202.7777777777778, + "std": 2515.4971605816318, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -86,14 +126,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 287, - "avg": 26940800.0, + "approx_unique": 17, + "avg": 882447.0588235294, "min": 0.0, - "max": 42076800.0, - "q25": 16213200.0, - "q50": 30620865.30612245, - "q75": 39661200.0, - "std": 13345080.744145872, + "max": 68601600.0, + "q25": 0.0, + "q50": 0.0, + "q75": 0.0, + "std": 6212366.872861004, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -140,14 +180,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 128, - "avg": 72.32483660130725, + "approx_unique": 34, + "avg": 7.1725490196078425, "min": 0.0, - "max": 200.0, - "q25": 50.90277777777777, - "q50": 63.077551020408166, - "q75": 100.0, - "std": 43.688883621604056, + "max": 100.0, + "q25": 0.0, + "q50": 0.0, + "q75": 0.0, + "std": 22.558438138781643, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -194,14 +234,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 19, - "avg": 2.6230936819172115, + "approx_unique": 20, + "avg": 2.7734204793028323, "min": 0.0, "max": 95.0, "q25": 0.0, "q50": 0.0, "q75": 0.0, - "std": 12.698880328094846, + "std": 13.070720949321736, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -230,14 +270,14 @@ "role": "numerical", "statistics": { "count": 459, - "approx_unique": 22, - "avg": 0.19324618736383445, + "approx_unique": 23, + "avg": 0.1993464052287582, "min": 0.0, "max": 6.6, "q25": 0.0, "q50": 0.0, "q75": 0.0, - "std": 0.7828247239515089, + "std": 0.7869983380593814, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -357,13 +397,13 @@ "statistics": { "count": 223, "approx_unique": 30, - "avg": 1794.914798206278, + "avg": 842.9372197309417, "min": 0.0, - "max": 67376.0, - "q25": 200.0, - "q50": 466.6666666666667, - "q75": 900.0, - "std": 6927.7664625313855, + "max": 18200.0, + "q25": 0.0, + "q50": 0.0, + "q75": 144.44444444444446, + "std": 2885.3221572288658, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -374,14 +414,14 @@ "role": "numerical", "statistics": { "count": 223, - "approx_unique": 147, - "avg": 27327971.30044843, + "approx_unique": 11, + "avg": 1550550.67264574, "min": 0.0, - "max": 43804800.0, - "q25": 17558400.0, - "q50": 29433600.0, - "q75": 39708000.0, - "std": 12861334.000076162, + "max": 128390400.0, + "q25": 0.0, + "q50": 0.0, + "q75": 0.0, + "std": 10521062.635618178, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -428,14 +468,14 @@ "role": "numerical", "statistics": { "count": 223, - "approx_unique": 92, - "avg": 77.92331838565016, + "approx_unique": 13, + "avg": 3.6977578475336323, "min": 0.0, - "max": 200.0, - "q25": 52.963888888888896, - "q50": 70.53333333333333, - "q75": 100.0, - "std": 40.10804320182245, + "max": 100.0, + "q25": 0.0, + "q50": 0.0, + "q75": 0.0, + "std": 15.026950095889092, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -518,14 +558,14 @@ "role": "numerical", "statistics": { "count": 223, - "approx_unique": 16, - "avg": 0.19192825112107625, + "approx_unique": 17, + "avg": 0.20986547085201795, "min": 0.0, "max": 6.6, "q25": 0.0, "q50": 0.0, "q75": 0.0, - "std": 0.8325188926028172, + "std": 0.8666449130086721, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -591,11 +631,11 @@ "feature_learners": [ { "aggregation": [ - "MIN", + "AVG", "SUM", - "COUNT", + "MIN", "MAX", - "AVG" + "COUNT" ], "allow_sets": true, "delta_t": 0.0, @@ -609,19 +649,19 @@ "num_threads": 0, "propositionalization": { "aggregation": [ - "MIN", - "AVG", - "MODE", + "COUNT DISTINCT", "COUNT MINUS COUNT DISTINCT", - "MEDIAN", + "AVG", "SUM", + "TREND", + "MIN", "STDDEV", "LAST", - "COUNT", "MAX", "FIRST", - "COUNT DISTINCT", - "TREND" + "MEDIAN", + "MODE", + "COUNT" ], "delta_t": 0.0, "loss_function": "CrossEntropyLoss", @@ -917,4 +957,4 @@ }, "peripheral": {} } -} +} \ No newline at end of file diff --git a/tests/integration/data/numerical/expected.pipeline.json b/tests/integration/data/numerical/expected.pipeline.json index 469e1a1..d714ce1 100644 --- a/tests/integration/data/numerical/expected.pipeline.json +++ b/tests/integration/data/numerical/expected.pipeline.json @@ -1,13 +1,53 @@ { - "id": "RbdvzM", + "id": "td6iXW", "predictions": { "train": { - "name": "train", - "path": "pipeline/predictions/train.parquet" + "name": "prediction.train", + "path": "pipeline/predictions/prediction.train.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 390, + "approx_unique": 345, + "avg": 96.17355588521713, + "min": 0.19379276037216187, + "max": 154.47401428222656, + "q25": 63.90802917480469, + "q50": 113.03656747606065, + "q75": 126.34682067871094, + "std": 40.818051834384214, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } }, "test": { - "name": "test", - "path": "pipeline/predictions/test.parquet" + "name": "prediction.test", + "path": "pipeline/predictions/prediction.test.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 110, + "approx_unique": 97, + "avg": 93.47890945889733, + "min": 1.1542826890945435, + "max": 148.96876525878906, + "q25": 62.2480583190918, + "q50": 110.78643798828125, + "q75": 124.41654968261719, + "std": 39.87365738682376, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } } }, "feature_sets": { @@ -68,14 +108,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 325, - "avg": 96.16907668123311, + "approx_unique": 365, + "avg": 96.16907668121483, "min": 0.0, - "max": 154.95853837280958, - "q25": 63.78581572178133, - "q50": 113.27765320315866, - "q75": 126.50463010412777, - "std": 40.83179514271343, + "max": 154.95853837278798, + "q25": 63.785815721758624, + "q50": 113.27765320314755, + "q75": 126.50463010411934, + "std": 40.83179514271078, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -86,14 +126,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 460, - "avg": 0.8376766309822631, - "min": -0.005741885778443031, - "max": 2.719408433645138, - "q25": 0.6520781997438738, - "q50": 0.9700795623721068, - "q75": 1.0064016977732944, - "std": 0.2557014318224649, + "approx_unique": 341, + "avg": 0.8376766309807187, + "min": -0.005741885826104295, + "max": 2.719408433664087, + "q25": 0.6520781997282549, + "q50": 0.9700795623830403, + "q75": 1.0064016977783699, + "std": 0.25570143183522287, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -104,14 +144,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 539, - "avg": 86.38617317829309, + "approx_unique": 463, + "avg": 86.38617317829464, "min": 0.0, - "max": 139.4831117157228, - "q25": 57.23467415456723, - "q50": 101.87912254707464, - "q75": 113.6064311988338, - "std": 36.712671335094896, + "max": 139.48311171571882, + "q25": 57.23467415457054, + "q50": 101.87912254707474, + "q75": 113.60643119883412, + "std": 36.71267133509418, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -122,14 +162,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 348, - "avg": 1.4866715394762007, + "approx_unique": 353, + "avg": 1.4866715395011707, "min": 0.0, - "max": 18.07352556921219, - "q25": 0.5113719486689973, - "q50": 0.6982874132797834, - "q75": 1.482302195832012, - "std": 2.49587156776478, + "max": 18.073525569347655, + "q25": 0.511371948685024, + "q50": 0.6982874132907706, + "q75": 1.4823021958627673, + "std": 2.4958715677815673, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -140,14 +180,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 302, - "avg": 78.0638784245176, + "approx_unique": 354, + "avg": 78.06387842451866, "min": 0.0, - "max": 125.9264443718258, - "q25": 51.62534776200198, - "q50": 92.20745748713675, - "q75": 102.6979676300216, - "std": 33.2115342480785, + "max": 125.92644437182243, + "q25": 51.62534776200333, + "q50": 92.20745748713837, + "q75": 102.69796763002303, + "std": 33.211534248078635, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -158,14 +198,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 326, - "avg": 0.5609713545814653, - "min": -1.884329594891604, - "max": 7.386337074028212, - "q25": 0.31063646371654113, - "q50": 0.4926810309517353, - "q75": 0.5897466914989792, - "std": 1.074825687942176, + "approx_unique": 461, + "avg": 0.5609713545719003, + "min": -1.8843295948944159, + "max": 7.386337073984908, + "q25": 0.31063646370630504, + "q50": 0.4926810309422083, + "q75": 0.5897466914903939, + "std": 1.0748256879378195, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -176,14 +216,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 468, - "avg": 70.10412027364201, + "approx_unique": 383, + "avg": 70.10412027365707, "min": 0.0, - "max": 113.10321385165373, - "q25": 46.531787803716234, - "q50": 82.55886648683865, - "q75": 92.25996193853011, - "std": 29.748700279905826, + "max": 113.10321385165179, + "q25": 46.531787803724455, + "q50": 82.5588664868703, + "q75": 92.25996193854685, + "std": 29.74870027991256, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -194,14 +234,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 375, - "avg": 0.5140333258914743, - "min": -0.44402437982294773, - "max": 0.9493693949050362, - "q25": 0.4048085322650249, - "q50": 0.6491476769270209, - "q75": 0.6766508980437053, - "std": 0.2595060636392321, + "approx_unique": 387, + "avg": 0.5140333259704128, + "min": -0.4440243790143118, + "max": 0.9493693945162422, + "q25": 0.40480853237326986, + "q50": 0.6491476769378332, + "q75": 0.6766508980594182, + "std": 0.25950606351142164, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -212,14 +252,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 378, - "avg": 62.90616871451977, + "approx_unique": 449, + "avg": 62.90321872320113, "min": 0.0, - "max": 101.78880366744104, - "q25": 41.77890864641669, - "q50": 74.08221392645343, - "q75": 82.5708916483322, - "std": 26.691806127961343, + "max": 101.78880366743772, + "q25": 41.77890864641813, + "q50": 74.08221392645562, + "q75": 82.57089164833283, + "std": 26.68981566900884, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -230,14 +270,14 @@ "role": "numerical", "statistics": { "count": 390, - "approx_unique": 403, - "avg": 0.49645826037201785, - "min": -0.45380937173777053, - "max": 8.224653499349321, - "q25": -0.013087906368716035, - "q50": 0.16252383064546452, - "q75": 0.3213966011515323, - "std": 1.3656064778531225, + "approx_unique": 383, + "avg": 0.49645826039061436, + "min": -0.4538093717154356, + "max": 8.224653499362855, + "q25": -0.013087906352437301, + "q50": 0.16252383066178144, + "q75": 0.3213966011730961, + "std": 1.3656064778513446, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -321,13 +361,13 @@ "statistics": { "count": 110, "approx_unique": 116, - "avg": 93.45267098580021, - "min": 1.0004581336996778, - "max": 150.31561443815576, - "q25": 61.996094719141645, - "q50": 110.66337045645069, - "q75": 124.18593897749646, - "std": 39.90174967625321, + "avg": 93.45267098578368, + "min": 1.000458133699219, + "max": 150.31561443814397, + "q25": 61.99609471912092, + "q50": 110.66337045642251, + "q75": 124.18593897756794, + "std": 39.90174967625208, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -338,14 +378,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 115, - "avg": 0.8371683464674687, - "min": 0.26161883596933283, - "max": 1.3008994594133347, - "q25": 0.6463480307577066, - "q50": 0.9769794994204424, - "q75": 0.9981102813307521, - "std": 0.2269445176703966, + "approx_unique": 99, + "avg": 0.8371683464659075, + "min": 0.2616188358978256, + "max": 1.3008994594821477, + "q25": 0.6463480307323424, + "q50": 0.9769794994299659, + "q75": 0.9981102813377745, + "std": 0.22694451768807689, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -356,14 +396,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 118, - "avg": 83.92568798146631, - "min": 0.8936319178549497, - "max": 135.34260543221754, - "q25": 55.61865811398502, - "q50": 99.41231630964933, - "q75": 112.10778547546019, - "std": 35.861589885099654, + "approx_unique": 128, + "avg": 83.92568798146797, + "min": 0.8936319178550347, + "max": 135.34260543221382, + "q25": 55.618658113988545, + "q50": 99.41231630965478, + "q75": 112.10778547545671, + "std": 35.86158988509899, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -374,14 +414,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 97, - "avg": 1.3729637342160355, - "min": 0.26150392324676175, - "max": 14.741123450975595, - "q25": 0.5079183381010651, - "q50": 0.733030259298452, - "q75": 1.5598762921754463, - "std": 2.1587383317031876, + "approx_unique": 83, + "avg": 1.372963734240981, + "min": 0.26150392327196315, + "max": 14.74112345108553, + "q25": 0.5079183381167619, + "q50": 0.733030259305973, + "q75": 1.559876292211866, + "std": 2.1587383317179856, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -392,14 +432,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 88, - "avg": 75.84121197115688, - "min": 0.8078674007376023, - "max": 122.03529180787784, - "q25": 50.16882583040196, - "q50": 90.30250196991506, - "q75": 101.0109783562762, - "std": 32.43788489337715, + "approx_unique": 107, + "avg": 75.84121197115796, + "min": 0.8078674007376269, + "max": 122.03529180788084, + "q25": 50.16882583040319, + "q50": 90.30250196991355, + "q75": 101.01097835627817, + "std": 32.43788489337733, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -410,14 +450,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 106, - "avg": 0.5049229013398149, - "min": -1.0234322123252753, - "max": 6.118363465755303, - "q25": 0.14283511563729268, - "q50": 0.4803652747350423, - "q75": 0.5681416543623228, - "std": 1.0283977404635392, + "approx_unique": 97, + "avg": 0.5049229013302758, + "min": -1.0234322123289814, + "max": 6.118363465727491, + "q25": 0.14283511563451265, + "q50": 0.48036527472462065, + "q75": 0.5681416543552915, + "std": 1.0283977404603937, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -428,14 +468,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 112, - "avg": 68.11771275542432, - "min": 0.7276733948500856, - "max": 109.66181307290546, - "q25": 45.197855854491515, - "q50": 80.4634801504667, - "q75": 90.96212075430984, - "std": 29.061616382304035, + "approx_unique": 116, + "avg": 68.11771275543998, + "min": 0.727673394850341, + "max": 109.66181307292277, + "q25": 45.197855854500325, + "q50": 80.46348015049212, + "q75": 90.9621207543436, + "std": 29.061616382311644, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -446,14 +486,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 109, - "avg": 0.4979052918970571, - "min": -0.7685655004747685, - "max": 0.8965485721217097, - "q25": 0.31406675065235196, - "q50": 0.6465958189771321, - "q75": 0.675510875478117, - "std": 0.27431090429004085, + "approx_unique": 86, + "avg": 0.49790529198137257, + "min": -0.7685654995270176, + "max": 0.8965485719768319, + "q25": 0.3140667507684413, + "q50": 0.6465958189822099, + "q75": 0.6755108754950195, + "std": 0.2743109041542205, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -464,14 +504,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 94, - "avg": 61.12278600295739, - "min": 0.6534488975859385, - "max": 98.15463652420176, - "q25": 40.59131151508852, - "q50": 71.44444119849118, - "q75": 81.54953124371184, - "std": 26.07679480798417, + "approx_unique": 110, + "avg": 61.12278600295826, + "min": 0.6534488975859953, + "max": 98.15463652420087, + "q25": 40.59131151509016, + "q50": 71.44444119849189, + "q75": 81.54953124371187, + "std": 26.076794807984033, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -482,14 +522,14 @@ "role": "numerical", "statistics": { "count": 110, - "approx_unique": 96, - "avg": 0.37822210364341335, - "min": -0.9960609367144815, - "max": 6.973058442481465, - "q25": -0.02509066174406189, - "q50": 0.16454383870471806, - "q75": 0.30357637468359877, - "std": 1.1509689359232387, + "approx_unique": 120, + "avg": 0.3782221036628038, + "min": -0.9960609366987908, + "max": 6.973058442492447, + "q25": -0.025090661724871316, + "q50": 0.1645438387261663, + "q75": 0.3035763747028691, + "std": 1.1509689359209772, "null_percentage": 0.0, "column_type": "DOUBLE", "type": "numerical" @@ -531,18 +571,18 @@ "propositionalization": { "aggregation": [ "COUNT DISTINCT", - "STDDEV", - "COUNT", - "FIRST", "COUNT MINUS COUNT DISTINCT", - "MEDIAN", "AVG", - "MAX", - "LAST", - "MIN", "SUM", + "TREND", + "MIN", + "STDDEV", + "LAST", + "MAX", + "FIRST", + "MEDIAN", "MODE", - "TREND" + "COUNT" ], "delta_t": 0.0, "loss_function": "SquareLoss", @@ -688,4 +728,4 @@ }, "peripheral": {} } -} +} \ No newline at end of file diff --git a/tests/integration/data/robot/expected.pipeline.json b/tests/integration/data/robot/expected.pipeline.json index df8eb6e..7388fa9 100644 --- a/tests/integration/data/robot/expected.pipeline.json +++ b/tests/integration/data/robot/expected.pipeline.json @@ -1,17 +1,185 @@ { - "id": "kSE7Uw", + "id": "XLdkKu", "predictions": { "train": { - "name": "train", - "path": "pipeline/predictions/train.parquet" + "name": "prediction.train", + "path": "pipeline/predictions/prediction.train.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 90, + "approx_unique": 92, + "avg": -10.733253468407526, + "min": -11.220943450927734, + "max": -10.405853271484375, + "q25": -10.853607177734375, + "q50": -10.71152925491333, + "q75": -10.611865997314453, + "std": 0.17923356795586112, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "1": { + "name": "1", + "role": "unused_float", + "statistics": { + "count": 90, + "approx_unique": 96, + "avg": 6.420799822277493, + "min": 5.9611358642578125, + "max": 6.9497480392456055, + "q25": 6.250061988830566, + "q50": 6.429170370101929, + "q75": 6.589086055755615, + "std": 0.23492959994799614, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "2": { + "name": "2", + "role": "unused_float", + "statistics": { + "count": 90, + "approx_unique": 89, + "avg": -7.648273118336996, + "min": -7.977388381958008, + "max": -7.296791076660156, + "q25": -7.702513217926025, + "q50": -7.645588159561157, + "q75": -7.6043314933776855, + "std": 0.11707979723485837, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } }, "validation": { - "name": "validation", - "path": "pipeline/predictions/validation.parquet" + "name": "prediction.validation", + "path": "pipeline/predictions/prediction.validation.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 35, + "avg": -10.93016627629598, + "min": -11.100025177001953, + "max": -10.782144546508789, + "q25": -11.029928207397461, + "q50": -10.914761066436768, + "q75": -10.841729164123535, + "std": 0.09749280830566094, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "1": { + "name": "1", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 32, + "avg": 6.241030391057333, + "min": 5.999167442321777, + "max": 6.531711578369141, + "q25": 6.109324932098389, + "q50": 6.203312873840332, + "q75": 6.360936164855957, + "std": 0.14808194665012037, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "2": { + "name": "2", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 30, + "avg": -7.675311549504598, + "min": -7.775376319885254, + "max": -7.590496063232422, + "q25": -7.704952239990234, + "q50": -7.686032295227051, + "q75": -7.637664794921875, + "std": 0.04378525558931166, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } }, "test": { - "name": "test", - "path": "pipeline/predictions/test.parquet" + "name": "prediction.test", + "path": "pipeline/predictions/prediction.test.parquet", + "column_profile": { + "0": { + "name": "0", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 30, + "avg": -10.998719056447348, + "min": -11.182074546813965, + "max": -10.69487476348877, + "q25": -11.061859130859375, + "q50": -11.018784046173096, + "q75": -10.933016777038574, + "std": 0.10115848786445528, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "1": { + "name": "1", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 25, + "avg": 6.590824826558431, + "min": 6.305027008056641, + "max": 6.856055736541748, + "q25": 6.494635581970215, + "q50": 6.575701713562012, + "q75": 6.6769700050354, + "std": 0.13825323392743774, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + }, + "2": { + "name": "2", + "role": "unused_float", + "statistics": { + "count": 30, + "approx_unique": 27, + "avg": -7.63157286643982, + "min": -7.698906421661377, + "max": -7.564383506774902, + "q25": -7.653155326843262, + "q50": -7.634854078292847, + "q75": -7.6035566329956055, + "std": 0.034576567990887985, + "null_percentage": 0.0, + "column_type": "DOUBLE", + "type": "unused_float" + } + } + } } }, "feature_sets": { @@ -6852,19 +7020,19 @@ "num_threads": 0, "propositionalization": { "aggregation": [ - "MAX", - "SUM", + "COUNT DISTINCT", + "COUNT MINUS COUNT DISTINCT", "AVG", - "STDDEV", + "SUM", + "TREND", "MIN", - "COUNT MINUS COUNT DISTINCT", - "COUNT", + "STDDEV", "LAST", - "COUNT DISTINCT", - "MEDIAN", + "MAX", "FIRST", - "TREND", - "MODE" + "MEDIAN", + "MODE", + "COUNT" ], "delta_t": 0.0, "loss_function": "SquareLoss", @@ -6940,7 +7108,7 @@ "preprocessors": [], "share_selected_features": 0.5, "tags": [ - "container-RYJL5w" + "container-0ENVoV" ], "targets": [ "f_x", @@ -7198,4 +7366,4 @@ }, "peripheral": {} } -} +} \ No newline at end of file diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index 9aedfc4..b8208f0 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -16,7 +16,7 @@ def load_container_information( assert container_information_json_path.exists() assert container_information_json_path.is_file() - container_information_json = json.loads(container_information_json_path.read_text()) + container_information_json = json.loads(container_information_json_path.read_text()) # pyright: ignore [reportAny] container_information_json["path"] = Path("dummy.json") return ContainerInformationAdapter.validate_python(container_information_json) @@ -27,6 +27,6 @@ def load_pipeline_information( assert pipeline_information_json_path.exists() assert pipeline_information_json_path.is_file() - pipeline_information_json = json.loads(pipeline_information_json_path.read_text()) + pipeline_information_json = json.loads(pipeline_information_json_path.read_text()) # pyright: ignore [reportAny] pipeline_information_json["path"] = Path("dummy.json") return PipelineInformationAdapter.validate_python(pipeline_information_json) diff --git a/tests/integration/test_serialize_loans.py b/tests/integration/test_serialize_loans.py index c5a379d..85e1d74 100644 --- a/tests/integration/test_serialize_loans.py +++ b/tests/integration/test_serialize_loans.py @@ -93,7 +93,7 @@ def test_serialize_loans( "features.test.parquet", ], expected_predictions=[ - "train.parquet", - "test.parquet", + "prediction.train.parquet", + "prediction.test.parquet", ], ) diff --git a/tests/integration/test_serialize_numerical.py b/tests/integration/test_serialize_numerical.py index 10b82d0..15290b1 100644 --- a/tests/integration/test_serialize_numerical.py +++ b/tests/integration/test_serialize_numerical.py @@ -91,7 +91,7 @@ def test_serialize_numerical( "features.test.parquet", ], expected_predictions=[ - "train.parquet", - "test.parquet", + "prediction.train.parquet", + "prediction.test.parquet", ], ) diff --git a/tests/integration/test_serialize_robot.py b/tests/integration/test_serialize_robot.py index 39acd28..9cfb766 100644 --- a/tests/integration/test_serialize_robot.py +++ b/tests/integration/test_serialize_robot.py @@ -93,8 +93,8 @@ def test_serialize_robot( "features.validation.parquet", ], expected_predictions=[ - "train.parquet", - "test.parquet", - "validation.parquet", + "prediction.train.parquet", + "prediction.test.parquet", + "prediction.validation.parquet", ], ) diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index ae80cd3..6b95f92 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -3,11 +3,12 @@ import re from collections.abc import Mapping, Sequence from pathlib import Path -from typing import Protocol +from typing import Any, Protocol import getml.data.roles as getml_roles import numpy as np import pandas as pd +import pyarrow as pa import pytest import pytest_mock from duckdb import DuckDBPyConnection @@ -38,7 +39,6 @@ JoinInformation, PlaceholderInformation, ) -from getml_io.metadata.table_information import TableInformation from getml_io.utils.convert import assume_is_str @@ -109,10 +109,10 @@ def __getitem__(self, key: str) -> Subset: ... container.train = mock_subsets["train"] container.test = mock_subsets["test"] container.validation = mock_subsets["validation"] - container.__getitem__.side_effect = lambda key: Subset( - container_id=container.id, - population=container.subsets[key].with_name(key), - peripheral=container.peripheral, + container.__getitem__.side_effect = lambda key: Subset( # pyright: ignore [reportAny, reportUnknownLambdaType] + container_id=container.id, # pyright: ignore [reportAny] + population=container.subsets[key].with_name(key), # pyright: ignore [reportAny] + peripheral=container.peripheral, # pyright: ignore [reportAny] ) return container @@ -239,8 +239,8 @@ def mock_dataframe( ] dataframe.roles = mocker.MagicMock() dataframe.roles.column = ( - lambda name: getml_roles.categorical - if name.startswith("Categorical") + lambda name: getml_roles.categorical # pyright: ignore [reportUnknownLambdaType] + if name.startswith("Categorical") # pyright: ignore [reportUnknownMemberType] else getml_roles.numerical ) return dataframe @@ -391,7 +391,7 @@ def mock_pipeline( ) -> Pipeline: pipeline = mocker.Mock() pipeline.id = "mock_pipeline_id" - pipeline.predict = lambda _: ndarray + pipeline.predict = lambda _: ndarray # pyright: ignore [reportUnknownLambdaType] def pipeline_transform(_: DataFrame | View | Subset, *, df_name: str) -> DataFrame: dataframe = copy.deepcopy(mock_dataframe) @@ -425,14 +425,6 @@ def pipeline_transform(_: DataFrame | View | Subset, *, df_name: str) -> DataFra return pipeline -@pytest.fixture -def table_information(tmp_path: Path) -> TableInformation: - return TableInformation( - name="table_name", - path=tmp_path / "table_name.parquet", - ) - - @pytest.fixture def mock_project_empty( mocker: pytest_mock.MockerFixture, @@ -445,26 +437,6 @@ def mock_project_empty( ) -@pytest.fixture -def table_information_test( - tmp_path: Path, -) -> TableInformation: - return TableInformation( - name="test", - path=tmp_path / "test.parquet", - ) - - -@pytest.fixture -def table_information_validation( - tmp_path: Path, -) -> TableInformation: - return TableInformation( - name="validation", - path=tmp_path / "validation.parquet", - ) - - @pytest.fixture def dataframe_information_features_test( tmp_path: Path, @@ -567,8 +539,8 @@ def category_trimmer() -> CategoryTrimmer: @pytest.fixture def pipeline_information( # noqa: PLR0913 tmp_path: Path, - table_information_test: TableInformation, - table_information_validation: TableInformation, + dataframe_information_test: DataFrameInformation, + dataframe_information_validation: DataFrameInformation, dataframe_information_features_test: DataFrameInformation, dataframe_information_features_validation: DataFrameInformation, data_model_information: DataModelInformation, @@ -580,18 +552,18 @@ def pipeline_information( # noqa: PLR0913 id="pipeline_id", predictions={ "test": dataclasses.replace( - table_information_test, + dataframe_information_test, path=tmp_path / "pipeline" / "predictions" - / table_information_test.path.name, + / dataframe_information_test.path.name, ), "validation": dataclasses.replace( - table_information_validation, + dataframe_information_validation, path=tmp_path / "pipeline" / "predictions" - / table_information_validation.path.name, + / dataframe_information_validation.path.name, ), }, feature_sets={ @@ -683,7 +655,7 @@ def mock_duckdb_execute( ) -> None: connection_context_manager = mocker.MagicMock(DuckDBPyConnection) connection = mocker.MagicMock(DuckDBPyConnection) - connection_context_manager.__enter__.return_value = connection + connection_context_manager.__enter__.return_value = connection # pyright: ignore [reportAny] _ = mocker.patch( "getml_io.serialize.dataframe_or_view.duckdb.connect", return_value=connection_context_manager, @@ -703,9 +675,31 @@ def mocked_df() -> pd.DataFrame: ] return generate_raw_summary_statistics_pd(current_dataframe) - mock_execution.df.side_effect = mocked_df + mock_execution.df.side_effect = mocked_df # pyright: ignore [reportAny] return mock_execution - connection.execute.side_effect = mocked_execute + connection.execute.side_effect = mocked_execute # pyright: ignore [reportAny] return mock_duckdb_execute + + +@pytest.fixture +def mock_getml_dataframe_from_array( + mocker: pytest_mock.MockerFixture, + mock_dataframe: DataFrame, +) -> None: + def mock_from_array( + table: pa.Table, # pyright: ignore [reportUnknownMemberType, reportUnknownParameterType] + name: str, + *_args: tuple[Any, ...], # pyright: ignore [reportExplicitAny] + **_kwargs: dict[str, Any], # pyright: ignore [reportExplicitAny] + ) -> DataFrame: + _ = table # pyright: ignore [reportUnknownVariableType] + dataframe = copy.deepcopy(mock_dataframe) + dataframe.name = name + return dataframe + + _ = mocker.patch( + "getml_io.serialize.pipeline.DataFrame.from_arrow", + side_effect=mock_from_array, + ) diff --git a/tests/unit/getml/test_project.py b/tests/unit/getml/test_project.py index 7a6dc88..a2fb111 100644 --- a/tests/unit/getml/test_project.py +++ b/tests/unit/getml/test_project.py @@ -59,9 +59,9 @@ def test_load_project( # Then mock_alive_getml.assert_called() - mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) - as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() - as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() + mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() # pyright: ignore [reportAny] mock_list_projects.assert_called() mock_list_running_projects.assert_called() mock_set_project.assert_called_with("project_name") @@ -121,9 +121,9 @@ def test_load_project_raises_project_not_found( assert "project_name" in str(excinfo.value) mock_alive_getml.assert_called() - mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) - as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() - as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() + mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() # pyright: ignore [reportAny] mock_list_projects.assert_called() mock_list_running_projects.assert_not_called() mock_set_project.assert_not_called() @@ -183,9 +183,9 @@ def test_load_project_raises_pipeline_not_found( assert "pipeline_id" in str(excinfo.value) mock_alive_getml.assert_called() - mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) - as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() - as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() + mock_alive_getml_contextmanager = as_magicmock(mock_alive_getml.return_value) # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__enter__).assert_called() # pyright: ignore [reportAny] + as_magicmock(mock_alive_getml_contextmanager.__exit__).assert_called() # pyright: ignore [reportAny] mock_list_projects.assert_called() mock_list_running_projects.assert_called() mock_set_project.assert_called_with("project_name") diff --git a/tests/unit/metadata/test_container_information.py b/tests/unit/metadata/test_container_information.py index 8a601b4..0bbd005 100644 --- a/tests/unit/metadata/test_container_information.py +++ b/tests/unit/metadata/test_container_information.py @@ -13,7 +13,7 @@ def test_serialize_model_without_dataframe_information( container_information_empty: ContainerInformation, ) -> None: # When - serialized_model = TypeAdapter(ContainerInformation).dump_python( + serialized_model = TypeAdapter(ContainerInformation).dump_python( # pyright: ignore [reportAny] container_information_empty, ) # Then @@ -38,7 +38,7 @@ def test_serialize_model( container_information: ContainerInformation, ) -> None: # When - serialized_model = TypeAdapter(ContainerInformation).dump_python( + serialized_model = TypeAdapter(ContainerInformation).dump_python( # pyright: ignore [reportAny] container_information, ) diff --git a/tests/unit/metadata/test_pipeline_information.py b/tests/unit/metadata/test_pipeline_information.py index fd69582..7ec8e15 100644 --- a/tests/unit/metadata/test_pipeline_information.py +++ b/tests/unit/metadata/test_pipeline_information.py @@ -14,7 +14,7 @@ def test_serialize_model_without_transforms( pipeline_information_empty: PipelineInformation, ) -> None: # When - serialized_model = TypeAdapter(PipelineInformation).dump_python( + serialized_model = TypeAdapter(PipelineInformation).dump_python( # pyright: ignore [reportAny] pipeline_information_empty, ) @@ -68,13 +68,13 @@ def test_serialize_model( pipeline_information: PipelineInformation, ) -> None: # When - serialized_model = TypeAdapter(PipelineInformation).dump_python( + serialized_model = TypeAdapter(PipelineInformation).dump_python( # pyright: ignore [reportAny] pipeline_information, ) # Then serialized_model["feature_learners"][0]["aggregation"] = list( - serialized_model["feature_learners"][0]["aggregation"], + serialized_model["feature_learners"][0]["aggregation"], # pyright: ignore [reportAny] ) expected_serialized_pipeline_information = ( _get_expected_serialized_pipeline_information() @@ -107,12 +107,14 @@ def _get_expected_serialized_pipeline_information() -> PipelineInformationType: "id": "pipeline_id", "predictions": { "test": { - "name": "test", - "path": Path("pipeline/predictions/test.parquet"), + "name": "dataframe_test", + "path": Path("pipeline/predictions/dataframe_test.parquet"), + "column_profile": expected_column_profile, }, "validation": { - "name": "validation", - "path": Path("pipeline/predictions/validation.parquet"), + "name": "dataframe_validation", + "path": Path("pipeline/predictions/dataframe_validation.parquet"), + "column_profile": expected_column_profile, }, }, "feature_sets": { diff --git a/tests/unit/metadata/test_utils.py b/tests/unit/metadata/test_utils.py index 90dcbb5..224070c 100644 --- a/tests/unit/metadata/test_utils.py +++ b/tests/unit/metadata/test_utils.py @@ -5,98 +5,45 @@ from getml_io.metadata.dataframe_information import DataFrameInformation from getml_io.metadata.exception import ( DataFrameInformationPathNotRelativeError, - TableInformationPathNotRelativeError, ) -from getml_io.metadata.table_information import TableInformation from getml_io.metadata.utils import derive_instance_with_relative_path @pytest.mark.unit -@pytest.mark.parametrize( - ("instance_fixture", "error_factory"), - [ - ("dataframe_information", DataFrameInformationPathNotRelativeError), - ("table_information", TableInformationPathNotRelativeError), - ], -) def test_derive_instance_with_relative_path( - request: pytest.FixtureRequest, tmp_path: Path, - instance_fixture: str, - error_factory: type[ - DataFrameInformationPathNotRelativeError | TableInformationPathNotRelativeError - ], + dataframe_information: DataFrameInformation, ) -> None: # Given - instance: DataFrameInformation | TableInformation = request.getfixturevalue( - instance_fixture, - ) # When derived_instance = derive_instance_with_relative_path( - instance, + dataframe_information, tmp_path, - error_factory, ) # Then - assert isinstance(derived_instance, type(instance)) - assert derived_instance.path == Path(instance.path.name) + assert isinstance(derived_instance, DataFrameInformation) + assert derived_instance.path == Path(dataframe_information.path.name) @pytest.mark.unit -@pytest.mark.parametrize( - ("instance_fixture", "error_factory"), - [ - ("dataframe_information", DataFrameInformationPathNotRelativeError), - ("table_information", TableInformationPathNotRelativeError), - ], -) def test_derive_instance_with_relative_path_not_relative( - request: pytest.FixtureRequest, - instance_fixture: str, - error_factory: type[ - DataFrameInformationPathNotRelativeError | TableInformationPathNotRelativeError - ], + dataframe_information: DataFrameInformation, ) -> None: # Given - instance: DataFrameInformation | TableInformation = request.getfixturevalue( - instance_fixture, - ) non_relative_path = Path("/non/relative/path") # When / Then with pytest.raises( - error_factory, + DataFrameInformationPathNotRelativeError, match=( r"'.*Information' with name '.*_name' " - f"and path '{instance.path}' " + f"and path '{dataframe_information.path}' " r"is not relative to base path '/non/relative/path'." ), ): _ = derive_instance_with_relative_path( - instance, + dataframe_information, non_relative_path, - error_factory, - ) - - -@pytest.mark.unit -def test_derive_instance_with_relative_path_not_dataclass( - tmp_path: Path, -) -> None: - # Given - class NotADataclass: - name: str = "not_a_dataclass_instance" - path: Path = Path("not_a_dataclass_instance.parquet") - - # When / Then - with pytest.raises( - TypeError, - match=r"Instance must be a dataclass:", - ): - _ = derive_instance_with_relative_path( - NotADataclass(), - tmp_path, - DataFrameInformationPathNotRelativeError, ) diff --git a/tests/unit/serialize/test_ndarray.py b/tests/unit/serialize/test_ndarray.py deleted file mode 100644 index 1660f0f..0000000 --- a/tests/unit/serialize/test_ndarray.py +++ /dev/null @@ -1,66 +0,0 @@ -from pathlib import Path - -import numpy as np -import pytest -import pytest_mock -from numpy.typing import NDArray -from pyarrow import parquet - -from getml_io.serialize.exception import TableParquetStorageError -from getml_io.serialize.ndarray import serialize_ndarray -from getml_io.utils.exception import StorageDirectoryCreationError - - -@pytest.mark.unit -def test_serialize_ndarray(tmp_path: Path, ndarray: NDArray[np.float64]) -> None: - # Given - name = "test_array" - - # When - path = serialize_ndarray(ndarray, tmp_path, name) - - # Then - assert path.exists() - assert path.is_file() - assert path.name == f"{name}.parquet" - - -@pytest.mark.unit -def test_serialize_ndarray_directory_creation_error( - ndarray: NDArray[np.float64], -) -> None: - # Given - invalid_target_storage_directory = Path("/invalid/ndarrays") - name = "test_array" - - # When / Then - with pytest.raises( - StorageDirectoryCreationError, - match=r"Failed to create storage directory '/invalid/ndarrays'.", - ): - _ = serialize_ndarray(ndarray, invalid_target_storage_directory, name) - - -@pytest.mark.unit -def test_serialize_ndarray_storage_error( - tmp_path: Path, - mocker: pytest_mock.MockerFixture, - ndarray: NDArray[np.float64], -) -> None: - # Given - name = "test_array" - - mock_write_table = mocker.patch.object( - parquet, - "write_table", - side_effect=Exception("Storage error"), - ) - - # When / Then - with pytest.raises( - TableParquetStorageError, - match=(r"Failed to store Table as parquet 'test_array' at path"), - ): - _ = serialize_ndarray(ndarray, tmp_path, name) - - mock_write_table.assert_called() diff --git a/tests/unit/serialize/test_pipeline.py b/tests/unit/serialize/test_pipeline.py index 5639d6e..ad523c7 100644 --- a/tests/unit/serialize/test_pipeline.py +++ b/tests/unit/serialize/test_pipeline.py @@ -48,19 +48,24 @@ @pytest.mark.unit -def test_serialize_pipeline( +def test_serialize_pipeline( # noqa: PLR0913 tmp_path: Path, mock_pipeline: Pipeline, mock_container: Container, mock_duckdb_execute_factory: MockDuckDBExecuteFactory, mock_dataframe: DataFrame, + mock_getml_dataframe_from_array: None, ) -> None: # Given + _ = mock_getml_dataframe_from_array mock_duckdb_execute_factory( { Path("pipeline/feature_sets/features.train.parquet"): mock_dataframe, Path("pipeline/feature_sets/features.test.parquet"): mock_dataframe, Path("pipeline/feature_sets/features.validation.parquet"): mock_dataframe, + Path("pipeline/predictions/prediction.train.parquet"): mock_dataframe, + Path("pipeline/predictions/prediction.test.parquet"): mock_dataframe, + Path("pipeline/predictions/prediction.validation.parquet"): mock_dataframe, }, ) @@ -161,11 +166,24 @@ def test_serialize_feature_sets( @pytest.mark.unit -def test_serialize_predictions( +def test_serialize_predictions( # noqa: PLR0913 tmp_path: Path, mock_pipeline: Pipeline, mock_container: Container, + mock_duckdb_execute_factory: MockDuckDBExecuteFactory, + mock_dataframe: DataFrame, + mock_getml_dataframe_from_array: None, ) -> None: + # Given + _ = mock_getml_dataframe_from_array + mock_duckdb_execute_factory( + { + Path("predictions/prediction.train.parquet"): mock_dataframe, + Path("predictions/prediction.test.parquet"): mock_dataframe, + Path("predictions/prediction.validation.parquet"): mock_dataframe, + }, + ) + # When prediction_results = serialize_predictions( mock_pipeline, @@ -191,12 +209,10 @@ def _assert_predictions_valid( path: Path, ) -> None: assert predictions[subset_name] - assert predictions[subset_name].name == subset_name + assert predictions[subset_name].name == f"prediction.{subset_name}" - expected_path = path / f"{subset_name}.parquet" + expected_path = path / f"prediction.{subset_name}.parquet" assert predictions[subset_name].path == expected_path - assert expected_path.exists() - assert expected_path.is_file() def _assert_features_valid( diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 144cef8..3094a32 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -53,7 +53,7 @@ def test_serialize( DEFAULT_ROOT_STORAGE_DIRECTORY, clear_storage_directory=False, ) - assert mock_logging_basic_config.call_args.kwargs["level"] == logging.WARNING + assert mock_logging_basic_config.call_args.kwargs["level"] == logging.WARNING # pyright: ignore [reportAny] @pytest.mark.unit @@ -99,7 +99,7 @@ def test_serialize_with_clear_storage_directory( root_storage_directory, clear_storage_directory=True, ) - assert mock_logging_basic_config.call_args.kwargs["level"] == logging.INFO + assert mock_logging_basic_config.call_args.kwargs["level"] == logging.INFO # pyright: ignore [reportAny] @pytest.mark.unit diff --git a/tests/unit/utils/test_storage.py b/tests/unit/utils/test_storage.py index e7bd7ee..190afbd 100644 --- a/tests/unit/utils/test_storage.py +++ b/tests/unit/utils/test_storage.py @@ -250,7 +250,7 @@ def test_get_default_root_storage_directory( if platform_system == Windows: monkeypatch.setattr( "platformdirs.windows.get_win_folder", - lambda _: Path("~/AppData/Local").expanduser(), + lambda _: Path("~/AppData/Local").expanduser(), # pyright: ignore [reportUnknownArgumentType, reportUnknownLambdaType] ) # When