From c3ce193892e9f1219a2df42ffb23d582742849cf Mon Sep 17 00:00:00 2001 From: Manuel Bellersen Date: Tue, 2 Sep 2025 19:28:45 +0200 Subject: [PATCH] GH-52: feat(pipeline): Add table information to pipeline serialization Adds a new `Table` data model and integrates it into the `PipelineInformation` model. This includes: - A new serialization function `serialize_tables` to convert getML `Tables` objects. - Updates to the main `serialize_pipeline` function to include table data. - Corresponding additions to unit and integration tests to cover the new functionality. --- src/getml_io/getml/tables.py | 8 +++ src/getml_io/metadata/pipeline_information.py | 3 +- src/getml_io/serialize/pipeline.py | 26 ++++++++- .../data/loans/expected.pipeline.json | 40 +++++++++---- .../data/numerical/expected.pipeline.json | 32 ++++++++--- .../data/robot/expected.pipeline.json | 56 ++++++++++++++++--- tests/unit/conftest.py | 33 +++++++++++ .../metadata/test_pipeline_information.py | 9 +++ tests/unit/serialize/test_pipeline.py | 20 +++++++ .../serialize/test_pipeline_information.py | 1 + tests/unit/types.py | 5 +- 11 files changed, 202 insertions(+), 31 deletions(-) create mode 100644 src/getml_io/getml/tables.py diff --git a/src/getml_io/getml/tables.py b/src/getml_io/getml/tables.py new file mode 100644 index 0000000..65c68b7 --- /dev/null +++ b/src/getml_io/getml/tables.py @@ -0,0 +1,8 @@ +from pydantic import BaseModel + + +class Table(BaseModel, frozen=True): + name: str + marker: str + target: str + importance: float diff --git a/src/getml_io/metadata/pipeline_information.py b/src/getml_io/metadata/pipeline_information.py index 4bfeb5b..88f9ab9 100644 --- a/src/getml_io/metadata/pipeline_information.py +++ b/src/getml_io/metadata/pipeline_information.py @@ -16,6 +16,7 @@ from getml_io.getml.predictors import FeatureSelector, Predictor from getml_io.getml.preprocessors import Preprocessor from getml_io.getml.scores import Scores +from getml_io.getml.tables import Table from getml_io.metadata.data_model_information import DataModelInformation from getml_io.metadata.dataframe_information import DataFrameInformationByName from getml_io.metadata.placeholder_information import PlaceholderInformation @@ -47,4 +48,4 @@ class PipelineInformation(BaseModel, frozen=True): scores: Scores columns: Sequence[Column] metadata: PipelineMetaData - # tables # TODO @urfoex: #52 + tables: Sequence[Table] diff --git a/src/getml_io/serialize/pipeline.py b/src/getml_io/serialize/pipeline.py index c8b3612..5d5dec9 100644 --- a/src/getml_io/serialize/pipeline.py +++ b/src/getml_io/serialize/pipeline.py @@ -18,12 +18,14 @@ from getml.pipeline import Features as GetMLFeatures from getml.pipeline import Pipeline from getml.pipeline import Scores as GetMLScores +from getml.pipeline import Tables as GetMLTables from getml.pipeline.column import Column as GetMLColumn from getml.pipeline.metadata import AllMetadata from getml.pipeline.metadata import Metadata as GetMLMetadata from getml.pipeline.score import ClassificationScore as GetMLClassificationScore from getml.pipeline.score import RegressionScore as GetMLRegressionScore from getml.pipeline.score import Score as GetMLScore +from getml.pipeline.table import Table as GetMLTable from numpy.typing import NDArray from getml_io.getml.columns import Column @@ -60,6 +62,7 @@ TextFieldSplitter, ) from getml_io.getml.scores import ClassificationScore, RegressionScore, Scores +from getml_io.getml.tables import Table from getml_io.metadata.dataframe_information import DataFrameInformationByName from getml_io.metadata.pipeline_information import ( LossFunction, @@ -149,7 +152,7 @@ def serialize_pipeline( scores=serialize_scores(pipeline.scores), columns=serialize_columns(pipeline.columns), metadata=serialize_all_metadata(pipeline.metadata), - # tables # TODO @urfoex: #52 + tables=serialize_tables(pipeline.tables), ) pipeline_information_json_path = serialize_pipeline_information( pipeline_information=pipeline_information, @@ -472,3 +475,24 @@ def _serialize_metadata(metadata: GetMLMetadata) -> DataFrameMetaData: name=metadata.name, roles=serialize_roles(metadata.roles), ) + + +def serialize_tables(tables: GetMLTables) -> list[Table]: + """Serialize getML Tables into a list of Table objects. + + Args: + tables: The getML Tables to serialize. + + Returns: + list[Table]: The serialized Tables information. + + """ + return [ + Table( + name=table.name, + marker=table.marker, + target=table.target, + importance=table.importance, + ) + for table in cast("Sequence[GetMLTable]", tables.data) + ] diff --git a/tests/integration/data/loans/expected.pipeline.json b/tests/integration/data/loans/expected.pipeline.json index 84c5fde..ab374d1 100644 --- a/tests/integration/data/loans/expected.pipeline.json +++ b/tests/integration/data/loans/expected.pipeline.json @@ -631,9 +631,9 @@ "feature_learners": [ { "aggregation": [ + "SUM", "MIN", "COUNT", - "SUM", "MAX", "AVG" ], @@ -649,19 +649,19 @@ "num_threads": 0, "propositionalization": { "aggregation": [ - "STDDEV", - "MIN", - "COUNT DISTINCT", - "COUNT", - "MEDIAN", "MAX", "SUM", - "FIRST", + "MODE", "LAST", "TREND", - "AVG", + "MIN", "COUNT MINUS COUNT DISTINCT", - "MODE" + "COUNT", + "FIRST", + "MEDIAN", + "AVG", + "COUNT DISTINCT", + "STDDEV" ], "delta_t": 0.0, "loss_function": "CrossEntropyLoss", @@ -1365,5 +1365,25 @@ } } ] - } + }, + "tables": [ + { + "name": "meta", + "marker": "[PERIPHERAL]", + "target": "default", + "importance": 0.2731073049046048 + }, + { + "name": "trans", + "marker": "[PERIPHERAL]", + "target": "default", + "importance": 0.39642128482217825 + }, + { + "name": "population", + "marker": "[POPULATION]", + "target": "default", + "importance": 0.330471410273217 + } + ] } \ No newline at end of file diff --git a/tests/integration/data/numerical/expected.pipeline.json b/tests/integration/data/numerical/expected.pipeline.json index 2821b96..9b148de 100644 --- a/tests/integration/data/numerical/expected.pipeline.json +++ b/tests/integration/data/numerical/expected.pipeline.json @@ -570,19 +570,19 @@ "num_threads": 0, "propositionalization": { "aggregation": [ - "STDDEV", - "MIN", - "COUNT DISTINCT", - "COUNT", - "MEDIAN", "MAX", "SUM", - "FIRST", + "MODE", "LAST", "TREND", - "AVG", + "MIN", "COUNT MINUS COUNT DISTINCT", - "MODE" + "COUNT", + "FIRST", + "MEDIAN", + "AVG", + "COUNT DISTINCT", + "STDDEV" ], "delta_t": 0.0, "loss_function": "SquareLoss", @@ -915,5 +915,19 @@ } } ] - } + }, + "tables": [ + { + "name": "perph", + "marker": "[PERIPHERAL]", + "target": "targets", + "importance": 0.2364818293321066 + }, + { + "name": "population", + "marker": "[POPULATION]", + "target": "targets", + "importance": 0.7635181706678932 + } + ] } \ No newline at end of file diff --git a/tests/integration/data/robot/expected.pipeline.json b/tests/integration/data/robot/expected.pipeline.json index 4efd474..a1cb26e 100644 --- a/tests/integration/data/robot/expected.pipeline.json +++ b/tests/integration/data/robot/expected.pipeline.json @@ -7020,19 +7020,19 @@ "num_threads": 0, "propositionalization": { "aggregation": [ - "STDDEV", - "MIN", - "COUNT DISTINCT", - "COUNT", - "MEDIAN", "MAX", "SUM", - "FIRST", + "MODE", "LAST", "TREND", - "AVG", + "MIN", "COUNT MINUS COUNT DISTINCT", - "MODE" + "COUNT", + "FIRST", + "MEDIAN", + "AVG", + "COUNT DISTINCT", + "STDDEV" ], "delta_t": 0.0, "loss_function": "SquareLoss", @@ -11546,5 +11546,43 @@ } } ] - } + }, + "tables": [ + { + "name": "full", + "marker": "[PERIPHERAL]", + "target": "f_x", + "importance": 0.6157352560942454 + }, + { + "name": "population", + "marker": "[POPULATION]", + "target": "f_x", + "importance": 0.3842647439057541 + }, + { + "name": "full", + "marker": "[PERIPHERAL]", + "target": "f_y", + "importance": 0.6632312212854787 + }, + { + "name": "population", + "marker": "[POPULATION]", + "target": "f_y", + "importance": 0.33676877871452054 + }, + { + "name": "full", + "marker": "[PERIPHERAL]", + "target": "f_z", + "importance": 0.5290353310682591 + }, + { + "name": "population", + "marker": "[POPULATION]", + "target": "f_z", + "importance": 0.470964668931741 + } + ] } \ No newline at end of file diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 2a198cc..d60e3cd 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -26,12 +26,14 @@ from getml.pipeline import Features as GetMLFeatures from getml.pipeline import Pipeline from getml.pipeline import Scores as GetMLScores +from getml.pipeline import Tables as GetMLTables from getml.pipeline.columns import Column as GetMLColumn from getml.pipeline.feature import Feature as GetMLFeature from getml.pipeline.metadata import AllMetadata as GetMLAllMetadata from getml.pipeline.metadata import Metadata as GetMLMetadata from getml.pipeline.score import ClassificationScore as GetMLClassificationScore from getml.pipeline.score import RegressionScore as GetMLRegressionScore +from getml.pipeline.table import Table as GetMLTable from numpy.typing import NDArray from getml_io.getml.columns import Column @@ -48,6 +50,7 @@ from getml_io.getml.relationships import Relationship from getml_io.getml.roles import Role, Roles from getml_io.getml.scores import ClassificationScore, Scores +from getml_io.getml.tables import Table from getml_io.metadata.container_information import ContainerInformation from getml_io.metadata.data_model_information import DataModelInformation from getml_io.metadata.dataframe_information import ( @@ -445,6 +448,7 @@ def pipeline_information_empty( scores=[], columns=[], metadata=PipelineMetaData(population=None, peripheral=[]), + tables=[], ) @@ -541,6 +545,19 @@ def getml_all_metadata() -> GetMLAllMetadata: ) +@pytest.fixture +def mock_tables(mocker: pytest_mock.MockerFixture) -> GetMLTables: + table = GetMLTable( + name="test_table", + marker="test_marker", + target="target0", + importance=0.0, + ) + tables = mocker.MagicMock(spec=GetMLTables) + tables.data = [table] + return tables + + @pytest.fixture def mock_pipeline( # noqa: PLR0913 mocker: pytest_mock.MockerFixture, @@ -550,6 +567,7 @@ def mock_pipeline( # noqa: PLR0913 mock_scores_regression: GetMLScores, mock_columns: GetMLColumns, getml_all_metadata: GetMLAllMetadata, + mock_tables: GetMLTables, ) -> Pipeline: pipeline = mocker.Mock() pipeline.id = "mock_pipeline_id" @@ -588,6 +606,7 @@ def pipeline_transform(_: DataFrame | View | Subset, *, df_name: str) -> DataFra pipeline.scores = mock_scores_regression pipeline.columns = mock_columns pipeline.metadata = getml_all_metadata + pipeline.tables = mock_tables return pipeline @@ -830,6 +849,18 @@ def meta_data( ) +@pytest.fixture +def tables() -> list[Table]: + return [ + Table( + name="test_table", + marker="test_marker", + target="target0", + importance=0.0, + ), + ] + + @pytest.fixture def pipeline_information( # noqa: PLR0913 data_model_information: DataModelInformation, @@ -842,6 +873,7 @@ def pipeline_information( # noqa: PLR0913 scores: Scores, columns: Sequence[Column], meta_data: PipelineMetaData, + tables: Sequence[Table], ) -> PipelineInformation: return PipelineInformation( id="pipeline_id", @@ -864,6 +896,7 @@ def pipeline_information( # noqa: PLR0913 scores=scores, columns=columns, metadata=meta_data, + tables=tables, ) diff --git a/tests/unit/metadata/test_pipeline_information.py b/tests/unit/metadata/test_pipeline_information.py index 675ed20..0fbb0ce 100644 --- a/tests/unit/metadata/test_pipeline_information.py +++ b/tests/unit/metadata/test_pipeline_information.py @@ -65,6 +65,7 @@ def _get_expected_serialized_empty_pipeline_information() -> PipelineInformation "population": None, "peripheral": [], }, + "tables": [], } @@ -402,4 +403,12 @@ def _get_expected_serialized_pipeline_information() -> PipelineInformationType: }, ], }, + "tables": [ + { + "importance": 0.0, + "marker": "test_marker", + "name": "test_table", + "target": "target0", + }, + ], } diff --git a/tests/unit/serialize/test_pipeline.py b/tests/unit/serialize/test_pipeline.py index 6c29c21..f4c0750 100644 --- a/tests/unit/serialize/test_pipeline.py +++ b/tests/unit/serialize/test_pipeline.py @@ -10,6 +10,7 @@ from getml.pipeline import Features as GetMLFeatures from getml.pipeline import Pipeline from getml.pipeline import Scores as GetMLScores +from getml.pipeline import Tables as GetMLTables from getml.pipeline.metadata import AllMetadata as GetMLAllMetadata from getml_io.getml.columns import Column @@ -43,6 +44,7 @@ ) from getml_io.getml.roles import Roles from getml_io.getml.scores import ClassificationScore, RegressionScore +from getml_io.getml.tables import Table from getml_io.metadata.dataframe_information import DataFrameInformationByName from getml_io.metadata.pipeline_information import LossFunction from getml_io.serialize.exception import WrongPipelineScoreTypeError @@ -57,6 +59,7 @@ serialize_predictor, serialize_preprocessor, serialize_scores, + serialize_tables, ) from tests.unit.conftest import MockDuckDBExecuteFactory @@ -506,3 +509,20 @@ def test_serialize_metadata_empty() -> None: # Then assert pipeline_metadata.population is None assert pipeline_metadata.peripheral == [] + + +@pytest.mark.unit +def test_serialize_tables(mock_tables: GetMLTables) -> None: + # Given + + # When + tables = serialize_tables(mock_tables) + + # Then + assert len(tables) == 1 + assert tables[0] == Table( + name="test_table", + marker="test_marker", + target="target0", + importance=0.0, + ) diff --git a/tests/unit/serialize/test_pipeline_information.py b/tests/unit/serialize/test_pipeline_information.py index d57f335..f7fc5c1 100644 --- a/tests/unit/serialize/test_pipeline_information.py +++ b/tests/unit/serialize/test_pipeline_information.py @@ -82,6 +82,7 @@ def _get_expected_pipeline_information() -> PipelineInformationType: "population": None, "peripheral": [], }, + "tables": [], } diff --git a/tests/unit/types.py b/tests/unit/types.py index b366e42..0976455 100644 --- a/tests/unit/types.py +++ b/tests/unit/types.py @@ -49,6 +49,8 @@ str, DataFrameMetaDataType | Sequence[DataFrameMetaDataType] | None, ] +TableType = Mapping[str, str | float] +TablesType = Sequence[TableType] PipelineInformationType = Mapping[ str, str @@ -63,5 +65,6 @@ | Sequence[str] | FeaturesType | ScoresType - | PipelineMetaDataType, + | PipelineMetaDataType + | TablesType, ]