From c3ce193892e9f1219a2df42ffb23d582742849cf Mon Sep 17 00:00:00 2001
From: Manuel Bellersen <bellersenm+github@gmail.com>
Date: Tue, 2 Sep 2025 19:28:45 +0200
Subject: [PATCH] GH-52: feat(pipeline): Add table information to pipeline
 serialization

Adds a new `Table` data model and integrates it into the `PipelineInformation` model. This includes:
- A new serialization function `serialize_tables` to convert getML `Tables` objects.
- Updates to the main `serialize_pipeline` function to include table data.
- Corresponding additions to unit and integration tests to cover the new functionality.
---
 src/getml_io/getml/tables.py                  |  8 +++
 src/getml_io/metadata/pipeline_information.py |  3 +-
 src/getml_io/serialize/pipeline.py            | 26 ++++++++-
 .../data/loans/expected.pipeline.json         | 40 +++++++++----
 .../data/numerical/expected.pipeline.json     | 32 ++++++++---
 .../data/robot/expected.pipeline.json         | 56 ++++++++++++++++---
 tests/unit/conftest.py                        | 33 +++++++++++
 .../metadata/test_pipeline_information.py     |  9 +++
 tests/unit/serialize/test_pipeline.py         | 20 +++++++
 .../serialize/test_pipeline_information.py    |  1 +
 tests/unit/types.py                           |  5 +-
 11 files changed, 202 insertions(+), 31 deletions(-)
 create mode 100644 src/getml_io/getml/tables.py

diff --git a/src/getml_io/getml/tables.py b/src/getml_io/getml/tables.py
new file mode 100644
index 0000000..65c68b7
--- /dev/null
+++ b/src/getml_io/getml/tables.py
@@ -0,0 +1,8 @@
+from pydantic import BaseModel
+
+
+class Table(BaseModel, frozen=True):
+    name: str
+    marker: str
+    target: str
+    importance: float
diff --git a/src/getml_io/metadata/pipeline_information.py b/src/getml_io/metadata/pipeline_information.py
index 4bfeb5b..88f9ab9 100644
--- a/src/getml_io/metadata/pipeline_information.py
+++ b/src/getml_io/metadata/pipeline_information.py
@@ -16,6 +16,7 @@
 from getml_io.getml.predictors import FeatureSelector, Predictor
 from getml_io.getml.preprocessors import Preprocessor
 from getml_io.getml.scores import Scores
+from getml_io.getml.tables import Table
 from getml_io.metadata.data_model_information import DataModelInformation
 from getml_io.metadata.dataframe_information import DataFrameInformationByName
 from getml_io.metadata.placeholder_information import PlaceholderInformation
@@ -47,4 +48,4 @@ class PipelineInformation(BaseModel, frozen=True):
     scores: Scores
     columns: Sequence[Column]
     metadata: PipelineMetaData
-    # tables # TODO @urfoex: #52
+    tables: Sequence[Table]
diff --git a/src/getml_io/serialize/pipeline.py b/src/getml_io/serialize/pipeline.py
index c8b3612..5d5dec9 100644
--- a/src/getml_io/serialize/pipeline.py
+++ b/src/getml_io/serialize/pipeline.py
@@ -18,12 +18,14 @@
 from getml.pipeline import Features as GetMLFeatures
 from getml.pipeline import Pipeline
 from getml.pipeline import Scores as GetMLScores
+from getml.pipeline import Tables as GetMLTables
 from getml.pipeline.column import Column as GetMLColumn
 from getml.pipeline.metadata import AllMetadata
 from getml.pipeline.metadata import Metadata as GetMLMetadata
 from getml.pipeline.score import ClassificationScore as GetMLClassificationScore
 from getml.pipeline.score import RegressionScore as GetMLRegressionScore
 from getml.pipeline.score import Score as GetMLScore
+from getml.pipeline.table import Table as GetMLTable
 from numpy.typing import NDArray
 
 from getml_io.getml.columns import Column
@@ -60,6 +62,7 @@
     TextFieldSplitter,
 )
 from getml_io.getml.scores import ClassificationScore, RegressionScore, Scores
+from getml_io.getml.tables import Table
 from getml_io.metadata.dataframe_information import DataFrameInformationByName
 from getml_io.metadata.pipeline_information import (
     LossFunction,
@@ -149,7 +152,7 @@ def serialize_pipeline(
         scores=serialize_scores(pipeline.scores),
         columns=serialize_columns(pipeline.columns),
         metadata=serialize_all_metadata(pipeline.metadata),
-        # tables # TODO @urfoex: #52
+        tables=serialize_tables(pipeline.tables),
     )
     pipeline_information_json_path = serialize_pipeline_information(
         pipeline_information=pipeline_information,
@@ -472,3 +475,24 @@ def _serialize_metadata(metadata: GetMLMetadata) -> DataFrameMetaData:
         name=metadata.name,
         roles=serialize_roles(metadata.roles),
     )
+
+
+def serialize_tables(tables: GetMLTables) -> list[Table]:
+    """Serialize getML Tables into a list of Table objects.
+
+    Args:
+        tables: The getML Tables to serialize.
+
+    Returns:
+        list[Table]: The serialized Tables information.
+
+    """
+    return [
+        Table(
+            name=table.name,
+            marker=table.marker,
+            target=table.target,
+            importance=table.importance,
+        )
+        for table in cast("Sequence[GetMLTable]", tables.data)
+    ]
diff --git a/tests/integration/data/loans/expected.pipeline.json b/tests/integration/data/loans/expected.pipeline.json
index 84c5fde..ab374d1 100644
--- a/tests/integration/data/loans/expected.pipeline.json
+++ b/tests/integration/data/loans/expected.pipeline.json
@@ -631,9 +631,9 @@
   "feature_learners": [
     {
       "aggregation": [
+        "SUM",
         "MIN",
         "COUNT",
-        "SUM",
         "MAX",
         "AVG"
       ],
@@ -649,19 +649,19 @@
       "num_threads": 0,
       "propositionalization": {
         "aggregation": [
-          "STDDEV",
-          "MIN",
-          "COUNT DISTINCT",
-          "COUNT",
-          "MEDIAN",
           "MAX",
           "SUM",
-          "FIRST",
+          "MODE",
           "LAST",
           "TREND",
-          "AVG",
+          "MIN",
           "COUNT MINUS COUNT DISTINCT",
-          "MODE"
+          "COUNT",
+          "FIRST",
+          "MEDIAN",
+          "AVG",
+          "COUNT DISTINCT",
+          "STDDEV"
         ],
         "delta_t": 0.0,
         "loss_function": "CrossEntropyLoss",
@@ -1365,5 +1365,25 @@
         }
       }
     ]
-  }
+  },
+  "tables": [
+    {
+      "name": "meta",
+      "marker": "[PERIPHERAL]",
+      "target": "default",
+      "importance": 0.2731073049046048
+    },
+    {
+      "name": "trans",
+      "marker": "[PERIPHERAL]",
+      "target": "default",
+      "importance": 0.39642128482217825
+    },
+    {
+      "name": "population",
+      "marker": "[POPULATION]",
+      "target": "default",
+      "importance": 0.330471410273217
+    }
+  ]
 }
\ No newline at end of file
diff --git a/tests/integration/data/numerical/expected.pipeline.json b/tests/integration/data/numerical/expected.pipeline.json
index 2821b96..9b148de 100644
--- a/tests/integration/data/numerical/expected.pipeline.json
+++ b/tests/integration/data/numerical/expected.pipeline.json
@@ -570,19 +570,19 @@
       "num_threads": 0,
       "propositionalization": {
         "aggregation": [
-          "STDDEV",
-          "MIN",
-          "COUNT DISTINCT",
-          "COUNT",
-          "MEDIAN",
           "MAX",
           "SUM",
-          "FIRST",
+          "MODE",
           "LAST",
           "TREND",
-          "AVG",
+          "MIN",
           "COUNT MINUS COUNT DISTINCT",
-          "MODE"
+          "COUNT",
+          "FIRST",
+          "MEDIAN",
+          "AVG",
+          "COUNT DISTINCT",
+          "STDDEV"
         ],
         "delta_t": 0.0,
         "loss_function": "SquareLoss",
@@ -915,5 +915,19 @@
         }
       }
     ]
-  }
+  },
+  "tables": [
+    {
+      "name": "perph",
+      "marker": "[PERIPHERAL]",
+      "target": "targets",
+      "importance": 0.2364818293321066
+    },
+    {
+      "name": "population",
+      "marker": "[POPULATION]",
+      "target": "targets",
+      "importance": 0.7635181706678932
+    }
+  ]
 }
\ No newline at end of file
diff --git a/tests/integration/data/robot/expected.pipeline.json b/tests/integration/data/robot/expected.pipeline.json
index 4efd474..a1cb26e 100644
--- a/tests/integration/data/robot/expected.pipeline.json
+++ b/tests/integration/data/robot/expected.pipeline.json
@@ -7020,19 +7020,19 @@
       "num_threads": 0,
       "propositionalization": {
         "aggregation": [
-          "STDDEV",
-          "MIN",
-          "COUNT DISTINCT",
-          "COUNT",
-          "MEDIAN",
           "MAX",
           "SUM",
-          "FIRST",
+          "MODE",
           "LAST",
           "TREND",
-          "AVG",
+          "MIN",
           "COUNT MINUS COUNT DISTINCT",
-          "MODE"
+          "COUNT",
+          "FIRST",
+          "MEDIAN",
+          "AVG",
+          "COUNT DISTINCT",
+          "STDDEV"
         ],
         "delta_t": 0.0,
         "loss_function": "SquareLoss",
@@ -11546,5 +11546,43 @@
         }
       }
     ]
-  }
+  },
+  "tables": [
+    {
+      "name": "full",
+      "marker": "[PERIPHERAL]",
+      "target": "f_x",
+      "importance": 0.6157352560942454
+    },
+    {
+      "name": "population",
+      "marker": "[POPULATION]",
+      "target": "f_x",
+      "importance": 0.3842647439057541
+    },
+    {
+      "name": "full",
+      "marker": "[PERIPHERAL]",
+      "target": "f_y",
+      "importance": 0.6632312212854787
+    },
+    {
+      "name": "population",
+      "marker": "[POPULATION]",
+      "target": "f_y",
+      "importance": 0.33676877871452054
+    },
+    {
+      "name": "full",
+      "marker": "[PERIPHERAL]",
+      "target": "f_z",
+      "importance": 0.5290353310682591
+    },
+    {
+      "name": "population",
+      "marker": "[POPULATION]",
+      "target": "f_z",
+      "importance": 0.470964668931741
+    }
+  ]
 }
\ No newline at end of file
diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
index 2a198cc..d60e3cd 100644
--- a/tests/unit/conftest.py
+++ b/tests/unit/conftest.py
@@ -26,12 +26,14 @@
 from getml.pipeline import Features as GetMLFeatures
 from getml.pipeline import Pipeline
 from getml.pipeline import Scores as GetMLScores
+from getml.pipeline import Tables as GetMLTables
 from getml.pipeline.columns import Column as GetMLColumn
 from getml.pipeline.feature import Feature as GetMLFeature
 from getml.pipeline.metadata import AllMetadata as GetMLAllMetadata
 from getml.pipeline.metadata import Metadata as GetMLMetadata
 from getml.pipeline.score import ClassificationScore as GetMLClassificationScore
 from getml.pipeline.score import RegressionScore as GetMLRegressionScore
+from getml.pipeline.table import Table as GetMLTable
 from numpy.typing import NDArray
 
 from getml_io.getml.columns import Column
@@ -48,6 +50,7 @@
 from getml_io.getml.relationships import Relationship
 from getml_io.getml.roles import Role, Roles
 from getml_io.getml.scores import ClassificationScore, Scores
+from getml_io.getml.tables import Table
 from getml_io.metadata.container_information import ContainerInformation
 from getml_io.metadata.data_model_information import DataModelInformation
 from getml_io.metadata.dataframe_information import (
@@ -445,6 +448,7 @@ def pipeline_information_empty(
         scores=[],
         columns=[],
         metadata=PipelineMetaData(population=None, peripheral=[]),
+        tables=[],
     )
 
 
@@ -541,6 +545,19 @@ def getml_all_metadata() -> GetMLAllMetadata:
     )
 
 
+@pytest.fixture
+def mock_tables(mocker: pytest_mock.MockerFixture) -> GetMLTables:
+    table = GetMLTable(
+        name="test_table",
+        marker="test_marker",
+        target="target0",
+        importance=0.0,
+    )
+    tables = mocker.MagicMock(spec=GetMLTables)
+    tables.data = [table]
+    return tables
+
+
 @pytest.fixture
 def mock_pipeline(  # noqa: PLR0913
     mocker: pytest_mock.MockerFixture,
@@ -550,6 +567,7 @@ def mock_pipeline(  # noqa: PLR0913
     mock_scores_regression: GetMLScores,
     mock_columns: GetMLColumns,
     getml_all_metadata: GetMLAllMetadata,
+    mock_tables: GetMLTables,
 ) -> Pipeline:
     pipeline = mocker.Mock()
     pipeline.id = "mock_pipeline_id"
@@ -588,6 +606,7 @@ def pipeline_transform(_: DataFrame | View | Subset, *, df_name: str) -> DataFra
     pipeline.scores = mock_scores_regression
     pipeline.columns = mock_columns
     pipeline.metadata = getml_all_metadata
+    pipeline.tables = mock_tables
     return pipeline
 
 
@@ -830,6 +849,18 @@ def meta_data(
     )
 
 
+@pytest.fixture
+def tables() -> list[Table]:
+    return [
+        Table(
+            name="test_table",
+            marker="test_marker",
+            target="target0",
+            importance=0.0,
+        ),
+    ]
+
+
 @pytest.fixture
 def pipeline_information(  # noqa: PLR0913
     data_model_information: DataModelInformation,
@@ -842,6 +873,7 @@ def pipeline_information(  # noqa: PLR0913
     scores: Scores,
     columns: Sequence[Column],
     meta_data: PipelineMetaData,
+    tables: Sequence[Table],
 ) -> PipelineInformation:
     return PipelineInformation(
         id="pipeline_id",
@@ -864,6 +896,7 @@ def pipeline_information(  # noqa: PLR0913
         scores=scores,
         columns=columns,
         metadata=meta_data,
+        tables=tables,
     )
 
 
diff --git a/tests/unit/metadata/test_pipeline_information.py b/tests/unit/metadata/test_pipeline_information.py
index 675ed20..0fbb0ce 100644
--- a/tests/unit/metadata/test_pipeline_information.py
+++ b/tests/unit/metadata/test_pipeline_information.py
@@ -65,6 +65,7 @@ def _get_expected_serialized_empty_pipeline_information() -> PipelineInformation
             "population": None,
             "peripheral": [],
         },
+        "tables": [],
     }
 
 
@@ -402,4 +403,12 @@ def _get_expected_serialized_pipeline_information() -> PipelineInformationType:
                 },
             ],
         },
+        "tables": [
+            {
+                "importance": 0.0,
+                "marker": "test_marker",
+                "name": "test_table",
+                "target": "target0",
+            },
+        ],
     }
diff --git a/tests/unit/serialize/test_pipeline.py b/tests/unit/serialize/test_pipeline.py
index 6c29c21..f4c0750 100644
--- a/tests/unit/serialize/test_pipeline.py
+++ b/tests/unit/serialize/test_pipeline.py
@@ -10,6 +10,7 @@
 from getml.pipeline import Features as GetMLFeatures
 from getml.pipeline import Pipeline
 from getml.pipeline import Scores as GetMLScores
+from getml.pipeline import Tables as GetMLTables
 from getml.pipeline.metadata import AllMetadata as GetMLAllMetadata
 
 from getml_io.getml.columns import Column
@@ -43,6 +44,7 @@
 )
 from getml_io.getml.roles import Roles
 from getml_io.getml.scores import ClassificationScore, RegressionScore
+from getml_io.getml.tables import Table
 from getml_io.metadata.dataframe_information import DataFrameInformationByName
 from getml_io.metadata.pipeline_information import LossFunction
 from getml_io.serialize.exception import WrongPipelineScoreTypeError
@@ -57,6 +59,7 @@
     serialize_predictor,
     serialize_preprocessor,
     serialize_scores,
+    serialize_tables,
 )
 from tests.unit.conftest import MockDuckDBExecuteFactory
 
@@ -506,3 +509,20 @@ def test_serialize_metadata_empty() -> None:
     # Then
     assert pipeline_metadata.population is None
     assert pipeline_metadata.peripheral == []
+
+
+@pytest.mark.unit
+def test_serialize_tables(mock_tables: GetMLTables) -> None:
+    # Given
+
+    # When
+    tables = serialize_tables(mock_tables)
+
+    # Then
+    assert len(tables) == 1
+    assert tables[0] == Table(
+        name="test_table",
+        marker="test_marker",
+        target="target0",
+        importance=0.0,
+    )
diff --git a/tests/unit/serialize/test_pipeline_information.py b/tests/unit/serialize/test_pipeline_information.py
index d57f335..f7fc5c1 100644
--- a/tests/unit/serialize/test_pipeline_information.py
+++ b/tests/unit/serialize/test_pipeline_information.py
@@ -82,6 +82,7 @@ def _get_expected_pipeline_information() -> PipelineInformationType:
             "population": None,
             "peripheral": [],
         },
+        "tables": [],
     }
 
 
diff --git a/tests/unit/types.py b/tests/unit/types.py
index b366e42..0976455 100644
--- a/tests/unit/types.py
+++ b/tests/unit/types.py
@@ -49,6 +49,8 @@
     str,
     DataFrameMetaDataType | Sequence[DataFrameMetaDataType] | None,
 ]
+TableType = Mapping[str, str | float]
+TablesType = Sequence[TableType]
 PipelineInformationType = Mapping[
     str,
     str
@@ -63,5 +65,6 @@
     | Sequence[str]
     | FeaturesType
     | ScoresType
-    | PipelineMetaDataType,
+    | PipelineMetaDataType
+    | TablesType,
 ]