diff --git a/src/getml_io/getml/metadatas.py b/src/getml_io/getml/metadatas.py new file mode 100644 index 0000000..6ebe8b6 --- /dev/null +++ b/src/getml_io/getml/metadatas.py @@ -0,0 +1,13 @@ +from pydantic import BaseModel + +from getml_io.getml.roles import Roles + + +class DataFrameMetaData(BaseModel, frozen=True): + name: str + roles: Roles + + +class PipelineMetaData(BaseModel, frozen=True): + population: DataFrameMetaData | None + peripheral: list[DataFrameMetaData] diff --git a/src/getml_io/metadata/pipeline_information.py b/src/getml_io/metadata/pipeline_information.py index 97535bb..4bfeb5b 100644 --- a/src/getml_io/metadata/pipeline_information.py +++ b/src/getml_io/metadata/pipeline_information.py @@ -12,6 +12,7 @@ from getml_io.getml.columns import Column from getml_io.getml.feature_learning import FeatureLearner from getml_io.getml.features import Features +from getml_io.getml.metadatas import PipelineMetaData from getml_io.getml.predictors import FeatureSelector, Predictor from getml_io.getml.preprocessors import Preprocessor from getml_io.getml.scores import Scores @@ -45,5 +46,5 @@ class PipelineInformation(BaseModel, frozen=True): features: Features scores: Scores columns: Sequence[Column] - # metadata # TODO @urfoex: #51 + metadata: PipelineMetaData # tables # TODO @urfoex: #52 diff --git a/src/getml_io/serialize/data_model.py b/src/getml_io/serialize/data_model.py index a2f0e23..f65b639 100644 --- a/src/getml_io/serialize/data_model.py +++ b/src/getml_io/serialize/data_model.py @@ -19,15 +19,23 @@ def serialize_data_model(data_model: DataModel) -> DataModelInformation: DataModelInformation: The serialized DataModel information. """ + peripheral = { + name: [ + serialize_placeholder(placeholder) + for placeholder in ( + [placeholders] + if isinstance(placeholders, Placeholder) + else placeholders + ) + ] + for name, placeholders in cast( + "dict[str, Placeholder | list[Placeholder]]", + data_model.peripheral, + ).items() + } return DataModelInformation( population=serialize_placeholder( data_model.population, ), - peripheral={ - name: [serialize_placeholder(placeholder) for placeholder in placeholders] - for name, placeholders in cast( - "dict[str, list[Placeholder]]", - cast("object", data_model.peripheral), - ).items() - }, + peripheral=peripheral, ) diff --git a/src/getml_io/serialize/pipeline.py b/src/getml_io/serialize/pipeline.py index 8f5ca9a..c8b3612 100644 --- a/src/getml_io/serialize/pipeline.py +++ b/src/getml_io/serialize/pipeline.py @@ -19,6 +19,8 @@ from getml.pipeline import Pipeline from getml.pipeline import Scores as GetMLScores from getml.pipeline.column import Column as GetMLColumn +from getml.pipeline.metadata import AllMetadata +from getml.pipeline.metadata import Metadata as GetMLMetadata from getml.pipeline.score import ClassificationScore as GetMLClassificationScore from getml.pipeline.score import RegressionScore as GetMLRegressionScore from getml.pipeline.score import Score as GetMLScore @@ -34,6 +36,10 @@ RelMT, ) from getml_io.getml.features import Feature, Features +from getml_io.getml.metadatas import ( + DataFrameMetaData, + PipelineMetaData, +) from getml_io.getml.predictors import ( LinearRegression, LogisticRegression, @@ -69,6 +75,7 @@ ) from getml_io.serialize.pipeline_information import serialize_pipeline_information from getml_io.serialize.placeholder import serialize_placeholder +from getml_io.serialize.roles import serialize_roles from getml_io.utils.convert import ( assume_is_dict_str_to_dataframe_or_view, ) @@ -141,7 +148,7 @@ def serialize_pipeline( features=serialize_features(pipeline.features), scores=serialize_scores(pipeline.scores), columns=serialize_columns(pipeline.columns), - # metadata # TODO @urfoex: #51 + metadata=serialize_all_metadata(pipeline.metadata), # tables # TODO @urfoex: #52 ) pipeline_information_json_path = serialize_pipeline_information( @@ -437,3 +444,31 @@ def serialize_columns(getml_columns: GetMLColumns | None) -> list[Column]: ) for column in columns ] + + +def serialize_all_metadata(all_metadata: AllMetadata | None) -> PipelineMetaData: + """Serialize getML AllMetadata into a PipelineMetaData object. + + Args: + all_metadata: The getML AllMetadata to serialize. + + Returns: + PipelineMetaData: The serialized PipelineMetaData information. + + """ + if all_metadata is None: + return PipelineMetaData(population=None, peripheral=[]) + + return PipelineMetaData( + population=_serialize_metadata(all_metadata.population), + peripheral=[ + _serialize_metadata(metadata) for metadata in all_metadata.peripheral + ], + ) + + +def _serialize_metadata(metadata: GetMLMetadata) -> DataFrameMetaData: + return DataFrameMetaData( + name=metadata.name, + roles=serialize_roles(metadata.roles), + ) diff --git a/tests/integration/data/loans/expected.pipeline.json b/tests/integration/data/loans/expected.pipeline.json index f5761b3..84c5fde 100644 --- a/tests/integration/data/loans/expected.pipeline.json +++ b/tests/integration/data/loans/expected.pipeline.json @@ -631,11 +631,11 @@ "feature_learners": [ { "aggregation": [ - "AVG", - "MAX", + "MIN", "COUNT", "SUM", - "MIN" + "MAX", + "AVG" ], "allow_sets": true, "delta_t": 0.0, @@ -649,19 +649,19 @@ "num_threads": 0, "propositionalization": { "aggregation": [ - "MODE", - "COUNT MINUS COUNT DISTINCT", "STDDEV", - "FIRST", + "MIN", + "COUNT DISTINCT", "COUNT", - "AVG", "MEDIAN", - "SUM", "MAX", + "SUM", + "FIRST", "LAST", - "COUNT DISTINCT", "TREND", - "MIN" + "AVG", + "COUNT MINUS COUNT DISTINCT", + "MODE" ], "delta_t": 0.0, "loss_function": "CrossEntropyLoss", @@ -1236,5 +1236,134 @@ "target": "default", "importance": 0.16812257116305224 } - ] + ], + "metadata": { + "population": { + "name": "train", + "roles": { + "categorical": [ + "frequency" + ], + "join_key": [ + "account_id" + ], + "numerical": [ + "duration", + "payments", + "amount" + ], + "target": [ + "default" + ], + "text": [], + "time_stamp": [ + "date_loan" + ], + "unused_float": [ + "loan_id", + "district_id" + ], + "unused_string": [ + "date_account", + "status" + ] + } + }, + "peripheral": [ + { + "name": "meta", + "roles": { + "categorical": [ + "type_disp", + "type_card", + "gender", + "A3" + ], + "join_key": [ + "account_id" + ], + "numerical": [ + "A4", + "A5", + "A6", + "A7", + "A8", + "A9", + "A10", + "A11", + "A12", + "A13", + "A14", + "A15", + "A16" + ], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [ + "disp_id", + "client_id", + "card_id", + "district_id" + ], + "unused_string": [ + "issued", + "birth_date", + "A2" + ] + } + }, + { + "name": "order", + "roles": { + "categorical": [ + "bank_to", + "k_symbol" + ], + "join_key": [ + "account_id" + ], + "numerical": [ + "amount" + ], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [ + "account_to", + "order_id" + ], + "unused_string": [] + } + }, + { + "name": "trans", + "roles": { + "categorical": [ + "type", + "k_symbol", + "bank", + "operation" + ], + "join_key": [ + "account_id" + ], + "numerical": [ + "amount", + "balance" + ], + "target": [], + "text": [], + "time_stamp": [ + "date" + ], + "unused_float": [ + "trans_id", + "account" + ], + "unused_string": [] + } + } + ] + } } \ No newline at end of file diff --git a/tests/integration/data/numerical/expected.pipeline.json b/tests/integration/data/numerical/expected.pipeline.json index 2dfc12b..2821b96 100644 --- a/tests/integration/data/numerical/expected.pipeline.json +++ b/tests/integration/data/numerical/expected.pipeline.json @@ -570,19 +570,19 @@ "num_threads": 0, "propositionalization": { "aggregation": [ - "MODE", - "COUNT MINUS COUNT DISTINCT", "STDDEV", - "FIRST", + "MIN", + "COUNT DISTINCT", "COUNT", - "AVG", "MEDIAN", - "SUM", "MAX", + "SUM", + "FIRST", "LAST", - "COUNT DISTINCT", "TREND", - "MIN" + "AVG", + "COUNT MINUS COUNT DISTINCT", + "MODE" ], "delta_t": 0.0, "loss_function": "SquareLoss", @@ -871,5 +871,49 @@ "target": "targets", "importance": 0.7610534571004377 } - ] + ], + "metadata": { + "population": { + "name": "train", + "roles": { + "categorical": [], + "join_key": [ + "join_key" + ], + "numerical": [ + "column_01" + ], + "target": [ + "targets" + ], + "text": [], + "time_stamp": [ + "time_stamp" + ], + "unused_float": [], + "unused_string": [] + } + }, + "peripheral": [ + { + "name": "perph", + "roles": { + "categorical": [], + "join_key": [ + "join_key" + ], + "numerical": [ + "column_01" + ], + "target": [], + "text": [], + "time_stamp": [ + "time_stamp" + ], + "unused_float": [], + "unused_string": [] + } + } + ] + } } \ No newline at end of file diff --git a/tests/integration/data/robot/expected.pipeline.json b/tests/integration/data/robot/expected.pipeline.json index 26cb892..4efd474 100644 --- a/tests/integration/data/robot/expected.pipeline.json +++ b/tests/integration/data/robot/expected.pipeline.json @@ -7020,19 +7020,19 @@ "num_threads": 0, "propositionalization": { "aggregation": [ - "MODE", - "COUNT MINUS COUNT DISTINCT", "STDDEV", - "FIRST", + "MIN", + "COUNT DISTINCT", "COUNT", - "AVG", "MEDIAN", - "SUM", "MAX", + "SUM", + "FIRST", "LAST", - "COUNT DISTINCT", "TREND", - "MIN" + "AVG", + "COUNT MINUS COUNT DISTINCT", + "MODE" ], "delta_t": 0.0, "loss_function": "SquareLoss", @@ -11316,5 +11316,235 @@ "target": "f_z", "importance": 0.050518213606767795 } - ] + ], + "metadata": { + "population": { + "name": "full", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [ + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "98", + "99", + "100", + "101", + "102", + "103", + "104", + "105", + "106" + ], + "target": [ + "f_x", + "f_y", + "f_z" + ], + "text": [], + "time_stamp": [ + "rowid" + ], + "unused_float": [], + "unused_string": [] + } + }, + "peripheral": [ + { + "name": "full", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [ + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "23", + "24", + "25", + "26", + "27", + "28", + "29", + "30", + "31", + "32", + "33", + "34", + "35", + "36", + "37", + "38", + "39", + "40", + "41", + "42", + "43", + "44", + "45", + "46", + "47", + "48", + "49", + "50", + "51", + "52", + "53", + "54", + "55", + "56", + "57", + "58", + "59", + "60", + "61", + "62", + "63", + "64", + "65", + "66", + "67", + "68", + "69", + "70", + "71", + "72", + "73", + "74", + "75", + "76", + "77", + "78", + "79", + "80", + "81", + "82", + "83", + "84", + "85", + "86", + "98", + "99", + "100", + "101", + "102", + "103", + "104", + "105", + "106" + ], + "target": [ + "f_x", + "f_y", + "f_z" + ], + "text": [], + "time_stamp": [ + "rowid" + ], + "unused_float": [], + "unused_string": [] + } + } + ] + } } \ No newline at end of file diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index 53951fc..2a198cc 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -19,6 +19,7 @@ from getml import predictors as getml_predictor from getml import preprocessors as getml_preprocessor from getml.data import Container, DataFrame, Placeholder, Subset, View +from getml.data import Roles as GetMLRoles from getml.data.roles.types import Role as GetMLRole from getml.feature_learning.loss_functions import CROSSENTROPYLOSS from getml.pipeline import Columns as GetMLColumns @@ -27,6 +28,8 @@ from getml.pipeline import Scores as GetMLScores from getml.pipeline.columns import Column as GetMLColumn from getml.pipeline.feature import Feature as GetMLFeature +from getml.pipeline.metadata import AllMetadata as GetMLAllMetadata +from getml.pipeline.metadata import Metadata as GetMLMetadata from getml.pipeline.score import ClassificationScore as GetMLClassificationScore from getml.pipeline.score import RegressionScore as GetMLRegressionScore from numpy.typing import NDArray @@ -34,6 +37,10 @@ from getml_io.getml.columns import Column from getml_io.getml.feature_learning import FastProp from getml_io.getml.features import Feature, Features +from getml_io.getml.metadatas import ( + DataFrameMetaData, + PipelineMetaData, +) from getml_io.getml.predictors import LinearRegression from getml_io.getml.preprocessors import CategoryTrimmer from getml_io.getml.project import Project @@ -437,6 +444,7 @@ def pipeline_information_empty( features={}, scores=[], columns=[], + metadata=PipelineMetaData(population=None, peripheral=[]), ) @@ -517,6 +525,22 @@ def mock_columns(mocker: pytest_mock.MockerFixture) -> GetMLColumns: return columns +@pytest.fixture +def getml_all_metadata() -> GetMLAllMetadata: + return GetMLAllMetadata( + population=GetMLMetadata( + name="placeholder_population", + roles=GetMLRoles(), + ), + peripheral=[ + GetMLMetadata( + name="placeholder_peripheral", + roles=GetMLRoles(), + ), + ], + ) + + @pytest.fixture def mock_pipeline( # noqa: PLR0913 mocker: pytest_mock.MockerFixture, @@ -525,6 +549,7 @@ def mock_pipeline( # noqa: PLR0913 mock_features: GetMLFeatures, mock_scores_regression: GetMLScores, mock_columns: GetMLColumns, + getml_all_metadata: GetMLAllMetadata, ) -> Pipeline: pipeline = mocker.Mock() pipeline.id = "mock_pipeline_id" @@ -562,6 +587,7 @@ def pipeline_transform(_: DataFrame | View | Subset, *, df_name: str) -> DataFra pipeline.features = mock_features pipeline.scores = mock_scores_regression pipeline.columns = mock_columns + pipeline.metadata = getml_all_metadata return pipeline @@ -743,50 +769,84 @@ def columns() -> list[Column]: @pytest.fixture -def pipeline_information( # noqa: PLR0913 +def predictions( dataframe_information_test: DataFrameInformation, dataframe_information_validation: DataFrameInformation, + predictions_path: Path, +) -> dict[str, DataFrameInformation]: + return { + "test": dataframe_information_test.model_copy( + update={ + "path": predictions_path / dataframe_information_test.path.name, + }, + ), + "validation": dataframe_information_validation.model_copy( + update={ + "path": predictions_path / dataframe_information_validation.path.name, + }, + ), + } + + +@pytest.fixture +def feature_sets( dataframe_information_features_test: DataFrameInformation, dataframe_information_features_validation: DataFrameInformation, + feature_sets_path: Path, +) -> dict[str, DataFrameInformation]: + return { + "test": dataframe_information_features_test.model_copy( + update={ + "path": feature_sets_path + / dataframe_information_features_test.path.name, + }, + ), + "validation": dataframe_information_features_validation.model_copy( + update={ + "path": feature_sets_path + / dataframe_information_features_validation.path.name, + }, + ), + } + + +@pytest.fixture +def meta_data( + data_model_information: DataModelInformation, +) -> PipelineMetaData: + return PipelineMetaData( + population=DataFrameMetaData( + name=data_model_information.population.name, + roles=data_model_information.population.roles, + ), + peripheral=[ + DataFrameMetaData( + name=placeholder.name, + roles=placeholder.roles, + ) + for placeholders in data_model_information.peripheral.values() + for placeholder in placeholders + ], + ) + + +@pytest.fixture +def pipeline_information( # noqa: PLR0913 data_model_information: DataModelInformation, fast_prop: FastProp, linear_regression: LinearRegression, category_trimmer: CategoryTrimmer, - predictions_path: Path, - feature_sets_path: Path, + predictions: dict[str, DataFrameInformation], + feature_sets: dict[str, DataFrameInformation], features: Features, scores: Scores, columns: Sequence[Column], + meta_data: PipelineMetaData, ) -> PipelineInformation: return PipelineInformation( id="pipeline_id", - predictions={ - "test": dataframe_information_test.model_copy( - update={ - "path": predictions_path / dataframe_information_test.path.name, - }, - ), - "validation": dataframe_information_validation.model_copy( - update={ - "path": predictions_path - / dataframe_information_validation.path.name, - }, - ), - }, - feature_sets={ - "test": dataframe_information_features_test.model_copy( - update={ - "path": feature_sets_path - / dataframe_information_features_test.path.name, - }, - ), - "validation": dataframe_information_features_validation.model_copy( - update={ - "path": feature_sets_path - / dataframe_information_features_validation.path.name, - }, - ), - }, + predictions=predictions, + feature_sets=feature_sets, feature_learners=[fast_prop], feature_selectors=[linear_regression], include_categorical=False, @@ -803,6 +863,7 @@ def pipeline_information( # noqa: PLR0913 features=features, scores=scores, columns=columns, + metadata=meta_data, ) diff --git a/tests/unit/metadata/test_pipeline_information.py b/tests/unit/metadata/test_pipeline_information.py index dfa279e..675ed20 100644 --- a/tests/unit/metadata/test_pipeline_information.py +++ b/tests/unit/metadata/test_pipeline_information.py @@ -61,6 +61,10 @@ def _get_expected_serialized_empty_pipeline_information() -> PipelineInformation "features": {}, "scores": [], "columns": [], + "metadata": { + "population": None, + "peripheral": [], + }, } @@ -368,4 +372,34 @@ def _get_expected_serialized_pipeline_information() -> PipelineInformationType: "target": "target0", }, ], + "metadata": { + "population": { + "name": "placeholder_name", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [], + }, + }, + "peripheral": [ + { + "name": "placeholder_name", + "roles": { + "categorical": [], + "join_key": [], + "numerical": [], + "target": [], + "text": [], + "time_stamp": [], + "unused_float": [], + "unused_string": [], + }, + }, + ], + }, } diff --git a/tests/unit/serialize/test_pipeline.py b/tests/unit/serialize/test_pipeline.py index 37be437..6c29c21 100644 --- a/tests/unit/serialize/test_pipeline.py +++ b/tests/unit/serialize/test_pipeline.py @@ -10,6 +10,7 @@ from getml.pipeline import Features as GetMLFeatures from getml.pipeline import Pipeline from getml.pipeline import Scores as GetMLScores +from getml.pipeline.metadata import AllMetadata as GetMLAllMetadata from getml_io.getml.columns import Column from getml_io.getml.feature_learning import ( @@ -20,6 +21,7 @@ Relboost, RelMT, ) +from getml_io.getml.metadatas import DataFrameMetaData from getml_io.getml.predictors import ( LinearRegression, LogisticRegression, @@ -39,11 +41,13 @@ Substring, TextFieldSplitter, ) +from getml_io.getml.roles import Roles from getml_io.getml.scores import ClassificationScore, RegressionScore from getml_io.metadata.dataframe_information import DataFrameInformationByName from getml_io.metadata.pipeline_information import LossFunction from getml_io.serialize.exception import WrongPipelineScoreTypeError from getml_io.serialize.pipeline import ( + serialize_all_metadata, serialize_columns, serialize_feature_learner, serialize_feature_sets, @@ -470,3 +474,35 @@ def test_serialize_columns_empty() -> None: # Then assert len(columns) == 0 + + +@pytest.mark.unit +def test_serialize_metadata( + getml_all_metadata: GetMLAllMetadata, + roles_empty: Roles, +) -> None: + # Given + + # When + pipeline_metadata = serialize_all_metadata(getml_all_metadata) + + # Then + assert pipeline_metadata.population == DataFrameMetaData( + name="placeholder_population", + roles=roles_empty, + ) + assert pipeline_metadata.peripheral == [ + DataFrameMetaData(name="placeholder_peripheral", roles=roles_empty), + ] + + +@pytest.mark.unit +def test_serialize_metadata_empty() -> None: + # Given + + # When + pipeline_metadata = serialize_all_metadata(None) + + # Then + assert pipeline_metadata.population is None + assert pipeline_metadata.peripheral == [] diff --git a/tests/unit/serialize/test_pipeline_information.py b/tests/unit/serialize/test_pipeline_information.py index 848d81f..d57f335 100644 --- a/tests/unit/serialize/test_pipeline_information.py +++ b/tests/unit/serialize/test_pipeline_information.py @@ -78,6 +78,10 @@ def _get_expected_pipeline_information() -> PipelineInformationType: "features": {}, "scores": [], "columns": [], + "metadata": { + "population": None, + "peripheral": [], + }, } diff --git a/tests/unit/types.py b/tests/unit/types.py index f1f266b..b366e42 100644 --- a/tests/unit/types.py +++ b/tests/unit/types.py @@ -44,6 +44,11 @@ FeaturesType = Mapping[str, FeatureType] ScoreType = Mapping[str, str | float | datetime] ScoresType = Sequence[ScoreType] +DataFrameMetaDataType = Mapping[str, str | RolesType] +PipelineMetaDataType = Mapping[ + str, + DataFrameMetaDataType | Sequence[DataFrameMetaDataType] | None, +] PipelineInformationType = Mapping[ str, str @@ -57,5 +62,6 @@ | float | Sequence[str] | FeaturesType - | ScoresType, + | ScoresType + | PipelineMetaDataType, ]