Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/getml_io/getml/metadatas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from pydantic import BaseModel

from getml_io.getml.roles import Roles


class DataFrameMetaData(BaseModel, frozen=True):
name: str
roles: Roles


class PipelineMetaData(BaseModel, frozen=True):
population: DataFrameMetaData | None
peripheral: list[DataFrameMetaData]
3 changes: 2 additions & 1 deletion src/getml_io/metadata/pipeline_information.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from getml_io.getml.columns import Column
from getml_io.getml.feature_learning import FeatureLearner
from getml_io.getml.features import Features
from getml_io.getml.metadatas import PipelineMetaData
from getml_io.getml.predictors import FeatureSelector, Predictor
from getml_io.getml.preprocessors import Preprocessor
from getml_io.getml.scores import Scores
Expand Down Expand Up @@ -45,5 +46,5 @@ class PipelineInformation(BaseModel, frozen=True):
features: Features
scores: Scores
columns: Sequence[Column]
# metadata # TODO @urfoex: #51
metadata: PipelineMetaData
# tables # TODO @urfoex: #52
22 changes: 15 additions & 7 deletions src/getml_io/serialize/data_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,23 @@ def serialize_data_model(data_model: DataModel) -> DataModelInformation:
DataModelInformation: The serialized DataModel information.

"""
peripheral = {
name: [
serialize_placeholder(placeholder)
for placeholder in (
[placeholders]
if isinstance(placeholders, Placeholder)
else placeholders
)
]
for name, placeholders in cast(
"dict[str, Placeholder | list[Placeholder]]",
data_model.peripheral,
).items()
}
return DataModelInformation(
population=serialize_placeholder(
data_model.population,
),
peripheral={
name: [serialize_placeholder(placeholder) for placeholder in placeholders]
for name, placeholders in cast(
"dict[str, list[Placeholder]]",
cast("object", data_model.peripheral),
).items()
},
peripheral=peripheral,
)
37 changes: 36 additions & 1 deletion src/getml_io/serialize/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
from getml.pipeline import Pipeline
from getml.pipeline import Scores as GetMLScores
from getml.pipeline.column import Column as GetMLColumn
from getml.pipeline.metadata import AllMetadata
from getml.pipeline.metadata import Metadata as GetMLMetadata
from getml.pipeline.score import ClassificationScore as GetMLClassificationScore
from getml.pipeline.score import RegressionScore as GetMLRegressionScore
from getml.pipeline.score import Score as GetMLScore
Expand All @@ -34,6 +36,10 @@
RelMT,
)
from getml_io.getml.features import Feature, Features
from getml_io.getml.metadatas import (
DataFrameMetaData,
PipelineMetaData,
)
from getml_io.getml.predictors import (
LinearRegression,
LogisticRegression,
Expand Down Expand Up @@ -69,6 +75,7 @@
)
from getml_io.serialize.pipeline_information import serialize_pipeline_information
from getml_io.serialize.placeholder import serialize_placeholder
from getml_io.serialize.roles import serialize_roles
from getml_io.utils.convert import (
assume_is_dict_str_to_dataframe_or_view,
)
Expand Down Expand Up @@ -141,7 +148,7 @@ def serialize_pipeline(
features=serialize_features(pipeline.features),
scores=serialize_scores(pipeline.scores),
columns=serialize_columns(pipeline.columns),
# metadata # TODO @urfoex: #51
metadata=serialize_all_metadata(pipeline.metadata),
# tables # TODO @urfoex: #52
)
pipeline_information_json_path = serialize_pipeline_information(
Expand Down Expand Up @@ -437,3 +444,31 @@ def serialize_columns(getml_columns: GetMLColumns | None) -> list[Column]:
)
for column in columns
]


def serialize_all_metadata(all_metadata: AllMetadata | None) -> PipelineMetaData:
"""Serialize getML AllMetadata into a PipelineMetaData object.

Args:
all_metadata: The getML AllMetadata to serialize.

Returns:
PipelineMetaData: The serialized PipelineMetaData information.

"""
if all_metadata is None:
return PipelineMetaData(population=None, peripheral=[])

return PipelineMetaData(
population=_serialize_metadata(all_metadata.population),
peripheral=[
_serialize_metadata(metadata) for metadata in all_metadata.peripheral
],
)


def _serialize_metadata(metadata: GetMLMetadata) -> DataFrameMetaData:
return DataFrameMetaData(
name=metadata.name,
roles=serialize_roles(metadata.roles),
)
151 changes: 140 additions & 11 deletions tests/integration/data/loans/expected.pipeline.json
Original file line number Diff line number Diff line change
Expand Up @@ -631,11 +631,11 @@
"feature_learners": [
{
"aggregation": [
"AVG",
"MAX",
"MIN",
"COUNT",
"SUM",
"MIN"
"MAX",
"AVG"
],
Comment thread
Urfoex marked this conversation as resolved.
"allow_sets": true,
"delta_t": 0.0,
Expand All @@ -649,19 +649,19 @@
"num_threads": 0,
"propositionalization": {
"aggregation": [
"MODE",
"COUNT MINUS COUNT DISTINCT",
"STDDEV",
"FIRST",
"MIN",
"COUNT DISTINCT",
"COUNT",
"AVG",
"MEDIAN",
"SUM",
"MAX",
"SUM",
"FIRST",
"LAST",
"COUNT DISTINCT",
"TREND",
"MIN"
"AVG",
"COUNT MINUS COUNT DISTINCT",
"MODE"
],
"delta_t": 0.0,
"loss_function": "CrossEntropyLoss",
Expand Down Expand Up @@ -1236,5 +1236,134 @@
"target": "default",
"importance": 0.16812257116305224
}
]
],
"metadata": {
"population": {
"name": "train",
"roles": {
"categorical": [
"frequency"
],
"join_key": [
"account_id"
],
"numerical": [
"duration",
"payments",
"amount"
],
"target": [
"default"
],
"text": [],
"time_stamp": [
"date_loan"
],
"unused_float": [
"loan_id",
"district_id"
],
"unused_string": [
"date_account",
"status"
]
}
},
"peripheral": [
{
"name": "meta",
"roles": {
"categorical": [
"type_disp",
"type_card",
"gender",
"A3"
],
"join_key": [
"account_id"
],
"numerical": [
"A4",
"A5",
"A6",
"A7",
"A8",
"A9",
"A10",
"A11",
"A12",
"A13",
"A14",
"A15",
"A16"
],
"target": [],
"text": [],
"time_stamp": [],
"unused_float": [
"disp_id",
"client_id",
"card_id",
"district_id"
],
"unused_string": [
"issued",
"birth_date",
"A2"
]
}
},
{
"name": "order",
"roles": {
"categorical": [
"bank_to",
"k_symbol"
],
"join_key": [
"account_id"
],
"numerical": [
"amount"
],
"target": [],
"text": [],
"time_stamp": [],
"unused_float": [
"account_to",
"order_id"
],
"unused_string": []
}
},
{
"name": "trans",
"roles": {
"categorical": [
"type",
"k_symbol",
"bank",
"operation"
],
"join_key": [
"account_id"
],
"numerical": [
"amount",
"balance"
],
"target": [],
"text": [],
"time_stamp": [
"date"
],
"unused_float": [
"trans_id",
"account"
],
"unused_string": []
}
}
]
}
}
60 changes: 52 additions & 8 deletions tests/integration/data/numerical/expected.pipeline.json
Original file line number Diff line number Diff line change
Expand Up @@ -570,19 +570,19 @@
"num_threads": 0,
"propositionalization": {
"aggregation": [
"MODE",
"COUNT MINUS COUNT DISTINCT",
"STDDEV",
"FIRST",
"MIN",
"COUNT DISTINCT",
"COUNT",
"AVG",
"MEDIAN",
"SUM",
"MAX",
"SUM",
"FIRST",
"LAST",
"COUNT DISTINCT",
"TREND",
"MIN"
"AVG",
"COUNT MINUS COUNT DISTINCT",
"MODE"
],
"delta_t": 0.0,
"loss_function": "SquareLoss",
Expand Down Expand Up @@ -871,5 +871,49 @@
"target": "targets",
"importance": 0.7610534571004377
}
]
],
"metadata": {
"population": {
"name": "train",
"roles": {
"categorical": [],
"join_key": [
"join_key"
],
"numerical": [
"column_01"
],
"target": [
"targets"
],
"text": [],
"time_stamp": [
"time_stamp"
],
"unused_float": [],
"unused_string": []
}
},
"peripheral": [
{
"name": "perph",
"roles": {
"categorical": [],
"join_key": [
"join_key"
],
"numerical": [
"column_01"
],
"target": [],
"text": [],
"time_stamp": [
"time_stamp"
],
"unused_float": [],
"unused_string": []
}
}
]
}
}
Loading