From b0d9b1942b37e90a1b95d339828cb3b5742b3d27 Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Tue, 13 Jan 2026 09:00:58 +0100 Subject: [PATCH 1/3] =?UTF-8?q?=F0=9F=9A=A7=20untested=20first=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_processing.py | 35 ++--- tide/processing.py | 292 +++++++++++++++++++++++++++++++++------ 2 files changed, 266 insertions(+), 61 deletions(-) diff --git a/tests/test_processing.py b/tests/test_processing.py index fc44425..e55f29e 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -798,53 +798,54 @@ def test_pd_fill_gap(self): def test_combiner(self): test_df = pd.DataFrame( { - "Tin__°C__building": [10.0, 20.0, 30.0], - "Text__°C__outdoor": [-1.0, 5.0, 4.0], - "radiation__W/m2__outdoor": [50, 100, 400], - "Humidity__%HR": [10, 15, 13], - "Humidity__%HR__room1": [20, 30, 50], - "Humidity_2": [10, 15, 13], - "light__DIMENSIONLESS__building": [100, 200, 300], - "mass_flwr__m3/h__hvac": [300, 500, 600], + "Tin__°C__building__room_1": [10.0, 20.0, 30.0], + "Tin__°C__building__room_2": [20.0, 40.0, 60.0], + "Text__°C__outdoor__meteo": [-1.0, 5.0, 4.0], + "radiation__W/m2__outdoor__meteo": [50, 100, 400], + "Humidity__%HR__building__room_1": [10, 15, 13], + "Humidity__%HR__building__room_2": [20, 30, 50], + "Humidity__%HR__outdoor__meteo": [10, 15, 13], + "light__DIMENSIONLESS__building__room_1": [100, 200, 300], + "mass_flwr__m3/h__hvac__pump": [300, 500, 600], }, index=pd.date_range("2009", freq="h", periods=3, tz="UTC"), ) combiner = ExpressionCombine( columns_dict={ - "T1": "Tin__°C__building", - "T2": "Text__°C__outdoor", - "m": "mass_flwr__m3/h__hvac", + "T1": "Tin__°C__building__room_1", + "T2": "Text__°C__outdoor__meteo", + "m": "mass_flwr__m3/h__hvac__pump", }, expression="(T1 - T2) * m * 1004 * 1.204", - result_column_name="loss_ventilation__J__hvac", + result_column_name="loss_ventilation__J__building__room_1", ) res = combiner.fit_transform(test_df.copy()) check_feature_names_out(combiner, res) np.testing.assert_almost_equal( - res["loss_ventilation__J__hvac"], + res["loss_ventilation__J__building__room_1"], [3989092.8, 9066120.0, 18857529.6], decimal=1, ) combiner.set_params(drop_columns=True) res = combiner.fit_transform(test_df.copy()) - assert res.shape == (3, 6) + assert res.shape == (3, 7) check_feature_names_out(combiner, res) combiner_cond = ExpressionCombine( columns_dict={ - "T1": "Text__°C__outdoor", + "T1": "Text__°C__outdoor__meteo", }, expression="(T1 > 10) * 1", - result_column_name="where_test_01__hvac", + result_column_name="where_test_01__meteo__outdoor", ) res = combiner_cond.fit_transform(test_df.copy()) check_feature_names_out(combiner_cond, res) np.testing.assert_almost_equal( - res["where_test_01__hvac"], + res["where_test_01__meteo__outdoor"], [0, 0, 0], decimal=1, ) diff --git a/tide/processing.py b/tide/processing.py index 3ae3ffb..04848da 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -1973,15 +1973,16 @@ class ExpressionCombine(BaseProcessing): """A transformer that combines DataFrame columns using a mathematical expression. This transformer evaluates a mathematical expression using specified columns from a DataFrame, - creating a new column with the result. It supports both simple aggregations and complex - physical expressions, with the option to drop the source columns after computation. + creating new columns with the results. It supports broadcasting: when a tag selector matches + multiple columns, the expression is applied to each matching group. Selectors matching a + single column are broadcast across all groups. Parameters ---------- columns_dict : dict[str, str] - Dictionary mapping expression variables to DataFrame column names. + Dictionary mapping expression variables to TIDE-style column tag patterns. Keys are the variable names used in the expression, and values are the - corresponding column names in the DataFrame. + tag patterns to match columns (e.g., "Tin__°C__building"). expression : str Mathematical expression to evaluate, using variables defined in columns_dict. @@ -1989,82 +1990,285 @@ class ExpressionCombine(BaseProcessing): evaluated using pandas.eval(). result_column_name : str - Name of the new column that will contain the evaluated expression result. - Must not already exist in the DataFrame. + Base name for result columns. If multiple column groups are found, + the sub-bloc suffix from matched columns is appended. drop_columns : bool, default=False Whether to drop the source columns used in the expression after computation. - If True, only the result column and other non-source columns are kept. + + tide_request_func : callable, optional + Function to resolve tag patterns to column names. Should have signature + (data_columns, request) -> list[str]. If None, uses the global tide_request. Attributes ---------- feature_names_out_ : list[str] - List of column names in the transformed DataFrame. If drop_columns is True, - excludes the source columns used in the expression. + List of column names in the transformed DataFrame. Raises ------ ValueError + If tag patterns resolve to incompatible column counts (not 1 and not equal). If result_column_name already exists in the DataFrame. Examples -------- - >>> from tide.processing import ExpressionCombine >>> import pandas as pd - >>> # Create sample data - >>> df = pd.DataFrame( - ... { - ... "Tin__°C__building": [20, 21, 22], - ... "Text__°C__outdoor": [10, 11, 12], - ... "mass_flwr__m3/h__hvac": [1, 2, 3], - ... } - ... ) - >>> # Calculate ventilation losses + >>> df = pd.DataFrame({ + ... "Tin__°C__building__room_1": [20, 21, 22], + ... "Tin__°C__building__room_2": [25, 26, 27], + ... "Text__°C__outdoor__meteo": [10, 11, 12], + ... "mass_flwr__m3/h__hvac__pump": [1, 2, 3], + ... }) + >>> >>> combiner = ExpressionCombine( ... columns_dict={ ... "T1": "Tin__°C__building", - ... "T2": "Text__°C__outdoor", - ... "m": "mass_flwr__m3/h__hvac", + ... "T2": "Text__°C__outdoor__meteo", + ... "m": "mass_flwr__m3/h__hvac__pump", ... }, ... expression="(T1 - T2) * m * 1004 * 1.204", - ... result_column_name="loss_ventilation__J__hvac", - ... drop_columns=True, + ... result_column_name="loss_ventilation__J__building", ... ) - >>> # Transform the data + >>> >>> result = combiner.fit_transform(df) + >>> # Creates: loss_ventilation__J__building__room_1 and + >>> # loss_ventilation__J__building__room_2 """ def __init__( - self, - columns_dict: dict[str, str], - expression: str, - result_column_name: str, - drop_columns: bool = False, + self, + columns_dict: dict[str, str], + expression: str, + result_column_name: str, + drop_columns: bool = False, ): - BaseProcessing.__init__(self, required_columns=list(columns_dict.values())) self.columns_dict = columns_dict - self.required_columns = list(columns_dict.values()) self.expression = expression self.result_column_name = result_column_name self.drop_columns = drop_columns + # Will be populated during fit + self._column_groups = None + self._result_columns = None + + # Initialize BaseProcessing with empty list, will be updated in fit + BaseProcessing.__init__(self, required_columns=[]) + + def _resolve_column_groups(self, X: pd.DataFrame) -> tuple[list[dict], list[str]]: + """Resolve tag patterns to column groups and validate broadcasting rules. + + Returns + ------- + column_groups : list[dict] + List of dicts mapping variable names to actual column names for each group. + result_columns : list[str] + List of result column names with appropriate suffixes. + """ + # Resolve each tag pattern to actual columns + resolved = {} + for var, tag_pattern in self.columns_dict.items(): + matched_cols = self.tide_request_func(X.columns, tag_pattern) + if not matched_cols: + raise ValueError( + f"Tag pattern '{tag_pattern}' for variable '{var}' " + f"matched no columns in DataFrame" + ) + resolved[var] = matched_cols + + # Determine the number of groups (max non-1 length) + counts = {var: len(cols) for var, cols in resolved.items()} + non_one_counts = [c for c in counts.values() if c > 1] + + if not non_one_counts: + # All variables match exactly 1 column + n_groups = 1 + else: + # Check all non-1 counts are equal + if len(set(non_one_counts)) > 1: + raise ValueError( + f"Incompatible column counts for broadcasting: {counts}. " + f"All counts must be 1 or equal to each other." + ) + n_groups = non_one_counts[0] + + # Build column groups + column_groups = [] + result_columns = [] + + for i in range(n_groups): + group = {} + for var, cols in resolved.items(): + # Broadcast single columns across all groups + group[var] = cols[0] if len(cols) == 1 else cols[i] + column_groups.append(group) + + # Extract sub-bloc suffix from any multi-column variable + suffix = "" + for var, cols in resolved.items(): + if len(cols) > 1: + # Extract suffix after the tag pattern + col_name = cols[i] + tag_pattern = self.columns_dict[var] + # Find the suffix by removing the base pattern + parts = col_name.split("__") + pattern_parts = tag_pattern.split("__") + # The suffix is the parts after the pattern + if len(parts) > len(pattern_parts): + suffix = "__" + "__".join(parts[len(pattern_parts):]) + break + + result_columns.append(self.result_column_name + suffix) + + return column_groups, result_columns + def _fit_implementation(self, X, y=None): + """Fit the transformer by resolving column groups.""" + self._column_groups, self._result_columns = self._resolve_column_groups(X) + + # Update required columns for BaseProcessing + all_cols = set() + for group in self._column_groups: + all_cols.update(group.values()) + self.required_columns = list(all_cols) + + # Build feature_names_out_ if self.drop_columns: - self.feature_names_out_ = list(X.columns.drop(self.required_columns)) - if self.result_column_name in self.feature_names_out_: - raise ValueError( - f"label_name {self.result_column_name} already in X columns. " - f"It cannot be overwritten" + self.feature_names_out_ = [ + col for col in X.columns if col not in self.required_columns + ] + else: + self.feature_names_out_ = list(X.columns) + + # Check for conflicts + for result_col in self._result_columns: + if result_col in self.feature_names_out_: + raise ValueError( + f"Result column '{result_col}' already exists in DataFrame. " + f"It cannot be overwritten." + ) + + self.feature_names_out_.extend(self._result_columns) + + return self + + def _transform_implementation(self, X: pd.DataFrame) -> pd.DataFrame: + """Transform the DataFrame by evaluating expression for each column group.""" + # Create output DataFrame (vectorized approach) + result = X.copy() if not self.drop_columns else X[ + [col for col in X.columns if col not in self.required_columns] + ].copy() + + # Evaluate expression for each group (vectorized) + for group, result_col in zip(self._column_groups, self._result_columns): + # Build local_dict for pd.eval + local_dict = {var: X[col].values for var, col in group.items()} + + # Evaluate expression using numpy arrays for speed + result[result_col] = pd.eval( + self.expression, + local_dict=local_dict, + engine='numexpr' # Faster for numeric operations ) - self.feature_names_out_.append(self.result_column_name) - def _transform_implementation(self, X: pd.Series | pd.DataFrame): - X.loc[:, self.result_column_name] = pd.eval( - self.expression, - target=X, - local_dict={var: X[col] for var, col in self.columns_dict.items()}, - ) - return X[self.feature_names_out_] + return result[self.feature_names_out_] + +# class ExpressionCombine(BaseProcessing): +# """A transformer that combines DataFrame columns using a mathematical expression. +# +# This transformer evaluates a mathematical expression using specified columns from a DataFrame, +# creating a new column with the result. It supports both simple aggregations and complex +# physical expressions, with the option to drop the source columns after computation. +# +# Parameters +# ---------- +# columns_dict : dict[str, str] +# Dictionary mapping expression variables to DataFrame column names. +# Keys are the variable names used in the expression, and values are the +# corresponding column names in the DataFrame. +# +# expression : str +# Mathematical expression to evaluate, using variables defined in columns_dict. +# The expression should be a valid Python mathematical expression that can be +# evaluated using pandas.eval(). +# +# result_column_name : str +# Name of the new column that will contain the evaluated expression result. +# Must not already exist in the DataFrame. +# +# drop_columns : bool, default=False +# Whether to drop the source columns used in the expression after computation. +# If True, only the result column and other non-source columns are kept. +# +# Attributes +# ---------- +# feature_names_out_ : list[str] +# List of column names in the transformed DataFrame. If drop_columns is True, +# excludes the source columns used in the expression. +# +# Raises +# ------ +# ValueError +# If result_column_name already exists in the DataFrame. +# +# Examples +# -------- +# >>> from tide.processing import ExpressionCombine +# >>> import pandas as pd +# >>> # Create sample data +# >>> df = pd.DataFrame( +# ... { +# ... "Tin__°C__building": [20, 21, 22], +# ... "Text__°C__outdoor": [10, 11, 12], +# ... "mass_flwr__m3/h__hvac": [1, 2, 3], +# ... } +# ... ) +# >>> # Calculate ventilation losses +# >>> combiner = ExpressionCombine( +# ... columns_dict={ +# ... "T1": "Tin__°C__building", +# ... "T2": "Text__°C__outdoor", +# ... "m": "mass_flwr__m3/h__hvac", +# ... }, +# ... expression="(T1 - T2) * m * 1004 * 1.204", +# ... result_column_name="loss_ventilation__J__hvac", +# ... drop_columns=True, +# ... ) +# >>> # Transform the data +# >>> result = combiner.fit_transform(df) +# """ +# +# def __init__( +# self, +# columns_dict: dict[str, str], +# expression: str, +# result_column_name: str, +# drop_columns: bool = False, +# ): +# BaseProcessing.__init__(self, required_columns=list(columns_dict.values())) +# self.columns_dict = columns_dict +# self.required_columns = list(columns_dict.values()) +# self.expression = expression +# self.result_column_name = result_column_name +# self.drop_columns = drop_columns +# +# def _fit_implementation(self, X, y=None): +# if self.drop_columns: +# self.feature_names_out_ = list(X.columns.drop(self.required_columns)) +# if self.result_column_name in self.feature_names_out_: +# raise ValueError( +# f"label_name {self.result_column_name} already in X columns. " +# f"It cannot be overwritten" +# ) +# self.feature_names_out_.append(self.result_column_name) +# +# def _transform_implementation(self, X: pd.Series | pd.DataFrame): +# X.loc[:, self.result_column_name] = pd.eval( +# self.expression, +# target=X, +# local_dict={var: X[col] for var, col in self.columns_dict.items()}, +# ) +# return X[self.feature_names_out_] class FillOikoMeteo(BaseFiller, BaseOikoMeteo, BaseProcessing): From 019a02d31a64c588c3b51bd591f2a0a5662c6f9b Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Wed, 14 Jan 2026 11:19:35 +0100 Subject: [PATCH 2/3] =?UTF-8?q?=E2=9C=A8=20ExpressionCombine?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_processing.py | 23 ++++ tide/math.py | 2 +- tide/processing.py | 271 ++++++++++++++++----------------------- 3 files changed, 135 insertions(+), 161 deletions(-) diff --git a/tests/test_processing.py b/tests/test_processing.py index e55f29e..46c60a4 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -850,6 +850,29 @@ def test_combiner(self): decimal=1, ) + combiner_broadcast = ExpressionCombine( + columns_dict={ + "T_in": "Tin__°C__building", + "T_ext": "Text__°C", + "m": "mass_flwr__m3/h__hvac__pump", + }, + expression="(T_in - T_ext) * m * 1004 * 1.204", + result_column_name="loss_ventilation__J__building", + ) + + res = combiner_broadcast.fit_transform(test_df.copy()) + np.testing.assert_almost_equal( + res["loss_ventilation__J__building__room_1"], + [3989092.8, 9066120.0, 18857529.6], + decimal=1, + ) + + np.testing.assert_almost_equal( + res["loss_ventilation__J__building__room_2"], + [7615540.8, 21154280.0, 40616217.6], + decimal=1, + ) + @patch("tide.base.get_oikolab_df", side_effect=mock_get_oikolab_df) def test_fill_oiko_meteo(self, mock_get_oikolab): data = pd.read_csv( diff --git a/tide/math.py b/tide/math.py index de1d187..4b9a398 100644 --- a/tide/math.py +++ b/tide/math.py @@ -96,7 +96,7 @@ def time_integrate(data: pd.DataFrame | pd.Series) -> pd.Series: t = (data.index.view("int64") - data.index[0].value) * 1e-9 # seconds y = data.to_numpy() - result = np.trapz(y, t, axis=0) + result = np.trapezoid(y, t, axis=0) return pd.Series(result, index=data.columns) diff --git a/tide/processing.py b/tide/processing.py index 04848da..8bfb7e5 100644 --- a/tide/processing.py +++ b/tide/processing.py @@ -1972,9 +1972,10 @@ def _transform_implementation(self, X: pd.Series | pd.DataFrame): class ExpressionCombine(BaseProcessing): """A transformer that combines DataFrame columns using a mathematical expression. - This transformer evaluates a mathematical expression using specified columns from a DataFrame, - creating new columns with the results. It supports broadcasting: when a tag selector matches - multiple columns, the expression is applied to each matching group. Selectors matching a + This transformer evaluates a mathematical expression using specified columns + from a DataFrame, creating new columns with the results. + It supports broadcasting: when a tag selector matches multiple columns, + the expression is applied to each matching group. Selectors matching a single column are broadcast across all groups. Parameters @@ -1986,73 +1987,132 @@ class ExpressionCombine(BaseProcessing): expression : str Mathematical expression to evaluate, using variables defined in columns_dict. - The expression should be a valid Python mathematical expression that can be - evaluated using pandas.eval(). + The expression must be a valid Python expression compatible with pandas.eval(). + Supports standard mathematical operators and numpy functions. result_column_name : str - Base name for result columns. If multiple column groups are found, - the sub-bloc suffix from matched columns is appended. + Base name for result columns following TIDE naming convention. + When multiple column groups are found, the sub-bloc suffix from matched + columns (e.g., "__room_1", "__room_2") is automatically appended. drop_columns : bool, default=False Whether to drop the source columns used in the expression after computation. - - tide_request_func : callable, optional - Function to resolve tag patterns to column names. Should have signature - (data_columns, request) -> list[str]. If None, uses the global tide_request. + If True, only the newly created result columns and other unused columns are kept. Attributes ---------- + column_groups_ : list[dict] + List of dictionaries mapping variable names to actual column names for each group. + Created during fit(). + + result_columns_ : list[str] + List of result column names with appropriate suffixes. Created during fit(). + + required_columns : list[str] + List of all source columns used in the expression. Updated during fit(). + feature_names_out_ : list[str] - List of column names in the transformed DataFrame. + List of all column names in the transformed DataFrame. + + Notes + ----- + Broadcasting Rules: + - If all tag patterns match exactly one column, a single result column is created. + - If some patterns match multiple columns, all multi-column patterns must match + the same number of columns. + - Single-column matches are broadcast (reused) across all groups. + - The sub-bloc suffix is extracted from the first multi-column variable found. + + Column Naming: + - Result columns follow the pattern: name__unit__bloc__sub-bloc + - The sub-bloc suffix is extracted by comparing matched column names with + their tag patterns. + - If no multi-column variables exist, no suffix is added. Raises ------ ValueError - If tag patterns resolve to incompatible column counts (not 1 and not equal). - If result_column_name already exists in the DataFrame. + - If a tag pattern matches no columns in the DataFrame. + - If tag patterns resolve to incompatible column counts (multiple patterns + match different numbers of columns, none being 1). + - If result_column_name (with suffix) already exists in the output DataFrame. Examples -------- + Basic usage with broadcasting: + >>> import pandas as pd - >>> df = pd.DataFrame({ - ... "Tin__°C__building__room_1": [20, 21, 22], - ... "Tin__°C__building__room_2": [25, 26, 27], - ... "Text__°C__outdoor__meteo": [10, 11, 12], - ... "mass_flwr__m3/h__hvac__pump": [1, 2, 3], - ... }) + >>> df = pd.DataFrame( + ... { + ... "Tin__°C__building__room_1": [20, 21, 22], + ... "Tin__°C__building__room_2": [25, 26, 27], + ... "Text__°C__outdoor__meteo": [10, 11, 12], + ... "mass_flwr__kg/s__hvac__pump": [0.1, 0.2, 0.3], + ... } + ... ) >>> + >>> # Calculate ventilation heat loss for each room + >>> # Text and mass_flwr are broadcast to both rooms >>> combiner = ExpressionCombine( ... columns_dict={ - ... "T1": "Tin__°C__building", - ... "T2": "Text__°C__outdoor__meteo", - ... "m": "mass_flwr__m3/h__hvac__pump", + ... "Tin": "Tin__°C__building", + ... "Text": "Text__°C__outdoor__meteo", + ... "mdot": "mass_flwr__kg/s__hvac__pump", ... }, - ... expression="(T1 - T2) * m * 1004 * 1.204", - ... result_column_name="loss_ventilation__J__building", + ... expression="(Tin - Text) * mdot * 1004", # Q = ΔT × ṁ × cp + ... result_column_name="Q_vent__W__building", ... ) >>> >>> result = combiner.fit_transform(df) - >>> # Creates: loss_ventilation__J__building__room_1 and - >>> # loss_ventilation__J__building__room_2 + >>> # Creates: Q_vent__W__building__room_1 and Q_vent__W__building__room_2 + + Using drop_columns to keep only results: + + >>> combiner_clean = ExpressionCombine( + ... columns_dict={ + ... "Tin": "Tin__°C__building", + ... "Text": "Text__°C__outdoor__meteo", + ... }, + ... expression="Tin - Text", + ... result_column_name="deltaT__K__building", + ... drop_columns=True, + ... ) + >>> + >>> result = combiner_clean.fit_transform(df) + >>> # Only deltaT__K__building__room_1, deltaT__K__building__room_2, + >>> # and mass_flwr__kg/s__hvac__pump remain + + Single column operation (no broadcasting): + + >>> df_simple = pd.DataFrame( + ... { + ... "P__W__hvac": [100, 200, 300], + ... "t__s__hvac": [3600, 3600, 3600], + ... } + ... ) + >>> + >>> energy_calc = ExpressionCombine( + ... columns_dict={"P": "P__W__hvac", "t": "t__s__hvac"}, + ... expression="P * t", + ... result_column_name="E__J__hvac", + ... ) + >>> + >>> result = energy_calc.fit_transform(df_simple) + >>> # Creates single column: E__J__hvac """ def __init__( - self, - columns_dict: dict[str, str], - expression: str, - result_column_name: str, - drop_columns: bool = False, + self, + columns_dict: dict[str, str], + expression: str, + result_column_name: str, + drop_columns: bool = False, ): self.columns_dict = columns_dict self.expression = expression self.result_column_name = result_column_name self.drop_columns = drop_columns - # Will be populated during fit - self._column_groups = None - self._result_columns = None - - # Initialize BaseProcessing with empty list, will be updated in fit BaseProcessing.__init__(self, required_columns=[]) def _resolve_column_groups(self, X: pd.DataFrame) -> tuple[list[dict], list[str]]: @@ -2065,10 +2125,9 @@ def _resolve_column_groups(self, X: pd.DataFrame) -> tuple[list[dict], list[str] result_columns : list[str] List of result column names with appropriate suffixes. """ - # Resolve each tag pattern to actual columns resolved = {} for var, tag_pattern in self.columns_dict.items(): - matched_cols = self.tide_request_func(X.columns, tag_pattern) + matched_cols = tide_request(X.columns, tag_pattern) if not matched_cols: raise ValueError( f"Tag pattern '{tag_pattern}' for variable '{var}' " @@ -2076,15 +2135,12 @@ def _resolve_column_groups(self, X: pd.DataFrame) -> tuple[list[dict], list[str] ) resolved[var] = matched_cols - # Determine the number of groups (max non-1 length) counts = {var: len(cols) for var, cols in resolved.items()} non_one_counts = [c for c in counts.values() if c > 1] if not non_one_counts: - # All variables match exactly 1 column n_groups = 1 else: - # Check all non-1 counts are equal if len(set(non_one_counts)) > 1: raise ValueError( f"Incompatible column counts for broadcasting: {counts}. " @@ -2092,7 +2148,6 @@ def _resolve_column_groups(self, X: pd.DataFrame) -> tuple[list[dict], list[str] ) n_groups = non_one_counts[0] - # Build column groups column_groups = [] result_columns = [] @@ -2107,15 +2162,12 @@ def _resolve_column_groups(self, X: pd.DataFrame) -> tuple[list[dict], list[str] suffix = "" for var, cols in resolved.items(): if len(cols) > 1: - # Extract suffix after the tag pattern col_name = cols[i] tag_pattern = self.columns_dict[var] - # Find the suffix by removing the base pattern parts = col_name.split("__") pattern_parts = tag_pattern.split("__") - # The suffix is the parts after the pattern if len(parts) > len(pattern_parts): - suffix = "__" + "__".join(parts[len(pattern_parts):]) + suffix = "__" + "__".join(parts[len(pattern_parts) :]) break result_columns.append(self.result_column_name + suffix) @@ -2124,15 +2176,14 @@ def _resolve_column_groups(self, X: pd.DataFrame) -> tuple[list[dict], list[str] def _fit_implementation(self, X, y=None): """Fit the transformer by resolving column groups.""" - self._column_groups, self._result_columns = self._resolve_column_groups(X) + self.column_groups_, self.result_columns_ = self._resolve_column_groups(X) # Update required columns for BaseProcessing all_cols = set() - for group in self._column_groups: + for group in self.column_groups_: all_cols.update(group.values()) self.required_columns = list(all_cols) - # Build feature_names_out_ if self.drop_columns: self.feature_names_out_ = [ col for col in X.columns if col not in self.required_columns @@ -2141,135 +2192,35 @@ def _fit_implementation(self, X, y=None): self.feature_names_out_ = list(X.columns) # Check for conflicts - for result_col in self._result_columns: + for result_col in self.result_columns_: if result_col in self.feature_names_out_: raise ValueError( f"Result column '{result_col}' already exists in DataFrame. " f"It cannot be overwritten." ) - self.feature_names_out_.extend(self._result_columns) + self.feature_names_out_.extend(self.result_columns_) return self def _transform_implementation(self, X: pd.DataFrame) -> pd.DataFrame: - """Transform the DataFrame by evaluating expression for each column group.""" - # Create output DataFrame (vectorized approach) - result = X.copy() if not self.drop_columns else X[ - [col for col in X.columns if col not in self.required_columns] - ].copy() - - # Evaluate expression for each group (vectorized) - for group, result_col in zip(self._column_groups, self._result_columns): + result = ( + X + if not self.drop_columns + else X[[col for col in X.columns if col not in self.required_columns]] + ) + + for group, result_col in zip(self.column_groups_, self.result_columns_): # Build local_dict for pd.eval local_dict = {var: X[col].values for var, col in group.items()} - # Evaluate expression using numpy arrays for speed - result[result_col] = pd.eval( + result.loc[:, result_col] = pd.eval( self.expression, local_dict=local_dict, - engine='numexpr' # Faster for numeric operations ) return result[self.feature_names_out_] -# class ExpressionCombine(BaseProcessing): -# """A transformer that combines DataFrame columns using a mathematical expression. -# -# This transformer evaluates a mathematical expression using specified columns from a DataFrame, -# creating a new column with the result. It supports both simple aggregations and complex -# physical expressions, with the option to drop the source columns after computation. -# -# Parameters -# ---------- -# columns_dict : dict[str, str] -# Dictionary mapping expression variables to DataFrame column names. -# Keys are the variable names used in the expression, and values are the -# corresponding column names in the DataFrame. -# -# expression : str -# Mathematical expression to evaluate, using variables defined in columns_dict. -# The expression should be a valid Python mathematical expression that can be -# evaluated using pandas.eval(). -# -# result_column_name : str -# Name of the new column that will contain the evaluated expression result. -# Must not already exist in the DataFrame. -# -# drop_columns : bool, default=False -# Whether to drop the source columns used in the expression after computation. -# If True, only the result column and other non-source columns are kept. -# -# Attributes -# ---------- -# feature_names_out_ : list[str] -# List of column names in the transformed DataFrame. If drop_columns is True, -# excludes the source columns used in the expression. -# -# Raises -# ------ -# ValueError -# If result_column_name already exists in the DataFrame. -# -# Examples -# -------- -# >>> from tide.processing import ExpressionCombine -# >>> import pandas as pd -# >>> # Create sample data -# >>> df = pd.DataFrame( -# ... { -# ... "Tin__°C__building": [20, 21, 22], -# ... "Text__°C__outdoor": [10, 11, 12], -# ... "mass_flwr__m3/h__hvac": [1, 2, 3], -# ... } -# ... ) -# >>> # Calculate ventilation losses -# >>> combiner = ExpressionCombine( -# ... columns_dict={ -# ... "T1": "Tin__°C__building", -# ... "T2": "Text__°C__outdoor", -# ... "m": "mass_flwr__m3/h__hvac", -# ... }, -# ... expression="(T1 - T2) * m * 1004 * 1.204", -# ... result_column_name="loss_ventilation__J__hvac", -# ... drop_columns=True, -# ... ) -# >>> # Transform the data -# >>> result = combiner.fit_transform(df) -# """ -# -# def __init__( -# self, -# columns_dict: dict[str, str], -# expression: str, -# result_column_name: str, -# drop_columns: bool = False, -# ): -# BaseProcessing.__init__(self, required_columns=list(columns_dict.values())) -# self.columns_dict = columns_dict -# self.required_columns = list(columns_dict.values()) -# self.expression = expression -# self.result_column_name = result_column_name -# self.drop_columns = drop_columns -# -# def _fit_implementation(self, X, y=None): -# if self.drop_columns: -# self.feature_names_out_ = list(X.columns.drop(self.required_columns)) -# if self.result_column_name in self.feature_names_out_: -# raise ValueError( -# f"label_name {self.result_column_name} already in X columns. " -# f"It cannot be overwritten" -# ) -# self.feature_names_out_.append(self.result_column_name) -# -# def _transform_implementation(self, X: pd.Series | pd.DataFrame): -# X.loc[:, self.result_column_name] = pd.eval( -# self.expression, -# target=X, -# local_dict={var: X[col] for var, col in self.columns_dict.items()}, -# ) -# return X[self.feature_names_out_] - class FillOikoMeteo(BaseFiller, BaseOikoMeteo, BaseProcessing): """A transformer that fills data gaps using meteorological data from the Oikolab API. From 78acae3306212fec21c7c43b829811acd4ac4e1f Mon Sep 17 00:00:00 2001 From: BaptisteDE Date: Wed, 14 Jan 2026 11:24:18 +0100 Subject: [PATCH 3/3] =?UTF-8?q?=F0=9F=90=9B=20trapz=20back=20for=20old=20v?= =?UTF-8?q?ersion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tide/math.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tide/math.py b/tide/math.py index 4b9a398..de1d187 100644 --- a/tide/math.py +++ b/tide/math.py @@ -96,7 +96,7 @@ def time_integrate(data: pd.DataFrame | pd.Series) -> pd.Series: t = (data.index.view("int64") - data.index[0].value) * 1e-9 # seconds y = data.to_numpy() - result = np.trapezoid(y, t, axis=0) + result = np.trapz(y, t, axis=0) return pd.Series(result, index=data.columns)