From 288211c276132b77920b67a16b00982bfe3c802c Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 10:34:56 +1100 Subject: [PATCH 01/15] add hyperparam tuning and multi label input --- README.md | 121 ++++ configs/penguins_multilabel.yaml | 64 ++ configs/possum_multilabel.yaml | 61 ++ ecosci/data.py | 580 ++++++++++++++++- ecosci/eval.py | 1047 ++++++++++++++++++++++++++++-- ecosci/hyperopt.py | 506 +++++++++++++++ ecosci/models.py | 97 ++- ecosci/trainer.py | 336 +++++++++- run.py | 126 +++- 9 files changed, 2846 insertions(+), 92 deletions(-) create mode 100644 configs/penguins_multilabel.yaml create mode 100644 configs/possum_multilabel.yaml create mode 100644 ecosci/hyperopt.py diff --git a/README.md b/README.md index c658bd2..b44695a 100644 --- a/README.md +++ b/README.md @@ -7,8 +7,10 @@ EcoNetToolkit lets you train a shallow neural network or classical models on you - CSV input with automatic preprocessing (impute, scale, encode) - Model zoo: MLP (shallow), Random Forest, SVM, XGBoost, Logistic Regression, Linear Regression +- **Hyperparameter tuning** with grouped train/val/test splits (prevents data leakage) - Repeated training with different seeds for stable estimates - Metrics, including for unbalanced datasets (balanced accuracy, PR AUC) +- K-fold cross-validation with spatial/temporal grouping - Configure the project from a single config file ## Table of Contents @@ -26,6 +28,7 @@ EcoNetToolkit lets you train a shallow neural network or classical models on you - [Available models and key parameters](#available-models-and-key-parameters) - [Notes on metrics](#notes-on-metrics) - [Additional notes](#additional-notes) + - [Hyperparameter Tuning](#hyperparameter-tuning) - [Using your own data](#using-your-own-data) - [Testing](#testing) - [Unit Tests](#unit-tests) @@ -211,6 +214,60 @@ output: **Note:** If `output.dir` is not specified, outputs are automatically saved to `outputs//` where `` is derived from your config file name. +### Multi-output (multi-target) prediction + +EcoNetToolkit supports predicting **multiple target variables simultaneously** (multi-output learning). This is useful when you want to predict several related outcomes from the same features. + +**Example: Multi-output classification** + +```yaml +problem_type: classification + +data: + path: data/palmerpenguins_extended.csv + features: [bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g, island] + labels: [species, sex, life_stage] # Predict 3 labels simultaneously + test_size: 0.2 + val_size: 0.15 + scaling: standard + +models: + - name: random_forest + params: + n_estimators: 100 + max_depth: 15 + +training: + repetitions: 5 +``` + +**Example: Multi-output regression** + +```yaml +problem_type: regression + +data: + path: data/possum.csv + features: [hdlngth, skullw, totlngth, taill] + labels: [age, chest, belly] # Predict 3 continuous values + test_size: 0.2 + scaling: standard + +models: + - name: mlp + params: + hidden_layer_sizes: [32, 16] + max_iter: 500 +``` + +**Key points:** +- Use `labels:` (list) instead of `label:` (single string) to specify multiple targets +- For backward compatibility, `label:` still works for single-output prediction +- Multi-output metrics report mean and standard deviation across all outputs +- Some models support multi-output natively (Random Forest, MLP Regressor), others are wrapped automatically (Logistic Regression, SVM, Linear Regression) + +See `configs/penguins_multilabel.yaml` and `configs/possum_multilabel.yaml` for complete examples. + ### Available models and key parameters **MLP (Multi-Layer Perceptron)** @@ -265,6 +322,70 @@ output: - For multi-class problems, macro-averaged Precision/Recall/F1 summarise performance across all classes. - Models are ranked by Cohen's kappa (classification) or MSE (regression) to identify the best performer. +## Hyperparameter Tuning + +EcoNetToolkit includes automated hyperparameter tuning with proper train/validation/test splits to prevent data leakage. This is especially important for ecological data with spatial or temporal structure. + +**Quick Example:** + +```bash +python run.py --config configs/mangrove_tuning.yaml +``` + +**Key Features:** + +- **Grouped splits**: Assign groups (e.g., patches, sites, years) to train/val/test sets +- **Automatic search**: GridSearchCV or RandomizedSearchCV to find optimal hyperparameters +- **Multiple seeds**: Run with different random seeds for stable results +- **Proper evaluation**: Tune on train+val, evaluate on held-out test set + +**Example Config:** + +```yaml +problem_type: regression + +data: + path: data/mangrove.csv + cv_group_column: patch_id # Group by spatial patches + n_train_groups: 4 # 4 patches for training + n_val_groups: 2 # 2 patches for validation (tuning) + n_test_groups: 2 # 2 patches for test (final eval) + + labels: [NDVI] + features: [pu_x, pu_y, temperature, ...] + scaling: standard + +# Enable hyperparameter tuning +tuning: + enabled: true + search_method: random # "random" or "grid" + n_iter: 30 # Number of parameter combinations + cv_folds: 3 # CV folds during tuning + scoring: neg_mean_squared_error + n_jobs: -1 # Use all CPU cores + +# Define models and search spaces +models: + - name: random_forest + param_space: + n_estimators: [100, 200, 500, 1000] + max_depth: [10, 20, 30, 50] + min_samples_split: [2, 5, 10] + max_features: [sqrt, log2, 0.5] + +training: + repetitions: 5 + random_seed: 42 +``` + +**Outputs include:** +- Best hyperparameters for each seed +- Validation and test set performance +- Comparison plots +- Trained models with optimal parameters + +For detailed information, see [docs/HYPERPARAMETER_TUNING.md](docs/HYPERPARAMETER_TUNING.md) + ## Using your own data 1. Place your CSV file in the `data` folder. diff --git a/configs/penguins_multilabel.yaml b/configs/penguins_multilabel.yaml new file mode 100644 index 0000000..5452b6f --- /dev/null +++ b/configs/penguins_multilabel.yaml @@ -0,0 +1,64 @@ +# Multi-output classification example using Palmer Penguins dataset +# This config demonstrates predicting multiple labels simultaneously + +problem_type: classification # Multi-output classification + +data: + path: data/palmerpenguins_extended.csv + + # Features: Physical measurements and contextual variables + features: + - bill_length_mm # Continuous: length of bill in millimeters + - bill_depth_mm # Continuous: depth of bill in millimeters + - flipper_length_mm # Continuous: flipper length in millimeters + - body_mass_g # Continuous: body mass in grams + - island # Categorical: Biscoe, Dream, Torgensen + - year # Numeric: 2021-2025 + + # Multiple target variables to predict simultaneously + labels: + - species # Adelie, Chinstrap, Gentoo (3 classes) + - sex # male, female (2 classes) + - life_stage # adult, juvenile, chick (3 classes) + + test_size: 0.2 # 20% for testing + val_size: 0.15 # 15% of remaining for validation + random_state: 42 # For reproducibility + scaling: standard # Standardize features + impute_strategy: mean # Imputation for missing values + +# Train multiple models for comparison +models: + # Random Forest - natively supports multi-output + - name: random_forest + params: + n_estimators: 100 + max_depth: 15 + min_samples_split: 5 + min_samples_leaf: 2 + max_features: sqrt + class_weight: balanced + + # MLP - good for multi-output tasks + - name: mlp + params: + hidden_layer_sizes: [64, 32] + max_iter: 300 + early_stopping: true + validation_fraction: 0.1 + n_iter_no_change: 15 + + # Logistic Regression - wrapped with MultiOutputClassifier + - name: logistic + params: + max_iter: 1000 + class_weight: balanced + +# Training configuration +training: + repetitions: 5 # Train 5 times with different seeds + random_seed: 0 + +# Output configuration +output: + dir: outputs/penguins_multilabel diff --git a/configs/possum_multilabel.yaml b/configs/possum_multilabel.yaml new file mode 100644 index 0000000..b293047 --- /dev/null +++ b/configs/possum_multilabel.yaml @@ -0,0 +1,61 @@ +# Multi-output regression example using Possum dataset +# This config demonstrates predicting multiple continuous variables simultaneously + +problem_type: regression # Multi-output regression + +data: + path: data/possum.csv + + # Features: Various possum measurements + features: + - hdlngth # Head length + - skullw # Skull width + - totlngth # Total length + - taill # Tail length + - footlgth # Foot length + - earconch # Ear conch length + - eye # Eye measurement + + # Multiple target variables to predict simultaneously + labels: + - age # Age of the possum + - chest # Chest circumference + - belly # Belly circumference + + test_size: 0.2 # 20% for testing + val_size: 0.15 # 15% of remaining for validation + random_state: 42 # For reproducibility + scaling: standard # Standardize features + impute_strategy: mean # Imputation for missing values + +# Train multiple models for comparison +models: + # Random Forest - natively supports multi-output + - name: random_forest + params: + n_estimators: 100 + max_depth: 10 + min_samples_split: 3 + min_samples_leaf: 2 + + # MLP Regressor - natively supports multi-output + - name: mlp + params: + hidden_layer_sizes: [32, 16] + max_iter: 500 + early_stopping: true + validation_fraction: 0.15 + n_iter_no_change: 20 + + # Linear Regression - wrapped with MultiOutputRegressor + - name: linear + params: {} + +# Training configuration +training: + repetitions: 5 # Train 5 times with different seeds + random_seed: 0 + +# Output configuration +output: + dir: outputs/possum_multilabel diff --git a/ecosci/data.py b/ecosci/data.py index c9e705a..99f06d6 100644 --- a/ecosci/data.py +++ b/ecosci/data.py @@ -32,27 +32,66 @@ def __init__( path: str, features: Optional[List[str]] = None, label: Optional[str] = None, + labels: Optional[List[str]] = None, test_size: float = 0.2, val_size: float = 0.2, random_state: int = 0, scaling: str = "standard", impute_strategy: str = "mean", problem_type: str = "classification", + cv_group_column: Optional[str] = None, ): self.path = path self.features = features - self.label = label + # Support both single label and multiple labels + if labels is not None: + self.labels = labels if isinstance(labels, list) else [labels] + self.label = None + elif label is not None: + self.labels = [label] + self.label = label + else: + self.labels = None + self.label = None self.test_size = test_size self.val_size = val_size self.random_state = random_state self.scaling = scaling self.impute_strategy = impute_strategy self.problem_type = problem_type + self.cv_group_column = cv_group_column def load(self) -> pd.DataFrame: """Read the CSV into a DataFrame.""" df = pd.read_csv(self.path) return df + + def _get_feature_names_after_transform(self, preprocessor, numeric_cols, cat_cols): + """Get feature names after preprocessing (including one-hot encoding).""" + feature_names = [] + + # Get feature names from each transformer + for name, transformer, columns in preprocessor.transformers_: + if name == "num": + # Numeric columns keep their names + feature_names.extend(columns) + elif name == "cat": + # For categorical columns, get one-hot encoded names + try: + # Get the OneHotEncoder from the pipeline + onehot = transformer.named_steps.get("onehot") + if onehot is not None and hasattr(onehot, "get_feature_names_out"): + # Get one-hot feature names + cat_features = onehot.get_feature_names_out(columns) + feature_names.extend(cat_features) + else: + # Fallback: just use column names + feature_names.extend(columns) + except Exception: + # Fallback if something goes wrong + feature_names.extend(columns) + + return feature_names def prepare(self) -> Tuple: """Prepare arrays for modeling. @@ -79,22 +118,27 @@ def prepare(self) -> Tuple: df = self.load() - if self.label is None: - raise ValueError("Label column must be provided in config") + if self.labels is None: + raise ValueError("Label column(s) must be provided in config") if self.features is None: - # default: all columns except label - self.features = [c for c in df.columns if c != self.label] + # default: all columns except labels + self.features = [c for c in df.columns if c not in self.labels] X = df[self.features].copy() - y = df[self.label].copy() - - # Drop rows with NaN in target variable - mask_target = ~y.isna() - if not mask_target.all(): + + # When single label, DataFrame.iloc returns Series; when multiple, returns DataFrame + if len(self.labels) == 1: + y = df[self.labels[0]].copy() # Get as Series + mask_target = ~y.isna() + else: + y = df[self.labels].copy() # Get as DataFrame + mask_target = ~y.isna().any(axis=1) + + if mask_target.any() and not mask_target.all(): n_dropped_target = (~mask_target).sum() print( - f"Warning: Dropping {n_dropped_target} rows with NaN in target '{self.label}'" + f"Warning: Dropping {n_dropped_target} rows with NaN in target {self.labels}" ) X = X[mask_target] y = y[mask_target] @@ -115,13 +159,38 @@ def prepare(self) -> Tuple: y = y.reset_index(drop=True) # Encode labels if they are strings/categorical (needed for XGBoost and others) - if y.dtype == "object" or y.dtype.name == "category": - label_encoder = LabelEncoder() - y = label_encoder.fit_transform(y) - # Store the encoder for later use if needed - self.label_encoder = label_encoder - self.label_classes = label_encoder.classes_ + # For multi-output, encode each output separately + self.label_encoders = {} + self.label_classes_dict = {} + + if len(self.labels) == 1: + # Single output: maintain backward compatibility + if y.dtype == "object" or y.dtype.name == "category": + label_encoder = LabelEncoder() + y = label_encoder.fit_transform(y) + self.label_encoders[self.labels[0]] = label_encoder + self.label_classes_dict[self.labels[0]] = label_encoder.classes_ + # Keep backward compatibility + self.label_encoder = label_encoder + self.label_classes = label_encoder.classes_ + else: + self.label_encoder = None + self.label_classes = None else: + # Multi-output: encode each column separately + y_encoded = [] + for col in self.labels: + if y[col].dtype == "object" or y[col].dtype.name == "category": + label_encoder = LabelEncoder() + y_col_encoded = label_encoder.fit_transform(y[col]) + self.label_encoders[col] = label_encoder + self.label_classes_dict[col] = label_encoder.classes_ + else: + y_col_encoded = y[col].values + self.label_encoders[col] = None + self.label_classes_dict[col] = None + y_encoded.append(y_col_encoded) + y = np.column_stack(y_encoded) self.label_encoder = None self.label_classes = None @@ -162,13 +231,21 @@ def prepare(self) -> Tuple: preprocessor = ColumnTransformer(transformers, remainder="drop") X_proc = preprocessor.fit_transform(X) + + # Store preprocessor and generate feature names after transformation + self.preprocessor = preprocessor + self.processed_feature_names = self._get_feature_names_after_transform( + preprocessor, numeric_cols, cat_cols + ) # Check number of unique classes for stratification # Only stratify for classification problems with reasonable number of classes - if self.problem_type == "classification": + # For multi-output, we cannot use stratification directly + if self.problem_type == "classification" and len(self.labels) == 1: n_unique = len(np.unique(y)) stratify_y = y if n_unique <= 20 else None else: + # Cannot stratify on multi-output or regression stratify_y = None # Split: first test, then validation from remaining @@ -186,7 +263,7 @@ def prepare(self) -> Tuple: else: val_fraction = self.val_size / (1.0 - self.test_size) # Only stratify for classification problems with reasonable number of classes - if self.problem_type == "classification": + if self.problem_type == "classification" and len(self.labels) == 1: n_unique_train = len(np.unique(y_train_val)) stratify_train = y_train_val if n_unique_train <= 20 else None else: @@ -207,3 +284,468 @@ def prepare(self) -> Tuple: (None if y_val is None else np.array(y_val)), np.array(y_test), ) + + def prepare_cv_folds(self): + """Prepare k-fold cross-validation splits using group-based splitting. + + Returns + ------- + list of tuples + Each tuple contains (X_train, X_val, X_test, y_train, y_val, y_test, fold_id) + where fold_id is the test fold number. + + Notes + ----- + - Uses GroupKFold to keep all samples from the same group together in the same fold + - The group is specified by cv_group_column (e.g., 'patch_id') + - Each unique group value becomes a unit for splitting + - Number of folds = number of unique groups + """ + from sklearn.model_selection import GroupKFold + from sklearn.impute import SimpleImputer + from sklearn.preprocessing import ( + StandardScaler, + MinMaxScaler, + OneHotEncoder, + LabelEncoder, + ) + from sklearn.compose import ColumnTransformer + from sklearn.pipeline import Pipeline + import numpy as np + + if self.cv_group_column is None: + raise ValueError("cv_group_column must be specified for k-fold cross-validation") + + df = self.load() + + if self.labels is None: + raise ValueError("Label column(s) must be provided in config") + + if self.features is None: + # default: all columns except labels and group column + self.features = [c for c in df.columns if c not in self.labels and c != self.cv_group_column] + + # Get group IDs + if self.cv_group_column not in df.columns: + raise ValueError(f"Group column '{self.cv_group_column}' not found in data") + + groups = df[self.cv_group_column].copy() + + X = df[self.features].copy() + + # When single label, DataFrame.iloc returns Series; when multiple, returns DataFrame + if len(self.labels) == 1: + y = df[self.labels[0]].copy() # Get as Series + mask_target = ~y.isna() + else: + y = df[self.labels].copy() # Get as DataFrame + mask_target = ~y.isna().any(axis=1) + + if mask_target.any() and not mask_target.all(): + n_dropped_target = (~mask_target).sum() + print( + f"Warning: Dropping {n_dropped_target} rows with NaN in target {self.labels}" + ) + X = X[mask_target] + y = y[mask_target] + groups = groups[mask_target] + + # Drop rows with NaN in any features + mask_features = ~X.isna().any(axis=1) + if not mask_features.all(): + n_dropped_features = (~mask_features).sum() + cols_with_nan = X.columns[X.isna().any()].tolist() + print( + f"Warning: Dropping {n_dropped_features} additional rows with NaN in features: {cols_with_nan}" + ) + X = X[mask_features] + y = y[mask_features] + groups = groups[mask_features] + + # Reset indices after dropping rows + X = X.reset_index(drop=True) + y = y.reset_index(drop=True) + groups = groups.reset_index(drop=True) + + # Encode labels if they are strings/categorical + self.label_encoders = {} + self.label_classes_dict = {} + + if len(self.labels) == 1: + # Single output + if y.dtype == "object" or y.dtype.name == "category": + label_encoder = LabelEncoder() + y = label_encoder.fit_transform(y) + self.label_encoders[self.labels[0]] = label_encoder + self.label_classes_dict[self.labels[0]] = label_encoder.classes_ + self.label_encoder = label_encoder + self.label_classes = label_encoder.classes_ + else: + self.label_encoder = None + self.label_classes = None + else: + # Multi-output + y_encoded = [] + for col in self.labels: + if y[col].dtype == "object" or y[col].dtype.name == "category": + label_encoder = LabelEncoder() + y_col_encoded = label_encoder.fit_transform(y[col]) + self.label_encoders[col] = label_encoder + self.label_classes_dict[col] = label_encoder.classes_ + else: + y_col_encoded = y[col].values + self.label_encoders[col] = None + self.label_classes_dict[col] = None + y_encoded.append(y_col_encoded) + y = np.column_stack(y_encoded) + self.label_encoder = None + self.label_classes = None + + # Identify numeric vs categorical + numeric_cols = X.select_dtypes(include=["number"]).columns.tolist() + cat_cols = [c for c in X.columns if c not in numeric_cols] + + transformers = [] + + if numeric_cols: + scaler = StandardScaler() if self.scaling == "standard" else MinMaxScaler() + num_pipeline = Pipeline([("imputer", SimpleImputer(strategy=self.impute_strategy)), ("scaler", scaler)]) + transformers.append(("num", num_pipeline, numeric_cols)) + + if cat_cols: + cat_pipeline = Pipeline([ + ("imputer", SimpleImputer(strategy="most_frequent")), + ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)), + ]) + transformers.append(("cat", cat_pipeline, cat_cols)) + + # Store the preprocessor template and feature names (but don't fit on all data) + self.preprocessor_template = ColumnTransformer(transformers=transformers, remainder="drop") + + # We need to fit preprocessor on a sample to get feature names + # Use just the first group to avoid leakage + sample_preprocessor = ColumnTransformer(transformers=transformers, remainder="drop") + sample_preprocessor.fit(X) + self.processed_feature_names = self._get_feature_names_after_transform(sample_preprocessor, numeric_cols, cat_cols) + + # Determine number of folds based on unique groups + unique_groups = groups.unique() + n_folds = len(unique_groups) + + print(f"\nK-Fold Cross-Validation Setup:") + print(f" Group column: {self.cv_group_column}") + print(f" Number of unique groups: {n_folds}") + print(f" Number of folds: {n_folds}") + print(f" Samples per group: {groups.value_counts().to_dict()}\n") + + # Create k-fold splits + group_kfold = GroupKFold(n_splits=n_folds) + + fold_data = [] + # NOTE: Keep X as DataFrame for ColumnTransformer which uses column names + for fold_idx, (train_val_idx, test_idx) in enumerate(group_kfold.split(X, y, groups)): + # Get raw (unscaled) train_val and test data - keep as DataFrames + X_train_val_raw = X.iloc[train_val_idx] + y_train_val = y[train_val_idx] if isinstance(y, np.ndarray) else y.iloc[train_val_idx] + X_test_raw = X.iloc[test_idx] + y_test = y[test_idx] if isinstance(y, np.ndarray) else y.iloc[test_idx] + + # Create a NEW preprocessor for this fold (to avoid data leakage) + fold_preprocessor = ColumnTransformer(transformers=transformers, remainder="drop") + + # Fit preprocessor ONLY on train_val data (not on test!) + X_train_val_proc = fold_preprocessor.fit_transform(X_train_val_raw) + + # Transform test data using the fitted preprocessor + X_test_proc = fold_preprocessor.transform(X_test_raw) + + # Further split train_val into train and val + if self.val_size <= 0: + X_train, X_val, y_train, y_val = X_train_val_proc, None, y_train_val, None + else: + # Use a fraction of train_val for validation + from sklearn.model_selection import train_test_split + val_fraction = self.val_size / (1.0 - (1.0 / n_folds)) + + # For stratification + if self.problem_type == "classification" and len(self.labels) == 1: + n_unique_train = len(np.unique(y_train_val)) + stratify_train = y_train_val if n_unique_train <= 20 else None + else: + stratify_train = None + + X_train, X_val, y_train, y_val = train_test_split( + X_train_val_proc, + y_train_val, + test_size=val_fraction, + random_state=self.random_state, + stratify=stratify_train, + ) + + fold_data.append(( + X_train, + X_val, + X_test_proc, + np.array(y_train), + (None if y_val is None else np.array(y_val)), + np.array(y_test), + fold_idx + )) + + return fold_data + + def prepare_grouped_splits( + self, + n_train_groups: int = 4, + n_val_groups: int = 2, + n_test_groups: int = 2, + ): + """Prepare train/val/test splits by assigning groups to each split. + + This method ensures no data leakage by keeping all samples from the same + group together in one split (train, val, or test). Useful for spatial or + temporal data where groups represent locations, patches, or time periods. + + Parameters + ---------- + n_train_groups : int + Number of groups to assign to training set + n_val_groups : int + Number of groups to assign to validation set (for hyperparameter tuning) + n_test_groups : int + Number of groups to assign to test set (for final evaluation) + + Returns + ------- + tuple + (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments) + where group_assignments is a dict mapping split names to group IDs + + Raises + ------ + ValueError + If cv_group_column is not specified or if there aren't enough groups + + Notes + ----- + - Requires cv_group_column to be set in the data config + - Groups are shuffled before assignment to ensure randomness + - Total groups needed = n_train_groups + n_val_groups + n_test_groups + """ + from sklearn.impute import SimpleImputer + from sklearn.preprocessing import ( + StandardScaler, + MinMaxScaler, + OneHotEncoder, + LabelEncoder, + ) + from sklearn.compose import ColumnTransformer + from sklearn.pipeline import Pipeline + import numpy as np + + if self.cv_group_column is None: + raise ValueError( + "cv_group_column must be specified for grouped train/val/test splits" + ) + + df = self.load() + + if self.labels is None: + raise ValueError("Label column(s) must be provided in config") + + if self.features is None: + # default: all columns except labels and group column + self.features = [ + c for c in df.columns + if c not in self.labels and c != self.cv_group_column + ] + + # Get group IDs + if self.cv_group_column not in df.columns: + raise ValueError(f"Group column '{self.cv_group_column}' not found in data") + + groups = df[self.cv_group_column].copy() + unique_groups = groups.unique() + n_groups = len(unique_groups) + + # Validate we have enough groups + total_groups_needed = n_train_groups + n_val_groups + n_test_groups + if n_groups < total_groups_needed: + raise ValueError( + f"Not enough groups: need {total_groups_needed} " + f"({n_train_groups} train + {n_val_groups} val + {n_test_groups} test), " + f"but only have {n_groups} unique groups" + ) + + # Shuffle groups for random assignment + rng = np.random.RandomState(self.random_state) + shuffled_groups = rng.permutation(unique_groups) + + # Assign groups to splits + train_groups = shuffled_groups[:n_train_groups] + val_groups = shuffled_groups[n_train_groups:n_train_groups + n_val_groups] + test_groups = shuffled_groups[n_train_groups + n_val_groups:n_train_groups + n_val_groups + n_test_groups] + + group_assignments = { + 'train': train_groups.tolist(), + 'val': val_groups.tolist(), + 'test': test_groups.tolist(), + } + + print(f"\nGrouped Train/Val/Test Split Setup:") + print(f" Group column: {self.cv_group_column}") + print(f" Total unique groups: {n_groups}") + print(f" Train groups ({n_train_groups}): {train_groups.tolist()}") + print(f" Val groups ({n_val_groups}): {val_groups.tolist()}") + print(f" Test groups ({n_test_groups}): {test_groups.tolist()}") + + # Create masks for each split + train_mask = groups.isin(train_groups) + val_mask = groups.isin(val_groups) + test_mask = groups.isin(test_groups) + + print(f"\n Train samples: {train_mask.sum()}") + print(f" Val samples: {val_mask.sum()}") + print(f" Test samples: {test_mask.sum()}\n") + + # Extract features and labels + X = df[self.features].copy() + + if len(self.labels) == 1: + y = df[self.labels[0]].copy() + mask_target = ~y.isna() + else: + y = df[self.labels].copy() + mask_target = ~y.isna().any(axis=1) + + # Drop rows with NaN in target + if mask_target.any() and not mask_target.all(): + n_dropped_target = (~mask_target).sum() + print( + f"Warning: Dropping {n_dropped_target} rows with NaN in target {self.labels}" + ) + X = X[mask_target] + y = y[mask_target] + groups = groups[mask_target] + train_mask = train_mask[mask_target] + val_mask = val_mask[mask_target] + test_mask = test_mask[mask_target] + + # Drop rows with NaN in features + mask_features = ~X.isna().any(axis=1) + if not mask_features.all(): + n_dropped_features = (~mask_features).sum() + cols_with_nan = X.columns[X.isna().any()].tolist() + print( + f"Warning: Dropping {n_dropped_features} additional rows with NaN in features: {cols_with_nan}" + ) + X = X[mask_features] + y = y[mask_features] + groups = groups[mask_features] + train_mask = train_mask[mask_features] + val_mask = val_mask[mask_features] + test_mask = test_mask[mask_features] + + # Reset indices + X = X.reset_index(drop=True) + y = y.reset_index(drop=True) + groups = groups.reset_index(drop=True) + train_mask = train_mask.reset_index(drop=True) + val_mask = val_mask.reset_index(drop=True) + test_mask = test_mask.reset_index(drop=True) + + # Encode labels + self.label_encoders = {} + self.label_classes_dict = {} + + if len(self.labels) == 1: + if y.dtype == "object" or y.dtype.name == "category": + label_encoder = LabelEncoder() + y = label_encoder.fit_transform(y) + self.label_encoders[self.labels[0]] = label_encoder + self.label_classes_dict[self.labels[0]] = label_encoder.classes_ + self.label_encoder = label_encoder + self.label_classes = label_encoder.classes_ + else: + self.label_encoder = None + self.label_classes = None + else: + y_encoded = [] + for col in self.labels: + if y[col].dtype == "object" or y[col].dtype.name == "category": + label_encoder = LabelEncoder() + y_col_encoded = label_encoder.fit_transform(y[col]) + self.label_encoders[col] = label_encoder + self.label_classes_dict[col] = label_encoder.classes_ + else: + y_col_encoded = y[col].values + self.label_encoders[col] = None + self.label_classes_dict[col] = None + y_encoded.append(y_col_encoded) + y = np.column_stack(y_encoded) + self.label_encoder = None + self.label_classes = None + + # Build preprocessing pipeline + numeric_cols = X.select_dtypes(include=["number"]).columns.tolist() + cat_cols = [c for c in X.columns if c not in numeric_cols] + + transformers = [] + + if numeric_cols: + scaler = StandardScaler() if self.scaling == "standard" else MinMaxScaler() + num_pipeline = Pipeline([ + ("imputer", SimpleImputer(strategy=self.impute_strategy)), + ("scaler", scaler) + ]) + transformers.append(("num", num_pipeline, numeric_cols)) + + if cat_cols: + cat_pipeline = Pipeline([ + ("imputer", SimpleImputer(strategy="most_frequent")), + ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)), + ]) + transformers.append(("cat", cat_pipeline, cat_cols)) + + preprocessor = ColumnTransformer(transformers=transformers, remainder="drop") + + # Fit preprocessor on training data only (important!) + X_train_raw = X[train_mask] + preprocessor.fit(X_train_raw) + + # Transform all splits + X_proc = preprocessor.transform(X) + + # Store preprocessor and feature names + self.preprocessor = preprocessor + self.processed_feature_names = self._get_feature_names_after_transform( + preprocessor, numeric_cols, cat_cols + ) + + # Split the data + X_train = X_proc[train_mask] + X_val = X_proc[val_mask] + X_test = X_proc[test_mask] + + # y should be numpy array at this point + y_train = y[train_mask] + y_val = y[val_mask] + y_test = y[test_mask] + + # Extract group IDs for each split + groups_train = groups[train_mask].values + groups_val = groups[val_mask].values + groups_test = groups[test_mask].values + + return ( + X_train, + X_val, + X_test, + np.array(y_train), + np.array(y_val), + np.array(y_test), + group_assignments, + groups_train, + groups_val, + groups_test, + ) diff --git a/ecosci/eval.py b/ecosci/eval.py index 6e798dd..427c2e5 100644 --- a/ecosci/eval.py +++ b/ecosci/eval.py @@ -93,6 +93,106 @@ def compute_classification_metrics(y_true, y_pred, y_proba=None) -> Dict[str, An return out +def compute_multi_output_classification_metrics(y_true, y_pred, y_proba=None) -> Dict[str, Any]: + """Compute classification metrics for multi-output predictions. + + For multi-output, we compute metrics per output and aggregate. + + Parameters + ---------- + y_true : array-like, shape (n_samples, n_outputs) + Ground truth labels + y_pred : array-like, shape (n_samples, n_outputs) + Predicted labels + y_proba : array-like, shape (n_samples, n_outputs, n_classes) or None + Predicted probabilities (if available) + + Returns + ------- + Dict[str, Any] + Dictionary with average metrics across outputs and per-output metrics + """ + from sklearn.metrics import ( + accuracy_score, + precision_score, + recall_score, + f1_score, + balanced_accuracy_score, + cohen_kappa_score, + ) + + n_outputs = y_true.shape[1] + out = {} + + # Compute per-output metrics + per_output_metrics = [] + for i in range(n_outputs): + y_true_i = y_true[:, i] + y_pred_i = y_pred[:, i] + y_proba_i = y_proba[i] if y_proba is not None and isinstance(y_proba, list) else None + + metrics_i = compute_classification_metrics(y_true_i, y_pred_i, y_proba_i) + per_output_metrics.append(metrics_i) + + # Aggregate metrics (average across outputs) + for key in ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1', 'cohen_kappa']: + values = [m[key] for m in per_output_metrics if key in m and m[key] is not None] + if values: + out[f"{key}_mean"] = np.mean(values) + out[f"{key}_std"] = np.std(values) + + # Store per-output metrics + out["per_output"] = per_output_metrics + out["n_outputs"] = n_outputs + + return out + + +def compute_multi_output_regression_metrics(y_true, y_pred) -> Dict[str, Any]: + """Compute regression metrics for multi-output predictions. + + For multi-output regression, we compute metrics per output and aggregate. + + Parameters + ---------- + y_true : array-like, shape (n_samples, n_outputs) + Ground truth values + y_pred : array-like, shape (n_samples, n_outputs) + Predicted values + + Returns + ------- + Dict[str, Any] + Dictionary with average metrics across outputs and per-output metrics + """ + from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score + + n_outputs = y_true.shape[1] + out = {} + + # Compute per-output metrics + per_output_metrics = [] + for i in range(n_outputs): + y_true_i = y_true[:, i] + y_pred_i = y_pred[:, i] + + metrics_i = compute_regression_metrics(y_true_i, y_pred_i) + per_output_metrics.append(metrics_i) + + # Aggregate metrics (average across outputs) + for key in ['mse', 'rmse', 'mae', 'r2']: + values = [m[key] for m in per_output_metrics if key in m and m[key] is not None] + if values: + out[f"{key}_mean"] = np.mean(values) + out[f"{key}_std"] = np.std(values) + + # Store per-output metrics + out["per_output"] = per_output_metrics + out["n_outputs"] = n_outputs + + return out + + def compute_regression_metrics(y_true, y_pred) -> Dict[str, Any]: """Compute regression metrics. @@ -135,7 +235,7 @@ def compute_regression_metrics(y_true, y_pred) -> Dict[str, Any]: def evaluate_and_report( - results, y_test, output_dir: str = "outputs", problem_type: str = "classification" + results, y_test, output_dir: str = "outputs", problem_type: str = "classification", label_names: list = None, feature_names: list = None, X_test=None ): """Compute metrics across runs and save a compact report and plots. @@ -150,6 +250,12 @@ def evaluate_and_report( Where to save the report and plots. problem_type : str "classification" or "regression". Determines which metrics to compute. + label_names : list, optional + Names of the target variables (for multi-output). If None, will use Output 1, Output 2, etc. + feature_names : list, optional + Names of the input features. If None, will use Feature 0, Feature 1, etc. + X_test : array-like, optional + Test features for permutation importance. Required for models without built-in importance (e.g., MLPs). """ os.makedirs(output_dir, exist_ok=True) import pandas as pd @@ -160,6 +266,9 @@ def evaluate_and_report( model_name = results[0].get("model_name", "model") if results else "model" results = {model_name: results} + # Determine if we have multi-output + is_multi_output = len(y_test.shape) > 1 and y_test.shape[1] > 1 + # Compute metrics for each model all_summaries = {} all_dfs = {} @@ -171,10 +280,18 @@ def evaluate_and_report( y_pred = r.get("y_pred") y_proba = r.get("y_proba") - if problem_type == "regression": - metrics = compute_regression_metrics(y_test, y_pred) + if is_multi_output: + # Multi-output case + if problem_type == "regression": + metrics = compute_multi_output_regression_metrics(y_test, y_pred) + else: + metrics = compute_multi_output_classification_metrics(y_test, y_pred, y_proba) else: - metrics = compute_classification_metrics(y_test, y_pred, y_proba) + # Single output case + if problem_type == "regression": + metrics = compute_regression_metrics(y_test, y_pred) + else: + metrics = compute_classification_metrics(y_test, y_pred, y_proba) metrics["seed"] = seed metrics["model"] = model_name @@ -200,34 +317,59 @@ def evaluate_and_report( json.dump(combined_summary, f, indent=2) # Print results for each model - if problem_type == "regression": - display_cols = ["seed", "mse", "rmse", "mae", "r2"] - if "mape" in combined_df.columns: - display_cols.append("mape") - comparison_metrics = ["mse", "rmse", "mae", "r2"] - plot_metrics = ["mse", "rmse", "mae", "r2"] + if is_multi_output: + # For multi-output, show aggregated metrics + if problem_type == "regression": + display_cols = ["seed", "mse_mean", "rmse_mean", "mae_mean", "r2_mean"] + comparison_metrics = ["mse_mean", "rmse_mean", "mae_mean", "r2_mean"] + plot_metrics = ["mse_mean", "rmse_mean", "mae_mean", "r2_mean"] + else: + display_cols = [ + "seed", + "accuracy_mean", + "balanced_accuracy_mean", + "precision_mean", + "recall_mean", + "f1_mean", + "cohen_kappa_mean", + ] + comparison_metrics = [ + "accuracy_mean", + "balanced_accuracy_mean", + "f1_mean", + "cohen_kappa_mean", + ] + plot_metrics = ["accuracy_mean", "balanced_accuracy_mean", "f1_mean", "cohen_kappa_mean"] else: - display_cols = [ - "seed", - "accuracy", - "balanced_accuracy", - "precision", - "recall", - "f1", - "cohen_kappa", - ] - if "roc_auc" in combined_df.columns: - display_cols.append("roc_auc") - if "average_precision" in combined_df.columns: - display_cols.append("average_precision") - comparison_metrics = [ - "accuracy", - "balanced_accuracy", - "f1", - "cohen_kappa", - "roc_auc", - ] - plot_metrics = ["accuracy", "balanced_accuracy", "f1", "cohen_kappa"] + # Single output + if problem_type == "regression": + display_cols = ["seed", "mse", "rmse", "mae", "r2"] + if "mape" in combined_df.columns: + display_cols.append("mape") + comparison_metrics = ["mse", "rmse", "mae", "r2"] + plot_metrics = ["mse", "rmse", "mae", "r2"] + else: + display_cols = [ + "seed", + "accuracy", + "balanced_accuracy", + "precision", + "recall", + "f1", + "cohen_kappa", + ] + if "roc_auc" in combined_df.columns: + display_cols.append("roc_auc") + if "average_precision" in combined_df.columns: + display_cols.append("average_precision") + comparison_metrics = [ + "accuracy", + "balanced_accuracy", + "f1", + "cohen_kappa", + "roc_auc", + ] + plot_metrics = ["accuracy", "balanced_accuracy", "f1", "cohen_kappa"] for model_name, df in all_dfs.items(): print(f"\n{'='*80}") @@ -250,6 +392,48 @@ def evaluate_and_report( if len(vals) > 0: print(f"{col:20s}: {vals.mean():.4f} ± {safe_std(vals):.4f}") print(f"{'-'*80}\n") + + # For multi-output, print per-output breakdown + if is_multi_output and "per_output" in df.columns[0:1] or (len(df) > 0 and "per_output" in df.iloc[0]): + print(f"\n{'-'*80}") + print(f"PER-OUTPUT BREAKDOWN FOR {model_name.upper()}") + print(f"{'-'*80}") + + # Get per-output metrics from first seed as example structure + first_row = df.iloc[0] + if "per_output" in first_row and first_row["per_output"] is not None: + per_output_list = first_row["per_output"] + n_outputs = len(per_output_list) + + # Collect metrics across all seeds for each output + for output_idx in range(n_outputs): + # Use label name if provided, otherwise use generic Output N + if label_names and output_idx < len(label_names): + output_label = f"{label_names[output_idx]}" + else: + output_label = f"Output {output_idx + 1}" + + print(f"\n{output_label}:") + + # Aggregate metrics across seeds for this output + output_metrics = {} + for _, row in df.iterrows(): + if "per_output" in row and row["per_output"] is not None: + per_out = row["per_output"] + if output_idx < len(per_out): + for key, val in per_out[output_idx].items(): + if key not in ["confusion_matrix", "seed", "model"] and val is not None: + if key not in output_metrics: + output_metrics[key] = [] + output_metrics[key].append(val) + + # Print summary for this output + for metric_name, values in sorted(output_metrics.items()): + if len(values) > 0: + mean_val = np.mean(values) + std_val = safe_std(np.array(values)) + print(f" {metric_name:20s}: {mean_val:.4f} ± {std_val:.4f}") + print(f"{'-'*80}\n") # Model comparison table if len(all_dfs) > 1: @@ -277,12 +461,20 @@ def evaluate_and_report( # Select primary metric based on problem type # For classification: Cohen's kappa is more robust as it accounts for chance agreement # For regression: MSE is standard (lower is better) - if problem_type == "regression": - primary_metric = "mse" - lower_is_better = True + if is_multi_output: + if problem_type == "regression": + primary_metric = "mse_mean" + lower_is_better = True + else: + primary_metric = "cohen_kappa_mean" + lower_is_better = False else: - primary_metric = "cohen_kappa" - lower_is_better = False + if problem_type == "regression": + primary_metric = "mse" + lower_is_better = True + else: + primary_metric = "cohen_kappa" + lower_is_better = False # Calculate mean primary metric for each model model_scores = [] @@ -463,14 +655,14 @@ def evaluate_and_report( pass # Silently skip if comparison can't be generated # Residual plots for regression - elif problem_type == "regression": + elif problem_type == "regression" and not is_multi_output: for model_name, model_results in results.items(): try: last = model_results[-1] y_pred = last.get("y_pred") residuals = y_test - y_pred - ax1, ax2 = plt.subplots(1, 2, figsize=(12, 4)) + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) # Predicted vs Actual ax1.scatter(y_test, y_pred, alpha=0.6) @@ -498,4 +690,783 @@ def evaluate_and_report( except Exception as e: print(f"Warning: Could not create residual plot for {model_name}: {e}") + # Feature importance analysis + print(f"\n{'='*80}") + print("FEATURE IMPORTANCE ANALYSIS") + print(f"{'='*80}") + + for model_name, model_results in results.items(): + try: + import joblib + from sklearn.inspection import permutation_importance + from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor + + # Collect feature importances from all trained models + all_importances = [] + + for result in model_results: + model_path = result.get("model_path") + + if not model_path or not os.path.exists(model_path): + continue + + model = joblib.load(model_path) + + # Extract feature importances based on model type + feature_importances = None + importance_method = None + + # For multi-output models, need to handle differently + if isinstance(model, (MultiOutputClassifier, MultiOutputRegressor)): + # Get the base estimator + if hasattr(model.estimators_[0], 'feature_importances_'): + # Average importances across outputs + importances_list = [est.feature_importances_ for est in model.estimators_ if hasattr(est, 'feature_importances_')] + if importances_list: + feature_importances = np.mean(importances_list, axis=0) + importance_method = "built-in (averaged across outputs)" + elif hasattr(model, 'feature_importances_'): + # Tree-based models (Random Forest, XGBoost) + feature_importances = model.feature_importances_ + importance_method = "built-in (Gini/gain)" + elif hasattr(model, 'coef_'): + # Linear models (use absolute coefficient values) + coef = model.coef_ + if len(coef.shape) > 1: + # Multi-output or multi-class: average absolute coefficients + feature_importances = np.mean(np.abs(coef), axis=0) + else: + feature_importances = np.abs(coef) + importance_method = "coefficient magnitude" + + # If no built-in importance, use permutation importance (works for all models including MLPs) + if feature_importances is None and X_test is not None: + # Only print once for the first model + if len(all_importances) == 0: + print(f"\n{model_name.upper()}: Computing permutation importance across all {len(model_results)} models...") + print(f" (This measures performance drop when each feature is shuffled)") + + # Determine scoring metric based on problem type + if problem_type == "classification": + scoring = "accuracy" + else: # regression + scoring = "r2" + + # Compute permutation importance + perm_importance = permutation_importance( + model, X_test, y_test, + n_repeats=10, # Number of times to permute each feature + random_state=42, + scoring=scoring, + n_jobs=-1 # Use all available cores + ) + feature_importances = perm_importance.importances_mean + importance_method = f"permutation (based on {scoring}, averaged across seeds)" + elif feature_importances is not None and importance_method: + # Update method to indicate averaging across seeds + if len(all_importances) == 0: + importance_method = importance_method + f", averaged across {len(model_results)} seeds" + + if feature_importances is not None: + all_importances.append(feature_importances) + + # Average importances across all models + if all_importances: + # Average across all seeds + avg_feature_importances = np.mean(all_importances, axis=0) + std_feature_importances = np.std(all_importances, axis=0) + + print(f"\n{model_name.upper()}:") + print(f"{'-'*80}") + print(f"Method: {importance_method}") + print(f"Averaged across {len(all_importances)} model(s)") + + # Sort features by importance + indices = np.argsort(avg_feature_importances)[::-1] + + # Helper to get feature label + def get_feature_label(idx): + if feature_names and idx < len(feature_names): + return feature_names[idx] + return f"Feature {idx}" + + # Print top 20 features with mean ± std + print(f"\nTop 20 most important features (mean ± std):") + for i, idx in enumerate(indices[:20], 1): + importance = avg_feature_importances[idx] + std = std_feature_importances[idx] + feat_label = get_feature_label(idx) + print(f" {i:2d}. {feat_label:40s}: {importance:.6f} ± {std:.6f}") + + # Create feature importance plot + try: + fig, ax = plt.subplots(figsize=(10, 8)) + + # Plot top 20 features + top_n = min(20, len(avg_feature_importances)) + top_indices = indices[:top_n] + top_importances = avg_feature_importances[top_indices] + top_stds = std_feature_importances[top_indices] + + y_pos = np.arange(top_n) + ax.barh(y_pos, top_importances, xerr=top_stds, align='center', alpha=0.7, capsize=3) + ax.set_yticks(y_pos) + ax.set_yticklabels([get_feature_label(idx) for idx in top_indices]) + ax.invert_yaxis() + ax.set_xlabel('Importance') + ax.set_title(f'Top {top_n} Feature Importances - {model_name.upper()}\n({importance_method})') + ax.grid(True, alpha=0.3, axis='x') + + plt.tight_layout() + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + plt.savefig(os.path.join(model_dir, f"feature_importance_{model_name}.png"), dpi=100, bbox_inches='tight') + plt.close() + + print(f"\n Feature importance plot saved to: {model_dir}/feature_importance_{model_name}.png") + except Exception as e: + print(f" Warning: Could not create feature importance plot: {e}") + else: + print(f"\n{model_name.upper()}: Feature importance not available (no models or no test data provided)") + except Exception as e: + print(f"\nWarning: Could not extract feature importance for {model_name}: {e}") + + print(f"\n{'='*80}\n") + return combined_summary + + +def evaluate_and_report_cv( + results, output_dir: str = "outputs", problem_type: str = "classification", + label_names: list = None, feature_names: list = None, X_test=None +): + """Compute metrics for k-fold cross-validation with per-fold breakdown. + + Parameters + ---------- + results : dict + Output from Trainer.run_cv() with format {model_name: [run_results_with_fold_info]} + Each run_result contains: seed, fold, y_pred, y_test, etc. + output_dir : str + Where to save the report and plots. + problem_type : str + "classification" or "regression". + label_names : list, optional + Names of the target variables (for multi-output). + feature_names : list, optional + Names of the input features. + X_test : array-like, optional + Test features (note: in CV, each fold has different test set). + """ + os.makedirs(output_dir, exist_ok=True) + import pandas as pd + + print(f"\n{'='*80}") + print("K-FOLD CROSS-VALIDATION RESULTS") + print(f"{'='*80}\n") + + # Determine if multi-output by checking first result + first_result = next(iter(results.values()))[0] + y_test_sample = first_result["y_test"] + is_multi_output = len(y_test_sample.shape) > 1 and y_test_sample.shape[1] > 1 + + all_summaries = {} + all_dfs = {} + + for model_name, model_results in results.items(): + # Group results by fold + folds = {} + for r in model_results: + fold_id = r.get("fold") + if fold_id not in folds: + folds[fold_id] = [] + folds[fold_id].append(r) + + n_folds = len(folds) + + print(f"\n{model_name.upper()} - {n_folds} Folds") + print(f"{'-'*80}") + + # Compute metrics for each fold + fold_summaries = [] + for fold_id in sorted(folds.keys()): + fold_results = folds[fold_id] + fold_metrics_list = [] + + for r in fold_results: + seed = r.get("seed") + y_pred = r.get("y_pred") + y_test = r.get("y_test") + y_proba = r.get("y_proba") + + if problem_type == "classification": + if is_multi_output: + metrics = compute_multi_output_classification_metrics(y_test, y_pred, y_proba) + else: + metrics = compute_classification_metrics(y_test, y_pred, y_proba) + else: # regression + if is_multi_output: + metrics = compute_multi_output_regression_metrics(y_test, y_pred) + else: + metrics = compute_regression_metrics(y_test, y_pred) + + metrics["seed"] = seed + metrics["fold"] = fold_id + fold_metrics_list.append(metrics) + + fold_summaries.append(fold_metrics_list) + + # Print per-fold metrics (averaged across seeds within each fold) + print(f"\nPer-Fold Metrics (averaged across {len(fold_results)} seeds per fold):") + + if problem_type == "classification": + if is_multi_output: + # Multi-output classification + metric_keys = ["accuracy_mean", "balanced_accuracy_mean", "f1_mean"] + else: + metric_keys = ["accuracy", "balanced_accuracy", "f1"] + else: + if is_multi_output: + # Multi-output regression + metric_keys = ["r2_mean", "rmse_mean", "mae_mean"] + else: + metric_keys = ["r2", "rmse", "mae"] + + # Print table header + print(f"\n {'Fold':<6}", end="") + for mk in metric_keys: + print(f"{mk:<20}", end="") + print() + print(f" {'-'*6}", end="") + for _ in metric_keys: + print(f"{'-'*20}", end="") + print() + + # Print per-fold metrics + for fold_id in sorted(folds.keys()): + fold_results_list = fold_summaries[fold_id] + fold_df = pd.DataFrame(fold_results_list) + + print(f" {fold_id:<6}", end="") + for mk in metric_keys: + if mk in fold_df.columns: + mean_val = fold_df[mk].mean() + std_val = fold_df[mk].std() + print(f"{mean_val:.4f} ± {std_val:.4f} ", end="") + else: + print(f"{'N/A':<20}", end="") + print() + + # Compute overall metrics (averaged across all folds and seeds) + all_fold_metrics = [m for fold_list in fold_summaries for m in fold_list] + overall_df = pd.DataFrame(all_fold_metrics) + + print(f"\n {'Overall':<6}", end="") + for mk in metric_keys: + if mk in overall_df.columns: + mean_val = overall_df[mk].mean() + std_val = overall_df[mk].std() + print(f"{mean_val:.4f} ± {std_val:.4f} ", end="") + else: + print(f"{'N/A':<20}", end="") + print("\n") + + # Save detailed report + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + report_path = os.path.join(model_dir, f"report_{model_name}_cv.json") + + with open(report_path, "w") as f: + json.dump(all_fold_metrics, f, indent=2, default=str) + + print(f" Detailed JSON report saved to: {report_path}") + + # Save detailed metrics as CSV (all seeds and folds) + csv_detailed_path = os.path.join(model_dir, f"metrics_{model_name}_cv_detailed.csv") + overall_df.to_csv(csv_detailed_path, index=False) + print(f" Detailed CSV metrics saved to: {csv_detailed_path}") + + # Save per-fold summary (averaged across seeds) + fold_summary_data = [] + for fold_id in sorted(folds.keys()): + fold_results_list = fold_summaries[fold_id] + fold_df = pd.DataFrame(fold_results_list) + + # Calculate mean and std for each metric + fold_summary = {"fold": fold_id} + for col in fold_df.columns: + if col not in ["seed", "fold"]: + fold_summary[f"{col}_mean"] = fold_df[col].mean() + fold_summary[f"{col}_std"] = fold_df[col].std() + fold_summary_data.append(fold_summary) + + # Add overall row + overall_summary = {"fold": "overall"} + for col in overall_df.columns: + if col not in ["seed", "fold"]: + overall_summary[f"{col}_mean"] = overall_df[col].mean() + overall_summary[f"{col}_std"] = overall_df[col].std() + fold_summary_data.append(overall_summary) + + fold_summary_df = pd.DataFrame(fold_summary_data) + csv_summary_path = os.path.join(model_dir, f"metrics_{model_name}_cv_per_fold.csv") + fold_summary_df.to_csv(csv_summary_path, index=False) + print(f" Per-fold summary CSV saved to: {csv_summary_path}") + + all_summaries[model_name] = overall_df + all_dfs[model_name] = overall_df + + # Create comparison plots across folds and models + if len(results) > 1: + print(f"\nGenerating comparison plots...") + + # Determine which metrics to plot + if problem_type == "classification": + if is_multi_output: + metrics_to_plot = {"accuracy_mean": "Accuracy", "balanced_accuracy_mean": "Balanced Accuracy", + "f1_mean": "F1 Score", "cohen_kappa_mean": "Cohen's Kappa"} + else: + metrics_to_plot = {"accuracy": "Accuracy", "balanced_accuracy": "Balanced Accuracy", + "f1": "F1 Score", "cohen_kappa": "Cohen's Kappa"} + else: + if is_multi_output: + metrics_to_plot = {"r2_mean": "R²", "rmse_mean": "RMSE", "mae_mean": "MAE", "mse_mean": "MSE"} + else: + metrics_to_plot = {"r2": "R²", "rmse": "RMSE", "mae": "MAE", "mse": "MSE"} + + for metric_key, metric_label in metrics_to_plot.items(): + try: + fig, ax = plt.subplots(figsize=(10, 6)) + + data_for_plot = [] + labels_for_plot = [] + + for model_name, df in all_dfs.items(): + if metric_key in df.columns: + data_for_plot.append(df[metric_key].values) + labels_for_plot.append(model_name.upper()) + + if data_for_plot: + bp = ax.boxplot(data_for_plot, labels=labels_for_plot, patch_artist=True) + for patch in bp["boxes"]: + patch.set_facecolor("lightblue") + + ax.set_ylabel(metric_label) + ax.set_title(f"{metric_label} Comparison - K-Fold CV") + ax.grid(True, alpha=0.3, axis='y') + plt.xticks(rotation=45, ha='right') + plt.tight_layout() + plt.savefig(os.path.join(output_dir, f"cv_comparison_{metric_key}.png")) + plt.close() + except Exception as e: + print(f" Warning: Could not create comparison plot for {metric_key}: {e}") + + print(f"\n{'='*80}\n") + + # Feature importance for CV + print(f"{'='*80}") + print("FEATURE IMPORTANCE ANALYSIS (per fold and averaged)") + print(f"{'='*80}") + + for model_name, model_results in results.items(): + try: + import joblib + from sklearn.inspection import permutation_importance + from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor + + # Group results by fold + folds_for_fi = {} + for r in model_results: + fold_id = r.get("fold") + if fold_id not in folds_for_fi: + folds_for_fi[fold_id] = [] + folds_for_fi[fold_id].append(r) + + print(f"\n{model_name.upper()}: Collecting feature importances from all {len(folds_for_fi)} folds...") + + # Collect feature importance for each fold + fold_importances_dict = {} + importance_method = None + + for fold_id in sorted(folds_for_fi.keys()): + fold_results = folds_for_fi[fold_id] + fold_importances_list = [] + + for result in fold_results: + model_path = result.get("model_path") + + if not model_path or not os.path.exists(model_path): + continue + + model = joblib.load(model_path) + y_test_fold = result.get("y_test") + X_test_fold = result.get("X_test") + + # Extract feature importances based on model type + feature_importances = None + + # For multi-output models + if isinstance(model, (MultiOutputClassifier, MultiOutputRegressor)): + if hasattr(model.estimators_[0], 'feature_importances_'): + importances_list = [est.feature_importances_ for est in model.estimators_ if hasattr(est, 'feature_importances_')] + if importances_list: + feature_importances = np.mean(importances_list, axis=0) + importance_method = "built-in (averaged across outputs, seeds, and folds)" + elif hasattr(model, 'feature_importances_'): + feature_importances = model.feature_importances_ + importance_method = "built-in (Gini/gain, averaged across seeds and folds)" + elif hasattr(model, 'coef_'): + coef = model.coef_ + if len(coef.shape) > 1: + feature_importances = np.mean(np.abs(coef), axis=0) + else: + feature_importances = np.abs(coef) + importance_method = "coefficient magnitude (averaged across seeds and folds)" + + # If no built-in importance, use permutation importance (e.g., for MLPs) + if feature_importances is None and X_test_fold is not None and y_test_fold is not None: + # Only print once per fold + if len(fold_importances_list) == 0: + print(f" Fold {fold_id}: Computing permutation importance (this may take a while)...") + + # Determine scoring metric based on problem type + if problem_type == "classification": + scoring = "accuracy" + else: # regression + scoring = "r2" + + # Compute permutation importance + perm_importance = permutation_importance( + model, X_test_fold, y_test_fold, + n_repeats=10, # Number of times to permute each feature + random_state=42, + scoring=scoring, + n_jobs=-1 # Use all available cores + ) + feature_importances = perm_importance.importances_mean + importance_method = f"permutation (based on {scoring}, averaged across seeds and folds)" + + if feature_importances is not None: + fold_importances_list.append(feature_importances) + + # Average across seeds for this fold + if fold_importances_list: + fold_avg_importance = np.mean(fold_importances_list, axis=0) + fold_importances_dict[fold_id] = fold_avg_importance + + # Average importances across all folds + if fold_importances_dict: + all_fold_importances = list(fold_importances_dict.values()) + avg_feature_importances = np.mean(all_fold_importances, axis=0) + std_feature_importances = np.std(all_fold_importances, axis=0) + + print(f"\n{model_name.upper()}:") + print(f"{'-'*80}") + print(f"Method: {importance_method}") + print(f"Computed from all {len(fold_importances_dict)} folds") + + # Sort features by importance + indices = np.argsort(avg_feature_importances)[::-1] + + # Helper to get feature label + def get_feature_label(idx): + if feature_names and idx < len(feature_names): + return feature_names[idx] + return f"Feature {idx}" + + # Print top 20 features + print(f"\nTop 20 most important features (mean ± std):") + for i, idx in enumerate(indices[:20], 1): + importance = avg_feature_importances[idx] + std = std_feature_importances[idx] + feat_label = get_feature_label(idx) + print(f" {i:2d}. {feat_label:40s}: {importance:.6f} ± {std:.6f}") + + # Save feature importance to CSV (all features, sorted by importance) + # Include per-fold values and overall average + feature_importance_data = [] + for rank, idx in enumerate(indices, 1): + row = { + "rank": rank, + "feature": get_feature_label(idx), + "importance_mean_all_folds": avg_feature_importances[idx], + "importance_std_across_folds": std_feature_importances[idx], + "feature_index": idx + } + # Add per-fold importance + for fold_id in sorted(fold_importances_dict.keys()): + row[f"importance_fold_{fold_id}"] = fold_importances_dict[fold_id][idx] + feature_importance_data.append(row) + + fi_df = pd.DataFrame(feature_importance_data) + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + fi_csv_path = os.path.join(model_dir, f"feature_importance_{model_name}_cv_all_folds.csv") + fi_df.to_csv(fi_csv_path, index=False) + print(f"\n Feature importance CSV (all folds) saved to: {fi_csv_path}") + + # Create feature importance plot + try: + fig, ax = plt.subplots(figsize=(10, 8)) + + top_n = min(20, len(avg_feature_importances)) + top_indices = indices[:top_n] + top_importances = avg_feature_importances[top_indices] + top_stds = std_feature_importances[top_indices] + + y_pos = np.arange(top_n) + ax.barh(y_pos, top_importances, xerr=top_stds, align='center', alpha=0.7, capsize=3) + ax.set_yticks(y_pos) + ax.set_yticklabels([get_feature_label(idx) for idx in top_indices]) + ax.invert_yaxis() + ax.set_xlabel('Importance') + ax.set_title(f'Top {top_n} Feature Importances - {model_name.upper()} (All Folds)\n({importance_method})') + ax.grid(True, alpha=0.3, axis='x') + + plt.tight_layout() + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + plt.savefig(os.path.join(model_dir, f"feature_importance_{model_name}_cv.png"), dpi=100, bbox_inches='tight') + plt.close() + + print(f"\n Feature importance plot saved to: {model_dir}/feature_importance_{model_name}_cv.png") + except Exception as e: + print(f" Warning: Could not create feature importance plot: {e}") + else: + print(f"\n{model_name.upper()}: Feature importance not available") + except Exception as e: + print(f"\nWarning: Could not extract feature importance for {model_name}: {e}") + + print(f"\n{'='*80}\n") + + return all_summaries + + +def evaluate_tuning_results( + results: Dict, + y_val: np.ndarray, + y_test: np.ndarray, + output_dir: str = "outputs", + problem_type: str = "classification", + label_names: Optional[list] = None, + feature_names: Optional[list] = None, + X_val: Optional[np.ndarray] = None, + X_test: Optional[np.ndarray] = None, +): + """Evaluate and report results from hyperparameter tuning. + + This function evaluates model performance on both validation and test sets + after hyperparameter tuning. It produces reports and visualizations comparing + performance across seeds and between val/test sets. + + Parameters + ---------- + results : Dict + Results dictionary from trainer.run_with_tuning() + y_val : np.ndarray + Validation set labels + y_test : np.ndarray + Test set labels + output_dir : str + Directory to save outputs + problem_type : str + "classification" or "regression" + label_names : list, optional + Names of label columns + feature_names : list, optional + Names of features (for feature importance) + X_val : np.ndarray, optional + Validation features (for feature importance) + X_test : np.ndarray, optional + Test features (for feature importance) + + Returns + ------- + dict + Summary statistics for each model + """ + import pandas as pd + + os.makedirs(output_dir, exist_ok=True) + + print(f"\n{'='*80}") + print("HYPERPARAMETER TUNING EVALUATION") + print(f"{'='*80}\n") + + all_summaries = {} + + for model_name, model_results in results.items(): + print(f"\n{model_name.upper()}") + print("-" * 60) + + # Collect metrics for each seed + val_metrics_per_seed = [] + test_metrics_per_seed = [] + best_params_per_seed = [] + best_cv_scores = [] + + for run in model_results: + seed = run['seed'] + best_params = run['best_params'] + best_cv_score = run['best_cv_score'] + + best_params_per_seed.append(best_params) + best_cv_scores.append(best_cv_score) + + # Compute validation metrics + if problem_type == "regression": + val_metrics = compute_regression_metrics( + y_val, run['y_val_pred'] + ) + else: + val_metrics = compute_classification_metrics( + y_val, run['y_val_pred'], run.get('y_val_proba') + ) + val_metrics['seed'] = seed + val_metrics_per_seed.append(val_metrics) + + # Compute test metrics + if problem_type == "regression": + test_metrics = compute_regression_metrics( + y_test, run['y_test_pred'] + ) + else: + test_metrics = compute_classification_metrics( + y_test, run['y_test_pred'], run.get('y_test_proba') + ) + test_metrics['seed'] = seed + test_metrics_per_seed.append(test_metrics) + + # Create summary DataFrames + val_df = pd.DataFrame(val_metrics_per_seed) + test_df = pd.DataFrame(test_metrics_per_seed) + + # Print summary statistics + print(f"\nValidation Set Performance (across {len(model_results)} seeds):") + if problem_type == "regression": + print(f" R² = {val_df['r2'].mean():.4f} ± {safe_std(val_df['r2']):.4f}") + print(f" RMSE = {val_df['rmse'].mean():.4f} ± {safe_std(val_df['rmse']):.4f}") + print(f" MAE = {val_df['mae'].mean():.4f} ± {safe_std(val_df['mae']):.4f}") + else: + print(f" Accuracy = {val_df['accuracy'].mean():.4f} ± {safe_std(val_df['accuracy']):.4f}") + print(f" Balanced Accuracy = {val_df['balanced_accuracy'].mean():.4f} ± {safe_std(val_df['balanced_accuracy']):.4f}") + print(f" F1 = {val_df['f1'].mean():.4f} ± {safe_std(val_df['f1']):.4f}") + + print(f"\nTest Set Performance (across {len(model_results)} seeds):") + if problem_type == "regression": + print(f" R² = {test_df['r2'].mean():.4f} ± {safe_std(test_df['r2']):.4f}") + print(f" RMSE = {test_df['rmse'].mean():.4f} ± {safe_std(test_df['rmse']):.4f}") + print(f" MAE = {test_df['mae'].mean():.4f} ± {safe_std(test_df['mae']):.4f}") + else: + print(f" Accuracy = {test_df['accuracy'].mean():.4f} ± {safe_std(test_df['accuracy']):.4f}") + print(f" Balanced Accuracy = {test_df['balanced_accuracy'].mean():.4f} ± {safe_std(test_df['balanced_accuracy']):.4f}") + print(f" F1 = {test_df['f1'].mean():.4f} ± {safe_std(test_df['f1']):.4f}") + + print(f"\nBest CV Score (during tuning): {np.mean(best_cv_scores):.4f} ± {safe_std(np.array(best_cv_scores)):.4f}") + + # Save detailed results + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + + # Save validation metrics + val_csv = os.path.join(model_dir, f"metrics_{model_name}_validation.csv") + val_df.to_csv(val_csv, index=False) + print(f"\n Validation metrics saved to: {val_csv}") + + # Save test metrics + test_csv = os.path.join(model_dir, f"metrics_{model_name}_test.csv") + test_df.to_csv(test_csv, index=False) + print(f" Test metrics saved to: {test_csv}") + + # Save hyperparameter summary + params_df = pd.DataFrame(best_params_per_seed) + params_csv = os.path.join(model_dir, f"best_params_{model_name}.csv") + params_df.to_csv(params_csv, index=False) + print(f" Best parameters saved to: {params_csv}") + + # Create comparison plots + try: + # Select key metrics based on problem type + if problem_type == "regression": + metrics_to_plot = ['r2', 'rmse', 'mae'] + else: + metrics_to_plot = ['accuracy', 'balanced_accuracy', 'f1'] + + fig, axes = plt.subplots(1, len(metrics_to_plot), figsize=(5*len(metrics_to_plot), 5)) + if len(metrics_to_plot) == 1: + axes = [axes] + + for idx, metric in enumerate(metrics_to_plot): + ax = axes[idx] + + # Prepare data for boxplot + data = [val_df[metric].values, test_df[metric].values] + positions = [1, 2] + labels = ['Validation', 'Test'] + + bp = ax.boxplot(data, positions=positions, labels=labels, + patch_artist=True, widths=0.6) + + # Color boxes + colors = ['lightblue', 'lightcoral'] + for patch, color in zip(bp['boxes'], colors): + patch.set_facecolor(color) + + ax.set_ylabel(metric.upper()) + ax.set_title(f'{metric.upper()} - {model_name.upper()}') + ax.grid(True, alpha=0.3, axis='y') + + plt.tight_layout() + plot_path = os.path.join(model_dir, f"val_vs_test_{model_name}.png") + plt.savefig(plot_path, dpi=100, bbox_inches='tight') + plt.close() + print(f" Comparison plot saved to: {plot_path}") + except Exception as e: + print(f" Warning: Could not create comparison plot: {e}") + + # Store summary + all_summaries[model_name] = { + 'validation': val_df, + 'test': test_df, + 'best_params': best_params_per_seed, + 'best_cv_scores': best_cv_scores, + } + + # Create summary report comparing all models + summary_report = {} + for model_name, summary in all_summaries.items(): + val_df = summary['validation'] + test_df = summary['test'] + + if problem_type == "regression": + summary_report[model_name] = { + 'val_r2_mean': float(val_df['r2'].mean()), + 'val_r2_std': float(safe_std(val_df['r2'])), + 'test_r2_mean': float(test_df['r2'].mean()), + 'test_r2_std': float(safe_std(test_df['r2'])), + 'val_rmse_mean': float(val_df['rmse'].mean()), + 'val_rmse_std': float(safe_std(val_df['rmse'])), + 'test_rmse_mean': float(test_df['rmse'].mean()), + 'test_rmse_std': float(safe_std(test_df['rmse'])), + } + else: + summary_report[model_name] = { + 'val_accuracy_mean': float(val_df['accuracy'].mean()), + 'val_accuracy_std': float(safe_std(val_df['accuracy'])), + 'test_accuracy_mean': float(test_df['accuracy'].mean()), + 'test_accuracy_std': float(safe_std(test_df['accuracy'])), + 'val_f1_mean': float(val_df['f1'].mean()), + 'val_f1_std': float(safe_std(val_df['f1'])), + 'test_f1_mean': float(test_df['f1'].mean()), + 'test_f1_std': float(safe_std(test_df['f1'])), + } + + # Save summary report + summary_path = os.path.join(output_dir, "tuning_summary_all_models.json") + with open(summary_path, 'w') as f: + json.dump(summary_report, f, indent=2) + print(f"\n{'='*60}") + print(f"Summary report saved to: {summary_path}") + print(f"{'='*60}\n") + + return all_summaries diff --git a/ecosci/hyperopt.py b/ecosci/hyperopt.py new file mode 100644 index 0000000..3490c65 --- /dev/null +++ b/ecosci/hyperopt.py @@ -0,0 +1,506 @@ +"""Hyperparameter optimization for different model types. + +This module provides hyperparameter tuning functionality using scikit-learn's +search methods (GridSearchCV, RandomizedSearchCV). Each model type has a dedicated +tuning function with sensible default search spaces that can be overridden via config. + +Key features: +- Separate tuning functions for each model type (MLP, RandomForest, XGBoost, SVM, Linear) +- Uses validation set for tuning, test set held out for final evaluation +- Supports both exhaustive grid search and randomized search +- Returns best model and tuning results +""" + +from typing import Any, Dict, Optional, Tuple +import numpy as np +from sklearn.model_selection import GridSearchCV, RandomizedSearchCV +from sklearn.base import BaseEstimator +import joblib +import os + + +class HyperparameterTuner: + """Hyperparameter tuning for various model types. + + Parameters + ---------- + problem_type : str + "classification" or "regression" + n_iter : int, optional + Number of iterations for RandomizedSearchCV (default: 50) + cv : int, optional + Number of cross-validation folds to use during tuning (default: 3) + scoring : str, optional + Scoring metric for tuning (default: auto-selected based on problem_type) + search_method : str, optional + Either "grid" for GridSearchCV or "random" for RandomizedSearchCV (default: "random") + random_state : int, optional + Random state for reproducibility (default: 42) + n_jobs : int, optional + Number of parallel jobs (default: -1, use all cores) + verbose : int, optional + Verbosity level (default: 1) + """ + + def __init__( + self, + problem_type: str = "regression", + n_iter: int = 50, + cv: int = 3, + scoring: Optional[str] = None, + search_method: str = "random", + random_state: int = 42, + n_jobs: int = -1, + verbose: int = 1, + ): + self.problem_type = problem_type + self.n_iter = n_iter + self.cv = cv + self.search_method = search_method + self.random_state = random_state + self.n_jobs = n_jobs + self.verbose = verbose + + # Auto-select scoring metric if not provided + if scoring is None: + if problem_type == "regression": + self.scoring = "neg_mean_squared_error" + else: + self.scoring = "accuracy" + else: + self.scoring = scoring + + def _get_search_estimator(self, model: BaseEstimator, param_space: Dict, groups: Optional[np.ndarray] = None): + """Create appropriate search estimator (Grid or Randomized). + + Parameters + ---------- + model : BaseEstimator + Base model to tune + param_space : Dict + Parameter search space + groups : np.ndarray, optional + Group labels for GroupKFold. If provided, uses GroupKFold instead of regular KFold. + + Returns + ------- + search estimator + GridSearchCV or RandomizedSearchCV configured appropriately + """ + from sklearn.model_selection import GroupKFold + + # Determine CV strategy + if groups is not None: + # Use GroupKFold to respect group structure + cv_strategy = GroupKFold(n_splits=self.cv) + else: + # Use default KFold (integer cv value) + cv_strategy = self.cv + + if self.search_method == "grid": + return GridSearchCV( + model, + param_space, + cv=cv_strategy, + scoring=self.scoring, + n_jobs=self.n_jobs, + verbose=self.verbose, + refit=True, + ) + else: + return RandomizedSearchCV( + model, + param_space, + n_iter=self.n_iter, + cv=cv_strategy, + scoring=self.scoring, + n_jobs=self.n_jobs, + verbose=self.verbose, + refit=True, + random_state=self.random_state, + ) + + def tune_mlp( + self, + X_train: np.ndarray, + y_train: np.ndarray, + param_space: Optional[Dict] = None, + groups: Optional[np.ndarray] = None, + ) -> Tuple[BaseEstimator, Dict]: + """Tune MLP hyperparameters. + + Parameters + ---------- + X_train : np.ndarray + Training features + y_train : np.ndarray + Training labels + param_space : Dict, optional + Custom parameter search space. If None, uses sensible defaults. + groups : np.ndarray, optional + Group labels for GroupKFold cross-validation + + Returns + ------- + best_model : BaseEstimator + Trained model with best hyperparameters + results : Dict + Dictionary containing best parameters, best score, and CV results + """ + from sklearn.neural_network import MLPRegressor, MLPClassifier + + # Default parameter space + if param_space is None: + param_space = { + 'hidden_layer_sizes': [ + (16,), (32,), (64,), + (16, 8), (32, 16), (64, 32), + (32, 16, 8), (64, 32, 16), + ], + 'activation': ['relu', 'tanh'], + 'alpha': [0.0001, 0.001, 0.01, 0.1], + 'learning_rate_init': [0.0001, 0.001, 0.01], + 'batch_size': [8, 16, 32, 64], + 'early_stopping': [True], + 'validation_fraction': [0.1], + 'n_iter_no_change': [10, 20], + 'max_iter': [1000], + } + + # Create base model + if self.problem_type == "regression": + base_model = MLPRegressor(random_state=self.random_state, verbose=False) + else: + base_model = MLPClassifier(random_state=self.random_state, verbose=False) + + # Perform search + search = self._get_search_estimator(base_model, param_space, groups=groups) + search.fit(X_train, y_train, groups=groups) + + results = { + 'best_params': search.best_params_, + 'best_score': search.best_score_, + 'cv_results': search.cv_results_, + } + + return search.best_estimator_, results + + def tune_random_forest( + self, + X_train: np.ndarray, + y_train: np.ndarray, + param_space: Optional[Dict] = None, + groups: Optional[np.ndarray] = None, + ) -> Tuple[BaseEstimator, Dict]: + """Tune Random Forest hyperparameters. + + Parameters + ---------- + X_train : np.ndarray + Training features + y_train : np.ndarray + Training labels + param_space : Dict, optional + Custom parameter search space. If None, uses sensible defaults. + groups : np.ndarray, optional + Group labels for GroupKFold cross-validation + + Returns + ------- + best_model : BaseEstimator + Trained model with best hyperparameters + results : Dict + Dictionary containing best parameters, best score, and CV results + """ + from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier + + # Default parameter space + if param_space is None: + param_space = { + 'n_estimators': [100, 200, 500, 1000], + 'max_depth': [5, 10, 20, 30, 50, None], + 'min_samples_split': [2, 5, 10, 20], + 'min_samples_leaf': [1, 2, 5, 10], + 'max_features': ['sqrt', 'log2', 0.5], + 'bootstrap': [True, False], + } + + # Create base model + if self.problem_type == "regression": + base_model = RandomForestRegressor(random_state=self.random_state, n_jobs=1) + else: + base_model = RandomForestClassifier(random_state=self.random_state, n_jobs=1) + + # Perform search + search = self._get_search_estimator(base_model, param_space, groups=groups) + search.fit(X_train, y_train, groups=groups) + + results = { + 'best_params': search.best_params_, + 'best_score': search.best_score_, + 'cv_results': search.cv_results_, + } + + return search.best_estimator_, results + + def tune_xgboost( + self, + X_train: np.ndarray, + y_train: np.ndarray, + param_space: Optional[Dict] = None, + groups: Optional[np.ndarray] = None, + ) -> Tuple[BaseEstimator, Dict]: + """Tune XGBoost hyperparameters. + + Parameters + ---------- + X_train : np.ndarray + Training features + y_train : np.ndarray + Training labels + param_space : Dict, optional + Custom parameter search space. If None, uses sensible defaults. + groups : np.ndarray, optional + Group labels for GroupKFold cross-validation + + Returns + ------- + best_model : BaseEstimator + Trained model with best hyperparameters + results : Dict + Dictionary containing best parameters, best score, and CV results + """ + try: + from xgboost import XGBRegressor, XGBClassifier + except ImportError: + raise ImportError("xgboost is required: install via pip install xgboost") + + # Default parameter space + if param_space is None: + param_space = { + 'n_estimators': [100, 200, 500], + 'max_depth': [3, 5, 7, 10], + 'learning_rate': [0.01, 0.05, 0.1, 0.2], + 'subsample': [0.6, 0.8, 1.0], + 'colsample_bytree': [0.6, 0.8, 1.0], + 'min_child_weight': [1, 3, 5], + 'reg_alpha': [0, 0.01, 0.1, 1.0], + 'reg_lambda': [0.1, 1.0, 10.0], + } + + # Create base model + if self.problem_type == "regression": + base_model = XGBRegressor(random_state=self.random_state, n_jobs=1) + else: + base_model = XGBClassifier( + random_state=self.random_state, + eval_metric='logloss', + n_jobs=1 + ) + + # Perform search + search = self._get_search_estimator(base_model, param_space, groups=groups) + search.fit(X_train, y_train, groups=groups) + + results = { + 'best_params': search.best_params_, + 'best_score': search.best_score_, + 'cv_results': search.cv_results_, + } + + return search.best_estimator_, results + + def tune_svm( + self, + X_train: np.ndarray, + y_train: np.ndarray, + param_space: Optional[Dict] = None, + groups: Optional[np.ndarray] = None, + ) -> Tuple[BaseEstimator, Dict]: + """Tune SVM hyperparameters. + + Parameters + ---------- + X_train : np.ndarray + Training features + y_train : np.ndarray + Training labels + param_space : Dict, optional + Custom parameter search space. If None, uses sensible defaults. + groups : np.ndarray, optional + Group labels for GroupKFold cross-validation + + Returns + ------- + best_model : BaseEstimator + Trained model with best hyperparameters + results : Dict + Dictionary containing best parameters, best score, and CV results + """ + from sklearn.svm import SVR, SVC + + # Default parameter space + if param_space is None: + if self.problem_type == "regression": + param_space = { + 'kernel': ['rbf', 'poly', 'linear'], + 'C': [0.1, 1, 10, 100], + 'epsilon': [0.01, 0.1, 0.5, 1.0], + 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1], + } + else: + param_space = { + 'kernel': ['rbf', 'poly', 'linear'], + 'C': [0.1, 1, 10, 100], + 'gamma': ['scale', 'auto', 0.001, 0.01, 0.1], + } + + # Create base model + if self.problem_type == "regression": + base_model = SVR() + else: + base_model = SVC(probability=True, random_state=self.random_state) + + # Perform search + search = self._get_search_estimator(base_model, param_space, groups=groups) + search.fit(X_train, y_train, groups=groups) + + results = { + 'best_params': search.best_params_, + 'best_score': search.best_score_, + 'cv_results': search.cv_results_, + } + + return search.best_estimator_, results + + def tune_linear( + self, + X_train: np.ndarray, + y_train: np.ndarray, + param_space: Optional[Dict] = None, + groups: Optional[np.ndarray] = None, + ) -> Tuple[BaseEstimator, Dict]: + """Tune Linear model hyperparameters. + + For linear regression, there are few hyperparameters to tune, + but we can tune regularization if using Ridge/Lasso. + + Parameters + ---------- + X_train : np.ndarray + Training features + y_train : np.ndarray + Training labels + param_space : Dict, optional + Custom parameter search space. If None, uses Ridge regression with alpha tuning. + groups : np.ndarray, optional + Group labels for GroupKFold cross-validation + + Returns + ------- + best_model : BaseEstimator + Trained model with best hyperparameters + results : Dict + Dictionary containing best parameters, best score, and CV results + """ + from sklearn.linear_model import Ridge, Lasso, ElasticNet, LogisticRegression + + # Default parameter space - use Ridge for regression, LogisticRegression for classification + if param_space is None: + if self.problem_type == "regression": + # Use Ridge with alpha tuning + param_space = { + 'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], + } + base_model = Ridge(random_state=self.random_state) + else: + # Use LogisticRegression with C and penalty tuning + param_space = { + 'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], + 'penalty': ['l1', 'l2', 'elasticnet'], + 'solver': ['saga'], # saga supports all penalties + 'l1_ratio': [0.1, 0.5, 0.9], # for elasticnet + } + base_model = LogisticRegression( + random_state=self.random_state, + max_iter=1000 + ) + else: + # Use provided param_space and determine model type from it + if self.problem_type == "regression": + base_model = Ridge(random_state=self.random_state) + else: + base_model = LogisticRegression( + random_state=self.random_state, + max_iter=1000 + ) + + # Perform search + search = self._get_search_estimator(base_model, param_space, groups=groups) + search.fit(X_train, y_train, groups=groups) + + results = { + 'best_params': search.best_params_, + 'best_score': search.best_score_, + 'cv_results': search.cv_results_, + } + + return search.best_estimator_, results + + def tune_model( + self, + model_name: str, + X_train: np.ndarray, + y_train: np.ndarray, + param_space: Optional[Dict] = None, + groups: Optional[np.ndarray] = None, + ) -> Tuple[BaseEstimator, Dict]: + """Tune hyperparameters for specified model type. + + Parameters + ---------- + model_name : str + Name of model to tune ('mlp', 'random_forest', 'xgboost', 'svm', 'linear') + X_train : np.ndarray + Training features + y_train : np.ndarray + Training labels + param_space : Dict, optional + Custom parameter search space. If None, uses sensible defaults. + groups : np.ndarray, optional + Group labels for GroupKFold cross-validation + + Returns + ------- + best_model : BaseEstimator + Trained model with best hyperparameters + results : Dict + Dictionary containing best parameters, best score, and CV results + """ + model_name_lower = model_name.lower() + + if model_name_lower == "mlp": + return self.tune_mlp(X_train, y_train, param_space, groups=groups) + elif model_name_lower == "random_forest": + return self.tune_random_forest(X_train, y_train, param_space, groups=groups) + elif model_name_lower == "xgboost": + return self.tune_xgboost(X_train, y_train, param_space, groups=groups) + elif model_name_lower == "svm": + return self.tune_svm(X_train, y_train, param_space, groups=groups) + elif model_name_lower in ["linear", "logistic"]: + return self.tune_linear(X_train, y_train, param_space, groups=groups) + else: + raise ValueError(f"Unknown model name for tuning: {model_name}") + + def save_tuning_results(self, results: Dict, output_path: str): + """Save tuning results to disk. + + Parameters + ---------- + results : Dict + Results dictionary from tuning + output_path : str + Path to save results (as joblib file) + """ + os.makedirs(os.path.dirname(output_path), exist_ok=True) + joblib.dump(results, output_path) diff --git a/ecosci/models.py b/ecosci/models.py index 1bdfe62..ab23751 100644 --- a/ecosci/models.py +++ b/ecosci/models.py @@ -19,7 +19,10 @@ class ModelZoo: @staticmethod def get_model( - name: str, problem_type: str = "classification", params: Dict[str, Any] = None + name: str, + problem_type: str = "classification", + params: Dict[str, Any] = None, + n_outputs: int = 1 ): """Return a model instance by name using params dict. @@ -31,6 +34,8 @@ def get_model( "classification" or "regression". Determines which estimator is used. params : Dict[str, Any] Hyperparameters forwarded to the underlying estimator. + n_outputs : int + Number of output targets. If > 1, wraps the model in MultiOutput wrapper. """ params = params or {} @@ -39,13 +44,14 @@ def get_model( if problem_type == "classification": # shallow MLP: single hidden layer default - return MLPClassifier( + base_model = MLPClassifier( hidden_layer_sizes=tuple(params.get("hidden_layer_sizes", (32,))), max_iter=params.get("max_iter", 200), early_stopping=params.get("early_stopping", True), validation_fraction=params.get("validation_fraction", 0.1), n_iter_no_change=params.get("n_iter_no_change", 10), random_state=params.get("random_state", 0), + verbose=params.get("verbose", False), # Disable verbose output by default **{ k: v for k, v in params.items() @@ -57,10 +63,13 @@ def get_model( "validation_fraction", "n_iter_no_change", "random_state", + "verbose", ] }, ) + return ModelZoo.wrap_for_multioutput(base_model, problem_type, n_outputs) else: + # MLPRegressor natively supports multi-output return MLPRegressor( hidden_layer_sizes=tuple(params.get("hidden_layer_sizes", (32,))), max_iter=params.get("max_iter", 200), @@ -68,6 +77,7 @@ def get_model( validation_fraction=params.get("validation_fraction", 0.1), n_iter_no_change=params.get("n_iter_no_change", 10), random_state=params.get("random_state", 0), + verbose=params.get("verbose", False), # Disable verbose output by default **{ k: v for k, v in params.items() @@ -79,6 +89,7 @@ def get_model( "validation_fraction", "n_iter_no_change", "random_state", + "verbose", ] }, ) @@ -86,6 +97,7 @@ def get_model( if name.lower() == "random_forest": from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor + # Random Forest natively supports multi-output if problem_type == "classification": return RandomForestClassifier( random_state=params.get("random_state", 0), @@ -101,14 +113,16 @@ def get_model( from sklearn.svm import SVC, SVR if problem_type == "classification": - return SVC( + base_model = SVC( probability=True, random_state=params.get("random_state", 0), **{k: v for k, v in params.items() if k != "random_state"}, ) + return ModelZoo.wrap_for_multioutput(base_model, problem_type, n_outputs) else: # SVR doesn't support random_state parameter - return SVR(**{k: v for k, v in params.items() if k != "random_state"}) + base_model = SVR(**{k: v for k, v in params.items() if k != "random_state"}) + return ModelZoo.wrap_for_multioutput(base_model, problem_type, n_outputs) if name.lower() == "xgboost": try: @@ -125,14 +139,16 @@ def get_model( params_filtered = { k: v for k, v in params.items() if k != "eval_metric" } - return XGBClassifier(eval_metric=eval_metric, **params_filtered) + base_model = XGBClassifier(eval_metric=eval_metric, **params_filtered) + return ModelZoo.wrap_for_multioutput(base_model, problem_type, n_outputs) else: - return XGBRegressor(**params) + base_model = XGBRegressor(**params) + return ModelZoo.wrap_for_multioutput(base_model, problem_type, n_outputs) if name.lower() == "logistic": from sklearn.linear_model import LogisticRegression - return LogisticRegression( + base_model = LogisticRegression( random_state=params.get("random_state", 0), max_iter=params.get("max_iter", 1000), **{ @@ -141,18 +157,75 @@ def get_model( if k not in ["random_state", "max_iter"] }, ) + return ModelZoo.wrap_for_multioutput(base_model, problem_type, n_outputs) if name.lower() == "linear": - from sklearn.linear_model import LinearRegression + from sklearn.linear_model import LinearRegression, Ridge if problem_type == "classification": raise ValueError( "Linear model is only for regression. Use 'logistic' for classification." ) - # LinearRegression doesn't support random_state - return LinearRegression( - **{k: v for k, v in params.items() if k != "random_state"} - ) + # If alpha is specified, use Ridge regression (supports regularization) + # Otherwise use plain LinearRegression + if "alpha" in params: + base_model = Ridge( + random_state=params.get("random_state", 0), + **{k: v for k, v in params.items() if k != "random_state"} + ) + else: + # LinearRegression doesn't support random_state or alpha + base_model = LinearRegression( + **{k: v for k, v in params.items() if k not in ["random_state", "alpha"]} + ) + + # Wrap in MultiOutputRegressor if needed + if n_outputs > 1: + from sklearn.multioutput import MultiOutputRegressor + return MultiOutputRegressor(base_model) + return base_model raise ValueError(f"Unknown model name: {name}") + + @staticmethod + def wrap_for_multioutput(model, problem_type: str, n_outputs: int): + """Wrap a model for multi-output prediction if necessary. + + Parameters + ---------- + model : estimator + Base sklearn-compatible model. + problem_type : str + "classification" or "regression". + n_outputs : int + Number of output targets. + + Returns + ------- + estimator + The model, wrapped in MultiOutput if n_outputs > 1. + """ + if n_outputs <= 1: + return model + + # Some models natively support multi-output + native_multioutput = [ + 'RandomForestClassifier', 'RandomForestRegressor', + 'ExtraTreesClassifier', 'ExtraTreesRegressor', + 'DecisionTreeClassifier', 'DecisionTreeRegressor', + 'KNeighborsClassifier', 'KNeighborsRegressor', + 'MLPRegressor', # MLP Regressor supports multi-output + ] + + model_class = model.__class__.__name__ + if model_class in native_multioutput: + return model + + # Wrap in MultiOutput wrapper + if problem_type == "classification": + from sklearn.multioutput import MultiOutputClassifier + return MultiOutputClassifier(model) + else: + from sklearn.multioutput import MultiOutputRegressor + return MultiOutputRegressor(model) diff --git a/ecosci/trainer.py b/ecosci/trainer.py index 49f58fa..521f673 100644 --- a/ecosci/trainer.py +++ b/ecosci/trainer.py @@ -5,14 +5,16 @@ - Uses scikit-learn's built-in early stopping for the MLP if enabled in config - Saves each trained model as a `.joblib` file - Returns predictions per run so the evaluator can compute metrics +- Supports hyperparameter tuning with train/val/test splits """ -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from tqdm import tqdm import numpy as np import random import joblib import os +import json class Trainer: @@ -67,6 +69,12 @@ def run(self, cfg: Dict[str, Any], X_train, X_val, X_test, y_train, y_val, y_tes seeds_list = [base_seed + i for i in range(repetitions)] all_results = {} + + # Determine number of outputs from y_train shape + if len(y_train.shape) == 1: + n_outputs = 1 + else: + n_outputs = y_train.shape[1] # Train each model type for model_idx, model_cfg in enumerate(models_cfg): @@ -84,7 +92,7 @@ def run(self, cfg: Dict[str, Any], X_train, X_val, X_test, y_train, y_val, y_tes if "random_state" in mparams_local or True: mparams_local.setdefault("random_state", s) - model = self.model_factory(mname, self.problem_type, mparams_local) + model = self.model_factory(mname, self.problem_type, mparams_local, n_outputs) # fit: sklearn models have fit(X, y). For MLP, early_stopping is # handled internally if enabled via params in the YAML config. @@ -130,3 +138,327 @@ def run(self, cfg: Dict[str, Any], X_train, X_val, X_test, y_train, y_val, y_tes all_results[mname] = model_results return all_results + + def run_cv(self, cfg: Dict[str, Any], fold_data_list): + """Run k-fold cross-validation training. + + Parameters + ---------- + cfg : Dict[str, Any] + Configuration dictionary + fold_data_list : list of tuples + List of (X_train, X_val, X_test, y_train, y_val, y_test, fold_id) tuples + + Returns + ------- + dict + Results organized as {model_name: [run_results_with_fold_info]} + """ + models_cfg = cfg.get("models", []) + + seeds = cfg.get("training", {}).get("seeds") + repetitions = cfg.get("training", {}).get("repetitions", 1) + base_seed = cfg.get("training", {}).get("random_seed", 0) + + seeds_list = [] + if seeds: + seeds_list = seeds + else: + seeds_list = [base_seed + i for i in range(repetitions)] + + all_results = {} + + # Train each model type + for model_idx, model_cfg in enumerate(models_cfg): + mname = model_cfg.get("name", "mlp") + mparams = model_cfg.get("params", {}) + + model_results = [] + + # Iterate over each fold + for fold_idx, (X_train, X_val, X_test, y_train, y_val, y_test, fold_id) in enumerate(fold_data_list): + # Determine number of outputs + if len(y_train.shape) == 1: + n_outputs = 1 + else: + n_outputs = y_train.shape[1] + + # Run each seed for this fold + for s in tqdm(seeds_list, desc=f"{mname.upper()} - Fold {fold_id+1}/{len(fold_data_list)}", unit="seed"): + self._set_seed(s) + + # ensure model has random_state where appropriate + mparams_local = dict(mparams) + if "random_state" in mparams_local or True: + mparams_local.setdefault("random_state", s) + + model = self.model_factory(mname, self.problem_type, mparams_local, n_outputs) + + # fit + if ( + X_val is not None + and hasattr(model, "partial_fit") + and cfg.get("training", {}).get("use_partial_fit", False) + ): + model.partial_fit(X_train, y_train) + else: + model.fit(X_train, y_train) + + y_pred = model.predict(X_test) + y_proba = None + + # Only try to get probabilities for classification problems + if self.problem_type == "classification" and hasattr(model, "predict_proba"): + try: + y_proba = model.predict_proba(X_test) + except Exception as e: + pass + + # save model for this run in model-specific subfolder + model_dir = os.path.join(self.output_dir, mname, f"fold{fold_id}") + os.makedirs(model_dir, exist_ok=True) + fname = os.path.join(model_dir, f"model_{mname}_fold{fold_id}_seed{s}.joblib") + joblib.dump(model, fname) + + model_results.append( + { + "seed": s, + "fold": fold_id, + "model_name": mname, + "model_path": fname, + "y_pred": y_pred, + "y_proba": y_proba, + "y_test": y_test, # Include y_test for fold-specific evaluation + "X_test": X_test, # Include X_test for permutation importance + } + ) + + all_results[mname] = model_results + + return all_results + + def run_with_tuning( + self, + cfg: Dict[str, Any], + X_train: np.ndarray, + X_val: np.ndarray, + X_test: np.ndarray, + y_train: np.ndarray, + y_val: np.ndarray, + y_test: np.ndarray, + group_assignments: Optional[Dict] = None, + groups_train: Optional[np.ndarray] = None, + groups_val: Optional[np.ndarray] = None, + ): + """Run training with hyperparameter tuning. + + This method performs hyperparameter optimization on the training + validation + sets, then evaluates the best model on the held-out test set. Multiple seeds + are used for stable results. + + Parameters + ---------- + cfg : Dict[str, Any] + Configuration dictionary with tuning settings + X_train : np.ndarray + Training features + X_val : np.ndarray + Validation features (used during hyperparameter tuning) + X_test : np.ndarray + Test features (held out for final evaluation) + y_train : np.ndarray + Training labels + y_val : np.ndarray + Validation labels + y_test : np.ndarray + Test labels + group_assignments : Dict, optional + Dictionary mapping split names to group IDs (for documentation) + groups_train : np.ndarray, optional + Group IDs for training samples + groups_val : np.ndarray, optional + Group IDs for validation samples + + Returns + ------- + dict + Results organized as {model_name: [run_results_with_tuning_info]} + """ + from ecosci.hyperopt import HyperparameterTuner + + models_cfg = cfg.get("models", []) + tuning_cfg = cfg.get("tuning", {}) + + # Get seeds for stable results + seeds = cfg.get("training", {}).get("seeds") + repetitions = cfg.get("training", {}).get("repetitions", 1) + base_seed = cfg.get("training", {}).get("random_seed", 0) + + if seeds: + seeds_list = seeds + else: + seeds_list = [base_seed + i for i in range(repetitions)] + + # Tuning configuration + search_method = tuning_cfg.get("search_method", "random") + n_iter = tuning_cfg.get("n_iter", 50) + cv_folds = tuning_cfg.get("cv_folds", 3) + scoring = tuning_cfg.get("scoring") + n_jobs = tuning_cfg.get("n_jobs", -1) + verbose = tuning_cfg.get("verbose", 1) + + all_results = {} + + # Train each model type + for model_idx, model_cfg in enumerate(models_cfg): + mname = model_cfg.get("name", "mlp") + + # Get hyperparameter search space from config + param_space = model_cfg.get("param_space") + + model_results = [] + + print(f"\n{'='*60}") + print(f"Tuning {mname.upper()}") + print(f"{'='*60}") + print(f"Metric: {scoring} ({'lower is better' if 'neg_' in str(scoring) else 'higher is better'})") + print(f"Search method: {search_method}") + print(f"CV folds: {cv_folds}") + if search_method == "random": + print(f"Iterations: {n_iter}") + + # Combine train and val for tuning (GridSearchCV will split internally) + X_train_val = np.vstack([X_train, X_val]) + y_train_val = np.concatenate([y_train, y_val]) + + # Combine groups for grouped CV during tuning (if groups are provided) + groups_train_val = None + if groups_train is not None and groups_val is not None: + groups_train_val = np.concatenate([groups_train, groups_val]) + + # Track best params across seeds + best_params_per_seed = [] + + for seed_idx, s in enumerate(tqdm(seeds_list, desc=f"Seeds", unit="seed")): + self._set_seed(s) + + # Create tuner for this seed + tuner = HyperparameterTuner( + problem_type=self.problem_type, + n_iter=n_iter, + cv=cv_folds, + scoring=scoring, + search_method=search_method, + random_state=s, + n_jobs=n_jobs, + verbose=verbose if seed_idx == 0 else 0, # Only verbose for first seed + ) + + # Tune hyperparameters + print(f"\n Seed {s}: Tuning hyperparameters...") + best_model, tuning_results = tuner.tune_model( + mname, X_train_val, y_train_val, param_space, groups=groups_train_val + ) + + best_params = tuning_results['best_params'] + best_score = tuning_results['best_score'] + best_params_per_seed.append(best_params) + + # Make the score more interpretable + if self.problem_type == "regression": + # For regression, the score is typically neg_mean_squared_error + if best_score < 0: + mse = -best_score + rmse = np.sqrt(mse) + print(f" Seed {s}: Best CV score: MSE={mse:.4f}, RMSE={rmse:.4f} (lower is better)") + else: + # If positive (e.g., R²), show as-is + print(f" Seed {s}: Best CV score: {best_score:.4f}") + else: + # For classification, usually accuracy or similar (higher is better) + print(f" Seed {s}: Best CV score: {best_score:.4f}") + print(f" Seed {s}: Best params: {best_params}") + + # Retrain on train set only (optional, but good practice) + # The best_model is already trained on train_val, but we can + # retrain on just train to match the typical workflow + from ecosci.models import ModelZoo + + # Determine number of outputs + if len(y_train.shape) == 1: + n_outputs = 1 + else: + n_outputs = y_train.shape[1] + + # Create new model with best params and train on train set + final_model = ModelZoo.get_model( + mname, + self.problem_type, + best_params, + n_outputs + ) + final_model.fit(X_train, y_train) + + # Evaluate on validation and test sets + y_val_pred = final_model.predict(X_val) + y_test_pred = final_model.predict(X_test) + + y_val_proba = None + y_test_proba = None + if self.problem_type == "classification" and hasattr(final_model, "predict_proba"): + try: + y_val_proba = final_model.predict_proba(X_val) + y_test_proba = final_model.predict_proba(X_test) + except Exception as e: + pass + + # Save model + model_dir = os.path.join(self.output_dir, mname) + os.makedirs(model_dir, exist_ok=True) + model_fname = os.path.join(model_dir, f"model_{mname}_seed{s}.joblib") + joblib.dump(final_model, model_fname) + + # Save tuning results + tuning_fname = os.path.join(model_dir, f"tuning_results_{mname}_seed{s}.json") + tuning_summary = { + 'best_params': best_params, + 'best_cv_score': float(best_score), + 'seed': s, + 'cv_folds': cv_folds, + 'search_method': search_method, + } + with open(tuning_fname, 'w') as f: + json.dump(tuning_summary, f, indent=2) + + model_results.append({ + 'seed': s, + 'model_name': mname, + 'model_path': model_fname, + 'best_params': best_params, + 'best_cv_score': best_score, + 'y_val_pred': y_val_pred, + 'y_val_proba': y_val_proba, + 'y_test_pred': y_test_pred, + 'y_test_proba': y_test_proba, + 'tuning_results_path': tuning_fname, + }) + + # Save summary of best params across all seeds + summary_path = os.path.join(model_dir, f"tuning_summary_{mname}.json") + summary = { + 'model_name': mname, + 'n_seeds': len(seeds_list), + 'seeds': seeds_list, + 'best_params_per_seed': best_params_per_seed, + 'search_method': search_method, + 'cv_folds': cv_folds, + 'group_assignments': group_assignments, + } + with open(summary_path, 'w') as f: + json.dump(summary, f, indent=2) + + print(f"\n Saved tuning summary to: {summary_path}") + + all_results[mname] = model_results + + return all_results diff --git a/run.py b/run.py index 9525f51..149472d 100644 --- a/run.py +++ b/run.py @@ -9,7 +9,7 @@ from ecosci.data import CSVDataLoader from ecosci.models import ModelZoo from ecosci.trainer import Trainer -from ecosci.eval import evaluate_and_report +from ecosci.eval import evaluate_and_report, evaluate_and_report_cv parser = argparse.ArgumentParser(description="Train a model using a YAML config.") parser.add_argument("--config", required=True, help="Path to YAML config file") @@ -26,18 +26,21 @@ # Load and prepare data data_cfg = cfg.get("data", {}) problem_type = cfg.get("problem_type", "classification") +cv_group_column = data_cfg.get("cv_group_column") + loader = CSVDataLoader( path=data_cfg.get("path"), features=data_cfg.get("features"), label=data_cfg.get("label"), + labels=data_cfg.get("labels"), test_size=data_cfg.get("test_size", 0.2), val_size=data_cfg.get("val_size", 0.2), random_state=data_cfg.get("random_state", 0), scaling=data_cfg.get("scaling", "standard"), impute_strategy=data_cfg.get("impute_strategy", "mean"), problem_type=problem_type, + cv_group_column=cv_group_column, ) -X_train, X_val, X_test, y_train, y_val, y_test = loader.prepare() # Get output directory output_dir = cfg.get("output", {}).get("dir", "outputs") @@ -48,26 +51,107 @@ problem_type=cfg.get("problem_type", "classification"), output_dir=output_dir, ) -results = trainer.run(cfg, X_train, X_val, X_test, y_train, y_val, y_test) -# Evaluate -summary = evaluate_and_report( - results, - y_test, - output_dir=output_dir, - problem_type=problem_type, -) +# Check if hyperparameter tuning is enabled +tuning_enabled = cfg.get("tuning", {}).get("enabled", False) + +# Determine which mode to use +if tuning_enabled and cv_group_column is not None: + # Hyperparameter tuning mode with grouped train/val/test splits + n_train_groups = data_cfg.get("n_train_groups", 4) + n_val_groups = data_cfg.get("n_val_groups", 2) + n_test_groups = data_cfg.get("n_test_groups", 2) + + print(f"\n{'='*70}") + print(f"Hyperparameter Tuning Mode") + print(f"{'='*70}") + print(f"Using grouped train/val/test splits with group column: {cv_group_column}") + print(f" Train groups: {n_train_groups}") + print(f" Val groups: {n_val_groups}") + print(f" Test groups: {n_test_groups}") + print(f"{'='*70}\n") + + # Prepare grouped splits + (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments, + groups_train, groups_val, groups_test) = \ + loader.prepare_grouped_splits(n_train_groups, n_val_groups, n_test_groups) + + # Run training with hyperparameter tuning + results = trainer.run_with_tuning( + cfg, X_train, X_val, X_test, y_train, y_val, y_test, group_assignments, + groups_train, groups_val + ) + + # Evaluate on both validation and test sets + # For tuning mode, we want to see performance on both val and test + from ecosci.eval import evaluate_tuning_results + summary = evaluate_tuning_results( + results, + y_val, + y_test, + output_dir=output_dir, + problem_type=problem_type, + label_names=loader.labels if hasattr(loader, 'labels') else None, + feature_names=loader.processed_feature_names if hasattr(loader, 'processed_feature_names') else None, + X_val=X_val, + X_test=X_test, + ) + +elif cv_group_column is not None: + # K-fold cross-validation mode (no tuning) + print(f"Running k-fold cross-validation using group column: {cv_group_column}") + fold_data_list = loader.prepare_cv_folds() + results = trainer.run_cv(cfg, fold_data_list) + + # Evaluate with CV-specific reporting + summary = evaluate_and_report_cv( + results, + output_dir=output_dir, + problem_type=problem_type, + label_names=loader.labels if hasattr(loader, 'labels') else None, + feature_names=loader.processed_feature_names if hasattr(loader, 'processed_feature_names') else None, + ) +else: + # Regular train/test split (no tuning, no CV) + X_train, X_val, X_test, y_train, y_val, y_test = loader.prepare() + results = trainer.run(cfg, X_train, X_val, X_test, y_train, y_val, y_test) + + # Evaluate + summary = evaluate_and_report( + results, + y_test, + output_dir=output_dir, + problem_type=problem_type, + label_names=loader.labels if hasattr(loader, 'labels') else None, + feature_names=loader.processed_feature_names if hasattr(loader, 'processed_feature_names') else None, + X_test=X_test, + ) # Print quick summary -if problem_type == "regression": - r2s = [r.get("r2") for r in summary if "r2" in r] - if r2s: - print(f"\nMean R²: {np.mean(r2s):.3f}") - rmses = [r.get("rmse") for r in summary if "rmse" in r] - if rmses: - print(f"Mean RMSE: {np.mean(rmses):.3f}") +if tuning_enabled and cv_group_column is not None: + # For tuning mode + print(f"\n{'='*70}") + print(f"Hyperparameter Tuning Complete!") + print(f"{'='*70}") + print(f"Results saved to: {output_dir}/") + print(f" - Best hyperparameters per seed") + print(f" - Validation and test set predictions") + print(f" - Model checkpoints") + print(f"{'='*70}\n") +elif cv_group_column is not None: + # For CV, summary is a dict of DataFrames + print(f"\nDone. See {output_dir}/ for full CV reports and plots.") else: - accs = [r.get("accuracy") for r in summary if "accuracy" in r] - if accs: - print(f"\nMean accuracy: {np.mean(accs):.3f}") -print(f"Done. See {output_dir}/ for full report and plots.") + # For regular split, summary is a list of dicts + if problem_type == "regression": + r2s = [r.get("r2") for r in summary if "r2" in r] + if r2s: + print(f"\nMean R²: {np.mean(r2s):.3f}") + rmses = [r.get("rmse") for r in summary if "rmse" in r] + if rmses: + print(f"Mean RMSE: {np.mean(rmses):.3f}") + else: + accs = [r.get("accuracy") for r in summary if "accuracy" in r] + if accs: + print(f"\nMean accuracy: {np.mean(accs):.3f}") + print(f"Done. See {output_dir}/ for full report and plots.") From cf4df3f6612921e2eea88e10428e2f5b2e50a6a5 Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 10:35:04 +1100 Subject: [PATCH 02/15] update tests --- tests/test_data.py | 74 ++++++++++++++++++++++ tests/test_eval.py | 84 +++++++++++++++++++++++++ tests/test_integration.py | 129 ++++++++++++++++++++++++++++++++++++++ tests/test_models.py | 57 +++++++++++++++++ 4 files changed, 344 insertions(+) diff --git a/tests/test_data.py b/tests/test_data.py index 3efe992..8facab5 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -80,3 +80,77 @@ def test_random_state_makes_splits_reproducible(sample_csv): np.testing.assert_array_equal(X_train1, X_train2) np.testing.assert_array_equal(y_train1, y_train2) + + +@pytest.fixture +def multi_label_csv(): + """Create a temporary CSV with multiple labels.""" + data = pd.DataFrame( + { + "num1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0] * 5, + "num2": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] * 5, + "label1": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1] * 5, + "label2": [1, 0, 1, 0, 1, 0, 1, 0, 1, 0] * 5, + } + ) + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + data.to_csv(f, index=False) + return f.name + + +def test_multi_label_loader_creates_splits(multi_label_csv): + """Test that data loader handles multiple labels.""" + loader = CSVDataLoader( + path=multi_label_csv, + features=["num1", "num2"], + labels=["label1", "label2"], + test_size=0.2, + val_size=0.2, + random_state=42, + ) + X_train, X_val, X_test, y_train, y_val, y_test = loader.prepare() + + # Verify we got all the data split properly + total_samples = len(y_train) + len(y_val) + len(y_test) + assert total_samples == 50 + + # Check that y has 2 outputs + assert y_train.shape[1] == 2 + assert y_test.shape[1] == 2 + assert y_val.shape[1] == 2 + + +def test_single_label_via_labels_param(sample_csv): + """Test that passing single label via labels param works.""" + loader = CSVDataLoader( + path=sample_csv, + features=["num1", "num2"], + labels=["label"], # Single label as list + test_size=0.2, + val_size=0.2, + random_state=42, + ) + X_train, X_val, X_test, y_train, y_val, y_test = loader.prepare() + + # Should work like single label (1D array) + assert len(y_train.shape) == 1 or (len(y_train.shape) == 2 and y_train.shape[1] == 1) + assert len(X_train) == len(y_train) + + +def test_multi_label_regression(multi_label_csv): + """Test multi-label with regression problem type.""" + loader = CSVDataLoader( + path=multi_label_csv, + features=["num1", "num2"], + labels=["label1", "label2"], + test_size=0.2, + val_size=0.2, + random_state=42, + problem_type="regression", + ) + X_train, X_val, X_test, y_train, y_val, y_test = loader.prepare() + + # Verify shape + assert y_train.shape[1] == 2 + assert not np.isnan(X_train).any() + assert not np.isnan(y_train).any() diff --git a/tests/test_eval.py b/tests/test_eval.py index 066d9e9..bdd6424 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -212,3 +212,87 @@ def test_evaluate_and_report_regression_multiple_seeds(temp_output_dir): assert all("r2" in m for m in summary) # Report is now saved in model-specific subfolder assert os.path.exists(os.path.join(temp_output_dir, "model", "report_model.json")) + + +# Multi-output tests +def test_multi_output_classification_metrics(): + """Test multi-output classification metrics computation.""" + from ecosci.eval import compute_multi_output_classification_metrics + + # Multi-output: predicting 2 outputs + y_true = np.array([[0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0]]) + y_pred = np.array([[0, 1], [1, 0], [0, 0], [1, 1], [0, 1], [1, 0]]) + + metrics = compute_multi_output_classification_metrics(y_true, y_pred) + + assert "accuracy_mean" in metrics + assert "balanced_accuracy_mean" in metrics + assert "f1_mean" in metrics + assert "n_outputs" in metrics + assert metrics["n_outputs"] == 2 + assert "per_output" in metrics + assert len(metrics["per_output"]) == 2 + + +def test_multi_output_regression_metrics(): + """Test multi-output regression metrics computation.""" + from ecosci.eval import compute_multi_output_regression_metrics + + # Multi-output: predicting 3 outputs + y_true = np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0], [4.0, 5.0, 6.0]]) + y_pred = np.array([[1.1, 2.1, 3.1], [2.1, 3.1, 4.1], [2.9, 3.9, 4.9], [4.1, 5.1, 6.1]]) + + metrics = compute_multi_output_regression_metrics(y_true, y_pred) + + assert "mse_mean" in metrics + assert "rmse_mean" in metrics + assert "mae_mean" in metrics + assert "r2_mean" in metrics + assert "n_outputs" in metrics + assert metrics["n_outputs"] == 3 + assert "per_output" in metrics + assert len(metrics["per_output"]) == 3 + + +def test_evaluate_and_report_multi_output_classification(temp_output_dir): + """Test evaluation for multi-output classification.""" + y_true = np.array([[0, 1], [1, 0], [0, 1], [1, 0]] * 10) + + results = [ + { + "seed": 42, + "y_pred": np.array([[0, 1], [1, 0], [0, 0], [1, 1]] * 10), + "y_proba": None, + "model_name": "model", + } + ] + + summary = evaluate_and_report(results, y_true, temp_output_dir, problem_type="classification") + + assert isinstance(summary, list) + assert len(summary) > 0 + assert "accuracy_mean" in summary[0] + assert "n_outputs" in summary[0] + + +def test_evaluate_and_report_multi_output_regression(temp_output_dir): + """Test evaluation for multi-output regression.""" + np.random.seed(42) + y_true = np.random.randn(30, 3) # 3 outputs + + results = [ + { + "seed": 42, + "y_pred": y_true + np.random.randn(30, 3) * 0.1, + "y_proba": None, + "model_name": "model", + } + ] + + summary = evaluate_and_report(results, y_true, temp_output_dir, problem_type="regression") + + assert isinstance(summary, list) + assert len(summary) > 0 + assert "mse_mean" in summary[0] + assert "n_outputs" in summary[0] + assert summary[0]["n_outputs"] == 3 diff --git a/tests/test_integration.py b/tests/test_integration.py index 1e297c0..b18be70 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -315,3 +315,132 @@ def test_regression_pipeline_svm(regression_csv, temp_output_dir): assert len(results["svm"]) == 1 assert "y_pred" in results["svm"][0] assert len(results["svm"][0]["y_pred"]) == len(y_test) + + +# Multi-output integration tests +@pytest.fixture +def multi_output_csv(): + """Create a temporary CSV with multiple labels.""" + np.random.seed(42) + n = 100 + data = { + "num1": np.random.randn(n), + "num2": np.random.randn(n) + 1, + "label1": np.random.choice([0, 1], n), + "label2": np.random.choice([0, 1, 2], n), + } + df = pd.DataFrame(data) + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + df.to_csv(f, index=False) + return f.name + + +def test_multi_output_classification_pipeline(multi_output_csv, temp_output_dir): + """Test complete pipeline for multi-output classification.""" + loader = CSVDataLoader( + path=multi_output_csv, + features=["num1", "num2"], + labels=["label1", "label2"], + test_size=0.2, + val_size=0.2, + random_state=42, + problem_type="classification", + ) + X_train, X_val, X_test, y_train, y_val, y_test = loader.prepare() + + # Verify multi-output shape + assert y_train.shape[1] == 2 + assert y_test.shape[1] == 2 + + cfg = { + "problem_type": "classification", + "models": [ + {"name": "random_forest", "params": {"n_estimators": 50, "random_state": 42}}, + {"name": "logistic", "params": {"max_iter": 500, "random_state": 42}}, + ], + "training": {"repetitions": 2, "random_seed": 0}, + "output_dir": temp_output_dir, + } + + trainer = Trainer( + ModelZoo.get_model, problem_type="classification", output_dir=temp_output_dir + ) + results = trainer.run(cfg, X_train, X_val, X_test, y_train, y_val, y_test) + + # Verify predictions have correct shape + assert results["random_forest"][0]["y_pred"].shape == y_test.shape + assert results["logistic"][0]["y_pred"].shape == y_test.shape + + summary = evaluate_and_report( + results, y_test, temp_output_dir, problem_type="classification" + ) + + # Check that we got multi-output metrics + assert "accuracy_mean" in summary[0] + assert "n_outputs" in summary[0] + assert summary[0]["n_outputs"] == 2 + + +def test_multi_output_regression_pipeline(temp_output_dir): + """Test complete pipeline for multi-output regression.""" + np.random.seed(42) + n = 100 + data = { + "num1": np.random.randn(n), + "num2": np.random.randn(n) + 1, + "target1": np.random.randn(n) * 2, + "target2": np.random.randn(n) * 3 + 5, + } + df = pd.DataFrame(data) + + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + df.to_csv(f, index=False) + csv_path = f.name + + loader = CSVDataLoader( + path=csv_path, + features=["num1", "num2"], + labels=["target1", "target2"], + test_size=0.2, + val_size=0.2, + random_state=42, + problem_type="regression", + ) + X_train, X_val, X_test, y_train, y_val, y_test = loader.prepare() + + # Verify multi-output shape + assert y_train.shape[1] == 2 + assert y_test.shape[1] == 2 + + cfg = { + "problem_type": "regression", + "models": [ + {"name": "random_forest", "params": {"n_estimators": 50, "random_state": 42}}, + {"name": "mlp", "params": {"hidden_layer_sizes": [16], "max_iter": 200, "random_state": 42}}, + ], + "training": {"repetitions": 2, "random_seed": 0}, + "output_dir": temp_output_dir, + } + + trainer = Trainer( + ModelZoo.get_model, problem_type="regression", output_dir=temp_output_dir + ) + + import warnings + from sklearn.exceptions import ConvergenceWarning + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=ConvergenceWarning) + results = trainer.run(cfg, X_train, X_val, X_test, y_train, y_val, y_test) + + # Verify predictions have correct shape + assert results["random_forest"][0]["y_pred"].shape == y_test.shape + assert results["mlp"][0]["y_pred"].shape == y_test.shape + + summary = evaluate_and_report( + results, y_test, temp_output_dir, problem_type="regression" + ) + + # Check that we got multi-output metrics + assert "mse_mean" in summary[0] + assert "n_outputs" in summary[0] + assert summary[0]["n_outputs"] == 2 diff --git a/tests/test_models.py b/tests/test_models.py index 794e2ed..4e6d621 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -141,3 +141,60 @@ def test_svm_regression(): params = model.get_params() assert params["C"] == 1.0 assert params["kernel"] == "rbf" + + +# Multi-output tests +def test_multi_output_classification_wrapping(): + """Test that models are wrapped for multi-output classification.""" + from sklearn.multioutput import MultiOutputClassifier + + # Logistic regression should be wrapped + model = ModelZoo.get_model( + "logistic", problem_type="classification", params={"random_state": 42}, n_outputs=2 + ) + assert isinstance(model, MultiOutputClassifier) + + # Random Forest should NOT be wrapped (native support) + model_rf = ModelZoo.get_model( + "random_forest", problem_type="classification", params={"random_state": 42}, n_outputs=2 + ) + assert not isinstance(model_rf, MultiOutputClassifier) + + +def test_multi_output_regression_wrapping(): + """Test that models are wrapped for multi-output regression.""" + from sklearn.multioutput import MultiOutputRegressor + + # Linear regression should be wrapped + model = ModelZoo.get_model( + "linear", problem_type="regression", params={}, n_outputs=2 + ) + assert isinstance(model, MultiOutputRegressor) + + # Random Forest should NOT be wrapped (native support) + model_rf = ModelZoo.get_model( + "random_forest", problem_type="regression", params={"random_state": 42}, n_outputs=2 + ) + assert not isinstance(model_rf, MultiOutputRegressor) + + # MLP Regressor should NOT be wrapped (native support) + model_mlp = ModelZoo.get_model( + "mlp", problem_type="regression", params={"random_state": 42}, n_outputs=2 + ) + assert not isinstance(model_mlp, MultiOutputRegressor) + + +def test_single_output_not_wrapped(): + """Test that single output models are not wrapped.""" + from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor + + # Single output should not be wrapped + model_cls = ModelZoo.get_model( + "logistic", problem_type="classification", params={"random_state": 42}, n_outputs=1 + ) + assert not isinstance(model_cls, MultiOutputClassifier) + + model_reg = ModelZoo.get_model( + "linear", problem_type="regression", params={}, n_outputs=1 + ) + assert not isinstance(model_reg, MultiOutputRegressor) From 40c77717f7a69b3994fb2b0c27aab92fcad3024a Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 10:35:14 +1100 Subject: [PATCH 03/15] add hyperparam tuning test --- test_tuning.py | 290 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 290 insertions(+) create mode 100755 test_tuning.py diff --git a/test_tuning.py b/test_tuning.py new file mode 100755 index 0000000..10d5629 --- /dev/null +++ b/test_tuning.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +"""Quick test script to validate hyperparameter tuning functionality.""" + +import numpy as np +import pandas as pd +import tempfile +import os +import sys + +def create_test_data(): + """Create a small synthetic dataset with groups for testing.""" + np.random.seed(42) + + n_samples = 200 + n_features = 5 + n_groups = 8 + + # Create features + X = np.random.randn(n_samples, n_features) + + # Create groups (e.g., patches) + groups = np.repeat(range(n_groups), n_samples // n_groups) + + # Create target with some signal + y = X[:, 0] * 2 + X[:, 1] - X[:, 2] * 0.5 + np.random.randn(n_samples) * 0.5 + + # Create DataFrame + df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(n_features)]) + df['target'] = y + df['group_id'] = groups + + return df + +def test_grouped_splits(): + """Test grouped train/val/test splits.""" + print("\n" + "="*60) + print("Testing Grouped Splits") + print("="*60) + + from ecosci.data import CSVDataLoader + + # Create test data + df = create_test_data() + + # Save to temporary file + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + df.to_csv(f.name, index=False) + temp_file = f.name + + try: + # Create loader + loader = CSVDataLoader( + path=temp_file, + features=[f'feature_{i}' for i in range(5)], + labels=['target'], + cv_group_column='group_id', + problem_type='regression', + ) + + # Test grouped splits + (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments) = \ + loader.prepare_grouped_splits(n_train_groups=4, n_val_groups=2, n_test_groups=2) + + print(f"\n✓ Successfully created grouped splits") + print(f" Train shape: {X_train.shape}") + print(f" Val shape: {X_val.shape}") + print(f" Test shape: {X_test.shape}") + print(f" Group assignments: {group_assignments}") + + # Verify no overlap + assert X_train.shape[0] > 0, "Train set is empty" + assert X_val.shape[0] > 0, "Val set is empty" + assert X_test.shape[0] > 0, "Test set is empty" + + print("✓ All splits are non-empty") + + return True + except Exception as e: + print(f"✗ Error: {e}") + import traceback + traceback.print_exc() + return False + finally: + os.unlink(temp_file) + +def test_hyperparameter_tuner(): + """Test hyperparameter tuner.""" + print("\n" + "="*60) + print("Testing Hyperparameter Tuner") + print("="*60) + + from ecosci.hyperopt import HyperparameterTuner + + # Create simple test data + np.random.seed(42) + X_train = np.random.randn(100, 5) + y_train = X_train[:, 0] * 2 + np.random.randn(100) * 0.5 + + try: + # Test MLP tuning + print("\nTesting MLP tuning...") + tuner = HyperparameterTuner( + problem_type='regression', + n_iter=3, # Small for testing + cv=2, # Small for testing + verbose=0, + ) + + param_space = { + 'hidden_layer_sizes': [(16,), (32,)], + 'alpha': [0.001, 0.01], + 'max_iter': [100], + } + + best_model, results = tuner.tune_mlp(X_train, y_train, param_space) + + print(f"✓ MLP tuning successful") + print(f" Best params: {results['best_params']}") + print(f" Best score: {results['best_score']:.4f}") + + # Test Random Forest tuning + print("\nTesting Random Forest tuning...") + param_space = { + 'n_estimators': [10, 20], + 'max_depth': [3, 5], + } + + best_model, results = tuner.tune_random_forest(X_train, y_train, param_space) + + print(f"✓ Random Forest tuning successful") + print(f" Best params: {results['best_params']}") + print(f" Best score: {results['best_score']:.4f}") + + return True + except Exception as e: + print(f"✗ Error: {e}") + import traceback + traceback.print_exc() + return False + +def test_integration(): + """Test full integration with config.""" + print("\n" + "="*60) + print("Testing Full Integration") + print("="*60) + + # Create test data + df = create_test_data() + + # Save to temporary file + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + df.to_csv(f.name, index=False) + temp_file = f.name + + # Create temporary output directory + temp_output_dir = tempfile.mkdtemp() + + try: + # Create a minimal config + config = { + 'problem_type': 'regression', + 'data': { + 'path': temp_file, + 'features': [f'feature_{i}' for i in range(5)], + 'labels': ['target'], + 'cv_group_column': 'group_id', + 'n_train_groups': 4, + 'n_val_groups': 2, + 'n_test_groups': 2, + 'scaling': 'standard', + }, + 'tuning': { + 'enabled': True, + 'search_method': 'random', + 'n_iter': 2, # Very small for testing + 'cv_folds': 2, + 'verbose': 0, + }, + 'models': [ + { + 'name': 'linear', + 'param_space': { + 'alpha': [0.1, 1.0], + } + } + ], + 'training': { + 'repetitions': 2, # Small for testing + 'random_seed': 42, + }, + 'output': { + 'dir': temp_output_dir, + } + } + + from ecosci.data import CSVDataLoader + from ecosci.trainer import Trainer + from ecosci.models import ModelZoo + + # Load data - extract n_*_groups before passing to loader + n_train_groups = config['data'].pop('n_train_groups') + n_val_groups = config['data'].pop('n_val_groups') + n_test_groups = config['data'].pop('n_test_groups') + + loader = CSVDataLoader(**config['data'], problem_type=config['problem_type']) + (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments) = \ + loader.prepare_grouped_splits( + n_train_groups=n_train_groups, + n_val_groups=n_val_groups, + n_test_groups=n_test_groups + ) + + print(f"✓ Data loaded successfully") + + # Create trainer + trainer = Trainer( + ModelZoo.get_model, + problem_type=config['problem_type'], + output_dir=config['output']['dir'], + ) + + # Run training with tuning + print("\nRunning training with tuning...") + results = trainer.run_with_tuning( + config, X_train, X_val, X_test, y_train, y_val, y_test, group_assignments + ) + + print(f"✓ Training completed successfully") + print(f" Models trained: {list(results.keys())}") + + # Check outputs + for model_name in results.keys(): + model_dir = os.path.join(temp_output_dir, model_name) + assert os.path.exists(model_dir), f"Model directory not found: {model_dir}" + + # Check for key files + files = os.listdir(model_dir) + assert any('model_' in f for f in files), "No model files found" + assert any('tuning_results_' in f for f in files), "No tuning results found" + + print(f"✓ All expected output files created") + + return True + except Exception as e: + print(f"✗ Error: {e}") + import traceback + traceback.print_exc() + return False + finally: + os.unlink(temp_file) + # Cleanup temp output dir + import shutil + shutil.rmtree(temp_output_dir, ignore_errors=True) + +def main(): + """Run all tests.""" + print("\n" + "="*60) + print("HYPERPARAMETER TUNING VALIDATION TESTS") + print("="*60) + + tests = [ + ("Grouped Splits", test_grouped_splits), + ("Hyperparameter Tuner", test_hyperparameter_tuner), + ("Full Integration", test_integration), + ] + + results = {} + for test_name, test_func in tests: + results[test_name] = test_func() + + # Summary + print("\n" + "="*60) + print("TEST SUMMARY") + print("="*60) + + for test_name, passed in results.items(): + status = "✓ PASSED" if passed else "✗ FAILED" + print(f"{test_name}: {status}") + + all_passed = all(results.values()) + + if all_passed: + print("\n✓ All tests passed!") + return 0 + else: + print("\n✗ Some tests failed") + return 1 + +if __name__ == "__main__": + sys.exit(main()) From ad49102ac0f8ad3d1ce64f9d163046c3015fa210 Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 12:38:49 +1100 Subject: [PATCH 04/15] update eval --- ecosci/__init__.py | 2 +- ecosci/eval.py | 1472 ----------------------- ecosci/evaluation/__init__.py | 59 + ecosci/evaluation/feature_importance.py | 341 ++++++ ecosci/evaluation/metrics.py | 239 ++++ ecosci/evaluation/plotting.py | 427 +++++++ ecosci/evaluation/reporting.py | 766 ++++++++++++ 7 files changed, 1833 insertions(+), 1473 deletions(-) delete mode 100644 ecosci/eval.py create mode 100644 ecosci/evaluation/__init__.py create mode 100644 ecosci/evaluation/feature_importance.py create mode 100644 ecosci/evaluation/metrics.py create mode 100644 ecosci/evaluation/plotting.py create mode 100644 ecosci/evaluation/reporting.py diff --git a/ecosci/__init__.py b/ecosci/__init__.py index 5f8a414..08d3f21 100644 --- a/ecosci/__init__.py +++ b/ecosci/__init__.py @@ -4,6 +4,6 @@ from .data import CSVDataLoader from .models import ModelZoo from .trainer import Trainer -from .eval import evaluate_and_report +from .evaluation import evaluate_and_report __all__ = ["load_config", "CSVDataLoader", "ModelZoo", "Trainer", "evaluate_and_report"] diff --git a/ecosci/eval.py b/ecosci/eval.py deleted file mode 100644 index 427c2e5..0000000 --- a/ecosci/eval.py +++ /dev/null @@ -1,1472 +0,0 @@ -"""Evaluation and reporting utilities. - -Focus on metrics that make sense for unbalanced datasets: -- Balanced accuracy (accounts for class imbalance) -- Precision/Recall/F1 (macro for multi-class) -- ROC-AUC and Average Precision (PR AUC) when probabilities are available - -Outputs: -- `report.json`: list of per-seed metrics -- `metric_*.png`: quick boxplots across seeds -- `confusion_matrix.png`: confusion matrix heatmap (last run) -- `pr_curve.png`: precision-recall curve if binary and probabilities exist -""" - -from typing import Dict, Any, Optional -import os -import json -import numpy as np -import matplotlib.pyplot as plt -import seaborn as sns - - -def safe_std(vals): - """Calculate std, returning 0.0 instead of nan when there's only 1 value.""" - if len(vals) <= 1: - return 0.0 - return vals.std() - - -def compute_classification_metrics(y_true, y_pred, y_proba=None) -> Dict[str, Any]: - from sklearn.metrics import ( - accuracy_score, - precision_score, - recall_score, - f1_score, - roc_auc_score, - confusion_matrix, - balanced_accuracy_score, - average_precision_score, - cohen_kappa_score, - ) - - out = {} - out["accuracy"] = accuracy_score(y_true, y_pred) - out["balanced_accuracy"] = balanced_accuracy_score(y_true, y_pred) - out["precision"] = precision_score( - y_true, - y_pred, - average="binary" if len(np.unique(y_true)) == 2 else "macro", - zero_division=0, - ) - out["recall"] = recall_score( - y_true, - y_pred, - average="binary" if len(np.unique(y_true)) == 2 else "macro", - zero_division=0, - ) - out["f1"] = f1_score( - y_true, - y_pred, - average="binary" if len(np.unique(y_true)) == 2 else "macro", - zero_division=0, - ) - out["cohen_kappa"] = cohen_kappa_score(y_true, y_pred) - out["confusion_matrix"] = confusion_matrix(y_true, y_pred).tolist() - - if y_proba is not None: - try: - # Binary classification: use positive class probabilities - if y_proba.ndim == 2 and y_proba.shape[1] == 2: - out["roc_auc"] = roc_auc_score(y_true, y_proba[:, 1]) - out["average_precision"] = average_precision_score( - y_true, y_proba[:, 1] - ) - # Multiclass: use OVR strategy - elif y_proba.ndim == 2 and y_proba.shape[1] > 2: - out["roc_auc"] = roc_auc_score(y_true, y_proba, multi_class="ovr") - out["average_precision"] = average_precision_score( - y_true, y_proba, average="macro" - ) - # 1D probabilities (rare, but handle it) - else: - out["roc_auc"] = roc_auc_score(y_true, y_proba) - out["average_precision"] = average_precision_score(y_true, y_proba) - except Exception as e: - print(f"Warning: Could not compute ROC-AUC/AP: {e}") - out["roc_auc"] = None - out["average_precision"] = None - else: - out["roc_auc"] = None - out["average_precision"] = None - - return out - - -def compute_multi_output_classification_metrics(y_true, y_pred, y_proba=None) -> Dict[str, Any]: - """Compute classification metrics for multi-output predictions. - - For multi-output, we compute metrics per output and aggregate. - - Parameters - ---------- - y_true : array-like, shape (n_samples, n_outputs) - Ground truth labels - y_pred : array-like, shape (n_samples, n_outputs) - Predicted labels - y_proba : array-like, shape (n_samples, n_outputs, n_classes) or None - Predicted probabilities (if available) - - Returns - ------- - Dict[str, Any] - Dictionary with average metrics across outputs and per-output metrics - """ - from sklearn.metrics import ( - accuracy_score, - precision_score, - recall_score, - f1_score, - balanced_accuracy_score, - cohen_kappa_score, - ) - - n_outputs = y_true.shape[1] - out = {} - - # Compute per-output metrics - per_output_metrics = [] - for i in range(n_outputs): - y_true_i = y_true[:, i] - y_pred_i = y_pred[:, i] - y_proba_i = y_proba[i] if y_proba is not None and isinstance(y_proba, list) else None - - metrics_i = compute_classification_metrics(y_true_i, y_pred_i, y_proba_i) - per_output_metrics.append(metrics_i) - - # Aggregate metrics (average across outputs) - for key in ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1', 'cohen_kappa']: - values = [m[key] for m in per_output_metrics if key in m and m[key] is not None] - if values: - out[f"{key}_mean"] = np.mean(values) - out[f"{key}_std"] = np.std(values) - - # Store per-output metrics - out["per_output"] = per_output_metrics - out["n_outputs"] = n_outputs - - return out - - -def compute_multi_output_regression_metrics(y_true, y_pred) -> Dict[str, Any]: - """Compute regression metrics for multi-output predictions. - - For multi-output regression, we compute metrics per output and aggregate. - - Parameters - ---------- - y_true : array-like, shape (n_samples, n_outputs) - Ground truth values - y_pred : array-like, shape (n_samples, n_outputs) - Predicted values - - Returns - ------- - Dict[str, Any] - Dictionary with average metrics across outputs and per-output metrics - """ - from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score - - n_outputs = y_true.shape[1] - out = {} - - # Compute per-output metrics - per_output_metrics = [] - for i in range(n_outputs): - y_true_i = y_true[:, i] - y_pred_i = y_pred[:, i] - - metrics_i = compute_regression_metrics(y_true_i, y_pred_i) - per_output_metrics.append(metrics_i) - - # Aggregate metrics (average across outputs) - for key in ['mse', 'rmse', 'mae', 'r2']: - values = [m[key] for m in per_output_metrics if key in m and m[key] is not None] - if values: - out[f"{key}_mean"] = np.mean(values) - out[f"{key}_std"] = np.std(values) - - # Store per-output metrics - out["per_output"] = per_output_metrics - out["n_outputs"] = n_outputs - - return out - - -def compute_regression_metrics(y_true, y_pred) -> Dict[str, Any]: - """Compute regression metrics. - - Parameters - ---------- - y_true : array-like - Ground truth target values. - y_pred : array-like - Predicted target values. - - Returns - ------- - Dict[str, Any] - Dictionary containing: - - mse: Mean Squared Error - - rmse: Root Mean Squared Error - - mae: Mean Absolute Error - - r2: R-squared (coefficient of determination) - - mape: Mean Absolute Percentage Error (if no zeros in y_true) - """ - from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score - - out = {} - out["mse"] = mean_squared_error(y_true, y_pred) - out["rmse"] = np.sqrt(out["mse"]) - out["mae"] = mean_absolute_error(y_true, y_pred) - out["r2"] = r2_score(y_true, y_pred) - - # MAPE - only compute if no zeros in y_true - try: - if not np.any(y_true == 0): - mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100 - out["mape"] = mape - else: - out["mape"] = None - except Exception: - out["mape"] = None - - return out - - -def evaluate_and_report( - results, y_test, output_dir: str = "outputs", problem_type: str = "classification", label_names: list = None, feature_names: list = None, X_test=None -): - """Compute metrics across runs and save a compact report and plots. - - Parameters - ---------- - results : dict or list - If dict: Output from Trainer.run() with multiple models {model_name: [run_results]} - If list: Single model results (backward compatibility) - y_test : array-like - Ground-truth labels (classification) or values (regression) for the test set. - output_dir : str - Where to save the report and plots. - problem_type : str - "classification" or "regression". Determines which metrics to compute. - label_names : list, optional - Names of the target variables (for multi-output). If None, will use Output 1, Output 2, etc. - feature_names : list, optional - Names of the input features. If None, will use Feature 0, Feature 1, etc. - X_test : array-like, optional - Test features for permutation importance. Required for models without built-in importance (e.g., MLPs). - """ - os.makedirs(output_dir, exist_ok=True) - import pandas as pd - - # Handle backward compatibility: convert list to dict - if isinstance(results, list): - # Assume single model (old format) - model_name = results[0].get("model_name", "model") if results else "model" - results = {model_name: results} - - # Determine if we have multi-output - is_multi_output = len(y_test.shape) > 1 and y_test.shape[1] > 1 - - # Compute metrics for each model - all_summaries = {} - all_dfs = {} - - for model_name, model_results in results.items(): - summary = [] - for r in model_results: - seed = r.get("seed") - y_pred = r.get("y_pred") - y_proba = r.get("y_proba") - - if is_multi_output: - # Multi-output case - if problem_type == "regression": - metrics = compute_multi_output_regression_metrics(y_test, y_pred) - else: - metrics = compute_multi_output_classification_metrics(y_test, y_pred, y_proba) - else: - # Single output case - if problem_type == "regression": - metrics = compute_regression_metrics(y_test, y_pred) - else: - metrics = compute_classification_metrics(y_test, y_pred, y_proba) - - metrics["seed"] = seed - metrics["model"] = model_name - summary.append(metrics) - - all_summaries[model_name] = summary - all_dfs[model_name] = pd.DataFrame(summary) - - # Save individual model reports in model-specific subfolders - for model_name, summary in all_summaries.items(): - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - report_path = os.path.join(model_dir, f"report_{model_name}.json") - with open(report_path, "w") as f: - json.dump(summary, f, indent=2) - - # Combine all results for comparison - combined_df = pd.concat(all_dfs.values(), ignore_index=True) - combined_summary = [item for sublist in all_summaries.values() for item in sublist] - - # Save combined report - with open(os.path.join(output_dir, "report_all_models.json"), "w") as f: - json.dump(combined_summary, f, indent=2) - - # Print results for each model - if is_multi_output: - # For multi-output, show aggregated metrics - if problem_type == "regression": - display_cols = ["seed", "mse_mean", "rmse_mean", "mae_mean", "r2_mean"] - comparison_metrics = ["mse_mean", "rmse_mean", "mae_mean", "r2_mean"] - plot_metrics = ["mse_mean", "rmse_mean", "mae_mean", "r2_mean"] - else: - display_cols = [ - "seed", - "accuracy_mean", - "balanced_accuracy_mean", - "precision_mean", - "recall_mean", - "f1_mean", - "cohen_kappa_mean", - ] - comparison_metrics = [ - "accuracy_mean", - "balanced_accuracy_mean", - "f1_mean", - "cohen_kappa_mean", - ] - plot_metrics = ["accuracy_mean", "balanced_accuracy_mean", "f1_mean", "cohen_kappa_mean"] - else: - # Single output - if problem_type == "regression": - display_cols = ["seed", "mse", "rmse", "mae", "r2"] - if "mape" in combined_df.columns: - display_cols.append("mape") - comparison_metrics = ["mse", "rmse", "mae", "r2"] - plot_metrics = ["mse", "rmse", "mae", "r2"] - else: - display_cols = [ - "seed", - "accuracy", - "balanced_accuracy", - "precision", - "recall", - "f1", - "cohen_kappa", - ] - if "roc_auc" in combined_df.columns: - display_cols.append("roc_auc") - if "average_precision" in combined_df.columns: - display_cols.append("average_precision") - comparison_metrics = [ - "accuracy", - "balanced_accuracy", - "f1", - "cohen_kappa", - "roc_auc", - ] - plot_metrics = ["accuracy", "balanced_accuracy", "f1", "cohen_kappa"] - - for model_name, df in all_dfs.items(): - print(f"\n{'='*80}") - print(f"RESULTS FOR MODEL: {model_name.upper()}") - print(f"{'='*80}") - print( - df[display_cols].to_string( - index=False, - float_format=lambda x: f"{x:.4f}" if not pd.isna(x) else "N/A", - ) - ) - - print(f"\n{'-'*80}") - print(f"SUMMARY STATISTICS FOR {model_name.upper()} (mean ± std)") - print(f"{'-'*80}") - metric_cols = [c for c in display_cols if c != "seed"] - for col in metric_cols: - if col in df.columns: - vals = df[col].dropna() - if len(vals) > 0: - print(f"{col:20s}: {vals.mean():.4f} ± {safe_std(vals):.4f}") - print(f"{'-'*80}\n") - - # For multi-output, print per-output breakdown - if is_multi_output and "per_output" in df.columns[0:1] or (len(df) > 0 and "per_output" in df.iloc[0]): - print(f"\n{'-'*80}") - print(f"PER-OUTPUT BREAKDOWN FOR {model_name.upper()}") - print(f"{'-'*80}") - - # Get per-output metrics from first seed as example structure - first_row = df.iloc[0] - if "per_output" in first_row and first_row["per_output"] is not None: - per_output_list = first_row["per_output"] - n_outputs = len(per_output_list) - - # Collect metrics across all seeds for each output - for output_idx in range(n_outputs): - # Use label name if provided, otherwise use generic Output N - if label_names and output_idx < len(label_names): - output_label = f"{label_names[output_idx]}" - else: - output_label = f"Output {output_idx + 1}" - - print(f"\n{output_label}:") - - # Aggregate metrics across seeds for this output - output_metrics = {} - for _, row in df.iterrows(): - if "per_output" in row and row["per_output"] is not None: - per_out = row["per_output"] - if output_idx < len(per_out): - for key, val in per_out[output_idx].items(): - if key not in ["confusion_matrix", "seed", "model"] and val is not None: - if key not in output_metrics: - output_metrics[key] = [] - output_metrics[key].append(val) - - # Print summary for this output - for metric_name, values in sorted(output_metrics.items()): - if len(values) > 0: - mean_val = np.mean(values) - std_val = safe_std(np.array(values)) - print(f" {metric_name:20s}: {mean_val:.4f} ± {std_val:.4f}") - print(f"{'-'*80}\n") - - # Model comparison table - if len(all_dfs) > 1: - print(f"\n{'='*80}") - print("MODEL COMPARISON (mean ± std)") - print(f"{'='*80}") - comparison_data = [] - for model_name, df in all_dfs.items(): - row = {"Model": model_name} - for col in comparison_metrics: - if col in df.columns: - vals = df[col].dropna() - if len(vals) > 0: - row[col] = f"{vals.mean():.4f} ± {safe_std(vals):.4f}" - else: - row[col] = "N/A" - comparison_data.append(row) - - comparison_df = pd.DataFrame(comparison_data) - print(comparison_df.to_string(index=False)) - print(f"{'='*80}\n") - - # Determine best and second-best models - # For regression: lower MSE is better - # Select primary metric based on problem type - # For classification: Cohen's kappa is more robust as it accounts for chance agreement - # For regression: MSE is standard (lower is better) - if is_multi_output: - if problem_type == "regression": - primary_metric = "mse_mean" - lower_is_better = True - else: - primary_metric = "cohen_kappa_mean" - lower_is_better = False - else: - if problem_type == "regression": - primary_metric = "mse" - lower_is_better = True - else: - primary_metric = "cohen_kappa" - lower_is_better = False - - # Calculate mean primary metric for each model - model_scores = [] - for model_name, df in all_dfs.items(): - if primary_metric in df.columns: - vals = df[primary_metric].dropna() - if len(vals) > 0: - model_scores.append( - { - "model": model_name, - "mean": vals.mean(), - "std": safe_std(vals), - } - ) - - if len(model_scores) > 0: - # Sort by primary metric - model_scores.sort(key=lambda x: x["mean"], reverse=not lower_is_better) - - print(f"\n{'='*80}") - print("MODEL RANKING") - print(f"{'='*80}") - print( - f"Ranked by: {primary_metric.replace('_', ' ').upper()} ({'Lower is better' if lower_is_better else 'Higher is better'})" - ) - print(f"{'-'*80}") - - # Create ranking table with all metrics - ranking_data = [] - for rank, score in enumerate(model_scores, 1): - model_name = score["model"] - model_df = all_dfs[model_name] - - row = { - "Rank": f"{rank}.", - "Model": model_name.upper(), - primary_metric: f"{score['mean']:.4f} ± {score['std']:.4f}", - } - - # Add other metrics - metric_cols = [ - c for c in display_cols if c != "seed" and c != primary_metric - ] - for col in metric_cols: - if col in model_df.columns: - vals = model_df[col].dropna() - if len(vals) > 0: - row[col] = f"{vals.mean():.4f} ± {safe_std(vals):.4f}" - - ranking_data.append(row) - - # Print ranking table - ranking_df = pd.DataFrame(ranking_data) - print(ranking_df.to_string(index=False)) - print(f"{'='*80}\n") - - # Plot comparison across models - - if len(all_dfs) > 1: - # Multi-model comparison plots - for m in plot_metrics: - if m in combined_df.columns: - plt.figure(figsize=(max(6, len(all_dfs) * 1.5), 4)) - sns.boxplot(data=combined_df, x="model", y=m) - plt.title(f"{m.replace('_', ' ').title()} - Model Comparison") - plt.xlabel("Model") - plt.ylabel(m.replace("_", " ").title()) - plt.xticks(rotation=45, ha="right") - plt.tight_layout() - plt.savefig(os.path.join(output_dir, f"comparison_{m}.png")) - plt.close() - else: - # Single model plots (backward compatibility) - for m in plot_metrics: - if m in combined_df.columns: - plt.figure(figsize=(4, 3)) - sns.boxplot(data=combined_df, y=m) - plt.title(m.replace("_", " ").title()) - plt.tight_layout() - plt.savefig(os.path.join(output_dir, f"metric_{m}.png")) - plt.close() - - # Confusion matrices for each model (last run) - classification only - if problem_type == "classification": - for model_name, model_results in results.items(): - try: - from sklearn.metrics import confusion_matrix - - last = model_results[-1] - cm = confusion_matrix(y_test, last.get("y_pred")) - plt.figure(figsize=(5, 4)) - sns.heatmap(cm, annot=True, fmt="d", cmap="Blues") - plt.title(f"Confusion Matrix - {model_name.upper()} (last run)") - plt.xlabel("Predicted") - plt.ylabel("True") - plt.tight_layout() - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - plt.savefig( - os.path.join(model_dir, f"confusion_matrix_{model_name}.png") - ) - plt.close() - except Exception as e: - print( - f"Warning: Could not create confusion matrix for {model_name}: {e}" - ) - - # Precision-Recall curves for each model - classification only - if problem_type == "classification": - for model_name, model_results in results.items(): - try: - from sklearn.metrics import ( - precision_recall_curve, - average_precision_score, - ) - - last = model_results[-1] - y_proba = last.get("y_proba") - if y_proba is not None: - # if proba has shape (n,2), take positive class - if getattr(y_proba, "ndim", 1) == 2 and y_proba.shape[1] >= 2: - pos_scores = y_proba[:, 1] - else: - pos_scores = y_proba - precision, recall, _ = precision_recall_curve(y_test, pos_scores) - ap = average_precision_score(y_test, pos_scores) - plt.figure(figsize=(5, 4)) - plt.plot(recall, precision, label=f"AP={ap:.3f}") - plt.xlabel("Recall") - plt.ylabel("Precision") - plt.title(f"Precision-Recall - {model_name.upper()} (last run)") - plt.legend() - plt.tight_layout() - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - plt.savefig(os.path.join(model_dir, f"pr_curve_{model_name}.png")) - plt.close() - except Exception as e: - pass # Silently skip if PR curve can't be generated - - # Combined PR curve for model comparison (binary classification only) - if len(results) > 1: - try: - from sklearn.metrics import ( - precision_recall_curve, - average_precision_score, - ) - - plt.figure(figsize=(7, 5)) - for model_name, model_results in results.items(): - last = model_results[-1] - y_proba = last.get("y_proba") - if y_proba is not None: - if getattr(y_proba, "ndim", 1) == 2 and y_proba.shape[1] >= 2: - pos_scores = y_proba[:, 1] - else: - pos_scores = y_proba - precision, recall, _ = precision_recall_curve( - y_test, pos_scores - ) - ap = average_precision_score(y_test, pos_scores) - plt.plot( - recall, - precision, - label=f"{model_name} (AP={ap:.3f})", - linewidth=2, - ) - - plt.xlabel("Recall") - plt.ylabel("Precision") - plt.title("Precision-Recall Comparison (last run)") - plt.legend() - plt.grid(True, alpha=0.3) - plt.tight_layout() - plt.savefig(os.path.join(output_dir, "pr_curve_comparison.png")) - plt.close() - except Exception as e: - pass # Silently skip if comparison can't be generated - - # Residual plots for regression - elif problem_type == "regression" and not is_multi_output: - for model_name, model_results in results.items(): - try: - last = model_results[-1] - y_pred = last.get("y_pred") - residuals = y_test - y_pred - - fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) - - # Predicted vs Actual - ax1.scatter(y_test, y_pred, alpha=0.6) - min_val = min(y_test.min(), y_pred.min()) - max_val = max(y_test.max(), y_pred.max()) - ax1.plot([min_val, max_val], [min_val, max_val], "r--", lw=2) - ax1.set_xlabel("Actual") - ax1.set_ylabel("Predicted") - ax1.set_title(f"Predicted vs Actual - {model_name.upper()}") - ax1.grid(True, alpha=0.3) - - # Residual plot - ax2.scatter(y_pred, residuals, alpha=0.6) - ax2.axhline(y=0, color="r", linestyle="--", lw=2) - ax2.set_xlabel("Predicted") - ax2.set_ylabel("Residuals") - ax2.set_title(f"Residual Plot - {model_name.upper()}") - ax2.grid(True, alpha=0.3) - - plt.tight_layout() - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - plt.savefig(os.path.join(model_dir, f"residual_plot_{model_name}.png")) - plt.close() - except Exception as e: - print(f"Warning: Could not create residual plot for {model_name}: {e}") - - # Feature importance analysis - print(f"\n{'='*80}") - print("FEATURE IMPORTANCE ANALYSIS") - print(f"{'='*80}") - - for model_name, model_results in results.items(): - try: - import joblib - from sklearn.inspection import permutation_importance - from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor - - # Collect feature importances from all trained models - all_importances = [] - - for result in model_results: - model_path = result.get("model_path") - - if not model_path or not os.path.exists(model_path): - continue - - model = joblib.load(model_path) - - # Extract feature importances based on model type - feature_importances = None - importance_method = None - - # For multi-output models, need to handle differently - if isinstance(model, (MultiOutputClassifier, MultiOutputRegressor)): - # Get the base estimator - if hasattr(model.estimators_[0], 'feature_importances_'): - # Average importances across outputs - importances_list = [est.feature_importances_ for est in model.estimators_ if hasattr(est, 'feature_importances_')] - if importances_list: - feature_importances = np.mean(importances_list, axis=0) - importance_method = "built-in (averaged across outputs)" - elif hasattr(model, 'feature_importances_'): - # Tree-based models (Random Forest, XGBoost) - feature_importances = model.feature_importances_ - importance_method = "built-in (Gini/gain)" - elif hasattr(model, 'coef_'): - # Linear models (use absolute coefficient values) - coef = model.coef_ - if len(coef.shape) > 1: - # Multi-output or multi-class: average absolute coefficients - feature_importances = np.mean(np.abs(coef), axis=0) - else: - feature_importances = np.abs(coef) - importance_method = "coefficient magnitude" - - # If no built-in importance, use permutation importance (works for all models including MLPs) - if feature_importances is None and X_test is not None: - # Only print once for the first model - if len(all_importances) == 0: - print(f"\n{model_name.upper()}: Computing permutation importance across all {len(model_results)} models...") - print(f" (This measures performance drop when each feature is shuffled)") - - # Determine scoring metric based on problem type - if problem_type == "classification": - scoring = "accuracy" - else: # regression - scoring = "r2" - - # Compute permutation importance - perm_importance = permutation_importance( - model, X_test, y_test, - n_repeats=10, # Number of times to permute each feature - random_state=42, - scoring=scoring, - n_jobs=-1 # Use all available cores - ) - feature_importances = perm_importance.importances_mean - importance_method = f"permutation (based on {scoring}, averaged across seeds)" - elif feature_importances is not None and importance_method: - # Update method to indicate averaging across seeds - if len(all_importances) == 0: - importance_method = importance_method + f", averaged across {len(model_results)} seeds" - - if feature_importances is not None: - all_importances.append(feature_importances) - - # Average importances across all models - if all_importances: - # Average across all seeds - avg_feature_importances = np.mean(all_importances, axis=0) - std_feature_importances = np.std(all_importances, axis=0) - - print(f"\n{model_name.upper()}:") - print(f"{'-'*80}") - print(f"Method: {importance_method}") - print(f"Averaged across {len(all_importances)} model(s)") - - # Sort features by importance - indices = np.argsort(avg_feature_importances)[::-1] - - # Helper to get feature label - def get_feature_label(idx): - if feature_names and idx < len(feature_names): - return feature_names[idx] - return f"Feature {idx}" - - # Print top 20 features with mean ± std - print(f"\nTop 20 most important features (mean ± std):") - for i, idx in enumerate(indices[:20], 1): - importance = avg_feature_importances[idx] - std = std_feature_importances[idx] - feat_label = get_feature_label(idx) - print(f" {i:2d}. {feat_label:40s}: {importance:.6f} ± {std:.6f}") - - # Create feature importance plot - try: - fig, ax = plt.subplots(figsize=(10, 8)) - - # Plot top 20 features - top_n = min(20, len(avg_feature_importances)) - top_indices = indices[:top_n] - top_importances = avg_feature_importances[top_indices] - top_stds = std_feature_importances[top_indices] - - y_pos = np.arange(top_n) - ax.barh(y_pos, top_importances, xerr=top_stds, align='center', alpha=0.7, capsize=3) - ax.set_yticks(y_pos) - ax.set_yticklabels([get_feature_label(idx) for idx in top_indices]) - ax.invert_yaxis() - ax.set_xlabel('Importance') - ax.set_title(f'Top {top_n} Feature Importances - {model_name.upper()}\n({importance_method})') - ax.grid(True, alpha=0.3, axis='x') - - plt.tight_layout() - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - plt.savefig(os.path.join(model_dir, f"feature_importance_{model_name}.png"), dpi=100, bbox_inches='tight') - plt.close() - - print(f"\n Feature importance plot saved to: {model_dir}/feature_importance_{model_name}.png") - except Exception as e: - print(f" Warning: Could not create feature importance plot: {e}") - else: - print(f"\n{model_name.upper()}: Feature importance not available (no models or no test data provided)") - except Exception as e: - print(f"\nWarning: Could not extract feature importance for {model_name}: {e}") - - print(f"\n{'='*80}\n") - - return combined_summary - - -def evaluate_and_report_cv( - results, output_dir: str = "outputs", problem_type: str = "classification", - label_names: list = None, feature_names: list = None, X_test=None -): - """Compute metrics for k-fold cross-validation with per-fold breakdown. - - Parameters - ---------- - results : dict - Output from Trainer.run_cv() with format {model_name: [run_results_with_fold_info]} - Each run_result contains: seed, fold, y_pred, y_test, etc. - output_dir : str - Where to save the report and plots. - problem_type : str - "classification" or "regression". - label_names : list, optional - Names of the target variables (for multi-output). - feature_names : list, optional - Names of the input features. - X_test : array-like, optional - Test features (note: in CV, each fold has different test set). - """ - os.makedirs(output_dir, exist_ok=True) - import pandas as pd - - print(f"\n{'='*80}") - print("K-FOLD CROSS-VALIDATION RESULTS") - print(f"{'='*80}\n") - - # Determine if multi-output by checking first result - first_result = next(iter(results.values()))[0] - y_test_sample = first_result["y_test"] - is_multi_output = len(y_test_sample.shape) > 1 and y_test_sample.shape[1] > 1 - - all_summaries = {} - all_dfs = {} - - for model_name, model_results in results.items(): - # Group results by fold - folds = {} - for r in model_results: - fold_id = r.get("fold") - if fold_id not in folds: - folds[fold_id] = [] - folds[fold_id].append(r) - - n_folds = len(folds) - - print(f"\n{model_name.upper()} - {n_folds} Folds") - print(f"{'-'*80}") - - # Compute metrics for each fold - fold_summaries = [] - for fold_id in sorted(folds.keys()): - fold_results = folds[fold_id] - fold_metrics_list = [] - - for r in fold_results: - seed = r.get("seed") - y_pred = r.get("y_pred") - y_test = r.get("y_test") - y_proba = r.get("y_proba") - - if problem_type == "classification": - if is_multi_output: - metrics = compute_multi_output_classification_metrics(y_test, y_pred, y_proba) - else: - metrics = compute_classification_metrics(y_test, y_pred, y_proba) - else: # regression - if is_multi_output: - metrics = compute_multi_output_regression_metrics(y_test, y_pred) - else: - metrics = compute_regression_metrics(y_test, y_pred) - - metrics["seed"] = seed - metrics["fold"] = fold_id - fold_metrics_list.append(metrics) - - fold_summaries.append(fold_metrics_list) - - # Print per-fold metrics (averaged across seeds within each fold) - print(f"\nPer-Fold Metrics (averaged across {len(fold_results)} seeds per fold):") - - if problem_type == "classification": - if is_multi_output: - # Multi-output classification - metric_keys = ["accuracy_mean", "balanced_accuracy_mean", "f1_mean"] - else: - metric_keys = ["accuracy", "balanced_accuracy", "f1"] - else: - if is_multi_output: - # Multi-output regression - metric_keys = ["r2_mean", "rmse_mean", "mae_mean"] - else: - metric_keys = ["r2", "rmse", "mae"] - - # Print table header - print(f"\n {'Fold':<6}", end="") - for mk in metric_keys: - print(f"{mk:<20}", end="") - print() - print(f" {'-'*6}", end="") - for _ in metric_keys: - print(f"{'-'*20}", end="") - print() - - # Print per-fold metrics - for fold_id in sorted(folds.keys()): - fold_results_list = fold_summaries[fold_id] - fold_df = pd.DataFrame(fold_results_list) - - print(f" {fold_id:<6}", end="") - for mk in metric_keys: - if mk in fold_df.columns: - mean_val = fold_df[mk].mean() - std_val = fold_df[mk].std() - print(f"{mean_val:.4f} ± {std_val:.4f} ", end="") - else: - print(f"{'N/A':<20}", end="") - print() - - # Compute overall metrics (averaged across all folds and seeds) - all_fold_metrics = [m for fold_list in fold_summaries for m in fold_list] - overall_df = pd.DataFrame(all_fold_metrics) - - print(f"\n {'Overall':<6}", end="") - for mk in metric_keys: - if mk in overall_df.columns: - mean_val = overall_df[mk].mean() - std_val = overall_df[mk].std() - print(f"{mean_val:.4f} ± {std_val:.4f} ", end="") - else: - print(f"{'N/A':<20}", end="") - print("\n") - - # Save detailed report - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - report_path = os.path.join(model_dir, f"report_{model_name}_cv.json") - - with open(report_path, "w") as f: - json.dump(all_fold_metrics, f, indent=2, default=str) - - print(f" Detailed JSON report saved to: {report_path}") - - # Save detailed metrics as CSV (all seeds and folds) - csv_detailed_path = os.path.join(model_dir, f"metrics_{model_name}_cv_detailed.csv") - overall_df.to_csv(csv_detailed_path, index=False) - print(f" Detailed CSV metrics saved to: {csv_detailed_path}") - - # Save per-fold summary (averaged across seeds) - fold_summary_data = [] - for fold_id in sorted(folds.keys()): - fold_results_list = fold_summaries[fold_id] - fold_df = pd.DataFrame(fold_results_list) - - # Calculate mean and std for each metric - fold_summary = {"fold": fold_id} - for col in fold_df.columns: - if col not in ["seed", "fold"]: - fold_summary[f"{col}_mean"] = fold_df[col].mean() - fold_summary[f"{col}_std"] = fold_df[col].std() - fold_summary_data.append(fold_summary) - - # Add overall row - overall_summary = {"fold": "overall"} - for col in overall_df.columns: - if col not in ["seed", "fold"]: - overall_summary[f"{col}_mean"] = overall_df[col].mean() - overall_summary[f"{col}_std"] = overall_df[col].std() - fold_summary_data.append(overall_summary) - - fold_summary_df = pd.DataFrame(fold_summary_data) - csv_summary_path = os.path.join(model_dir, f"metrics_{model_name}_cv_per_fold.csv") - fold_summary_df.to_csv(csv_summary_path, index=False) - print(f" Per-fold summary CSV saved to: {csv_summary_path}") - - all_summaries[model_name] = overall_df - all_dfs[model_name] = overall_df - - # Create comparison plots across folds and models - if len(results) > 1: - print(f"\nGenerating comparison plots...") - - # Determine which metrics to plot - if problem_type == "classification": - if is_multi_output: - metrics_to_plot = {"accuracy_mean": "Accuracy", "balanced_accuracy_mean": "Balanced Accuracy", - "f1_mean": "F1 Score", "cohen_kappa_mean": "Cohen's Kappa"} - else: - metrics_to_plot = {"accuracy": "Accuracy", "balanced_accuracy": "Balanced Accuracy", - "f1": "F1 Score", "cohen_kappa": "Cohen's Kappa"} - else: - if is_multi_output: - metrics_to_plot = {"r2_mean": "R²", "rmse_mean": "RMSE", "mae_mean": "MAE", "mse_mean": "MSE"} - else: - metrics_to_plot = {"r2": "R²", "rmse": "RMSE", "mae": "MAE", "mse": "MSE"} - - for metric_key, metric_label in metrics_to_plot.items(): - try: - fig, ax = plt.subplots(figsize=(10, 6)) - - data_for_plot = [] - labels_for_plot = [] - - for model_name, df in all_dfs.items(): - if metric_key in df.columns: - data_for_plot.append(df[metric_key].values) - labels_for_plot.append(model_name.upper()) - - if data_for_plot: - bp = ax.boxplot(data_for_plot, labels=labels_for_plot, patch_artist=True) - for patch in bp["boxes"]: - patch.set_facecolor("lightblue") - - ax.set_ylabel(metric_label) - ax.set_title(f"{metric_label} Comparison - K-Fold CV") - ax.grid(True, alpha=0.3, axis='y') - plt.xticks(rotation=45, ha='right') - plt.tight_layout() - plt.savefig(os.path.join(output_dir, f"cv_comparison_{metric_key}.png")) - plt.close() - except Exception as e: - print(f" Warning: Could not create comparison plot for {metric_key}: {e}") - - print(f"\n{'='*80}\n") - - # Feature importance for CV - print(f"{'='*80}") - print("FEATURE IMPORTANCE ANALYSIS (per fold and averaged)") - print(f"{'='*80}") - - for model_name, model_results in results.items(): - try: - import joblib - from sklearn.inspection import permutation_importance - from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor - - # Group results by fold - folds_for_fi = {} - for r in model_results: - fold_id = r.get("fold") - if fold_id not in folds_for_fi: - folds_for_fi[fold_id] = [] - folds_for_fi[fold_id].append(r) - - print(f"\n{model_name.upper()}: Collecting feature importances from all {len(folds_for_fi)} folds...") - - # Collect feature importance for each fold - fold_importances_dict = {} - importance_method = None - - for fold_id in sorted(folds_for_fi.keys()): - fold_results = folds_for_fi[fold_id] - fold_importances_list = [] - - for result in fold_results: - model_path = result.get("model_path") - - if not model_path or not os.path.exists(model_path): - continue - - model = joblib.load(model_path) - y_test_fold = result.get("y_test") - X_test_fold = result.get("X_test") - - # Extract feature importances based on model type - feature_importances = None - - # For multi-output models - if isinstance(model, (MultiOutputClassifier, MultiOutputRegressor)): - if hasattr(model.estimators_[0], 'feature_importances_'): - importances_list = [est.feature_importances_ for est in model.estimators_ if hasattr(est, 'feature_importances_')] - if importances_list: - feature_importances = np.mean(importances_list, axis=0) - importance_method = "built-in (averaged across outputs, seeds, and folds)" - elif hasattr(model, 'feature_importances_'): - feature_importances = model.feature_importances_ - importance_method = "built-in (Gini/gain, averaged across seeds and folds)" - elif hasattr(model, 'coef_'): - coef = model.coef_ - if len(coef.shape) > 1: - feature_importances = np.mean(np.abs(coef), axis=0) - else: - feature_importances = np.abs(coef) - importance_method = "coefficient magnitude (averaged across seeds and folds)" - - # If no built-in importance, use permutation importance (e.g., for MLPs) - if feature_importances is None and X_test_fold is not None and y_test_fold is not None: - # Only print once per fold - if len(fold_importances_list) == 0: - print(f" Fold {fold_id}: Computing permutation importance (this may take a while)...") - - # Determine scoring metric based on problem type - if problem_type == "classification": - scoring = "accuracy" - else: # regression - scoring = "r2" - - # Compute permutation importance - perm_importance = permutation_importance( - model, X_test_fold, y_test_fold, - n_repeats=10, # Number of times to permute each feature - random_state=42, - scoring=scoring, - n_jobs=-1 # Use all available cores - ) - feature_importances = perm_importance.importances_mean - importance_method = f"permutation (based on {scoring}, averaged across seeds and folds)" - - if feature_importances is not None: - fold_importances_list.append(feature_importances) - - # Average across seeds for this fold - if fold_importances_list: - fold_avg_importance = np.mean(fold_importances_list, axis=0) - fold_importances_dict[fold_id] = fold_avg_importance - - # Average importances across all folds - if fold_importances_dict: - all_fold_importances = list(fold_importances_dict.values()) - avg_feature_importances = np.mean(all_fold_importances, axis=0) - std_feature_importances = np.std(all_fold_importances, axis=0) - - print(f"\n{model_name.upper()}:") - print(f"{'-'*80}") - print(f"Method: {importance_method}") - print(f"Computed from all {len(fold_importances_dict)} folds") - - # Sort features by importance - indices = np.argsort(avg_feature_importances)[::-1] - - # Helper to get feature label - def get_feature_label(idx): - if feature_names and idx < len(feature_names): - return feature_names[idx] - return f"Feature {idx}" - - # Print top 20 features - print(f"\nTop 20 most important features (mean ± std):") - for i, idx in enumerate(indices[:20], 1): - importance = avg_feature_importances[idx] - std = std_feature_importances[idx] - feat_label = get_feature_label(idx) - print(f" {i:2d}. {feat_label:40s}: {importance:.6f} ± {std:.6f}") - - # Save feature importance to CSV (all features, sorted by importance) - # Include per-fold values and overall average - feature_importance_data = [] - for rank, idx in enumerate(indices, 1): - row = { - "rank": rank, - "feature": get_feature_label(idx), - "importance_mean_all_folds": avg_feature_importances[idx], - "importance_std_across_folds": std_feature_importances[idx], - "feature_index": idx - } - # Add per-fold importance - for fold_id in sorted(fold_importances_dict.keys()): - row[f"importance_fold_{fold_id}"] = fold_importances_dict[fold_id][idx] - feature_importance_data.append(row) - - fi_df = pd.DataFrame(feature_importance_data) - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - fi_csv_path = os.path.join(model_dir, f"feature_importance_{model_name}_cv_all_folds.csv") - fi_df.to_csv(fi_csv_path, index=False) - print(f"\n Feature importance CSV (all folds) saved to: {fi_csv_path}") - - # Create feature importance plot - try: - fig, ax = plt.subplots(figsize=(10, 8)) - - top_n = min(20, len(avg_feature_importances)) - top_indices = indices[:top_n] - top_importances = avg_feature_importances[top_indices] - top_stds = std_feature_importances[top_indices] - - y_pos = np.arange(top_n) - ax.barh(y_pos, top_importances, xerr=top_stds, align='center', alpha=0.7, capsize=3) - ax.set_yticks(y_pos) - ax.set_yticklabels([get_feature_label(idx) for idx in top_indices]) - ax.invert_yaxis() - ax.set_xlabel('Importance') - ax.set_title(f'Top {top_n} Feature Importances - {model_name.upper()} (All Folds)\n({importance_method})') - ax.grid(True, alpha=0.3, axis='x') - - plt.tight_layout() - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - plt.savefig(os.path.join(model_dir, f"feature_importance_{model_name}_cv.png"), dpi=100, bbox_inches='tight') - plt.close() - - print(f"\n Feature importance plot saved to: {model_dir}/feature_importance_{model_name}_cv.png") - except Exception as e: - print(f" Warning: Could not create feature importance plot: {e}") - else: - print(f"\n{model_name.upper()}: Feature importance not available") - except Exception as e: - print(f"\nWarning: Could not extract feature importance for {model_name}: {e}") - - print(f"\n{'='*80}\n") - - return all_summaries - - -def evaluate_tuning_results( - results: Dict, - y_val: np.ndarray, - y_test: np.ndarray, - output_dir: str = "outputs", - problem_type: str = "classification", - label_names: Optional[list] = None, - feature_names: Optional[list] = None, - X_val: Optional[np.ndarray] = None, - X_test: Optional[np.ndarray] = None, -): - """Evaluate and report results from hyperparameter tuning. - - This function evaluates model performance on both validation and test sets - after hyperparameter tuning. It produces reports and visualizations comparing - performance across seeds and between val/test sets. - - Parameters - ---------- - results : Dict - Results dictionary from trainer.run_with_tuning() - y_val : np.ndarray - Validation set labels - y_test : np.ndarray - Test set labels - output_dir : str - Directory to save outputs - problem_type : str - "classification" or "regression" - label_names : list, optional - Names of label columns - feature_names : list, optional - Names of features (for feature importance) - X_val : np.ndarray, optional - Validation features (for feature importance) - X_test : np.ndarray, optional - Test features (for feature importance) - - Returns - ------- - dict - Summary statistics for each model - """ - import pandas as pd - - os.makedirs(output_dir, exist_ok=True) - - print(f"\n{'='*80}") - print("HYPERPARAMETER TUNING EVALUATION") - print(f"{'='*80}\n") - - all_summaries = {} - - for model_name, model_results in results.items(): - print(f"\n{model_name.upper()}") - print("-" * 60) - - # Collect metrics for each seed - val_metrics_per_seed = [] - test_metrics_per_seed = [] - best_params_per_seed = [] - best_cv_scores = [] - - for run in model_results: - seed = run['seed'] - best_params = run['best_params'] - best_cv_score = run['best_cv_score'] - - best_params_per_seed.append(best_params) - best_cv_scores.append(best_cv_score) - - # Compute validation metrics - if problem_type == "regression": - val_metrics = compute_regression_metrics( - y_val, run['y_val_pred'] - ) - else: - val_metrics = compute_classification_metrics( - y_val, run['y_val_pred'], run.get('y_val_proba') - ) - val_metrics['seed'] = seed - val_metrics_per_seed.append(val_metrics) - - # Compute test metrics - if problem_type == "regression": - test_metrics = compute_regression_metrics( - y_test, run['y_test_pred'] - ) - else: - test_metrics = compute_classification_metrics( - y_test, run['y_test_pred'], run.get('y_test_proba') - ) - test_metrics['seed'] = seed - test_metrics_per_seed.append(test_metrics) - - # Create summary DataFrames - val_df = pd.DataFrame(val_metrics_per_seed) - test_df = pd.DataFrame(test_metrics_per_seed) - - # Print summary statistics - print(f"\nValidation Set Performance (across {len(model_results)} seeds):") - if problem_type == "regression": - print(f" R² = {val_df['r2'].mean():.4f} ± {safe_std(val_df['r2']):.4f}") - print(f" RMSE = {val_df['rmse'].mean():.4f} ± {safe_std(val_df['rmse']):.4f}") - print(f" MAE = {val_df['mae'].mean():.4f} ± {safe_std(val_df['mae']):.4f}") - else: - print(f" Accuracy = {val_df['accuracy'].mean():.4f} ± {safe_std(val_df['accuracy']):.4f}") - print(f" Balanced Accuracy = {val_df['balanced_accuracy'].mean():.4f} ± {safe_std(val_df['balanced_accuracy']):.4f}") - print(f" F1 = {val_df['f1'].mean():.4f} ± {safe_std(val_df['f1']):.4f}") - - print(f"\nTest Set Performance (across {len(model_results)} seeds):") - if problem_type == "regression": - print(f" R² = {test_df['r2'].mean():.4f} ± {safe_std(test_df['r2']):.4f}") - print(f" RMSE = {test_df['rmse'].mean():.4f} ± {safe_std(test_df['rmse']):.4f}") - print(f" MAE = {test_df['mae'].mean():.4f} ± {safe_std(test_df['mae']):.4f}") - else: - print(f" Accuracy = {test_df['accuracy'].mean():.4f} ± {safe_std(test_df['accuracy']):.4f}") - print(f" Balanced Accuracy = {test_df['balanced_accuracy'].mean():.4f} ± {safe_std(test_df['balanced_accuracy']):.4f}") - print(f" F1 = {test_df['f1'].mean():.4f} ± {safe_std(test_df['f1']):.4f}") - - print(f"\nBest CV Score (during tuning): {np.mean(best_cv_scores):.4f} ± {safe_std(np.array(best_cv_scores)):.4f}") - - # Save detailed results - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - - # Save validation metrics - val_csv = os.path.join(model_dir, f"metrics_{model_name}_validation.csv") - val_df.to_csv(val_csv, index=False) - print(f"\n Validation metrics saved to: {val_csv}") - - # Save test metrics - test_csv = os.path.join(model_dir, f"metrics_{model_name}_test.csv") - test_df.to_csv(test_csv, index=False) - print(f" Test metrics saved to: {test_csv}") - - # Save hyperparameter summary - params_df = pd.DataFrame(best_params_per_seed) - params_csv = os.path.join(model_dir, f"best_params_{model_name}.csv") - params_df.to_csv(params_csv, index=False) - print(f" Best parameters saved to: {params_csv}") - - # Create comparison plots - try: - # Select key metrics based on problem type - if problem_type == "regression": - metrics_to_plot = ['r2', 'rmse', 'mae'] - else: - metrics_to_plot = ['accuracy', 'balanced_accuracy', 'f1'] - - fig, axes = plt.subplots(1, len(metrics_to_plot), figsize=(5*len(metrics_to_plot), 5)) - if len(metrics_to_plot) == 1: - axes = [axes] - - for idx, metric in enumerate(metrics_to_plot): - ax = axes[idx] - - # Prepare data for boxplot - data = [val_df[metric].values, test_df[metric].values] - positions = [1, 2] - labels = ['Validation', 'Test'] - - bp = ax.boxplot(data, positions=positions, labels=labels, - patch_artist=True, widths=0.6) - - # Color boxes - colors = ['lightblue', 'lightcoral'] - for patch, color in zip(bp['boxes'], colors): - patch.set_facecolor(color) - - ax.set_ylabel(metric.upper()) - ax.set_title(f'{metric.upper()} - {model_name.upper()}') - ax.grid(True, alpha=0.3, axis='y') - - plt.tight_layout() - plot_path = os.path.join(model_dir, f"val_vs_test_{model_name}.png") - plt.savefig(plot_path, dpi=100, bbox_inches='tight') - plt.close() - print(f" Comparison plot saved to: {plot_path}") - except Exception as e: - print(f" Warning: Could not create comparison plot: {e}") - - # Store summary - all_summaries[model_name] = { - 'validation': val_df, - 'test': test_df, - 'best_params': best_params_per_seed, - 'best_cv_scores': best_cv_scores, - } - - # Create summary report comparing all models - summary_report = {} - for model_name, summary in all_summaries.items(): - val_df = summary['validation'] - test_df = summary['test'] - - if problem_type == "regression": - summary_report[model_name] = { - 'val_r2_mean': float(val_df['r2'].mean()), - 'val_r2_std': float(safe_std(val_df['r2'])), - 'test_r2_mean': float(test_df['r2'].mean()), - 'test_r2_std': float(safe_std(test_df['r2'])), - 'val_rmse_mean': float(val_df['rmse'].mean()), - 'val_rmse_std': float(safe_std(val_df['rmse'])), - 'test_rmse_mean': float(test_df['rmse'].mean()), - 'test_rmse_std': float(safe_std(test_df['rmse'])), - } - else: - summary_report[model_name] = { - 'val_accuracy_mean': float(val_df['accuracy'].mean()), - 'val_accuracy_std': float(safe_std(val_df['accuracy'])), - 'test_accuracy_mean': float(test_df['accuracy'].mean()), - 'test_accuracy_std': float(safe_std(test_df['accuracy'])), - 'val_f1_mean': float(val_df['f1'].mean()), - 'val_f1_std': float(safe_std(val_df['f1'])), - 'test_f1_mean': float(test_df['f1'].mean()), - 'test_f1_std': float(safe_std(test_df['f1'])), - } - - # Save summary report - summary_path = os.path.join(output_dir, "tuning_summary_all_models.json") - with open(summary_path, 'w') as f: - json.dump(summary_report, f, indent=2) - print(f"\n{'='*60}") - print(f"Summary report saved to: {summary_path}") - print(f"{'='*60}\n") - - return all_summaries diff --git a/ecosci/evaluation/__init__.py b/ecosci/evaluation/__init__.py new file mode 100644 index 0000000..040b9a6 --- /dev/null +++ b/ecosci/evaluation/__init__.py @@ -0,0 +1,59 @@ +"""Evaluation module for EcoNetToolkit. + +This module provides comprehensive evaluation and reporting capabilities +for machine learning models, including: +- Metrics computation for classification and regression +- Feature importance analysis +- Visualisation and plotting +- Cross-validation results reporting +- Hyperparameter tuning evaluation + +The module is organised into submodules for better maintainability: +- metrics: Core metric computation functions +- plotting: Visualisation utilities +- feature_importance: Feature importance extraction and analysis +- reporting: Main evaluation and reporting orchestration + +For backward compatibility, key functions are re-exported from the main module. +""" + +# Re-export key functions for backward compatibility +from .metrics import ( + safe_std, + compute_classification_metrics, + compute_regression_metrics, + compute_multi_output_classification_metrics, + compute_multi_output_regression_metrics, +) + +from .reporting import ( + evaluate_and_report, + evaluate_and_report_cv, + evaluate_tuning_results, +) + +# Make submodules available +from . import metrics +from . import plotting +from . import feature_importance +from . import reporting + +__all__ = [ + # Core functions + 'evaluate_and_report', + 'evaluate_and_report_cv', + 'evaluate_tuning_results', + + # Metric functions + 'safe_std', + 'compute_classification_metrics', + 'compute_regression_metrics', + 'compute_multi_output_classification_metrics', + 'compute_multi_output_regression_metrics', + + # Submodules + 'metrics', + 'plotting', + 'feature_importance', + 'reporting', +] diff --git a/ecosci/evaluation/feature_importance.py b/ecosci/evaluation/feature_importance.py new file mode 100644 index 0000000..31ba369 --- /dev/null +++ b/ecosci/evaluation/feature_importance.py @@ -0,0 +1,341 @@ +"""Feature importance extraction and analysis. + +This module contains functions for extracting and analyzing feature importance +from trained models, including built-in importance (tree models, linear models) +and permutation importance (for models like MLPs). +""" + +from typing import Dict, Optional, List +import os +import numpy as np +import pandas as pd + +from .metrics import safe_std +from .plotting import plot_feature_importance + + +def extract_feature_importance( + results: Dict, + X_test, + y_test, + output_dir: str, + problem_type: str, + feature_names: Optional[List[str]] = None +): + """Extract and report feature importance for all models. + + Parameters + ---------- + results : Dict + Results dictionary {model_name: [run_results]} + X_test : array-like + Test features + y_test : array-like + Test labels/values + output_dir : str + Directory to save outputs + problem_type : str + "classification" or "regression" + feature_names : Optional[List[str]] + Names of features + """ + import joblib + from sklearn.inspection import permutation_importance + from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor + + print(f"\n{'='*80}") + print("FEATURE IMPORTANCE ANALYSIS") + print(f"{'='*80}") + + for model_name, model_results in results.items(): + try: + # Collect feature importances from all trained models + all_importances = [] + + for result in model_results: + model_path = result.get("model_path") + + if not model_path or not os.path.exists(model_path): + continue + + model = joblib.load(model_path) + + # Extract feature importances based on model type + feature_importances = None + importance_method = None + + # For multi-output models, need to handle differently + if isinstance(model, (MultiOutputClassifier, MultiOutputRegressor)): + # Get the base estimator + if hasattr(model.estimators_[0], 'feature_importances_'): + # Average importances across outputs + importances_list = [est.feature_importances_ for est in model.estimators_ if hasattr(est, 'feature_importances_')] + if importances_list: + feature_importances = np.mean(importances_list, axis=0) + importance_method = "built-in (averaged across outputs)" + elif hasattr(model, 'feature_importances_'): + # Tree-based models (Random Forest, XGBoost) + feature_importances = model.feature_importances_ + importance_method = "built-in (Gini/gain)" + elif hasattr(model, 'coef_'): + # Linear models (use absolute coefficient values) + coef = model.coef_ + if len(coef.shape) > 1: + # Multi-output or multi-class: average absolute coefficients + feature_importances = np.mean(np.abs(coef), axis=0) + else: + feature_importances = np.abs(coef) + importance_method = "coefficient magnitude" + + # If no built-in importance, use permutation importance (works for all models including MLPs) + if feature_importances is None and X_test is not None: + # Only print once for the first model + if len(all_importances) == 0: + print(f"\n{model_name.upper()}: Computing permutation importance across all {len(model_results)} models...") + print(f" (This measures performance drop when each feature is shuffled)") + + # Determine scoring metric based on problem type + if problem_type == "classification": + scoring = "accuracy" + else: # regression + scoring = "r2" + + # Compute permutation importance + perm_importance = permutation_importance( + model, X_test, y_test, + n_repeats=10, # Number of times to permute each feature + random_state=42, + scoring=scoring, + n_jobs=-1 # Use all available cores + ) + feature_importances = perm_importance.importances_mean + importance_method = f"permutation (based on {scoring}, averaged across seeds)" + elif feature_importances is not None and importance_method: + # Update method to indicate averaging across seeds + if len(all_importances) == 0: + importance_method = importance_method + f", averaged across {len(model_results)} seeds" + + if feature_importances is not None: + all_importances.append(feature_importances) + + # Average importances across all models + if all_importances: + # Average across all seeds + avg_feature_importances = np.mean(all_importances, axis=0) + std_feature_importances = np.std(all_importances, axis=0) + + print(f"\n{model_name.upper()}:") + print(f"{'-'*80}") + print(f"Method: {importance_method}") + print(f"Averaged across {len(all_importances)} model(s)") + + # Sort features by importance + indices = np.argsort(avg_feature_importances)[::-1] + + # Helper to get feature label + def get_feature_label(idx): + if feature_names and idx < len(feature_names): + return feature_names[idx] + return f"Feature {idx}" + + # Print top 20 features with mean ± std + print(f"\nTop 20 most important features (mean ± std):") + for i, idx in enumerate(indices[:20], 1): + importance = avg_feature_importances[idx] + std = std_feature_importances[idx] + feat_label = get_feature_label(idx) + print(f" {i:2d}. {feat_label:40s}: {importance:.6f} ± {std:.6f}") + + # Create feature importance plot + plot_feature_importance( + avg_feature_importances, + std_feature_importances, + feature_names, + model_name, + output_dir, + importance_method, + top_n=20 + ) + else: + print(f"\n{model_name.upper()}: Feature importance not available (no models or no test data provided)") + except Exception as e: + print(f"\nWarning: Could not extract feature importance for {model_name}: {e}") + + print(f"\n{'='*80}\n") + + +def extract_cv_feature_importance( + results: Dict, + output_dir: str, + problem_type: str, + feature_names: Optional[List[str]] = None +): + """Extract and report feature importance for cross-validation results. + + Parameters + ---------- + results : Dict + Results dictionary {model_name: [run_results_with_fold_info]} + output_dir : str + Directory to save outputs + problem_type : str + "classification" or "regression" + feature_names : Optional[List[str]] + Names of features + """ + import joblib + from sklearn.inspection import permutation_importance + from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor + + print(f"{'='*80}") + print("FEATURE IMPORTANCE ANALYSIS (per fold and averaged)") + print(f"{'='*80}") + + for model_name, model_results in results.items(): + try: + # Group results by fold + folds_for_fi = {} + for r in model_results: + fold_id = r.get("fold") + if fold_id not in folds_for_fi: + folds_for_fi[fold_id] = [] + folds_for_fi[fold_id].append(r) + + print(f"\n{model_name.upper()}: Collecting feature importances from all {len(folds_for_fi)} folds...") + + # Collect feature importance for each fold + fold_importances_dict = {} + importance_method = None + + for fold_id in sorted(folds_for_fi.keys()): + fold_results = folds_for_fi[fold_id] + fold_importances_list = [] + + for result in fold_results: + model_path = result.get("model_path") + + if not model_path or not os.path.exists(model_path): + continue + + model = joblib.load(model_path) + y_test_fold = result.get("y_test") + X_test_fold = result.get("X_test") + + # Extract feature importances based on model type + feature_importances = None + + # For multi-output models + if isinstance(model, (MultiOutputClassifier, MultiOutputRegressor)): + if hasattr(model.estimators_[0], 'feature_importances_'): + importances_list = [est.feature_importances_ for est in model.estimators_ if hasattr(est, 'feature_importances_')] + if importances_list: + feature_importances = np.mean(importances_list, axis=0) + importance_method = "built-in (averaged across outputs, seeds, and folds)" + elif hasattr(model, 'feature_importances_'): + feature_importances = model.feature_importances_ + importance_method = "built-in (Gini/gain, averaged across seeds and folds)" + elif hasattr(model, 'coef_'): + coef = model.coef_ + if len(coef.shape) > 1: + feature_importances = np.mean(np.abs(coef), axis=0) + else: + feature_importances = np.abs(coef) + importance_method = "coefficient magnitude (averaged across seeds and folds)" + + # If no built-in importance, use permutation importance (e.g., for MLPs) + if feature_importances is None and X_test_fold is not None and y_test_fold is not None: + # Only print once per fold + if len(fold_importances_list) == 0: + print(f" Fold {fold_id}: Computing permutation importance (this may take a while)...") + + # Determine scoring metric based on problem type + if problem_type == "classification": + scoring = "accuracy" + else: # regression + scoring = "r2" + + # Compute permutation importance + perm_importance = permutation_importance( + model, X_test_fold, y_test_fold, + n_repeats=10, # Number of times to permute each feature + random_state=42, + scoring=scoring, + n_jobs=-1 # Use all available cores + ) + feature_importances = perm_importance.importances_mean + importance_method = f"permutation (based on {scoring}, averaged across seeds and folds)" + + if feature_importances is not None: + fold_importances_list.append(feature_importances) + + # Average across seeds for this fold + if fold_importances_list: + fold_avg_importance = np.mean(fold_importances_list, axis=0) + fold_importances_dict[fold_id] = fold_avg_importance + + # Average importances across all folds + if fold_importances_dict: + all_fold_importances = list(fold_importances_dict.values()) + avg_feature_importances = np.mean(all_fold_importances, axis=0) + std_feature_importances = np.std(all_fold_importances, axis=0) + + print(f"\n{model_name.upper()}:") + print(f"{'-'*80}") + print(f"Method: {importance_method}") + print(f"Computed from all {len(fold_importances_dict)} folds") + + # Sort features by importance + indices = np.argsort(avg_feature_importances)[::-1] + + # Helper to get feature label + def get_feature_label(idx): + if feature_names and idx < len(feature_names): + return feature_names[idx] + return f"Feature {idx}" + + # Print top 20 features + print(f"\nTop 20 most important features (mean ± std):") + for i, idx in enumerate(indices[:20], 1): + importance = avg_feature_importances[idx] + std = std_feature_importances[idx] + feat_label = get_feature_label(idx) + print(f" {i:2d}. {feat_label:40s}: {importance:.6f} ± {std:.6f}") + + # Save feature importance to CSV (all features, sorted by importance) + feature_importance_data = [] + for rank, idx in enumerate(indices, 1): + row = { + "rank": rank, + "feature": get_feature_label(idx), + "importance_mean_all_folds": avg_feature_importances[idx], + "importance_std_across_folds": std_feature_importances[idx], + "feature_index": idx + } + # Add per-fold importance + for fold_id in sorted(fold_importances_dict.keys()): + row[f"importance_fold_{fold_id}"] = fold_importances_dict[fold_id][idx] + feature_importance_data.append(row) + + fi_df = pd.DataFrame(feature_importance_data) + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + fi_csv_path = os.path.join(model_dir, f"feature_importance_{model_name}_cv_all_folds.csv") + fi_df.to_csv(fi_csv_path, index=False) + print(f"\n Feature importance CSV (all folds) saved to: {fi_csv_path}") + + # Create feature importance plot + plot_feature_importance( + avg_feature_importances, + std_feature_importances, + feature_names, + f"{model_name}_cv", + output_dir, + importance_method, + top_n=20 + ) + else: + print(f"\n{model_name.upper()}: Feature importance not available") + except Exception as e: + print(f"\nWarning: Could not extract feature importance for {model_name}: {e}") + + print(f"\n{'='*80}\n") diff --git a/ecosci/evaluation/metrics.py b/ecosci/evaluation/metrics.py new file mode 100644 index 0000000..9b6b49b --- /dev/null +++ b/ecosci/evaluation/metrics.py @@ -0,0 +1,239 @@ +"""Metrics computation for classification and regression tasks. + +This module contains functions for computing various performance metrics +for both single-output and multi-output problems. +""" + +from typing import Dict, Any +import numpy as np + + +def safe_std(vals): + """Calculate std, returning 0.0 instead of nan when there's only 1 value.""" + if len(vals) <= 1: + return 0.0 + return vals.std() + + +def compute_classification_metrics(y_true, y_pred, y_proba=None) -> Dict[str, Any]: + """Compute classification metrics. + + Parameters + ---------- + y_true : array-like + Ground truth labels. + y_pred : array-like + Predicted labels. + y_proba : array-like, optional + Predicted probabilities (if available). + + Returns + ------- + Dict[str, Any] + Dictionary containing accuracy, balanced_accuracy, precision, recall, + f1, cohen_kappa, confusion_matrix, and optionally roc_auc and average_precision. + """ + from sklearn.metrics import ( + accuracy_score, + precision_score, + recall_score, + f1_score, + roc_auc_score, + confusion_matrix, + balanced_accuracy_score, + average_precision_score, + cohen_kappa_score, + ) + + out = {} + out["accuracy"] = accuracy_score(y_true, y_pred) + out["balanced_accuracy"] = balanced_accuracy_score(y_true, y_pred) + out["precision"] = precision_score( + y_true, + y_pred, + average="binary" if len(np.unique(y_true)) == 2 else "macro", + zero_division=0, + ) + out["recall"] = recall_score( + y_true, + y_pred, + average="binary" if len(np.unique(y_true)) == 2 else "macro", + zero_division=0, + ) + out["f1"] = f1_score( + y_true, + y_pred, + average="binary" if len(np.unique(y_true)) == 2 else "macro", + zero_division=0, + ) + out["cohen_kappa"] = cohen_kappa_score(y_true, y_pred) + out["confusion_matrix"] = confusion_matrix(y_true, y_pred).tolist() + + if y_proba is not None: + try: + # Binary classification: use positive class probabilities + if y_proba.ndim == 2 and y_proba.shape[1] == 2: + out["roc_auc"] = roc_auc_score(y_true, y_proba[:, 1]) + out["average_precision"] = average_precision_score( + y_true, y_proba[:, 1] + ) + # Multiclass: use OVR strategy + elif y_proba.ndim == 2 and y_proba.shape[1] > 2: + out["roc_auc"] = roc_auc_score(y_true, y_proba, multi_class="ovr") + out["average_precision"] = average_precision_score( + y_true, y_proba, average="macro" + ) + # 1D probabilities (rare, but handle it) + else: + out["roc_auc"] = roc_auc_score(y_true, y_proba) + out["average_precision"] = average_precision_score(y_true, y_proba) + except Exception as e: + print(f"Warning: Could not compute ROC-AUC/AP: {e}") + out["roc_auc"] = None + out["average_precision"] = None + else: + out["roc_auc"] = None + out["average_precision"] = None + + return out + + +def compute_multi_output_classification_metrics(y_true, y_pred, y_proba=None) -> Dict[str, Any]: + """Compute classification metrics for multi-output predictions. + + For multi-output, we compute metrics per output and aggregate. + + Parameters + ---------- + y_true : array-like, shape (n_samples, n_outputs) + Ground truth labels + y_pred : array-like, shape (n_samples, n_outputs) + Predicted labels + y_proba : array-like, shape (n_samples, n_outputs, n_classes) or None + Predicted probabilities (if available) + + Returns + ------- + Dict[str, Any] + Dictionary with average metrics across outputs and per-output metrics + """ + from sklearn.metrics import ( + accuracy_score, + precision_score, + recall_score, + f1_score, + balanced_accuracy_score, + cohen_kappa_score, + ) + + n_outputs = y_true.shape[1] + out = {} + + # Compute per-output metrics + per_output_metrics = [] + for i in range(n_outputs): + y_true_i = y_true[:, i] + y_pred_i = y_pred[:, i] + y_proba_i = y_proba[i] if y_proba is not None and isinstance(y_proba, list) else None + + metrics_i = compute_classification_metrics(y_true_i, y_pred_i, y_proba_i) + per_output_metrics.append(metrics_i) + + # Aggregate metrics (average across outputs) + for key in ['accuracy', 'balanced_accuracy', 'precision', 'recall', 'f1', 'cohen_kappa']: + values = [m[key] for m in per_output_metrics if key in m and m[key] is not None] + if values: + out[f"{key}_mean"] = np.mean(values) + out[f"{key}_std"] = np.std(values) + + # Store per-output metrics + out["per_output"] = per_output_metrics + out["n_outputs"] = n_outputs + + return out + + +def compute_regression_metrics(y_true, y_pred) -> Dict[str, Any]: + """Compute regression metrics. + + Parameters + ---------- + y_true : array-like + Ground truth target values. + y_pred : array-like + Predicted target values. + + Returns + ------- + Dict[str, Any] + Dictionary containing: + - mse: Mean Squared Error + - rmse: Root Mean Squared Error + - mae: Mean Absolute Error + - r2: R-squared (coefficient of determination) + - mape: Mean Absolute Percentage Error (if no zeros in y_true) + """ + from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score + + out = {} + out["mse"] = mean_squared_error(y_true, y_pred) + out["rmse"] = np.sqrt(out["mse"]) + out["mae"] = mean_absolute_error(y_true, y_pred) + out["r2"] = r2_score(y_true, y_pred) + + # MAPE - only compute if no zeros in y_true + try: + if not np.any(y_true == 0): + mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100 + out["mape"] = mape + else: + out["mape"] = None + except Exception: + out["mape"] = None + + return out + + +def compute_multi_output_regression_metrics(y_true, y_pred) -> Dict[str, Any]: + """Compute regression metrics for multi-output predictions. + + For multi-output regression, we compute metrics per output and aggregate. + + Parameters + ---------- + y_true : array-like, shape (n_samples, n_outputs) + Ground truth values + y_pred : array-like, shape (n_samples, n_outputs) + Predicted values + + Returns + ------- + Dict[str, Any] + Dictionary with average metrics across outputs and per-output metrics + """ + from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score + + n_outputs = y_true.shape[1] + out = {} + + # Compute per-output metrics + per_output_metrics = [] + for i in range(n_outputs): + y_true_i = y_true[:, i] + y_pred_i = y_pred[:, i] + + metrics_i = compute_regression_metrics(y_true_i, y_pred_i) + per_output_metrics.append(metrics_i) + + # Aggregate metrics (average across outputs) + for key in ['mse', 'rmse', 'mae', 'r2']: + values = [m[key] for m in per_output_metrics if key in m and m[key] is not None] + if values: + out[f"{key}_mean"] = np.mean(values) + out[f"{key}_std"] = np.std(values) + + # Store per-output metrics + out["per_output"] = per_output_metrics + out["n_outputs"] = n_outputs + + return out diff --git a/ecosci/evaluation/plotting.py b/ecosci/evaluation/plotting.py new file mode 100644 index 0000000..ca0a1b0 --- /dev/null +++ b/ecosci/evaluation/plotting.py @@ -0,0 +1,427 @@ +"""Plotting and visualisation utilities for evaluation results. + +This module contains functions for creating various plots and visualisations +for model evaluation, including confusion matrices, PR curves, residual plots, +and feature importance plots. +""" + +from typing import Dict, Any, Optional, List +import os +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +import pandas as pd + + +def plot_metric_comparisons( + combined_df: pd.DataFrame, + plot_metrics: List[str], + output_dir: str, + all_dfs: Optional[Dict] = None +): + """Create boxplot comparisons for metrics across models or seeds. + + Parameters + ---------- + combined_df : pd.DataFrame + Combined DataFrame with all results + plot_metrics : List[str] + List of metric names to plot + output_dir : str + Directory to save plots + all_dfs : Optional[Dict] + Dictionary of DataFrames per model (for multi-model comparison) + """ + if all_dfs and len(all_dfs) > 1: + # Multi-model comparison plots + for m in plot_metrics: + if m in combined_df.columns: + plt.figure(figsize=(max(6, len(all_dfs) * 1.5), 4)) + sns.boxplot(data=combined_df, x="model", y=m) + plt.title(f"{m.replace('_', ' ').title()} - Model Comparison") + plt.xlabel("Model") + plt.ylabel(m.replace("_", " ").title()) + plt.xticks(rotation=45, ha="right") + plt.tight_layout() + plt.savefig(os.path.join(output_dir, f"comparison_{m}.png")) + plt.close() + else: + # Single model plots (backward compatibility) + for m in plot_metrics: + if m in combined_df.columns: + plt.figure(figsize=(4, 3)) + sns.boxplot(data=combined_df, y=m) + plt.title(m.replace("_", " ").title()) + plt.tight_layout() + plt.savefig(os.path.join(output_dir, f"metric_{m}.png")) + plt.close() + + +def plot_confusion_matrices( + results: Dict, + y_test, + output_dir: str +): + """Create confusion matrix heatmaps for classification models. + + Parameters + ---------- + results : Dict + Results dictionary {model_name: [run_results]} + y_test : array-like + Test labels + output_dir : str + Directory to save plots + """ + from sklearn.metrics import confusion_matrix + + for model_name, model_results in results.items(): + try: + last = model_results[-1] + cm = confusion_matrix(y_test, last.get("y_pred")) + plt.figure(figsize=(5, 4)) + sns.heatmap(cm, annot=True, fmt="d", cmap="Blues") + plt.title(f"Confusion Matrix - {model_name.upper()} (last run)") + plt.xlabel("Predicted") + plt.ylabel("True") + plt.tight_layout() + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + plt.savefig( + os.path.join(model_dir, f"confusion_matrix_{model_name}.png") + ) + plt.close() + except Exception as e: + print( + f"Warning: Could not create confusion matrix for {model_name}: {e}" + ) + + +def plot_pr_curves( + results: Dict, + y_test, + output_dir: str +): + """Create precision-recall curves for classification models. + + Parameters + ---------- + results : Dict + Results dictionary {model_name: [run_results]} + y_test : array-like + Test labels + output_dir : str + Directory to save plots + """ + from sklearn.metrics import ( + precision_recall_curve, + average_precision_score, + ) + + # Individual PR curves for each model + for model_name, model_results in results.items(): + try: + last = model_results[-1] + y_proba = last.get("y_proba") + if y_proba is not None: + # if proba has shape (n,2), take positive class + if getattr(y_proba, "ndim", 1) == 2 and y_proba.shape[1] >= 2: + pos_scores = y_proba[:, 1] + else: + pos_scores = y_proba + precision, recall, _ = precision_recall_curve(y_test, pos_scores) + ap = average_precision_score(y_test, pos_scores) + plt.figure(figsize=(5, 4)) + plt.plot(recall, precision, label=f"AP={ap:.3f}") + plt.xlabel("Recall") + plt.ylabel("Precision") + plt.title(f"Precision-Recall - {model_name.upper()} (last run)") + plt.legend() + plt.tight_layout() + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + plt.savefig(os.path.join(model_dir, f"pr_curve_{model_name}.png")) + plt.close() + except Exception as e: + pass # Silently skip if PR curve can't be generated + + # Combined PR curve for model comparison (binary classification only) + if len(results) > 1: + try: + plt.figure(figsize=(7, 5)) + for model_name, model_results in results.items(): + last = model_results[-1] + y_proba = last.get("y_proba") + if y_proba is not None: + if getattr(y_proba, "ndim", 1) == 2 and y_proba.shape[1] >= 2: + pos_scores = y_proba[:, 1] + else: + pos_scores = y_proba + precision, recall, _ = precision_recall_curve( + y_test, pos_scores + ) + ap = average_precision_score(y_test, pos_scores) + plt.plot( + recall, + precision, + label=f"{model_name} (AP={ap:.3f})", + linewidth=2, + ) + + plt.xlabel("Recall") + plt.ylabel("Precision") + plt.title("Precision-Recall Comparison (last run)") + plt.legend() + plt.grid(True, alpha=0.3) + plt.tight_layout() + plt.savefig(os.path.join(output_dir, "pr_curve_comparison.png")) + plt.close() + except Exception as e: + pass # Silently skip if comparison can't be generated + + +def plot_residuals( + results: Dict, + y_test, + output_dir: str +): + """Create residual plots for regression models. + + Parameters + ---------- + results : Dict + Results dictionary {model_name: [run_results]} + y_test : array-like + Test values + output_dir : str + Directory to save plots + """ + for model_name, model_results in results.items(): + try: + last = model_results[-1] + y_pred = last.get("y_pred") + residuals = y_test - y_pred + + fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) + + # Predicted vs Actual + ax1.scatter(y_test, y_pred, alpha=0.6) + min_val = min(y_test.min(), y_pred.min()) + max_val = max(y_test.max(), y_pred.max()) + ax1.plot([min_val, max_val], [min_val, max_val], "r--", lw=2) + ax1.set_xlabel("Actual") + ax1.set_ylabel("Predicted") + ax1.set_title(f"Predicted vs Actual - {model_name.upper()}") + ax1.grid(True, alpha=0.3) + + # Residual plot + ax2.scatter(y_pred, residuals, alpha=0.6) + ax2.axhline(y=0, color="r", linestyle="--", lw=2) + ax2.set_xlabel("Predicted") + ax2.set_ylabel("Residuals") + ax2.set_title(f"Residual Plot - {model_name.upper()}") + ax2.grid(True, alpha=0.3) + + plt.tight_layout() + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + plt.savefig(os.path.join(model_dir, f"residual_plot_{model_name}.png")) + plt.close() + except Exception as e: + print(f"Warning: Could not create residual plot for {model_name}: {e}") + + +def plot_feature_importance( + feature_importances: np.ndarray, + feature_std: np.ndarray, + feature_names: Optional[List[str]], + model_name: str, + output_dir: str, + importance_method: str, + top_n: int = 20 +): + """Create feature importance bar plot. + + Parameters + ---------- + feature_importances : np.ndarray + Array of feature importance values + feature_std : np.ndarray + Array of standard deviations for feature importances + feature_names : Optional[List[str]] + Names of features + model_name : str + Name of the model + output_dir : str + Directory to save plot + importance_method : str + Description of the method used to compute importance + top_n : int + Number of top features to display + """ + try: + # Sort features by importance + indices = np.argsort(feature_importances)[::-1] + + # Helper to get feature label + def get_feature_label(idx): + if feature_names and idx < len(feature_names): + return feature_names[idx] + return f"Feature {idx}" + + fig, ax = plt.subplots(figsize=(10, 8)) + + # Plot top N features + top_n = min(top_n, len(feature_importances)) + top_indices = indices[:top_n] + top_importances = feature_importances[top_indices] + top_stds = feature_std[top_indices] + + y_pos = np.arange(top_n) + ax.barh(y_pos, top_importances, xerr=top_stds, align='centre', alpha=0.7, capsize=3) + ax.set_yticks(y_pos) + ax.set_yticklabels([get_feature_label(idx) for idx in top_indices]) + ax.invert_yaxis() + ax.set_xlabel('Importance') + ax.set_title(f'Top {top_n} Feature Importances - {model_name.upper()}\n({importance_method})') + ax.grid(True, alpha=0.3, axis='x') + + plt.tight_layout() + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + plt.savefig(os.path.join(model_dir, f"feature_importance_{model_name}.png"), dpi=100, bbox_inches='tight') + plt.close() + + print(f"\n Feature importance plot saved to: {model_dir}/feature_importance_{model_name}.png") + except Exception as e: + print(f" Warning: Could not create feature importance plot: {e}") + + +def plot_cv_comparison( + all_dfs: Dict[str, pd.DataFrame], + output_dir: str, + problem_type: str, + is_multi_output: bool +): + """Create comparison plots across folds and models for cross-validation. + + Parameters + ---------- + all_dfs : Dict[str, pd.DataFrame] + Dictionary of DataFrames per model + output_dir : str + Directory to save plots + problem_type : str + "classification" or "regression" + is_multi_output : bool + Whether this is multi-output problem + """ + if len(all_dfs) <= 1: + return + + print(f"\nGenerating comparison plots...") + + # Determine which metrics to plot + if problem_type == "classification": + if is_multi_output: + metrics_to_plot = {"accuracy_mean": "Accuracy", "balanced_accuracy_mean": "Balanced Accuracy", + "f1_mean": "F1 Score", "cohen_kappa_mean": "Cohen's Kappa"} + else: + metrics_to_plot = {"accuracy": "Accuracy", "balanced_accuracy": "Balanced Accuracy", + "f1": "F1 Score", "cohen_kappa": "Cohen's Kappa"} + else: + if is_multi_output: + metrics_to_plot = {"r2_mean": "R²", "rmse_mean": "RMSE", "mae_mean": "MAE", "mse_mean": "MSE"} + else: + metrics_to_plot = {"r2": "R²", "rmse": "RMSE", "mae": "MAE", "mse": "MSE"} + + for metric_key, metric_label in metrics_to_plot.items(): + try: + fig, ax = plt.subplots(figsize=(10, 6)) + + data_for_plot = [] + labels_for_plot = [] + + for model_name, df in all_dfs.items(): + if metric_key in df.columns: + data_for_plot.append(df[metric_key].values) + labels_for_plot.append(model_name.upper()) + + if data_for_plot: + bp = ax.boxplot(data_for_plot, labels=labels_for_plot, patch_artist=True) + for patch in bp["boxes"]: + patch.set_facecolor("lightblue") + + ax.set_ylabel(metric_label) + ax.set_title(f"{metric_label} Comparison - K-Fold CV") + ax.grid(True, alpha=0.3, axis='y') + plt.xticks(rotation=45, ha='right') + plt.tight_layout() + plt.savefig(os.path.join(output_dir, f"cv_comparison_{metric_key}.png")) + plt.close() + except Exception as e: + print(f" Warning: Could not create comparison plot for {metric_key}: {e}") + + +def plot_validation_vs_test( + val_df: pd.DataFrame, + test_df: pd.DataFrame, + model_name: str, + output_dir: str, + problem_type: str +): + """Create validation vs test comparison plots for hyperparameter tuning. + + Parameters + ---------- + val_df : pd.DataFrame + Validation metrics DataFrame + test_df : pd.DataFrame + Test metrics DataFrame + model_name : str + Name of the model + output_dir : str + Directory to save plots + problem_type : str + "classification" or "regression" + """ + try: + # Select key metrics based on problem type + if problem_type == "regression": + metrics_to_plot = ['r2', 'rmse', 'mae'] + else: + metrics_to_plot = ['accuracy', 'balanced_accuracy', 'f1'] + + fig, axes = plt.subplots(1, len(metrics_to_plot), figsize=(5*len(metrics_to_plot), 5)) + if len(metrics_to_plot) == 1: + axes = [axes] + + for idx, metric in enumerate(metrics_to_plot): + ax = axes[idx] + + # Prepare data for boxplot + data = [val_df[metric].values, test_df[metric].values] + positions = [1, 2] + labels = ['Validation', 'Test'] + + bp = ax.boxplot(data, positions=positions, labels=labels, + patch_artist=True, widths=0.6) + + # Colour boxes + colours = ['lightblue', 'lightcoral'] + for patch, colour in zip(bp['boxes'], colours): + patch.set_facecolor(colour) + + ax.set_ylabel(metric.upper()) + ax.set_title(f'{metric.upper()} - {model_name.upper()}') + ax.grid(True, alpha=0.3, axis='y') + + plt.tight_layout() + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + plot_path = os.path.join(model_dir, f"val_vs_test_{model_name}.png") + plt.savefig(plot_path, dpi=100, bbox_inches='tight') + plt.close() + print(f" Comparison plot saved to: {plot_path}") + except Exception as e: + print(f" Warning: Could not create comparison plot: {e}") diff --git a/ecosci/evaluation/reporting.py b/ecosci/evaluation/reporting.py new file mode 100644 index 0000000..ad9f67b --- /dev/null +++ b/ecosci/evaluation/reporting.py @@ -0,0 +1,766 @@ +"""Reporting and evaluation utilities. + +This module contains the main evaluation and reporting functions that orchestrate +metrics computation, feature importance analysis, and plot generation. + +Focus on metrics that make sense for unbalanced datasets: +- Balanced accuracy (accounts for class imbalance) +- Precision/Recall/F1 (macro for multi-class) +- ROC-AUC and Average Precision (PR AUC) when probabilities are available + +Outputs: +- `report.json`: list of per-seed metrics +- `metric_*.png`: quick boxplots across seeds +- `confusion_matrix.png`: confusion matrix heatmap (last run) +- `pr_curve.png`: precision-recall curve if binary and probabilities exist +""" + +from typing import Dict, Any, Optional +import os +import json +import numpy as np +import pandas as pd + +from .metrics import ( + safe_std, + compute_classification_metrics, + compute_regression_metrics, + compute_multi_output_classification_metrics, + compute_multi_output_regression_metrics, +) +from .plotting import ( + plot_metric_comparisons, + plot_confusion_matrices, + plot_pr_curves, + plot_residuals, + plot_feature_importance, + plot_cv_comparison, + plot_validation_vs_test, +) +from .feature_importance import ( + extract_feature_importance, + extract_cv_feature_importance, +) + + +def evaluate_and_report( + results, y_test, output_dir: str = "outputs", problem_type: str = "classification", + label_names: list = None, feature_names: list = None, X_test=None +): + """Compute metrics across runs and save a compact report and plots. + + Parameters + ---------- + results : dict or list + If dict: Output from Trainer.run() with multiple models {model_name: [run_results]} + If list: Single model results (backward compatibility) + y_test : array-like + Ground-truth labels (classification) or values (regression) for the test set. + output_dir : str + Where to save the report and plots. + problem_type : str + "classification" or "regression". Determines which metrics to compute. + label_names : list, optional + Names of the target variables (for multi-output). If None, will use Output 1, Output 2, etc. + feature_names : list, optional + Names of the input features. If None, will use Feature 0, Feature 1, etc. + X_test : array-like, optional + Test features for permutation importance. Required for models without built-in importance (e.g., MLPs). + """ + os.makedirs(output_dir, exist_ok=True) + + # Handle backward compatibility: convert list to dict + if isinstance(results, list): + # Assume single model (old format) + model_name = results[0].get("model_name", "model") if results else "model" + results = {model_name: results} + + # Determine if we have multi-output + is_multi_output = len(y_test.shape) > 1 and y_test.shape[1] > 1 + + # Compute metrics for each model + all_summaries = {} + all_dfs = {} + + for model_name, model_results in results.items(): + summary = [] + for r in model_results: + seed = r.get("seed") + y_pred = r.get("y_pred") + y_proba = r.get("y_proba") + + if is_multi_output: + # Multi-output case + if problem_type == "regression": + metrics = compute_multi_output_regression_metrics(y_test, y_pred) + else: + metrics = compute_multi_output_classification_metrics(y_test, y_pred, y_proba) + else: + # Single output case + if problem_type == "regression": + metrics = compute_regression_metrics(y_test, y_pred) + else: + metrics = compute_classification_metrics(y_test, y_pred, y_proba) + + metrics["seed"] = seed + metrics["model"] = model_name + summary.append(metrics) + + all_summaries[model_name] = summary + all_dfs[model_name] = pd.DataFrame(summary) + + # Save individual model reports in model-specific subfolders + for model_name, summary in all_summaries.items(): + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + report_path = os.path.join(model_dir, f"report_{model_name}.json") + with open(report_path, "w") as f: + json.dump(summary, f, indent=2) + + # Combine all results for comparison + combined_df = pd.concat(all_dfs.values(), ignore_index=True) + combined_summary = [item for sublist in all_summaries.values() for item in sublist] + + # Save combined report + with open(os.path.join(output_dir, "report_all_models.json"), "w") as f: + json.dump(combined_summary, f, indent=2) + + # Print results for each model + display_cols, comparison_metrics, plot_metrics = _get_metric_columns( + problem_type, is_multi_output, combined_df + ) + + _print_model_results(all_dfs, display_cols, is_multi_output, label_names) + + if len(all_dfs) > 1: + _print_model_comparison(all_dfs, comparison_metrics) + _print_model_ranking(all_dfs, display_cols, problem_type, is_multi_output) + + # Create plots + plot_metric_comparisons(combined_df, plot_metrics, output_dir, all_dfs) + + if problem_type == "classification": + plot_confusion_matrices(results, y_test, output_dir) + plot_pr_curves(results, y_test, output_dir) + elif problem_type == "regression" and not is_multi_output: + plot_residuals(results, y_test, output_dir) + + # Feature importance analysis + extract_feature_importance(results, X_test, y_test, output_dir, problem_type, feature_names) + + return combined_summary + + +def evaluate_and_report_cv( + results, output_dir: str = "outputs", problem_type: str = "classification", + label_names: list = None, feature_names: list = None, X_test=None +): + """Compute metrics for k-fold cross-validation with per-fold breakdown. + + Parameters + ---------- + results : dict + Output from Trainer.run_cv() with format {model_name: [run_results_with_fold_info]} + Each run_result contains: seed, fold, y_pred, y_test, etc. + output_dir : str + Where to save the report and plots. + problem_type : str + "classification" or "regression". + label_names : list, optional + Names of the target variables (for multi-output). + feature_names : list, optional + Names of the input features. + X_test : array-like, optional + Test features (note: in CV, each fold has different test set). + """ + os.makedirs(output_dir, exist_ok=True) + + print(f"\n{'='*80}") + print("K-FOLD CROSS-VALIDATION RESULTS") + print(f"{'='*80}\n") + + # Determine if multi-output by checking first result + first_result = next(iter(results.values()))[0] + y_test_sample = first_result["y_test"] + is_multi_output = len(y_test_sample.shape) > 1 and y_test_sample.shape[1] > 1 + + all_summaries = {} + all_dfs = {} + + for model_name, model_results in results.items(): + # Group results by fold + folds = {} + for r in model_results: + fold_id = r.get("fold") + if fold_id not in folds: + folds[fold_id] = [] + folds[fold_id].append(r) + + n_folds = len(folds) + + print(f"\n{model_name.upper()} - {n_folds} Folds") + print(f"{'-'*80}") + + # Compute metrics for each fold + fold_summaries = [] + for fold_id in sorted(folds.keys()): + fold_results = folds[fold_id] + fold_metrics_list = [] + + for r in fold_results: + seed = r.get("seed") + y_pred = r.get("y_pred") + y_test = r.get("y_test") + y_proba = r.get("y_proba") + + if problem_type == "classification": + if is_multi_output: + metrics = compute_multi_output_classification_metrics(y_test, y_pred, y_proba) + else: + metrics = compute_classification_metrics(y_test, y_pred, y_proba) + else: # regression + if is_multi_output: + metrics = compute_multi_output_regression_metrics(y_test, y_pred) + else: + metrics = compute_regression_metrics(y_test, y_pred) + + metrics["seed"] = seed + metrics["fold"] = fold_id + fold_metrics_list.append(metrics) + + fold_summaries.append(fold_metrics_list) + + # Print per-fold metrics + metric_keys = _get_cv_metric_keys(problem_type, is_multi_output) + _print_cv_fold_results(folds, fold_summaries, metric_keys) + + # Compute overall metrics (averaged across all folds and seeds) + all_fold_metrics = [m for fold_list in fold_summaries for m in fold_list] + overall_df = pd.DataFrame(all_fold_metrics) + + # Print overall metrics + print(f"\n {'Overall':<6}", end="") + for mk in metric_keys: + if mk in overall_df.columns: + mean_val = overall_df[mk].mean() + std_val = overall_df[mk].std() + print(f"{mean_val:.4f} ± {std_val:.4f} ", end="") + else: + print(f"{'N/A':<20}", end="") + print("\n") + + # Save reports and metrics + _save_cv_reports(model_name, all_fold_metrics, fold_summaries, folds, overall_df, output_dir) + + all_summaries[model_name] = overall_df + all_dfs[model_name] = overall_df + + # Create comparison plots across folds and models + plot_cv_comparison(all_dfs, output_dir, problem_type, is_multi_output) + + print(f"\n{'='*80}\n") + + # Feature importance for CV + extract_cv_feature_importance(results, output_dir, problem_type, feature_names) + + return all_summaries + + +def evaluate_tuning_results( + results: Dict, + y_val: np.ndarray, + y_test: np.ndarray, + output_dir: str = "outputs", + problem_type: str = "classification", + label_names: Optional[list] = None, + feature_names: Optional[list] = None, + X_val: Optional[np.ndarray] = None, + X_test: Optional[np.ndarray] = None, +): + """Evaluate and report results from hyperparameter tuning. + + This function evaluates model performance on both validation and test sets + after hyperparameter tuning. It produces reports and visualisations comparing + performance across seeds and between val/test sets. + + Parameters + ---------- + results : Dict + Results dictionary from trainer.run_with_tuning() + y_val : np.ndarray + Validation set labels + y_test : np.ndarray + Test set labels + output_dir : str + Directory to save outputs + problem_type : str + "classification" or "regression" + label_names : list, optional + Names of label columns + feature_names : list, optional + Names of features (for feature importance) + X_val : np.ndarray, optional + Validation features (for feature importance) + X_test : np.ndarray, optional + Test features (for feature importance) + + Returns + ------- + dict + Summary statistics for each model + """ + os.makedirs(output_dir, exist_ok=True) + + print(f"\n{'='*80}") + print("HYPERPARAMETER TUNING EVALUATION") + print(f"{'='*80}\n") + + all_summaries = {} + + for model_name, model_results in results.items(): + print(f"\n{model_name.upper()}") + print("-" * 60) + + # Collect metrics for each seed + val_metrics_per_seed = [] + test_metrics_per_seed = [] + best_params_per_seed = [] + best_cv_scores = [] + + for run in model_results: + seed = run['seed'] + best_params = run['best_params'] + best_cv_score = run['best_cv_score'] + + best_params_per_seed.append(best_params) + best_cv_scores.append(best_cv_score) + + # Compute validation metrics + if problem_type == "regression": + val_metrics = compute_regression_metrics( + y_val, run['y_val_pred'] + ) + else: + val_metrics = compute_classification_metrics( + y_val, run['y_val_pred'], run.get('y_val_proba') + ) + val_metrics['seed'] = seed + val_metrics_per_seed.append(val_metrics) + + # Compute test metrics + if problem_type == "regression": + test_metrics = compute_regression_metrics( + y_test, run['y_test_pred'] + ) + else: + test_metrics = compute_classification_metrics( + y_test, run['y_test_pred'], run.get('y_test_proba') + ) + test_metrics['seed'] = seed + test_metrics_per_seed.append(test_metrics) + + # Create summary DataFrames + val_df = pd.DataFrame(val_metrics_per_seed) + test_df = pd.DataFrame(test_metrics_per_seed) + + # Print summary statistics + _print_tuning_summary(val_df, test_df, best_cv_scores, problem_type, len(model_results)) + + # Save detailed results + _save_tuning_results(model_name, val_df, test_df, best_params_per_seed, output_dir) + + # Create comparison plots + plot_validation_vs_test(val_df, test_df, model_name, output_dir, problem_type) + + # Store summary + all_summaries[model_name] = { + 'validation': val_df, + 'test': test_df, + 'best_params': best_params_per_seed, + 'best_cv_scores': best_cv_scores, + } + + # Create summary report comparing all models + _save_tuning_summary_report(all_summaries, output_dir, problem_type) + + return all_summaries + + +# Helper functions + +def _get_metric_columns(problem_type, is_multi_output, combined_df): + """Determine which metric columns to display based on problem type.""" + if is_multi_output: + if problem_type == "regression": + display_cols = ["seed", "mse_mean", "rmse_mean", "mae_mean", "r2_mean"] + comparison_metrics = ["mse_mean", "rmse_mean", "mae_mean", "r2_mean"] + plot_metrics = ["mse_mean", "rmse_mean", "mae_mean", "r2_mean"] + else: + display_cols = [ + "seed", "accuracy_mean", "balanced_accuracy_mean", + "precision_mean", "recall_mean", "f1_mean", "cohen_kappa_mean", + ] + comparison_metrics = [ + "accuracy_mean", "balanced_accuracy_mean", "f1_mean", "cohen_kappa_mean", + ] + plot_metrics = ["accuracy_mean", "balanced_accuracy_mean", "f1_mean", "cohen_kappa_mean"] + else: + if problem_type == "regression": + display_cols = ["seed", "mse", "rmse", "mae", "r2"] + if "mape" in combined_df.columns: + display_cols.append("mape") + comparison_metrics = ["mse", "rmse", "mae", "r2"] + plot_metrics = ["mse", "rmse", "mae", "r2"] + else: + display_cols = [ + "seed", "accuracy", "balanced_accuracy", + "precision", "recall", "f1", "cohen_kappa", + ] + if "roc_auc" in combined_df.columns: + display_cols.append("roc_auc") + if "average_precision" in combined_df.columns: + display_cols.append("average_precision") + comparison_metrics = [ + "accuracy", "balanced_accuracy", "f1", "cohen_kappa", "roc_auc", + ] + plot_metrics = ["accuracy", "balanced_accuracy", "f1", "cohen_kappa"] + + return display_cols, comparison_metrics, plot_metrics + + +def _print_model_results(all_dfs, display_cols, is_multi_output, label_names): + """Print results for each model.""" + for model_name, df in all_dfs.items(): + print(f"\n{'='*80}") + print(f"RESULTS FOR MODEL: {model_name.upper()}") + print(f"{'='*80}") + print( + df[display_cols].to_string( + index=False, + float_format=lambda x: f"{x:.4f}" if not pd.isna(x) else "N/A", + ) + ) + + print(f"\n{'-'*80}") + print(f"SUMMARY STATISTICS FOR {model_name.upper()} (mean ± std)") + print(f"{'-'*80}") + metric_cols = [c for c in display_cols if c != "seed"] + for col in metric_cols: + if col in df.columns: + vals = df[col].dropna() + if len(vals) > 0: + print(f"{col:20s}: {vals.mean():.4f} ± {safe_std(vals):.4f}") + print(f"{'-'*80}\n") + + # For multi-output, print per-output breakdown + if is_multi_output and len(df) > 0 and "per_output" in df.iloc[0]: + _print_per_output_breakdown(df, label_names, model_name) + + +def _print_per_output_breakdown(df, label_names, model_name): + """Print per-output breakdown for multi-output problems.""" + print(f"\n{'-'*80}") + print(f"PER-OUTPUT BREAKDOWN FOR {model_name.upper()}") + print(f"{'-'*80}") + + first_row = df.iloc[0] + if "per_output" in first_row and first_row["per_output"] is not None: + per_output_list = first_row["per_output"] + n_outputs = len(per_output_list) + + # Collect metrics across all seeds for each output + for output_idx in range(n_outputs): + # Use label name if provided, otherwise use generic Output N + if label_names and output_idx < len(label_names): + output_label = f"{label_names[output_idx]}" + else: + output_label = f"Output {output_idx + 1}" + + print(f"\n{output_label}:") + + # Aggregate metrics across seeds for this output + output_metrics = {} + for _, row in df.iterrows(): + if "per_output" in row and row["per_output"] is not None: + per_out = row["per_output"] + if output_idx < len(per_out): + for key, val in per_out[output_idx].items(): + if key not in ["confusion_matrix", "seed", "model"] and val is not None: + if key not in output_metrics: + output_metrics[key] = [] + output_metrics[key].append(val) + + # Print summary for this output + for metric_name, values in sorted(output_metrics.items()): + if len(values) > 0: + mean_val = np.mean(values) + std_val = safe_std(np.array(values)) + print(f" {metric_name:20s}: {mean_val:.4f} ± {std_val:.4f}") + print(f"{'-'*80}\n") + + +def _print_model_comparison(all_dfs, comparison_metrics): + """Print model comparison table.""" + print(f"\n{'='*80}") + print("MODEL COMPARISON (mean ± std)") + print(f"{'='*80}") + comparison_data = [] + for model_name, df in all_dfs.items(): + row = {"Model": model_name} + for col in comparison_metrics: + if col in df.columns: + vals = df[col].dropna() + if len(vals) > 0: + row[col] = f"{vals.mean():.4f} ± {safe_std(vals):.4f}" + else: + row[col] = "N/A" + comparison_data.append(row) + + comparison_df = pd.DataFrame(comparison_data) + print(comparison_df.to_string(index=False)) + print(f"{'='*80}\n") + + +def _print_model_ranking(all_dfs, display_cols, problem_type, is_multi_output): + """Print model ranking based on primary metric.""" + # Determine primary metric + if is_multi_output: + if problem_type == "regression": + primary_metric = "mse_mean" + lower_is_better = True + else: + primary_metric = "cohen_kappa_mean" + lower_is_better = False + else: + if problem_type == "regression": + primary_metric = "mse" + lower_is_better = True + else: + primary_metric = "cohen_kappa" + lower_is_better = False + + # Calculate mean primary metric for each model + model_scores = [] + for model_name, df in all_dfs.items(): + if primary_metric in df.columns: + vals = df[primary_metric].dropna() + if len(vals) > 0: + model_scores.append({ + "model": model_name, + "mean": vals.mean(), + "std": safe_std(vals), + }) + + if len(model_scores) > 0: + # Sort by primary metric + model_scores.sort(key=lambda x: x["mean"], reverse=not lower_is_better) + + print(f"\n{'='*80}") + print("MODEL RANKING") + print(f"{'='*80}") + print( + f"Ranked by: {primary_metric.replace('_', ' ').upper()} " + f"({'Lower is better' if lower_is_better else 'Higher is better'})" + ) + print(f"{'-'*80}") + + # Create ranking table with all metrics + ranking_data = [] + for rank, score in enumerate(model_scores, 1): + model_name = score["model"] + model_df = all_dfs[model_name] + + row = { + "Rank": f"{rank}.", + "Model": model_name.upper(), + primary_metric: f"{score['mean']:.4f} ± {score['std']:.4f}", + } + + # Add other metrics + metric_cols = [c for c in display_cols if c != "seed" and c != primary_metric] + for col in metric_cols: + if col in model_df.columns: + vals = model_df[col].dropna() + if len(vals) > 0: + row[col] = f"{vals.mean():.4f} ± {safe_std(vals):.4f}" + + ranking_data.append(row) + + # Print ranking table + ranking_df = pd.DataFrame(ranking_data) + print(ranking_df.to_string(index=False)) + print(f"{'='*80}\n") + + +def _get_cv_metric_keys(problem_type, is_multi_output): + """Get metric keys for CV results.""" + if problem_type == "classification": + if is_multi_output: + return ["accuracy_mean", "balanced_accuracy_mean", "f1_mean"] + else: + return ["accuracy", "balanced_accuracy", "f1"] + else: + if is_multi_output: + return ["r2_mean", "rmse_mean", "mae_mean"] + else: + return ["r2", "rmse", "mae"] + + +def _print_cv_fold_results(folds, fold_summaries, metric_keys): + """Print per-fold CV results.""" + print(f"\nPer-Fold Metrics (averaged across seeds per fold):") + + # Print table header + print(f"\n {'Fold':<6}", end="") + for mk in metric_keys: + print(f"{mk:<20}", end="") + print() + print(f" {'-'*6}", end="") + for _ in metric_keys: + print(f"{'-'*20}", end="") + print() + + # Print per-fold metrics + for fold_id in sorted(folds.keys()): + fold_results_list = fold_summaries[fold_id] + fold_df = pd.DataFrame(fold_results_list) + + print(f" {fold_id:<6}", end="") + for mk in metric_keys: + if mk in fold_df.columns: + mean_val = fold_df[mk].mean() + std_val = fold_df[mk].std() + print(f"{mean_val:.4f} ± {std_val:.4f} ", end="") + else: + print(f"{'N/A':<20}", end="") + print() + + +def _save_cv_reports(model_name, all_fold_metrics, fold_summaries, folds, overall_df, output_dir): + """Save CV reports and metrics to files.""" + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + + # Save detailed report + report_path = os.path.join(model_dir, f"report_{model_name}_cv.json") + with open(report_path, "w") as f: + json.dump(all_fold_metrics, f, indent=2, default=str) + print(f" Detailed JSON report saved to: {report_path}") + + # Save detailed metrics as CSV (all seeds and folds) + csv_detailed_path = os.path.join(model_dir, f"metrics_{model_name}_cv_detailed.csv") + overall_df.to_csv(csv_detailed_path, index=False) + print(f" Detailed CSV metrics saved to: {csv_detailed_path}") + + # Save per-fold summary (averaged across seeds) + fold_summary_data = [] + for fold_id in sorted(folds.keys()): + fold_results_list = fold_summaries[fold_id] + fold_df = pd.DataFrame(fold_results_list) + + # Calculate mean and std for each metric + fold_summary = {"fold": fold_id} + for col in fold_df.columns: + if col not in ["seed", "fold"]: + fold_summary[f"{col}_mean"] = fold_df[col].mean() + fold_summary[f"{col}_std"] = fold_df[col].std() + fold_summary_data.append(fold_summary) + + # Add overall row + overall_summary = {"fold": "overall"} + for col in overall_df.columns: + if col not in ["seed", "fold"]: + overall_summary[f"{col}_mean"] = overall_df[col].mean() + overall_summary[f"{col}_std"] = overall_df[col].std() + fold_summary_data.append(overall_summary) + + fold_summary_df = pd.DataFrame(fold_summary_data) + csv_summary_path = os.path.join(model_dir, f"metrics_{model_name}_cv_per_fold.csv") + fold_summary_df.to_csv(csv_summary_path, index=False) + print(f" Per-fold summary CSV saved to: {csv_summary_path}") + + +def _print_tuning_summary(val_df, test_df, best_cv_scores, problem_type, n_seeds): + """Print summary of hyperparameter tuning results.""" + print(f"\nValidation Set Performance (across {n_seeds} seeds):") + if problem_type == "regression": + print(f" R² = {val_df['r2'].mean():.4f} ± {safe_std(val_df['r2']):.4f}") + print(f" RMSE = {val_df['rmse'].mean():.4f} ± {safe_std(val_df['rmse']):.4f}") + print(f" MAE = {val_df['mae'].mean():.4f} ± {safe_std(val_df['mae']):.4f}") + else: + print(f" Accuracy = {val_df['accuracy'].mean():.4f} ± {safe_std(val_df['accuracy']):.4f}") + print(f" Balanced Accuracy = {val_df['balanced_accuracy'].mean():.4f} ± {safe_std(val_df['balanced_accuracy']):.4f}") + print(f" F1 = {val_df['f1'].mean():.4f} ± {safe_std(val_df['f1']):.4f}") + + print(f"\nTest Set Performance (across {n_seeds} seeds):") + if problem_type == "regression": + print(f" R² = {test_df['r2'].mean():.4f} ± {safe_std(test_df['r2']):.4f}") + print(f" RMSE = {test_df['rmse'].mean():.4f} ± {safe_std(test_df['rmse']):.4f}") + print(f" MAE = {test_df['mae'].mean():.4f} ± {safe_std(test_df['mae']):.4f}") + else: + print(f" Accuracy = {test_df['accuracy'].mean():.4f} ± {safe_std(test_df['accuracy']):.4f}") + print(f" Balanced Accuracy = {test_df['balanced_accuracy'].mean():.4f} ± {safe_std(test_df['balanced_accuracy']):.4f}") + print(f" F1 = {test_df['f1'].mean():.4f} ± {safe_std(test_df['f1']):.4f}") + + print(f"\nBest CV Score (during tuning): {np.mean(best_cv_scores):.4f} ± {safe_std(np.array(best_cv_scores)):.4f}") + + +def _save_tuning_results(model_name, val_df, test_df, best_params_per_seed, output_dir): + """Save hyperparameter tuning results to files.""" + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + + # Save validation metrics + val_csv = os.path.join(model_dir, f"metrics_{model_name}_validation.csv") + val_df.to_csv(val_csv, index=False) + print(f"\n Validation metrics saved to: {val_csv}") + + # Save test metrics + test_csv = os.path.join(model_dir, f"metrics_{model_name}_test.csv") + test_df.to_csv(test_csv, index=False) + print(f" Test metrics saved to: {test_csv}") + + # Save hyperparameter summary + params_df = pd.DataFrame(best_params_per_seed) + params_csv = os.path.join(model_dir, f"best_params_{model_name}.csv") + params_df.to_csv(params_csv, index=False) + print(f" Best parameters saved to: {params_csv}") + + +def _save_tuning_summary_report(all_summaries, output_dir, problem_type): + """Save summary report comparing all models.""" + summary_report = {} + for model_name, summary in all_summaries.items(): + val_df = summary['validation'] + test_df = summary['test'] + + if problem_type == "regression": + summary_report[model_name] = { + 'val_r2_mean': float(val_df['r2'].mean()), + 'val_r2_std': float(safe_std(val_df['r2'])), + 'test_r2_mean': float(test_df['r2'].mean()), + 'test_r2_std': float(safe_std(test_df['r2'])), + 'val_rmse_mean': float(val_df['rmse'].mean()), + 'val_rmse_std': float(safe_std(val_df['rmse'])), + 'test_rmse_mean': float(test_df['rmse'].mean()), + 'test_rmse_std': float(safe_std(test_df['rmse'])), + } + else: + summary_report[model_name] = { + 'val_accuracy_mean': float(val_df['accuracy'].mean()), + 'val_accuracy_std': float(safe_std(val_df['accuracy'])), + 'test_accuracy_mean': float(test_df['accuracy'].mean()), + 'test_accuracy_std': float(safe_std(test_df['accuracy'])), + 'val_f1_mean': float(val_df['f1'].mean()), + 'val_f1_std': float(safe_std(val_df['f1'])), + 'test_f1_mean': float(test_df['f1'].mean()), + 'test_f1_std': float(safe_std(test_df['f1'])), + } + + # Save summary report + summary_path = os.path.join(output_dir, "tuning_summary_all_models.json") + with open(summary_path, 'w') as f: + json.dump(summary_report, f, indent=2) + print(f"\n{'='*60}") + print(f"Summary report saved to: {summary_path}") + print(f"{'='*60}\n") From 394b94a8d6ed2dcdca7026befd7e5d78bc0fc249 Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 12:39:07 +1100 Subject: [PATCH 05/15] cleaning --- README.md | 2 +- ecosci/hyperopt.py | 2 +- ecosci/trainer.py | 6 +++--- run.py | 4 ++-- tests/test_eval.py | 12 ++++++------ test_tuning.py => tests/test_hyperopt.py | 0 tests/test_integration.py | 2 +- 7 files changed, 14 insertions(+), 14 deletions(-) rename test_tuning.py => tests/test_hyperopt.py (100%) diff --git a/README.md b/README.md index b44695a..f36d463 100644 --- a/README.md +++ b/README.md @@ -442,7 +442,7 @@ python run.py --config configs/penguins_config.yaml python run.py --config configs/possum_config.yaml ``` -These demonstrate that the toolkit works correctly for both problem types and generates appropriate metrics and visualizations. +These demonstrate that the toolkit works correctly for both problem types and generates appropriate metrics and visualisations. ## Troubleshooting diff --git a/ecosci/hyperopt.py b/ecosci/hyperopt.py index 3490c65..685c2cb 100644 --- a/ecosci/hyperopt.py +++ b/ecosci/hyperopt.py @@ -1,4 +1,4 @@ -"""Hyperparameter optimization for different model types. +"""Hyperparameter optimisation for different model types. This module provides hyperparameter tuning functionality using scikit-learn's search methods (GridSearchCV, RandomizedSearchCV). Each model type has a dedicated diff --git a/ecosci/trainer.py b/ecosci/trainer.py index 521f673..df19a55 100644 --- a/ecosci/trainer.py +++ b/ecosci/trainer.py @@ -152,7 +152,7 @@ def run_cv(self, cfg: Dict[str, Any], fold_data_list): Returns ------- dict - Results organized as {model_name: [run_results_with_fold_info]} + Results organised as {model_name: [run_results_with_fold_info]} """ models_cfg = cfg.get("models", []) @@ -252,7 +252,7 @@ def run_with_tuning( ): """Run training with hyperparameter tuning. - This method performs hyperparameter optimization on the training + validation + This method performs hyperparameter optimisation on the training + validation sets, then evaluates the best model on the held-out test set. Multiple seeds are used for stable results. @@ -282,7 +282,7 @@ def run_with_tuning( Returns ------- dict - Results organized as {model_name: [run_results_with_tuning_info]} + Results organised as {model_name: [run_results_with_tuning_info]} """ from ecosci.hyperopt import HyperparameterTuner diff --git a/run.py b/run.py index 149472d..c77ffd5 100644 --- a/run.py +++ b/run.py @@ -9,7 +9,7 @@ from ecosci.data import CSVDataLoader from ecosci.models import ModelZoo from ecosci.trainer import Trainer -from ecosci.eval import evaluate_and_report, evaluate_and_report_cv +from ecosci.evaluation import evaluate_and_report, evaluate_and_report_cv parser = argparse.ArgumentParser(description="Train a model using a YAML config.") parser.add_argument("--config", required=True, help="Path to YAML config file") @@ -84,7 +84,7 @@ # Evaluate on both validation and test sets # For tuning mode, we want to see performance on both val and test - from ecosci.eval import evaluate_tuning_results + from ecosci.evaluation import evaluate_tuning_results summary = evaluate_tuning_results( results, y_val, diff --git a/tests/test_eval.py b/tests/test_eval.py index bdd6424..8cdc67c 100644 --- a/tests/test_eval.py +++ b/tests/test_eval.py @@ -7,7 +7,7 @@ import json import os -from ecosci.eval import compute_classification_metrics, evaluate_and_report +from ecosci.evaluation import compute_classification_metrics, evaluate_and_report @pytest.fixture @@ -118,7 +118,7 @@ def test_evaluate_and_report_multiple_seeds_computes_stats(temp_output_dir): # Regression tests def test_regression_metrics_computed_correctly(): """Test that regression metrics are computed correctly.""" - from ecosci.eval import compute_regression_metrics + from ecosci.evaluation import compute_regression_metrics y_true = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) y_pred = np.array([1.1, 2.2, 2.9, 4.1, 4.8]) @@ -142,7 +142,7 @@ def test_regression_metrics_computed_correctly(): def test_regression_metrics_perfect_prediction(): """Test regression metrics for perfect predictions.""" - from ecosci.eval import compute_regression_metrics + from ecosci.evaluation import compute_regression_metrics y_true = np.array([1.0, 2.0, 3.0, 4.0, 5.0]) y_pred = y_true.copy() @@ -158,7 +158,7 @@ def test_regression_metrics_perfect_prediction(): def test_regression_metrics_with_zeros(): """Test that MAPE is None when y_true contains zeros.""" - from ecosci.eval import compute_regression_metrics + from ecosci.evaluation import compute_regression_metrics y_true = np.array([0.0, 1.0, 2.0, 3.0]) y_pred = np.array([0.1, 1.1, 2.1, 3.1]) @@ -217,7 +217,7 @@ def test_evaluate_and_report_regression_multiple_seeds(temp_output_dir): # Multi-output tests def test_multi_output_classification_metrics(): """Test multi-output classification metrics computation.""" - from ecosci.eval import compute_multi_output_classification_metrics + from ecosci.evaluation import compute_multi_output_classification_metrics # Multi-output: predicting 2 outputs y_true = np.array([[0, 1], [1, 0], [0, 1], [1, 0], [0, 1], [1, 0]]) @@ -236,7 +236,7 @@ def test_multi_output_classification_metrics(): def test_multi_output_regression_metrics(): """Test multi-output regression metrics computation.""" - from ecosci.eval import compute_multi_output_regression_metrics + from ecosci.evaluation import compute_multi_output_regression_metrics # Multi-output: predicting 3 outputs y_true = np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0], [4.0, 5.0, 6.0]]) diff --git a/test_tuning.py b/tests/test_hyperopt.py similarity index 100% rename from test_tuning.py rename to tests/test_hyperopt.py diff --git a/tests/test_integration.py b/tests/test_integration.py index b18be70..edb1e51 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -12,7 +12,7 @@ from ecosci.data import CSVDataLoader from ecosci.models import ModelZoo from ecosci.trainer import Trainer -from ecosci.eval import evaluate_and_report +from ecosci.evaluation import evaluate_and_report @pytest.fixture From c6315a09ef4cca8496568c23f067a36345ed286e Mon Sep 17 00:00:00 2001 From: Ysobel Sims <35280100+ysims@users.noreply.github.com> Date: Mon, 24 Nov 2025 12:49:49 +1100 Subject: [PATCH 06/15] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- ecosci/evaluation/plotting.py | 6 +++--- ecosci/evaluation/reporting.py | 3 +-- ecosci/hyperopt.py | 2 +- ecosci/trainer.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/ecosci/evaluation/plotting.py b/ecosci/evaluation/plotting.py index ca0a1b0..337a23d 100644 --- a/ecosci/evaluation/plotting.py +++ b/ecosci/evaluation/plotting.py @@ -5,7 +5,7 @@ and feature importance plots. """ -from typing import Dict, Any, Optional, List +from typing import Dict, Optional, List import os import numpy as np import matplotlib.pyplot as plt @@ -202,7 +202,7 @@ def plot_residuals( y_pred = last.get("y_pred") residuals = y_test - y_pred - fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) + (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) # Predicted vs Actual ax1.scatter(y_test, y_pred, alpha=0.6) @@ -278,7 +278,7 @@ def get_feature_label(idx): top_stds = feature_std[top_indices] y_pos = np.arange(top_n) - ax.barh(y_pos, top_importances, xerr=top_stds, align='centre', alpha=0.7, capsize=3) + ax.barh(y_pos, top_importances, xerr=top_stds, align='center', alpha=0.7, capsize=3) ax.set_yticks(y_pos) ax.set_yticklabels([get_feature_label(idx) for idx in top_indices]) ax.invert_yaxis() diff --git a/ecosci/evaluation/reporting.py b/ecosci/evaluation/reporting.py index ad9f67b..84b3caa 100644 --- a/ecosci/evaluation/reporting.py +++ b/ecosci/evaluation/reporting.py @@ -15,7 +15,7 @@ - `pr_curve.png`: precision-recall curve if binary and probabilities exist """ -from typing import Dict, Any, Optional +from typing import Dict, Optional import os import json import numpy as np @@ -33,7 +33,6 @@ plot_confusion_matrices, plot_pr_curves, plot_residuals, - plot_feature_importance, plot_cv_comparison, plot_validation_vs_test, ) diff --git a/ecosci/hyperopt.py b/ecosci/hyperopt.py index 685c2cb..818c7b5 100644 --- a/ecosci/hyperopt.py +++ b/ecosci/hyperopt.py @@ -11,7 +11,7 @@ - Returns best model and tuning results """ -from typing import Any, Dict, Optional, Tuple +from typing import Dict, Optional, Tuple import numpy as np from sklearn.model_selection import GridSearchCV, RandomizedSearchCV from sklearn.base import BaseEstimator diff --git a/ecosci/trainer.py b/ecosci/trainer.py index df19a55..49e983b 100644 --- a/ecosci/trainer.py +++ b/ecosci/trainer.py @@ -8,7 +8,7 @@ - Supports hyperparameter tuning with train/val/test splits """ -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional from tqdm import tqdm import numpy as np import random From a7a9bfc3da4b85351fac1ff1d883693fafac5873 Mon Sep 17 00:00:00 2001 From: Ysobel Sims <35280100+ysims@users.noreply.github.com> Date: Mon, 24 Nov 2025 12:50:05 +1100 Subject: [PATCH 07/15] Update ecosci/evaluation/feature_importance.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- ecosci/evaluation/feature_importance.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ecosci/evaluation/feature_importance.py b/ecosci/evaluation/feature_importance.py index 31ba369..75da92e 100644 --- a/ecosci/evaluation/feature_importance.py +++ b/ecosci/evaluation/feature_importance.py @@ -10,7 +10,6 @@ import numpy as np import pandas as pd -from .metrics import safe_std from .plotting import plot_feature_importance From 76a3dac62e337a621cdf87f39ae45086c93bda87 Mon Sep 17 00:00:00 2001 From: Ysobel Sims <35280100+ysims@users.noreply.github.com> Date: Mon, 24 Nov 2025 12:50:19 +1100 Subject: [PATCH 08/15] Update tests/test_hyperopt.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- tests/test_hyperopt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_hyperopt.py b/tests/test_hyperopt.py index 10d5629..72374c5 100755 --- a/tests/test_hyperopt.py +++ b/tests/test_hyperopt.py @@ -58,7 +58,7 @@ def test_grouped_splits(): ) # Test grouped splits - (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments) = \ + (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments, _, _, _) = \ loader.prepare_grouped_splits(n_train_groups=4, n_val_groups=2, n_test_groups=2) print(f"\n✓ Successfully created grouped splits") From c98a98e7ff9466d8c280d99887ef24d7c76492cf Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 12:52:52 +1100 Subject: [PATCH 09/15] explanation --- ecosci/trainer.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ecosci/trainer.py b/ecosci/trainer.py index 49e983b..0eca02f 100644 --- a/ecosci/trainer.py +++ b/ecosci/trainer.py @@ -212,7 +212,10 @@ def run_cv(self, cfg: Dict[str, Any], fold_data_list): try: y_proba = model.predict_proba(X_test) except Exception as e: - pass + # predict_proba may fail for some classifiers (e.g., SVC without probability=True) + # or when the model has issues with the data. This is not critical for training, + # but we silently continue without probabilities for this fold. + y_proba = None # save model for this run in model-specific subfolder model_dir = os.path.join(self.output_dir, mname, f"fold{fold_id}") @@ -410,7 +413,11 @@ def run_with_tuning( y_val_proba = final_model.predict_proba(X_val) y_test_proba = final_model.predict_proba(X_test) except Exception as e: - pass + # predict_proba may fail for some classifiers (e.g., SVC without probability=True) + # or when the model has issues with the data. This is not critical for evaluation, + # but we silently continue without probabilities. Metrics like ROC-AUC won't be available. + y_val_proba = None + y_test_proba = None # Save model model_dir = os.path.join(self.output_dir, mname) From 051e29a62b0dc24028e48bdc28522befd7d7d254 Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 13:19:00 +1100 Subject: [PATCH 10/15] split up --- ecosci/data.py | 751 --------------------- ecosci/data/__init__.py | 5 + ecosci/data/loader.py | 396 +++++++++++ ecosci/data/preprocessing.py | 144 ++++ ecosci/data/splitting.py | 260 ++++++++ ecosci/evaluation/reporting.py | 765 ---------------------- ecosci/evaluation/reporting/__init__.py | 13 + ecosci/evaluation/reporting/display.py | 372 +++++++++++ ecosci/evaluation/reporting/evaluation.py | 385 +++++++++++ ecosci/evaluation/reporting/output.py | 152 +++++ ecosci/hyperopt.py | 13 +- 11 files changed, 1734 insertions(+), 1522 deletions(-) delete mode 100644 ecosci/data.py create mode 100644 ecosci/data/__init__.py create mode 100644 ecosci/data/loader.py create mode 100644 ecosci/data/preprocessing.py create mode 100644 ecosci/data/splitting.py delete mode 100644 ecosci/evaluation/reporting.py create mode 100644 ecosci/evaluation/reporting/__init__.py create mode 100644 ecosci/evaluation/reporting/display.py create mode 100644 ecosci/evaluation/reporting/evaluation.py create mode 100644 ecosci/evaluation/reporting/output.py diff --git a/ecosci/data.py b/ecosci/data.py deleted file mode 100644 index 99f06d6..0000000 --- a/ecosci/data.py +++ /dev/null @@ -1,751 +0,0 @@ -"""Data loading and preprocessing utilities. - -What this does for you (no coding required): -- Reads your CSV file -- Picks which columns are inputs (features) and which is the label (what you want to predict) -- Handles missing values (imputation) -- Scales numeric columns so models train better -- Encodes text/categorical columns into numbers automatically -- Splits your data into train/validation/test sets -""" - -from typing import List, Optional, Tuple -import pandas as pd - - -class CSVDataLoader: - """Load a CSV and build a simple preprocessing pipeline. - - How to use (via YAML): specify the CSV path, list the feature columns and the - label column, and choose options for imputation and scaling. - - Notes - - Scaling: "standard" makes numbers with mean=0 and std=1 (good default). - "minmax" squashes numbers to 0..1. - - Categorical text columns are one-hot encoded automatically. - - Missing numbers use the mean (by default). Categorical uses the most - frequent value. - """ - - def __init__( - self, - path: str, - features: Optional[List[str]] = None, - label: Optional[str] = None, - labels: Optional[List[str]] = None, - test_size: float = 0.2, - val_size: float = 0.2, - random_state: int = 0, - scaling: str = "standard", - impute_strategy: str = "mean", - problem_type: str = "classification", - cv_group_column: Optional[str] = None, - ): - self.path = path - self.features = features - # Support both single label and multiple labels - if labels is not None: - self.labels = labels if isinstance(labels, list) else [labels] - self.label = None - elif label is not None: - self.labels = [label] - self.label = label - else: - self.labels = None - self.label = None - self.test_size = test_size - self.val_size = val_size - self.random_state = random_state - self.scaling = scaling - self.impute_strategy = impute_strategy - self.problem_type = problem_type - self.cv_group_column = cv_group_column - - def load(self) -> pd.DataFrame: - """Read the CSV into a DataFrame.""" - df = pd.read_csv(self.path) - return df - - def _get_feature_names_after_transform(self, preprocessor, numeric_cols, cat_cols): - """Get feature names after preprocessing (including one-hot encoding).""" - feature_names = [] - - # Get feature names from each transformer - for name, transformer, columns in preprocessor.transformers_: - if name == "num": - # Numeric columns keep their names - feature_names.extend(columns) - elif name == "cat": - # For categorical columns, get one-hot encoded names - try: - # Get the OneHotEncoder from the pipeline - onehot = transformer.named_steps.get("onehot") - if onehot is not None and hasattr(onehot, "get_feature_names_out"): - # Get one-hot feature names - cat_features = onehot.get_feature_names_out(columns) - feature_names.extend(cat_features) - else: - # Fallback: just use column names - feature_names.extend(columns) - except Exception: - # Fallback if something goes wrong - feature_names.extend(columns) - - return feature_names - - def prepare(self) -> Tuple: - """Prepare arrays for modeling. - - Returns - ------- - tuple - (X_train, X_val, X_test, y_train, y_val, y_test) - - Under the hood we use scikit-learn pipelines for imputation, encoding - and scaling to keep things robust and simple. - """ - from sklearn.model_selection import train_test_split - from sklearn.impute import SimpleImputer - from sklearn.preprocessing import ( - StandardScaler, - MinMaxScaler, - OneHotEncoder, - LabelEncoder, - ) - from sklearn.compose import ColumnTransformer - from sklearn.pipeline import Pipeline - import numpy as np - - df = self.load() - - if self.labels is None: - raise ValueError("Label column(s) must be provided in config") - - if self.features is None: - # default: all columns except labels - self.features = [c for c in df.columns if c not in self.labels] - - X = df[self.features].copy() - - # When single label, DataFrame.iloc returns Series; when multiple, returns DataFrame - if len(self.labels) == 1: - y = df[self.labels[0]].copy() # Get as Series - mask_target = ~y.isna() - else: - y = df[self.labels].copy() # Get as DataFrame - mask_target = ~y.isna().any(axis=1) - - if mask_target.any() and not mask_target.all(): - n_dropped_target = (~mask_target).sum() - print( - f"Warning: Dropping {n_dropped_target} rows with NaN in target {self.labels}" - ) - X = X[mask_target] - y = y[mask_target] - - # Drop rows with NaN in any features - mask_features = ~X.isna().any(axis=1) - if not mask_features.all(): - n_dropped_features = (~mask_features).sum() - cols_with_nan = X.columns[X.isna().any()].tolist() - print( - f"Warning: Dropping {n_dropped_features} additional rows with NaN in features: {cols_with_nan}" - ) - X = X[mask_features] - y = y[mask_features] - - # Reset indices after dropping rows - X = X.reset_index(drop=True) - y = y.reset_index(drop=True) - - # Encode labels if they are strings/categorical (needed for XGBoost and others) - # For multi-output, encode each output separately - self.label_encoders = {} - self.label_classes_dict = {} - - if len(self.labels) == 1: - # Single output: maintain backward compatibility - if y.dtype == "object" or y.dtype.name == "category": - label_encoder = LabelEncoder() - y = label_encoder.fit_transform(y) - self.label_encoders[self.labels[0]] = label_encoder - self.label_classes_dict[self.labels[0]] = label_encoder.classes_ - # Keep backward compatibility - self.label_encoder = label_encoder - self.label_classes = label_encoder.classes_ - else: - self.label_encoder = None - self.label_classes = None - else: - # Multi-output: encode each column separately - y_encoded = [] - for col in self.labels: - if y[col].dtype == "object" or y[col].dtype.name == "category": - label_encoder = LabelEncoder() - y_col_encoded = label_encoder.fit_transform(y[col]) - self.label_encoders[col] = label_encoder - self.label_classes_dict[col] = label_encoder.classes_ - else: - y_col_encoded = y[col].values - self.label_encoders[col] = None - self.label_classes_dict[col] = None - y_encoded.append(y_col_encoded) - y = np.column_stack(y_encoded) - self.label_encoder = None - self.label_classes = None - - # Identify numeric vs categorical (strings, categories, etc.) - numeric_cols = X.select_dtypes(include=["number"]).columns.tolist() - cat_cols = [c for c in X.columns if c not in numeric_cols] - - transformers = [] - - if numeric_cols: - num_pipeline = Pipeline( - [ - ("imputer", SimpleImputer(strategy=self.impute_strategy)), - ( - "scaler", - ( - StandardScaler() - if self.scaling == "standard" - else MinMaxScaler() - ), - ), - ] - ) - transformers.append(("num", num_pipeline, numeric_cols)) - - if cat_cols: - cat_pipeline = Pipeline( - [ - ("imputer", SimpleImputer(strategy="most_frequent")), - ( - "onehot", - OneHotEncoder(handle_unknown="ignore", sparse_output=False), - ), - ] - ) - transformers.append(("cat", cat_pipeline, cat_cols)) - - preprocessor = ColumnTransformer(transformers, remainder="drop") - - X_proc = preprocessor.fit_transform(X) - - # Store preprocessor and generate feature names after transformation - self.preprocessor = preprocessor - self.processed_feature_names = self._get_feature_names_after_transform( - preprocessor, numeric_cols, cat_cols - ) - - # Check number of unique classes for stratification - # Only stratify for classification problems with reasonable number of classes - # For multi-output, we cannot use stratification directly - if self.problem_type == "classification" and len(self.labels) == 1: - n_unique = len(np.unique(y)) - stratify_y = y if n_unique <= 20 else None - else: - # Cannot stratify on multi-output or regression - stratify_y = None - - # Split: first test, then validation from remaining - X_train_val, X_test, y_train_val, y_test = train_test_split( - X_proc, - y, - test_size=self.test_size, - random_state=self.random_state, - stratify=stratify_y, - ) - - # compute val fraction relative to train_val - if self.val_size <= 0: - X_train, X_val, y_train, y_val = X_train_val, None, y_train_val, None - else: - val_fraction = self.val_size / (1.0 - self.test_size) - # Only stratify for classification problems with reasonable number of classes - if self.problem_type == "classification" and len(self.labels) == 1: - n_unique_train = len(np.unique(y_train_val)) - stratify_train = y_train_val if n_unique_train <= 20 else None - else: - stratify_train = None - X_train, X_val, y_train, y_val = train_test_split( - X_train_val, - y_train_val, - test_size=val_fraction, - random_state=self.random_state, - stratify=stratify_train, - ) - - return ( - X_train, - X_val, - X_test, - np.array(y_train), - (None if y_val is None else np.array(y_val)), - np.array(y_test), - ) - - def prepare_cv_folds(self): - """Prepare k-fold cross-validation splits using group-based splitting. - - Returns - ------- - list of tuples - Each tuple contains (X_train, X_val, X_test, y_train, y_val, y_test, fold_id) - where fold_id is the test fold number. - - Notes - ----- - - Uses GroupKFold to keep all samples from the same group together in the same fold - - The group is specified by cv_group_column (e.g., 'patch_id') - - Each unique group value becomes a unit for splitting - - Number of folds = number of unique groups - """ - from sklearn.model_selection import GroupKFold - from sklearn.impute import SimpleImputer - from sklearn.preprocessing import ( - StandardScaler, - MinMaxScaler, - OneHotEncoder, - LabelEncoder, - ) - from sklearn.compose import ColumnTransformer - from sklearn.pipeline import Pipeline - import numpy as np - - if self.cv_group_column is None: - raise ValueError("cv_group_column must be specified for k-fold cross-validation") - - df = self.load() - - if self.labels is None: - raise ValueError("Label column(s) must be provided in config") - - if self.features is None: - # default: all columns except labels and group column - self.features = [c for c in df.columns if c not in self.labels and c != self.cv_group_column] - - # Get group IDs - if self.cv_group_column not in df.columns: - raise ValueError(f"Group column '{self.cv_group_column}' not found in data") - - groups = df[self.cv_group_column].copy() - - X = df[self.features].copy() - - # When single label, DataFrame.iloc returns Series; when multiple, returns DataFrame - if len(self.labels) == 1: - y = df[self.labels[0]].copy() # Get as Series - mask_target = ~y.isna() - else: - y = df[self.labels].copy() # Get as DataFrame - mask_target = ~y.isna().any(axis=1) - - if mask_target.any() and not mask_target.all(): - n_dropped_target = (~mask_target).sum() - print( - f"Warning: Dropping {n_dropped_target} rows with NaN in target {self.labels}" - ) - X = X[mask_target] - y = y[mask_target] - groups = groups[mask_target] - - # Drop rows with NaN in any features - mask_features = ~X.isna().any(axis=1) - if not mask_features.all(): - n_dropped_features = (~mask_features).sum() - cols_with_nan = X.columns[X.isna().any()].tolist() - print( - f"Warning: Dropping {n_dropped_features} additional rows with NaN in features: {cols_with_nan}" - ) - X = X[mask_features] - y = y[mask_features] - groups = groups[mask_features] - - # Reset indices after dropping rows - X = X.reset_index(drop=True) - y = y.reset_index(drop=True) - groups = groups.reset_index(drop=True) - - # Encode labels if they are strings/categorical - self.label_encoders = {} - self.label_classes_dict = {} - - if len(self.labels) == 1: - # Single output - if y.dtype == "object" or y.dtype.name == "category": - label_encoder = LabelEncoder() - y = label_encoder.fit_transform(y) - self.label_encoders[self.labels[0]] = label_encoder - self.label_classes_dict[self.labels[0]] = label_encoder.classes_ - self.label_encoder = label_encoder - self.label_classes = label_encoder.classes_ - else: - self.label_encoder = None - self.label_classes = None - else: - # Multi-output - y_encoded = [] - for col in self.labels: - if y[col].dtype == "object" or y[col].dtype.name == "category": - label_encoder = LabelEncoder() - y_col_encoded = label_encoder.fit_transform(y[col]) - self.label_encoders[col] = label_encoder - self.label_classes_dict[col] = label_encoder.classes_ - else: - y_col_encoded = y[col].values - self.label_encoders[col] = None - self.label_classes_dict[col] = None - y_encoded.append(y_col_encoded) - y = np.column_stack(y_encoded) - self.label_encoder = None - self.label_classes = None - - # Identify numeric vs categorical - numeric_cols = X.select_dtypes(include=["number"]).columns.tolist() - cat_cols = [c for c in X.columns if c not in numeric_cols] - - transformers = [] - - if numeric_cols: - scaler = StandardScaler() if self.scaling == "standard" else MinMaxScaler() - num_pipeline = Pipeline([("imputer", SimpleImputer(strategy=self.impute_strategy)), ("scaler", scaler)]) - transformers.append(("num", num_pipeline, numeric_cols)) - - if cat_cols: - cat_pipeline = Pipeline([ - ("imputer", SimpleImputer(strategy="most_frequent")), - ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)), - ]) - transformers.append(("cat", cat_pipeline, cat_cols)) - - # Store the preprocessor template and feature names (but don't fit on all data) - self.preprocessor_template = ColumnTransformer(transformers=transformers, remainder="drop") - - # We need to fit preprocessor on a sample to get feature names - # Use just the first group to avoid leakage - sample_preprocessor = ColumnTransformer(transformers=transformers, remainder="drop") - sample_preprocessor.fit(X) - self.processed_feature_names = self._get_feature_names_after_transform(sample_preprocessor, numeric_cols, cat_cols) - - # Determine number of folds based on unique groups - unique_groups = groups.unique() - n_folds = len(unique_groups) - - print(f"\nK-Fold Cross-Validation Setup:") - print(f" Group column: {self.cv_group_column}") - print(f" Number of unique groups: {n_folds}") - print(f" Number of folds: {n_folds}") - print(f" Samples per group: {groups.value_counts().to_dict()}\n") - - # Create k-fold splits - group_kfold = GroupKFold(n_splits=n_folds) - - fold_data = [] - # NOTE: Keep X as DataFrame for ColumnTransformer which uses column names - for fold_idx, (train_val_idx, test_idx) in enumerate(group_kfold.split(X, y, groups)): - # Get raw (unscaled) train_val and test data - keep as DataFrames - X_train_val_raw = X.iloc[train_val_idx] - y_train_val = y[train_val_idx] if isinstance(y, np.ndarray) else y.iloc[train_val_idx] - X_test_raw = X.iloc[test_idx] - y_test = y[test_idx] if isinstance(y, np.ndarray) else y.iloc[test_idx] - - # Create a NEW preprocessor for this fold (to avoid data leakage) - fold_preprocessor = ColumnTransformer(transformers=transformers, remainder="drop") - - # Fit preprocessor ONLY on train_val data (not on test!) - X_train_val_proc = fold_preprocessor.fit_transform(X_train_val_raw) - - # Transform test data using the fitted preprocessor - X_test_proc = fold_preprocessor.transform(X_test_raw) - - # Further split train_val into train and val - if self.val_size <= 0: - X_train, X_val, y_train, y_val = X_train_val_proc, None, y_train_val, None - else: - # Use a fraction of train_val for validation - from sklearn.model_selection import train_test_split - val_fraction = self.val_size / (1.0 - (1.0 / n_folds)) - - # For stratification - if self.problem_type == "classification" and len(self.labels) == 1: - n_unique_train = len(np.unique(y_train_val)) - stratify_train = y_train_val if n_unique_train <= 20 else None - else: - stratify_train = None - - X_train, X_val, y_train, y_val = train_test_split( - X_train_val_proc, - y_train_val, - test_size=val_fraction, - random_state=self.random_state, - stratify=stratify_train, - ) - - fold_data.append(( - X_train, - X_val, - X_test_proc, - np.array(y_train), - (None if y_val is None else np.array(y_val)), - np.array(y_test), - fold_idx - )) - - return fold_data - - def prepare_grouped_splits( - self, - n_train_groups: int = 4, - n_val_groups: int = 2, - n_test_groups: int = 2, - ): - """Prepare train/val/test splits by assigning groups to each split. - - This method ensures no data leakage by keeping all samples from the same - group together in one split (train, val, or test). Useful for spatial or - temporal data where groups represent locations, patches, or time periods. - - Parameters - ---------- - n_train_groups : int - Number of groups to assign to training set - n_val_groups : int - Number of groups to assign to validation set (for hyperparameter tuning) - n_test_groups : int - Number of groups to assign to test set (for final evaluation) - - Returns - ------- - tuple - (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments) - where group_assignments is a dict mapping split names to group IDs - - Raises - ------ - ValueError - If cv_group_column is not specified or if there aren't enough groups - - Notes - ----- - - Requires cv_group_column to be set in the data config - - Groups are shuffled before assignment to ensure randomness - - Total groups needed = n_train_groups + n_val_groups + n_test_groups - """ - from sklearn.impute import SimpleImputer - from sklearn.preprocessing import ( - StandardScaler, - MinMaxScaler, - OneHotEncoder, - LabelEncoder, - ) - from sklearn.compose import ColumnTransformer - from sklearn.pipeline import Pipeline - import numpy as np - - if self.cv_group_column is None: - raise ValueError( - "cv_group_column must be specified for grouped train/val/test splits" - ) - - df = self.load() - - if self.labels is None: - raise ValueError("Label column(s) must be provided in config") - - if self.features is None: - # default: all columns except labels and group column - self.features = [ - c for c in df.columns - if c not in self.labels and c != self.cv_group_column - ] - - # Get group IDs - if self.cv_group_column not in df.columns: - raise ValueError(f"Group column '{self.cv_group_column}' not found in data") - - groups = df[self.cv_group_column].copy() - unique_groups = groups.unique() - n_groups = len(unique_groups) - - # Validate we have enough groups - total_groups_needed = n_train_groups + n_val_groups + n_test_groups - if n_groups < total_groups_needed: - raise ValueError( - f"Not enough groups: need {total_groups_needed} " - f"({n_train_groups} train + {n_val_groups} val + {n_test_groups} test), " - f"but only have {n_groups} unique groups" - ) - - # Shuffle groups for random assignment - rng = np.random.RandomState(self.random_state) - shuffled_groups = rng.permutation(unique_groups) - - # Assign groups to splits - train_groups = shuffled_groups[:n_train_groups] - val_groups = shuffled_groups[n_train_groups:n_train_groups + n_val_groups] - test_groups = shuffled_groups[n_train_groups + n_val_groups:n_train_groups + n_val_groups + n_test_groups] - - group_assignments = { - 'train': train_groups.tolist(), - 'val': val_groups.tolist(), - 'test': test_groups.tolist(), - } - - print(f"\nGrouped Train/Val/Test Split Setup:") - print(f" Group column: {self.cv_group_column}") - print(f" Total unique groups: {n_groups}") - print(f" Train groups ({n_train_groups}): {train_groups.tolist()}") - print(f" Val groups ({n_val_groups}): {val_groups.tolist()}") - print(f" Test groups ({n_test_groups}): {test_groups.tolist()}") - - # Create masks for each split - train_mask = groups.isin(train_groups) - val_mask = groups.isin(val_groups) - test_mask = groups.isin(test_groups) - - print(f"\n Train samples: {train_mask.sum()}") - print(f" Val samples: {val_mask.sum()}") - print(f" Test samples: {test_mask.sum()}\n") - - # Extract features and labels - X = df[self.features].copy() - - if len(self.labels) == 1: - y = df[self.labels[0]].copy() - mask_target = ~y.isna() - else: - y = df[self.labels].copy() - mask_target = ~y.isna().any(axis=1) - - # Drop rows with NaN in target - if mask_target.any() and not mask_target.all(): - n_dropped_target = (~mask_target).sum() - print( - f"Warning: Dropping {n_dropped_target} rows with NaN in target {self.labels}" - ) - X = X[mask_target] - y = y[mask_target] - groups = groups[mask_target] - train_mask = train_mask[mask_target] - val_mask = val_mask[mask_target] - test_mask = test_mask[mask_target] - - # Drop rows with NaN in features - mask_features = ~X.isna().any(axis=1) - if not mask_features.all(): - n_dropped_features = (~mask_features).sum() - cols_with_nan = X.columns[X.isna().any()].tolist() - print( - f"Warning: Dropping {n_dropped_features} additional rows with NaN in features: {cols_with_nan}" - ) - X = X[mask_features] - y = y[mask_features] - groups = groups[mask_features] - train_mask = train_mask[mask_features] - val_mask = val_mask[mask_features] - test_mask = test_mask[mask_features] - - # Reset indices - X = X.reset_index(drop=True) - y = y.reset_index(drop=True) - groups = groups.reset_index(drop=True) - train_mask = train_mask.reset_index(drop=True) - val_mask = val_mask.reset_index(drop=True) - test_mask = test_mask.reset_index(drop=True) - - # Encode labels - self.label_encoders = {} - self.label_classes_dict = {} - - if len(self.labels) == 1: - if y.dtype == "object" or y.dtype.name == "category": - label_encoder = LabelEncoder() - y = label_encoder.fit_transform(y) - self.label_encoders[self.labels[0]] = label_encoder - self.label_classes_dict[self.labels[0]] = label_encoder.classes_ - self.label_encoder = label_encoder - self.label_classes = label_encoder.classes_ - else: - self.label_encoder = None - self.label_classes = None - else: - y_encoded = [] - for col in self.labels: - if y[col].dtype == "object" or y[col].dtype.name == "category": - label_encoder = LabelEncoder() - y_col_encoded = label_encoder.fit_transform(y[col]) - self.label_encoders[col] = label_encoder - self.label_classes_dict[col] = label_encoder.classes_ - else: - y_col_encoded = y[col].values - self.label_encoders[col] = None - self.label_classes_dict[col] = None - y_encoded.append(y_col_encoded) - y = np.column_stack(y_encoded) - self.label_encoder = None - self.label_classes = None - - # Build preprocessing pipeline - numeric_cols = X.select_dtypes(include=["number"]).columns.tolist() - cat_cols = [c for c in X.columns if c not in numeric_cols] - - transformers = [] - - if numeric_cols: - scaler = StandardScaler() if self.scaling == "standard" else MinMaxScaler() - num_pipeline = Pipeline([ - ("imputer", SimpleImputer(strategy=self.impute_strategy)), - ("scaler", scaler) - ]) - transformers.append(("num", num_pipeline, numeric_cols)) - - if cat_cols: - cat_pipeline = Pipeline([ - ("imputer", SimpleImputer(strategy="most_frequent")), - ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)), - ]) - transformers.append(("cat", cat_pipeline, cat_cols)) - - preprocessor = ColumnTransformer(transformers=transformers, remainder="drop") - - # Fit preprocessor on training data only (important!) - X_train_raw = X[train_mask] - preprocessor.fit(X_train_raw) - - # Transform all splits - X_proc = preprocessor.transform(X) - - # Store preprocessor and feature names - self.preprocessor = preprocessor - self.processed_feature_names = self._get_feature_names_after_transform( - preprocessor, numeric_cols, cat_cols - ) - - # Split the data - X_train = X_proc[train_mask] - X_val = X_proc[val_mask] - X_test = X_proc[test_mask] - - # y should be numpy array at this point - y_train = y[train_mask] - y_val = y[val_mask] - y_test = y[test_mask] - - # Extract group IDs for each split - groups_train = groups[train_mask].values - groups_val = groups[val_mask].values - groups_test = groups[test_mask].values - - return ( - X_train, - X_val, - X_test, - np.array(y_train), - np.array(y_val), - np.array(y_test), - group_assignments, - groups_train, - groups_val, - groups_test, - ) diff --git a/ecosci/data/__init__.py b/ecosci/data/__init__.py new file mode 100644 index 0000000..1b84bcf --- /dev/null +++ b/ecosci/data/__init__.py @@ -0,0 +1,5 @@ +"""Data loading, preprocessing, and splitting utilities.""" + +from .loader import CSVDataLoader + +__all__ = ["CSVDataLoader"] diff --git a/ecosci/data/loader.py b/ecosci/data/loader.py new file mode 100644 index 0000000..3a3f859 --- /dev/null +++ b/ecosci/data/loader.py @@ -0,0 +1,396 @@ +"""CSV data loader with preprocessing pipeline.""" + +from typing import List, Optional, Tuple +import pandas as pd +import numpy as np + +from .preprocessing import ( + get_feature_names_after_transform, + build_preprocessing_pipeline, + encode_labels, +) +from .splitting import prepare_cv_folds, prepare_grouped_splits + + +class CSVDataLoader: + """Load a CSV and build a simple preprocessing pipeline. + + How to use (via YAML): specify the CSV path, list the feature columns and the + label column, and choose options for imputation and scaling. + + Notes + ----- + - Scaling: "standard" makes numbers with mean=0 and std=1 (good default). + "minmax" squashes numbers to 0..1. + - Categorical text columns are one-hot encoded automatically. + - Missing numbers use the mean (by default). Categorical uses the most + frequent value. + + Parameters + ---------- + path : str + Path to CSV file + features : list of str, optional + Column names to use as features. If None, all columns except labels are used. + label : str, optional + Single target column name (for backward compatibility) + labels : list of str, optional + Multiple target column names (for multi-output) + test_size : float + Fraction of data for test set + val_size : float + Fraction of data for validation set + random_state : int + Random seed for reproducibility + scaling : str + "standard" or "minmax" scaling + impute_strategy : str + Imputation strategy: "mean", "median", or "most_frequent" + problem_type : str + "classification" or "regression" + cv_group_column : str, optional + Column name for grouping in cross-validation + """ + + def __init__( + self, + path: str, + features: Optional[List[str]] = None, + label: Optional[str] = None, + labels: Optional[List[str]] = None, + test_size: float = 0.2, + val_size: float = 0.2, + random_state: int = 0, + scaling: str = "standard", + impute_strategy: str = "mean", + problem_type: str = "classification", + cv_group_column: Optional[str] = None, + ): + self.path = path + self.features = features + # Support both single label and multiple labels + if labels is not None: + self.labels = labels if isinstance(labels, list) else [labels] + self.label = None + elif label is not None: + self.labels = [label] + self.label = label + else: + self.labels = None + self.label = None + self.test_size = test_size + self.val_size = val_size + self.random_state = random_state + self.scaling = scaling + self.impute_strategy = impute_strategy + self.problem_type = problem_type + self.cv_group_column = cv_group_column + + def load(self) -> pd.DataFrame: + """Read the CSV into a DataFrame.""" + df = pd.read_csv(self.path) + return df + + def _get_feature_names_after_transform(self, preprocessor, numeric_cols, cat_cols): + """Get feature names after preprocessing (including one-hot encoding).""" + return get_feature_names_after_transform(preprocessor, numeric_cols, cat_cols) + + def prepare(self) -> Tuple: + """Prepare arrays for modeling. + + Returns + ------- + tuple + (X_train, X_val, X_test, y_train, y_val, y_test) + + Under the hood we use scikit-learn pipelines for imputation, encoding + and scaling to keep things robust and simple. + """ + from sklearn.model_selection import train_test_split + + df = self.load() + + if self.labels is None: + raise ValueError("Label column(s) must be provided in config") + + if self.features is None: + # default: all columns except labels + self.features = [c for c in df.columns if c not in self.labels] + + X = df[self.features].copy() + + # When single label, DataFrame.iloc returns Series; when multiple, returns DataFrame + if len(self.labels) == 1: + y = df[self.labels[0]].copy() # Get as Series + mask_target = ~y.isna() + else: + y = df[self.labels].copy() # Get as DataFrame + mask_target = ~y.isna().any(axis=1) + + if mask_target.any() and not mask_target.all(): + n_dropped_target = (~mask_target).sum() + print(f"Warning: Dropping {n_dropped_target} rows with NaN in target {self.labels}") + X = X[mask_target] + y = y[mask_target] + + # Drop rows with NaN in any features + mask_features = ~X.isna().any(axis=1) + if not mask_features.all(): + n_dropped_features = (~mask_features).sum() + cols_with_nan = X.columns[X.isna().any()].tolist() + print(f"Warning: Dropping {n_dropped_features} additional rows with NaN in features: {cols_with_nan}") + X = X[mask_features] + y = y[mask_features] + + # Reset indices after dropping rows + X = X.reset_index(drop=True) + y = y.reset_index(drop=True) + + # Encode labels if they are strings/categorical + y, self.label_encoders, self.label_classes_dict, self.label_encoder, self.label_classes = encode_labels( + y, self.labels, self.problem_type + ) + + # Identify numeric vs categorical (strings, categories, etc.) + numeric_cols = X.select_dtypes(include=["number"]).columns.tolist() + cat_cols = [c for c in X.columns if c not in numeric_cols] + + # Build preprocessing pipeline + preprocessor = build_preprocessing_pipeline( + numeric_cols, cat_cols, self.scaling, self.impute_strategy + ) + + X_proc = preprocessor.fit_transform(X) + + # Store preprocessor and generate feature names after transformation + self.preprocessor = preprocessor + self.processed_feature_names = self._get_feature_names_after_transform( + preprocessor, numeric_cols, cat_cols + ) + + # Check number of unique classes for stratification + if self.problem_type == "classification" and len(self.labels) == 1: + n_unique = len(np.unique(y)) + stratify_y = y if n_unique <= 20 else None + else: + # Cannot stratify on multi-output or regression + stratify_y = None + + # Split: first test, then validation from remaining + X_train_val, X_test, y_train_val, y_test = train_test_split( + X_proc, + y, + test_size=self.test_size, + random_state=self.random_state, + stratify=stratify_y, + ) + + # compute val fraction relative to train_val + if self.val_size <= 0: + X_train, X_val, y_train, y_val = X_train_val, None, y_train_val, None + else: + val_fraction = self.val_size / (1.0 - self.test_size) + # Only stratify for classification problems with reasonable number of classes + if self.problem_type == "classification" and len(self.labels) == 1: + n_unique_train = len(np.unique(y_train_val)) + stratify_train = y_train_val if n_unique_train <= 20 else None + else: + stratify_train = None + X_train, X_val, y_train, y_val = train_test_split( + X_train_val, + y_train_val, + test_size=val_fraction, + random_state=self.random_state, + stratify=stratify_train, + ) + + return ( + X_train, + X_val, + X_test, + np.array(y_train), + (None if y_val is None else np.array(y_val)), + np.array(y_test), + ) + + def prepare_cv_folds(self): + """Prepare k-fold cross-validation splits using group-based splitting. + + Returns + ------- + list of tuples + Each tuple contains (X_train, X_val, X_test, y_train, y_val, y_test, fold_id) + + Notes + ----- + - Uses GroupKFold to keep all samples from the same group together + - Number of folds = number of unique groups + """ + if self.cv_group_column is None: + raise ValueError("cv_group_column must be specified for k-fold cross-validation") + + df = self.load() + + if self.labels is None: + raise ValueError("Label column(s) must be provided in config") + + if self.features is None: + self.features = [c for c in df.columns if c not in self.labels and c != self.cv_group_column] + + # Get group IDs + if self.cv_group_column not in df.columns: + raise ValueError(f"Group column '{self.cv_group_column}' not found in data") + + groups = df[self.cv_group_column].copy() + X = df[self.features].copy() + + # Handle single vs multi-output + if len(self.labels) == 1: + y = df[self.labels[0]].copy() + mask_target = ~y.isna() + else: + y = df[self.labels].copy() + mask_target = ~y.isna().any(axis=1) + + # Drop NaN rows + if mask_target.any() and not mask_target.all(): + n_dropped_target = (~mask_target).sum() + print(f"Warning: Dropping {n_dropped_target} rows with NaN in target {self.labels}") + X, y, groups = X[mask_target], y[mask_target], groups[mask_target] + + mask_features = ~X.isna().any(axis=1) + if not mask_features.all(): + n_dropped_features = (~mask_features).sum() + cols_with_nan = X.columns[X.isna().any()].tolist() + print(f"Warning: Dropping {n_dropped_features} additional rows with NaN in features: {cols_with_nan}") + X, y, groups = X[mask_features], y[mask_features], groups[mask_features] + + # Reset indices + X = X.reset_index(drop=True) + y = y.reset_index(drop=True) + groups = groups.reset_index(drop=True) + + # Encode labels + y, self.label_encoders, self.label_classes_dict, self.label_encoder, self.label_classes = encode_labels( + y, self.labels, self.problem_type + ) + + # Identify numeric vs categorical + numeric_cols = X.select_dtypes(include=["number"]).columns.tolist() + cat_cols = [c for c in X.columns if c not in numeric_cols] + + # Store feature names from a sample fit + sample_preprocessor = build_preprocessing_pipeline( + numeric_cols, cat_cols, self.scaling, self.impute_strategy + ) + sample_preprocessor.fit(X) + self.processed_feature_names = self._get_feature_names_after_transform( + sample_preprocessor, numeric_cols, cat_cols + ) + + # Prepare folds + return prepare_cv_folds( + X, y, groups, self.labels, numeric_cols, cat_cols, + self.val_size, self.random_state, self.scaling, self.impute_strategy, + self.problem_type, self.cv_group_column + ) + + def prepare_grouped_splits( + self, + n_train_groups: int = 4, + n_val_groups: int = 2, + n_test_groups: int = 2, + ): + """Prepare train/val/test splits by assigning groups to each split. + + This method ensures no data leakage by keeping all samples from the same + group together in one split. + + Parameters + ---------- + n_train_groups : int + Number of groups to assign to training set + n_val_groups : int + Number of groups to assign to validation set + n_test_groups : int + Number of groups to assign to test set + + Returns + ------- + tuple + (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments, + groups_train, groups_val, groups_test) + """ + if self.cv_group_column is None: + raise ValueError("cv_group_column must be specified for grouped train/val/test splits") + + df = self.load() + + if self.labels is None: + raise ValueError("Label column(s) must be provided in config") + + if self.features is None: + self.features = [c for c in df.columns if c not in self.labels and c != self.cv_group_column] + + if self.cv_group_column not in df.columns: + raise ValueError(f"Group column '{self.cv_group_column}' not found in data") + + groups = df[self.cv_group_column].copy() + X = df[self.features].copy() + + # Handle single vs multi-output + if len(self.labels) == 1: + y = df[self.labels[0]].copy() + mask_target = ~y.isna() + else: + y = df[self.labels].copy() + mask_target = ~y.isna().any(axis=1) + + # Drop NaN rows + if mask_target.any() and not mask_target.all(): + n_dropped_target = (~mask_target).sum() + print(f"Warning: Dropping {n_dropped_target} rows with NaN in target {self.labels}") + X, y, groups = X[mask_target], y[mask_target], groups[mask_target] + + mask_features = ~X.isna().any(axis=1) + if not mask_features.all(): + n_dropped_features = (~mask_features).sum() + cols_with_nan = X.columns[X.isna().any()].tolist() + print(f"Warning: Dropping {n_dropped_features} additional rows with NaN in features: {cols_with_nan}") + X, y, groups = X[mask_features], y[mask_features], groups[mask_features] + + # Reset indices + X = X.reset_index(drop=True) + y = y.reset_index(drop=True) + groups = groups.reset_index(drop=True) + + # Encode labels + y, self.label_encoders, self.label_classes_dict, self.label_encoder, self.label_classes = encode_labels( + y, self.labels, self.problem_type + ) + + # Identify numeric vs categorical + numeric_cols = X.select_dtypes(include=["number"]).columns.tolist() + cat_cols = [c for c in X.columns if c not in numeric_cols] + + # Prepare grouped splits + result = prepare_grouped_splits( + X, y, groups, self.labels, numeric_cols, cat_cols, + self.random_state, self.scaling, self.impute_strategy, + n_train_groups, n_val_groups, n_test_groups, self.cv_group_column + ) + + # Unpack result + (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments, + groups_train, groups_val, groups_test, preprocessor) = result + + # Store preprocessor and feature names + self.preprocessor = preprocessor + self.processed_feature_names = self._get_feature_names_after_transform( + preprocessor, numeric_cols, cat_cols + ) + + return ( + X_train, X_val, X_test, y_train, y_val, y_test, group_assignments, + groups_train, groups_val, groups_test + ) diff --git a/ecosci/data/preprocessing.py b/ecosci/data/preprocessing.py new file mode 100644 index 0000000..9748690 --- /dev/null +++ b/ecosci/data/preprocessing.py @@ -0,0 +1,144 @@ +"""Preprocessing utilities for feature extraction and transformation.""" + +from typing import List + + +def get_feature_names_after_transform(preprocessor, numeric_cols: List[str], cat_cols: List[str]) -> List[str]: + """Get feature names after preprocessing (including one-hot encoding). + + Parameters + ---------- + preprocessor : ColumnTransformer + Fitted sklearn ColumnTransformer with numeric and categorical pipelines + numeric_cols : list of str + Names of numeric columns + cat_cols : list of str + Names of categorical columns + + Returns + ------- + list of str + Feature names after transformation + """ + feature_names = [] + + # Get feature names from each transformer + for name, transformer, columns in preprocessor.transformers_: + if name == "num": + # Numeric columns keep their names + feature_names.extend(columns) + elif name == "cat": + # For categorical columns, get one-hot encoded names + try: + # Get the OneHotEncoder from the pipeline + onehot = transformer.named_steps.get("onehot") + if onehot is not None and hasattr(onehot, "get_feature_names_out"): + # Get one-hot feature names + cat_features = onehot.get_feature_names_out(columns) + feature_names.extend(cat_features) + else: + # Fallback: just use column names + feature_names.extend(columns) + except Exception: + # Fallback if something goes wrong + feature_names.extend(columns) + + return feature_names + + +def build_preprocessing_pipeline(numeric_cols: List[str], cat_cols: List[str], + scaling: str = "standard", impute_strategy: str = "mean"): + """Build a scikit-learn preprocessing pipeline. + + Parameters + ---------- + numeric_cols : list of str + Names of numeric columns + cat_cols : list of str + Names of categorical columns + scaling : str + "standard" or "minmax" scaling for numeric features + impute_strategy : str + Imputation strategy for numeric features ("mean", "median", "most_frequent") + + Returns + ------- + ColumnTransformer + Preprocessing pipeline ready to fit + """ + from sklearn.impute import SimpleImputer + from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder + from sklearn.compose import ColumnTransformer + from sklearn.pipeline import Pipeline + + transformers = [] + + if numeric_cols: + scaler = StandardScaler() if scaling == "standard" else MinMaxScaler() + num_pipeline = Pipeline([ + ("imputer", SimpleImputer(strategy=impute_strategy)), + ("scaler", scaler) + ]) + transformers.append(("num", num_pipeline, numeric_cols)) + + if cat_cols: + cat_pipeline = Pipeline([ + ("imputer", SimpleImputer(strategy="most_frequent")), + ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)), + ]) + transformers.append(("cat", cat_pipeline, cat_cols)) + + return ColumnTransformer(transformers=transformers, remainder="drop") + + +def encode_labels(y, labels: List[str], problem_type: str = "classification"): + """Encode labels if they are categorical strings. + + Parameters + ---------- + y : pandas Series or DataFrame + Target variable(s) + labels : list of str + Names of target variable(s) + problem_type : str + "classification" or "regression" + + Returns + ------- + tuple + (encoded_y, label_encoders_dict, label_classes_dict, label_encoder, label_classes) + Last two maintain backward compatibility for single output + """ + from sklearn.preprocessing import LabelEncoder + import numpy as np + import pandas as pd + + label_encoders = {} + label_classes_dict = {} + + if len(labels) == 1: + # Single output: maintain backward compatibility + if y.dtype == "object" or y.dtype.name == "category": + label_encoder = LabelEncoder() + y_encoded = label_encoder.fit_transform(y) + label_encoders[labels[0]] = label_encoder + label_classes_dict[labels[0]] = label_encoder.classes_ + return y_encoded, label_encoders, label_classes_dict, label_encoder, label_encoder.classes_ + else: + return y, label_encoders, label_classes_dict, None, None + else: + # Multi-output: encode each column separately + y_encoded = [] + for col in labels: + if y[col].dtype == "object" or y[col].dtype.name == "category": + label_encoder = LabelEncoder() + y_col_encoded = label_encoder.fit_transform(y[col]) + label_encoders[col] = label_encoder + label_classes_dict[col] = label_encoder.classes_ + else: + y_col_encoded = y[col].values + label_encoders[col] = None + label_classes_dict[col] = None + y_encoded.append(y_col_encoded) + y_encoded = np.column_stack(y_encoded) + return y_encoded, label_encoders, label_classes_dict, None, None diff --git a/ecosci/data/splitting.py b/ecosci/data/splitting.py new file mode 100644 index 0000000..c7767f2 --- /dev/null +++ b/ecosci/data/splitting.py @@ -0,0 +1,260 @@ +"""Data splitting utilities for cross-validation and grouped splits.""" + +from typing import List, Tuple, Dict, Optional +import numpy as np +import pandas as pd + + +def prepare_cv_folds( + X: pd.DataFrame, + y, + groups: pd.Series, + labels: List[str], + numeric_cols: List[str], + cat_cols: List[str], + val_size: float, + random_state: int, + scaling: str, + impute_strategy: str, + problem_type: str, + cv_group_column: str, +) -> List[Tuple]: + """Prepare k-fold cross-validation splits using group-based splitting. + + Parameters + ---------- + X : DataFrame + Feature data + y : array-like + Target data (already encoded if categorical) + groups : Series + Group IDs for GroupKFold + labels : list of str + Names of target variables + numeric_cols : list of str + Names of numeric columns + cat_cols : list of str + Names of categorical columns + val_size : float + Validation set size as fraction + random_state : int + Random seed + scaling : str + "standard" or "minmax" + impute_strategy : str + Imputation strategy + problem_type : str + "classification" or "regression" + cv_group_column : str + Name of the group column + + Returns + ------- + list of tuples + Each tuple contains (X_train, X_val, X_test, y_train, y_val, y_test, fold_id) + """ + from sklearn.model_selection import GroupKFold, train_test_split + from .preprocessing import build_preprocessing_pipeline + + unique_groups = groups.unique() + n_folds = len(unique_groups) + + print(f"\nK-Fold Cross-Validation Setup:") + print(f" Group column: {cv_group_column}") + print(f" Number of unique groups: {n_folds}") + print(f" Number of folds: {n_folds}") + print(f" Samples per group: {groups.value_counts().to_dict()}\n") + + # Create k-fold splits + group_kfold = GroupKFold(n_splits=n_folds) + + fold_data = [] + for fold_idx, (train_val_idx, test_idx) in enumerate(group_kfold.split(X, y, groups)): + # Get raw train_val and test data + X_train_val_raw = X.iloc[train_val_idx] + y_train_val = y[train_val_idx] if isinstance(y, np.ndarray) else y.iloc[train_val_idx] + X_test_raw = X.iloc[test_idx] + y_test = y[test_idx] if isinstance(y, np.ndarray) else y.iloc[test_idx] + + # Create a NEW preprocessor for this fold (to avoid data leakage) + fold_preprocessor = build_preprocessing_pipeline( + numeric_cols, cat_cols, scaling, impute_strategy + ) + + # Fit preprocessor ONLY on train_val data (not on test!) + X_train_val_proc = fold_preprocessor.fit_transform(X_train_val_raw) + X_test_proc = fold_preprocessor.transform(X_test_raw) + + # Further split train_val into train and val + if val_size <= 0: + X_train, X_val, y_train, y_val = X_train_val_proc, None, y_train_val, None + else: + val_fraction = val_size / (1.0 - (1.0 / n_folds)) + + # For stratification + if problem_type == "classification" and len(labels) == 1: + n_unique_train = len(np.unique(y_train_val)) + stratify_train = y_train_val if n_unique_train <= 20 else None + else: + stratify_train = None + + X_train, X_val, y_train, y_val = train_test_split( + X_train_val_proc, + y_train_val, + test_size=val_fraction, + random_state=random_state, + stratify=stratify_train, + ) + + fold_data.append(( + X_train, + X_val, + X_test_proc, + np.array(y_train), + (None if y_val is None else np.array(y_val)), + np.array(y_test), + fold_idx + )) + + return fold_data + + +def prepare_grouped_splits( + X: pd.DataFrame, + y, + groups: pd.Series, + labels: List[str], + numeric_cols: List[str], + cat_cols: List[str], + random_state: int, + scaling: str, + impute_strategy: str, + n_train_groups: int, + n_val_groups: int, + n_test_groups: int, + cv_group_column: str, +) -> Tuple: + """Prepare train/val/test splits by assigning groups to each split. + + This function ensures no data leakage by keeping all samples from the same + group together in one split (train, val, or test). + + Parameters + ---------- + X : DataFrame + Feature data + y : array-like + Target data (already encoded if categorical) + groups : Series + Group IDs + labels : list of str + Names of target variables + numeric_cols : list of str + Names of numeric columns + cat_cols : list of str + Names of categorical columns + random_state : int + Random seed + scaling : str + "standard" or "minmax" + impute_strategy : str + Imputation strategy + n_train_groups : int + Number of groups for training + n_val_groups : int + Number of groups for validation + n_test_groups : int + Number of groups for testing + cv_group_column : str + Name of the group column + + Returns + ------- + tuple + (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments, + groups_train, groups_val, groups_test) + """ + from .preprocessing import build_preprocessing_pipeline + + unique_groups = groups.unique() + n_groups = len(unique_groups) + + # Validate we have enough groups + total_groups_needed = n_train_groups + n_val_groups + n_test_groups + if n_groups < total_groups_needed: + raise ValueError( + f"Not enough groups: need {total_groups_needed} " + f"({n_train_groups} train + {n_val_groups} val + {n_test_groups} test), " + f"but only have {n_groups} unique groups" + ) + + # Shuffle groups for random assignment + rng = np.random.RandomState(random_state) + shuffled_groups = rng.permutation(unique_groups) + + # Assign groups to splits + train_groups = shuffled_groups[:n_train_groups] + val_groups = shuffled_groups[n_train_groups:n_train_groups + n_val_groups] + test_groups = shuffled_groups[n_train_groups + n_val_groups:n_train_groups + n_val_groups + n_test_groups] + + group_assignments = { + 'train': train_groups.tolist(), + 'val': val_groups.tolist(), + 'test': test_groups.tolist(), + } + + print(f"\nGrouped Train/Val/Test Split Setup:") + print(f" Group column: {cv_group_column}") + print(f" Total unique groups: {n_groups}") + print(f" Train groups ({n_train_groups}): {train_groups.tolist()}") + print(f" Val groups ({n_val_groups}): {val_groups.tolist()}") + print(f" Test groups ({n_test_groups}): {test_groups.tolist()}") + + # Create masks for each split + train_mask = groups.isin(train_groups) + val_mask = groups.isin(val_groups) + test_mask = groups.isin(test_groups) + + print(f"\n Train samples: {train_mask.sum()}") + print(f" Val samples: {val_mask.sum()}") + print(f" Test samples: {test_mask.sum()}\n") + + # Build preprocessing pipeline + preprocessor = build_preprocessing_pipeline( + numeric_cols, cat_cols, scaling, impute_strategy + ) + + # Fit preprocessor on training data only (important!) + X_train_raw = X[train_mask] + preprocessor.fit(X_train_raw) + + # Transform all splits + X_proc = preprocessor.transform(X) + + # Split the data + X_train = X_proc[train_mask] + X_val = X_proc[val_mask] + X_test = X_proc[test_mask] + + y_train = y[train_mask] + y_val = y[val_mask] + y_test = y[test_mask] + + # Extract group IDs for each split + groups_train = groups[train_mask].values + groups_val = groups[val_mask].values + groups_test = groups[test_mask].values + + return ( + X_train, + X_val, + X_test, + np.array(y_train), + np.array(y_val), + np.array(y_test), + group_assignments, + groups_train, + groups_val, + groups_test, + preprocessor, # Return for feature name extraction + ) diff --git a/ecosci/evaluation/reporting.py b/ecosci/evaluation/reporting.py deleted file mode 100644 index 84b3caa..0000000 --- a/ecosci/evaluation/reporting.py +++ /dev/null @@ -1,765 +0,0 @@ -"""Reporting and evaluation utilities. - -This module contains the main evaluation and reporting functions that orchestrate -metrics computation, feature importance analysis, and plot generation. - -Focus on metrics that make sense for unbalanced datasets: -- Balanced accuracy (accounts for class imbalance) -- Precision/Recall/F1 (macro for multi-class) -- ROC-AUC and Average Precision (PR AUC) when probabilities are available - -Outputs: -- `report.json`: list of per-seed metrics -- `metric_*.png`: quick boxplots across seeds -- `confusion_matrix.png`: confusion matrix heatmap (last run) -- `pr_curve.png`: precision-recall curve if binary and probabilities exist -""" - -from typing import Dict, Optional -import os -import json -import numpy as np -import pandas as pd - -from .metrics import ( - safe_std, - compute_classification_metrics, - compute_regression_metrics, - compute_multi_output_classification_metrics, - compute_multi_output_regression_metrics, -) -from .plotting import ( - plot_metric_comparisons, - plot_confusion_matrices, - plot_pr_curves, - plot_residuals, - plot_cv_comparison, - plot_validation_vs_test, -) -from .feature_importance import ( - extract_feature_importance, - extract_cv_feature_importance, -) - - -def evaluate_and_report( - results, y_test, output_dir: str = "outputs", problem_type: str = "classification", - label_names: list = None, feature_names: list = None, X_test=None -): - """Compute metrics across runs and save a compact report and plots. - - Parameters - ---------- - results : dict or list - If dict: Output from Trainer.run() with multiple models {model_name: [run_results]} - If list: Single model results (backward compatibility) - y_test : array-like - Ground-truth labels (classification) or values (regression) for the test set. - output_dir : str - Where to save the report and plots. - problem_type : str - "classification" or "regression". Determines which metrics to compute. - label_names : list, optional - Names of the target variables (for multi-output). If None, will use Output 1, Output 2, etc. - feature_names : list, optional - Names of the input features. If None, will use Feature 0, Feature 1, etc. - X_test : array-like, optional - Test features for permutation importance. Required for models without built-in importance (e.g., MLPs). - """ - os.makedirs(output_dir, exist_ok=True) - - # Handle backward compatibility: convert list to dict - if isinstance(results, list): - # Assume single model (old format) - model_name = results[0].get("model_name", "model") if results else "model" - results = {model_name: results} - - # Determine if we have multi-output - is_multi_output = len(y_test.shape) > 1 and y_test.shape[1] > 1 - - # Compute metrics for each model - all_summaries = {} - all_dfs = {} - - for model_name, model_results in results.items(): - summary = [] - for r in model_results: - seed = r.get("seed") - y_pred = r.get("y_pred") - y_proba = r.get("y_proba") - - if is_multi_output: - # Multi-output case - if problem_type == "regression": - metrics = compute_multi_output_regression_metrics(y_test, y_pred) - else: - metrics = compute_multi_output_classification_metrics(y_test, y_pred, y_proba) - else: - # Single output case - if problem_type == "regression": - metrics = compute_regression_metrics(y_test, y_pred) - else: - metrics = compute_classification_metrics(y_test, y_pred, y_proba) - - metrics["seed"] = seed - metrics["model"] = model_name - summary.append(metrics) - - all_summaries[model_name] = summary - all_dfs[model_name] = pd.DataFrame(summary) - - # Save individual model reports in model-specific subfolders - for model_name, summary in all_summaries.items(): - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - report_path = os.path.join(model_dir, f"report_{model_name}.json") - with open(report_path, "w") as f: - json.dump(summary, f, indent=2) - - # Combine all results for comparison - combined_df = pd.concat(all_dfs.values(), ignore_index=True) - combined_summary = [item for sublist in all_summaries.values() for item in sublist] - - # Save combined report - with open(os.path.join(output_dir, "report_all_models.json"), "w") as f: - json.dump(combined_summary, f, indent=2) - - # Print results for each model - display_cols, comparison_metrics, plot_metrics = _get_metric_columns( - problem_type, is_multi_output, combined_df - ) - - _print_model_results(all_dfs, display_cols, is_multi_output, label_names) - - if len(all_dfs) > 1: - _print_model_comparison(all_dfs, comparison_metrics) - _print_model_ranking(all_dfs, display_cols, problem_type, is_multi_output) - - # Create plots - plot_metric_comparisons(combined_df, plot_metrics, output_dir, all_dfs) - - if problem_type == "classification": - plot_confusion_matrices(results, y_test, output_dir) - plot_pr_curves(results, y_test, output_dir) - elif problem_type == "regression" and not is_multi_output: - plot_residuals(results, y_test, output_dir) - - # Feature importance analysis - extract_feature_importance(results, X_test, y_test, output_dir, problem_type, feature_names) - - return combined_summary - - -def evaluate_and_report_cv( - results, output_dir: str = "outputs", problem_type: str = "classification", - label_names: list = None, feature_names: list = None, X_test=None -): - """Compute metrics for k-fold cross-validation with per-fold breakdown. - - Parameters - ---------- - results : dict - Output from Trainer.run_cv() with format {model_name: [run_results_with_fold_info]} - Each run_result contains: seed, fold, y_pred, y_test, etc. - output_dir : str - Where to save the report and plots. - problem_type : str - "classification" or "regression". - label_names : list, optional - Names of the target variables (for multi-output). - feature_names : list, optional - Names of the input features. - X_test : array-like, optional - Test features (note: in CV, each fold has different test set). - """ - os.makedirs(output_dir, exist_ok=True) - - print(f"\n{'='*80}") - print("K-FOLD CROSS-VALIDATION RESULTS") - print(f"{'='*80}\n") - - # Determine if multi-output by checking first result - first_result = next(iter(results.values()))[0] - y_test_sample = first_result["y_test"] - is_multi_output = len(y_test_sample.shape) > 1 and y_test_sample.shape[1] > 1 - - all_summaries = {} - all_dfs = {} - - for model_name, model_results in results.items(): - # Group results by fold - folds = {} - for r in model_results: - fold_id = r.get("fold") - if fold_id not in folds: - folds[fold_id] = [] - folds[fold_id].append(r) - - n_folds = len(folds) - - print(f"\n{model_name.upper()} - {n_folds} Folds") - print(f"{'-'*80}") - - # Compute metrics for each fold - fold_summaries = [] - for fold_id in sorted(folds.keys()): - fold_results = folds[fold_id] - fold_metrics_list = [] - - for r in fold_results: - seed = r.get("seed") - y_pred = r.get("y_pred") - y_test = r.get("y_test") - y_proba = r.get("y_proba") - - if problem_type == "classification": - if is_multi_output: - metrics = compute_multi_output_classification_metrics(y_test, y_pred, y_proba) - else: - metrics = compute_classification_metrics(y_test, y_pred, y_proba) - else: # regression - if is_multi_output: - metrics = compute_multi_output_regression_metrics(y_test, y_pred) - else: - metrics = compute_regression_metrics(y_test, y_pred) - - metrics["seed"] = seed - metrics["fold"] = fold_id - fold_metrics_list.append(metrics) - - fold_summaries.append(fold_metrics_list) - - # Print per-fold metrics - metric_keys = _get_cv_metric_keys(problem_type, is_multi_output) - _print_cv_fold_results(folds, fold_summaries, metric_keys) - - # Compute overall metrics (averaged across all folds and seeds) - all_fold_metrics = [m for fold_list in fold_summaries for m in fold_list] - overall_df = pd.DataFrame(all_fold_metrics) - - # Print overall metrics - print(f"\n {'Overall':<6}", end="") - for mk in metric_keys: - if mk in overall_df.columns: - mean_val = overall_df[mk].mean() - std_val = overall_df[mk].std() - print(f"{mean_val:.4f} ± {std_val:.4f} ", end="") - else: - print(f"{'N/A':<20}", end="") - print("\n") - - # Save reports and metrics - _save_cv_reports(model_name, all_fold_metrics, fold_summaries, folds, overall_df, output_dir) - - all_summaries[model_name] = overall_df - all_dfs[model_name] = overall_df - - # Create comparison plots across folds and models - plot_cv_comparison(all_dfs, output_dir, problem_type, is_multi_output) - - print(f"\n{'='*80}\n") - - # Feature importance for CV - extract_cv_feature_importance(results, output_dir, problem_type, feature_names) - - return all_summaries - - -def evaluate_tuning_results( - results: Dict, - y_val: np.ndarray, - y_test: np.ndarray, - output_dir: str = "outputs", - problem_type: str = "classification", - label_names: Optional[list] = None, - feature_names: Optional[list] = None, - X_val: Optional[np.ndarray] = None, - X_test: Optional[np.ndarray] = None, -): - """Evaluate and report results from hyperparameter tuning. - - This function evaluates model performance on both validation and test sets - after hyperparameter tuning. It produces reports and visualisations comparing - performance across seeds and between val/test sets. - - Parameters - ---------- - results : Dict - Results dictionary from trainer.run_with_tuning() - y_val : np.ndarray - Validation set labels - y_test : np.ndarray - Test set labels - output_dir : str - Directory to save outputs - problem_type : str - "classification" or "regression" - label_names : list, optional - Names of label columns - feature_names : list, optional - Names of features (for feature importance) - X_val : np.ndarray, optional - Validation features (for feature importance) - X_test : np.ndarray, optional - Test features (for feature importance) - - Returns - ------- - dict - Summary statistics for each model - """ - os.makedirs(output_dir, exist_ok=True) - - print(f"\n{'='*80}") - print("HYPERPARAMETER TUNING EVALUATION") - print(f"{'='*80}\n") - - all_summaries = {} - - for model_name, model_results in results.items(): - print(f"\n{model_name.upper()}") - print("-" * 60) - - # Collect metrics for each seed - val_metrics_per_seed = [] - test_metrics_per_seed = [] - best_params_per_seed = [] - best_cv_scores = [] - - for run in model_results: - seed = run['seed'] - best_params = run['best_params'] - best_cv_score = run['best_cv_score'] - - best_params_per_seed.append(best_params) - best_cv_scores.append(best_cv_score) - - # Compute validation metrics - if problem_type == "regression": - val_metrics = compute_regression_metrics( - y_val, run['y_val_pred'] - ) - else: - val_metrics = compute_classification_metrics( - y_val, run['y_val_pred'], run.get('y_val_proba') - ) - val_metrics['seed'] = seed - val_metrics_per_seed.append(val_metrics) - - # Compute test metrics - if problem_type == "regression": - test_metrics = compute_regression_metrics( - y_test, run['y_test_pred'] - ) - else: - test_metrics = compute_classification_metrics( - y_test, run['y_test_pred'], run.get('y_test_proba') - ) - test_metrics['seed'] = seed - test_metrics_per_seed.append(test_metrics) - - # Create summary DataFrames - val_df = pd.DataFrame(val_metrics_per_seed) - test_df = pd.DataFrame(test_metrics_per_seed) - - # Print summary statistics - _print_tuning_summary(val_df, test_df, best_cv_scores, problem_type, len(model_results)) - - # Save detailed results - _save_tuning_results(model_name, val_df, test_df, best_params_per_seed, output_dir) - - # Create comparison plots - plot_validation_vs_test(val_df, test_df, model_name, output_dir, problem_type) - - # Store summary - all_summaries[model_name] = { - 'validation': val_df, - 'test': test_df, - 'best_params': best_params_per_seed, - 'best_cv_scores': best_cv_scores, - } - - # Create summary report comparing all models - _save_tuning_summary_report(all_summaries, output_dir, problem_type) - - return all_summaries - - -# Helper functions - -def _get_metric_columns(problem_type, is_multi_output, combined_df): - """Determine which metric columns to display based on problem type.""" - if is_multi_output: - if problem_type == "regression": - display_cols = ["seed", "mse_mean", "rmse_mean", "mae_mean", "r2_mean"] - comparison_metrics = ["mse_mean", "rmse_mean", "mae_mean", "r2_mean"] - plot_metrics = ["mse_mean", "rmse_mean", "mae_mean", "r2_mean"] - else: - display_cols = [ - "seed", "accuracy_mean", "balanced_accuracy_mean", - "precision_mean", "recall_mean", "f1_mean", "cohen_kappa_mean", - ] - comparison_metrics = [ - "accuracy_mean", "balanced_accuracy_mean", "f1_mean", "cohen_kappa_mean", - ] - plot_metrics = ["accuracy_mean", "balanced_accuracy_mean", "f1_mean", "cohen_kappa_mean"] - else: - if problem_type == "regression": - display_cols = ["seed", "mse", "rmse", "mae", "r2"] - if "mape" in combined_df.columns: - display_cols.append("mape") - comparison_metrics = ["mse", "rmse", "mae", "r2"] - plot_metrics = ["mse", "rmse", "mae", "r2"] - else: - display_cols = [ - "seed", "accuracy", "balanced_accuracy", - "precision", "recall", "f1", "cohen_kappa", - ] - if "roc_auc" in combined_df.columns: - display_cols.append("roc_auc") - if "average_precision" in combined_df.columns: - display_cols.append("average_precision") - comparison_metrics = [ - "accuracy", "balanced_accuracy", "f1", "cohen_kappa", "roc_auc", - ] - plot_metrics = ["accuracy", "balanced_accuracy", "f1", "cohen_kappa"] - - return display_cols, comparison_metrics, plot_metrics - - -def _print_model_results(all_dfs, display_cols, is_multi_output, label_names): - """Print results for each model.""" - for model_name, df in all_dfs.items(): - print(f"\n{'='*80}") - print(f"RESULTS FOR MODEL: {model_name.upper()}") - print(f"{'='*80}") - print( - df[display_cols].to_string( - index=False, - float_format=lambda x: f"{x:.4f}" if not pd.isna(x) else "N/A", - ) - ) - - print(f"\n{'-'*80}") - print(f"SUMMARY STATISTICS FOR {model_name.upper()} (mean ± std)") - print(f"{'-'*80}") - metric_cols = [c for c in display_cols if c != "seed"] - for col in metric_cols: - if col in df.columns: - vals = df[col].dropna() - if len(vals) > 0: - print(f"{col:20s}: {vals.mean():.4f} ± {safe_std(vals):.4f}") - print(f"{'-'*80}\n") - - # For multi-output, print per-output breakdown - if is_multi_output and len(df) > 0 and "per_output" in df.iloc[0]: - _print_per_output_breakdown(df, label_names, model_name) - - -def _print_per_output_breakdown(df, label_names, model_name): - """Print per-output breakdown for multi-output problems.""" - print(f"\n{'-'*80}") - print(f"PER-OUTPUT BREAKDOWN FOR {model_name.upper()}") - print(f"{'-'*80}") - - first_row = df.iloc[0] - if "per_output" in first_row and first_row["per_output"] is not None: - per_output_list = first_row["per_output"] - n_outputs = len(per_output_list) - - # Collect metrics across all seeds for each output - for output_idx in range(n_outputs): - # Use label name if provided, otherwise use generic Output N - if label_names and output_idx < len(label_names): - output_label = f"{label_names[output_idx]}" - else: - output_label = f"Output {output_idx + 1}" - - print(f"\n{output_label}:") - - # Aggregate metrics across seeds for this output - output_metrics = {} - for _, row in df.iterrows(): - if "per_output" in row and row["per_output"] is not None: - per_out = row["per_output"] - if output_idx < len(per_out): - for key, val in per_out[output_idx].items(): - if key not in ["confusion_matrix", "seed", "model"] and val is not None: - if key not in output_metrics: - output_metrics[key] = [] - output_metrics[key].append(val) - - # Print summary for this output - for metric_name, values in sorted(output_metrics.items()): - if len(values) > 0: - mean_val = np.mean(values) - std_val = safe_std(np.array(values)) - print(f" {metric_name:20s}: {mean_val:.4f} ± {std_val:.4f}") - print(f"{'-'*80}\n") - - -def _print_model_comparison(all_dfs, comparison_metrics): - """Print model comparison table.""" - print(f"\n{'='*80}") - print("MODEL COMPARISON (mean ± std)") - print(f"{'='*80}") - comparison_data = [] - for model_name, df in all_dfs.items(): - row = {"Model": model_name} - for col in comparison_metrics: - if col in df.columns: - vals = df[col].dropna() - if len(vals) > 0: - row[col] = f"{vals.mean():.4f} ± {safe_std(vals):.4f}" - else: - row[col] = "N/A" - comparison_data.append(row) - - comparison_df = pd.DataFrame(comparison_data) - print(comparison_df.to_string(index=False)) - print(f"{'='*80}\n") - - -def _print_model_ranking(all_dfs, display_cols, problem_type, is_multi_output): - """Print model ranking based on primary metric.""" - # Determine primary metric - if is_multi_output: - if problem_type == "regression": - primary_metric = "mse_mean" - lower_is_better = True - else: - primary_metric = "cohen_kappa_mean" - lower_is_better = False - else: - if problem_type == "regression": - primary_metric = "mse" - lower_is_better = True - else: - primary_metric = "cohen_kappa" - lower_is_better = False - - # Calculate mean primary metric for each model - model_scores = [] - for model_name, df in all_dfs.items(): - if primary_metric in df.columns: - vals = df[primary_metric].dropna() - if len(vals) > 0: - model_scores.append({ - "model": model_name, - "mean": vals.mean(), - "std": safe_std(vals), - }) - - if len(model_scores) > 0: - # Sort by primary metric - model_scores.sort(key=lambda x: x["mean"], reverse=not lower_is_better) - - print(f"\n{'='*80}") - print("MODEL RANKING") - print(f"{'='*80}") - print( - f"Ranked by: {primary_metric.replace('_', ' ').upper()} " - f"({'Lower is better' if lower_is_better else 'Higher is better'})" - ) - print(f"{'-'*80}") - - # Create ranking table with all metrics - ranking_data = [] - for rank, score in enumerate(model_scores, 1): - model_name = score["model"] - model_df = all_dfs[model_name] - - row = { - "Rank": f"{rank}.", - "Model": model_name.upper(), - primary_metric: f"{score['mean']:.4f} ± {score['std']:.4f}", - } - - # Add other metrics - metric_cols = [c for c in display_cols if c != "seed" and c != primary_metric] - for col in metric_cols: - if col in model_df.columns: - vals = model_df[col].dropna() - if len(vals) > 0: - row[col] = f"{vals.mean():.4f} ± {safe_std(vals):.4f}" - - ranking_data.append(row) - - # Print ranking table - ranking_df = pd.DataFrame(ranking_data) - print(ranking_df.to_string(index=False)) - print(f"{'='*80}\n") - - -def _get_cv_metric_keys(problem_type, is_multi_output): - """Get metric keys for CV results.""" - if problem_type == "classification": - if is_multi_output: - return ["accuracy_mean", "balanced_accuracy_mean", "f1_mean"] - else: - return ["accuracy", "balanced_accuracy", "f1"] - else: - if is_multi_output: - return ["r2_mean", "rmse_mean", "mae_mean"] - else: - return ["r2", "rmse", "mae"] - - -def _print_cv_fold_results(folds, fold_summaries, metric_keys): - """Print per-fold CV results.""" - print(f"\nPer-Fold Metrics (averaged across seeds per fold):") - - # Print table header - print(f"\n {'Fold':<6}", end="") - for mk in metric_keys: - print(f"{mk:<20}", end="") - print() - print(f" {'-'*6}", end="") - for _ in metric_keys: - print(f"{'-'*20}", end="") - print() - - # Print per-fold metrics - for fold_id in sorted(folds.keys()): - fold_results_list = fold_summaries[fold_id] - fold_df = pd.DataFrame(fold_results_list) - - print(f" {fold_id:<6}", end="") - for mk in metric_keys: - if mk in fold_df.columns: - mean_val = fold_df[mk].mean() - std_val = fold_df[mk].std() - print(f"{mean_val:.4f} ± {std_val:.4f} ", end="") - else: - print(f"{'N/A':<20}", end="") - print() - - -def _save_cv_reports(model_name, all_fold_metrics, fold_summaries, folds, overall_df, output_dir): - """Save CV reports and metrics to files.""" - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - - # Save detailed report - report_path = os.path.join(model_dir, f"report_{model_name}_cv.json") - with open(report_path, "w") as f: - json.dump(all_fold_metrics, f, indent=2, default=str) - print(f" Detailed JSON report saved to: {report_path}") - - # Save detailed metrics as CSV (all seeds and folds) - csv_detailed_path = os.path.join(model_dir, f"metrics_{model_name}_cv_detailed.csv") - overall_df.to_csv(csv_detailed_path, index=False) - print(f" Detailed CSV metrics saved to: {csv_detailed_path}") - - # Save per-fold summary (averaged across seeds) - fold_summary_data = [] - for fold_id in sorted(folds.keys()): - fold_results_list = fold_summaries[fold_id] - fold_df = pd.DataFrame(fold_results_list) - - # Calculate mean and std for each metric - fold_summary = {"fold": fold_id} - for col in fold_df.columns: - if col not in ["seed", "fold"]: - fold_summary[f"{col}_mean"] = fold_df[col].mean() - fold_summary[f"{col}_std"] = fold_df[col].std() - fold_summary_data.append(fold_summary) - - # Add overall row - overall_summary = {"fold": "overall"} - for col in overall_df.columns: - if col not in ["seed", "fold"]: - overall_summary[f"{col}_mean"] = overall_df[col].mean() - overall_summary[f"{col}_std"] = overall_df[col].std() - fold_summary_data.append(overall_summary) - - fold_summary_df = pd.DataFrame(fold_summary_data) - csv_summary_path = os.path.join(model_dir, f"metrics_{model_name}_cv_per_fold.csv") - fold_summary_df.to_csv(csv_summary_path, index=False) - print(f" Per-fold summary CSV saved to: {csv_summary_path}") - - -def _print_tuning_summary(val_df, test_df, best_cv_scores, problem_type, n_seeds): - """Print summary of hyperparameter tuning results.""" - print(f"\nValidation Set Performance (across {n_seeds} seeds):") - if problem_type == "regression": - print(f" R² = {val_df['r2'].mean():.4f} ± {safe_std(val_df['r2']):.4f}") - print(f" RMSE = {val_df['rmse'].mean():.4f} ± {safe_std(val_df['rmse']):.4f}") - print(f" MAE = {val_df['mae'].mean():.4f} ± {safe_std(val_df['mae']):.4f}") - else: - print(f" Accuracy = {val_df['accuracy'].mean():.4f} ± {safe_std(val_df['accuracy']):.4f}") - print(f" Balanced Accuracy = {val_df['balanced_accuracy'].mean():.4f} ± {safe_std(val_df['balanced_accuracy']):.4f}") - print(f" F1 = {val_df['f1'].mean():.4f} ± {safe_std(val_df['f1']):.4f}") - - print(f"\nTest Set Performance (across {n_seeds} seeds):") - if problem_type == "regression": - print(f" R² = {test_df['r2'].mean():.4f} ± {safe_std(test_df['r2']):.4f}") - print(f" RMSE = {test_df['rmse'].mean():.4f} ± {safe_std(test_df['rmse']):.4f}") - print(f" MAE = {test_df['mae'].mean():.4f} ± {safe_std(test_df['mae']):.4f}") - else: - print(f" Accuracy = {test_df['accuracy'].mean():.4f} ± {safe_std(test_df['accuracy']):.4f}") - print(f" Balanced Accuracy = {test_df['balanced_accuracy'].mean():.4f} ± {safe_std(test_df['balanced_accuracy']):.4f}") - print(f" F1 = {test_df['f1'].mean():.4f} ± {safe_std(test_df['f1']):.4f}") - - print(f"\nBest CV Score (during tuning): {np.mean(best_cv_scores):.4f} ± {safe_std(np.array(best_cv_scores)):.4f}") - - -def _save_tuning_results(model_name, val_df, test_df, best_params_per_seed, output_dir): - """Save hyperparameter tuning results to files.""" - model_dir = os.path.join(output_dir, model_name) - os.makedirs(model_dir, exist_ok=True) - - # Save validation metrics - val_csv = os.path.join(model_dir, f"metrics_{model_name}_validation.csv") - val_df.to_csv(val_csv, index=False) - print(f"\n Validation metrics saved to: {val_csv}") - - # Save test metrics - test_csv = os.path.join(model_dir, f"metrics_{model_name}_test.csv") - test_df.to_csv(test_csv, index=False) - print(f" Test metrics saved to: {test_csv}") - - # Save hyperparameter summary - params_df = pd.DataFrame(best_params_per_seed) - params_csv = os.path.join(model_dir, f"best_params_{model_name}.csv") - params_df.to_csv(params_csv, index=False) - print(f" Best parameters saved to: {params_csv}") - - -def _save_tuning_summary_report(all_summaries, output_dir, problem_type): - """Save summary report comparing all models.""" - summary_report = {} - for model_name, summary in all_summaries.items(): - val_df = summary['validation'] - test_df = summary['test'] - - if problem_type == "regression": - summary_report[model_name] = { - 'val_r2_mean': float(val_df['r2'].mean()), - 'val_r2_std': float(safe_std(val_df['r2'])), - 'test_r2_mean': float(test_df['r2'].mean()), - 'test_r2_std': float(safe_std(test_df['r2'])), - 'val_rmse_mean': float(val_df['rmse'].mean()), - 'val_rmse_std': float(safe_std(val_df['rmse'])), - 'test_rmse_mean': float(test_df['rmse'].mean()), - 'test_rmse_std': float(safe_std(test_df['rmse'])), - } - else: - summary_report[model_name] = { - 'val_accuracy_mean': float(val_df['accuracy'].mean()), - 'val_accuracy_std': float(safe_std(val_df['accuracy'])), - 'test_accuracy_mean': float(test_df['accuracy'].mean()), - 'test_accuracy_std': float(safe_std(test_df['accuracy'])), - 'val_f1_mean': float(val_df['f1'].mean()), - 'val_f1_std': float(safe_std(val_df['f1'])), - 'test_f1_mean': float(test_df['f1'].mean()), - 'test_f1_std': float(safe_std(test_df['f1'])), - } - - # Save summary report - summary_path = os.path.join(output_dir, "tuning_summary_all_models.json") - with open(summary_path, 'w') as f: - json.dump(summary_report, f, indent=2) - print(f"\n{'='*60}") - print(f"Summary report saved to: {summary_path}") - print(f"{'='*60}\n") diff --git a/ecosci/evaluation/reporting/__init__.py b/ecosci/evaluation/reporting/__init__.py new file mode 100644 index 0000000..f118cd0 --- /dev/null +++ b/ecosci/evaluation/reporting/__init__.py @@ -0,0 +1,13 @@ +"""Reporting submodule for evaluation.""" + +from .evaluation import ( + evaluate_and_report, + evaluate_and_report_cv, + evaluate_tuning_results, +) + +__all__ = [ + "evaluate_and_report", + "evaluate_and_report_cv", + "evaluate_tuning_results", +] diff --git a/ecosci/evaluation/reporting/display.py b/ecosci/evaluation/reporting/display.py new file mode 100644 index 0000000..a54340a --- /dev/null +++ b/ecosci/evaluation/reporting/display.py @@ -0,0 +1,372 @@ +"""Display and formatting utilities for evaluation reports.""" + +import numpy as np +import pandas as pd +from ..metrics import safe_std + + +def get_metric_columns(problem_type, is_multi_output, combined_df): + """Determine which metric columns to display based on problem type. + + Parameters + ---------- + problem_type : str + "classification" or "regression" + is_multi_output : bool + Whether the problem has multiple outputs + combined_df : DataFrame + Combined results DataFrame + + Returns + ------- + tuple + (display_cols, comparison_metrics, plot_metrics) + """ + if is_multi_output: + if problem_type == "regression": + display_cols = ["seed", "mse_mean", "rmse_mean", "mae_mean", "r2_mean"] + comparison_metrics = ["mse_mean", "rmse_mean", "mae_mean", "r2_mean"] + plot_metrics = ["mse_mean", "rmse_mean", "mae_mean", "r2_mean"] + else: + display_cols = [ + "seed", "accuracy_mean", "balanced_accuracy_mean", + "precision_mean", "recall_mean", "f1_mean", "cohen_kappa_mean", + ] + comparison_metrics = [ + "accuracy_mean", "balanced_accuracy_mean", "f1_mean", "cohen_kappa_mean", + ] + plot_metrics = ["accuracy_mean", "balanced_accuracy_mean", "f1_mean", "cohen_kappa_mean"] + else: + if problem_type == "regression": + display_cols = ["seed", "mse", "rmse", "mae", "r2"] + if "mape" in combined_df.columns: + display_cols.append("mape") + comparison_metrics = ["mse", "rmse", "mae", "r2"] + plot_metrics = ["mse", "rmse", "mae", "r2"] + else: + display_cols = [ + "seed", "accuracy", "balanced_accuracy", + "precision", "recall", "f1", "cohen_kappa", + ] + if "roc_auc" in combined_df.columns: + display_cols.append("roc_auc") + if "average_precision" in combined_df.columns: + display_cols.append("average_precision") + comparison_metrics = [ + "accuracy", "balanced_accuracy", "f1", "cohen_kappa", "roc_auc", + ] + plot_metrics = ["accuracy", "balanced_accuracy", "f1", "cohen_kappa"] + + return display_cols, comparison_metrics, plot_metrics + + +def print_model_results(all_dfs, display_cols, is_multi_output, label_names): + """Print results for each model. + + Parameters + ---------- + all_dfs : dict + Dictionary mapping model names to DataFrames with results + display_cols : list + Columns to display + is_multi_output : bool + Whether the problem has multiple outputs + label_names : list or None + Names of output labels + """ + for model_name, df in all_dfs.items(): + print(f"\n{'='*80}") + print(f"RESULTS FOR MODEL: {model_name.upper()}") + print(f"{'='*80}") + print( + df[display_cols].to_string( + index=False, + float_format=lambda x: f"{x:.4f}" if not pd.isna(x) else "N/A", + ) + ) + + print(f"\n{'-'*80}") + print(f"SUMMARY STATISTICS FOR {model_name.upper()} (mean ± std)") + print(f"{'-'*80}") + metric_cols = [c for c in display_cols if c != "seed"] + for col in metric_cols: + if col in df.columns: + vals = df[col].dropna() + if len(vals) > 0: + print(f"{col:20s}: {vals.mean():.4f} ± {safe_std(vals):.4f}") + print(f"{'-'*80}\n") + + # For multi-output, print per-output breakdown + if is_multi_output and len(df) > 0 and "per_output" in df.iloc[0]: + print_per_output_breakdown(df, label_names, model_name) + + +def print_per_output_breakdown(df, label_names, model_name): + """Print per-output breakdown for multi-output problems. + + Parameters + ---------- + df : DataFrame + Results DataFrame + label_names : list or None + Names of output labels + model_name : str + Name of the model + """ + print(f"\n{'-'*80}") + print(f"PER-OUTPUT BREAKDOWN FOR {model_name.upper()}") + print(f"{'-'*80}") + + first_row = df.iloc[0] + if "per_output" in first_row and first_row["per_output"] is not None: + per_output_list = first_row["per_output"] + n_outputs = len(per_output_list) + + # Collect metrics across all seeds for each output + for output_idx in range(n_outputs): + # Use label name if provided, otherwise use generic Output N + if label_names and output_idx < len(label_names): + output_label = f"{label_names[output_idx]}" + else: + output_label = f"Output {output_idx + 1}" + + print(f"\n{output_label}:") + + # Aggregate metrics across seeds for this output + output_metrics = {} + for _, row in df.iterrows(): + if "per_output" in row and row["per_output"] is not None: + per_out = row["per_output"] + if output_idx < len(per_out): + for key, val in per_out[output_idx].items(): + if key not in ["confusion_matrix", "seed", "model"] and val is not None: + if key not in output_metrics: + output_metrics[key] = [] + output_metrics[key].append(val) + + # Print summary for this output + for metric_name, values in sorted(output_metrics.items()): + if len(values) > 0: + mean_val = np.mean(values) + std_val = safe_std(np.array(values)) + print(f" {metric_name:20s}: {mean_val:.4f} ± {std_val:.4f}") + print(f"{'-'*80}\n") + + +def print_model_comparison(all_dfs, comparison_metrics): + """Print model comparison table. + + Parameters + ---------- + all_dfs : dict + Dictionary mapping model names to DataFrames + comparison_metrics : list + Metrics to compare + """ + print(f"\n{'='*80}") + print("MODEL COMPARISON (mean ± std)") + print(f"{'='*80}") + comparison_data = [] + for model_name, df in all_dfs.items(): + row = {"Model": model_name} + for col in comparison_metrics: + if col in df.columns: + vals = df[col].dropna() + if len(vals) > 0: + row[col] = f"{vals.mean():.4f} ± {safe_std(vals):.4f}" + else: + row[col] = "N/A" + comparison_data.append(row) + + comparison_df = pd.DataFrame(comparison_data) + print(comparison_df.to_string(index=False)) + print(f"{'='*80}\n") + + +def print_model_ranking(all_dfs, display_cols, problem_type, is_multi_output): + """Print model ranking based on primary metric. + + Parameters + ---------- + all_dfs : dict + Dictionary mapping model names to DataFrames + display_cols : list + Columns to display + problem_type : str + "classification" or "regression" + is_multi_output : bool + Whether the problem has multiple outputs + """ + # Determine primary metric + if is_multi_output: + if problem_type == "regression": + primary_metric = "mse_mean" + lower_is_better = True + else: + primary_metric = "cohen_kappa_mean" + lower_is_better = False + else: + if problem_type == "regression": + primary_metric = "mse" + lower_is_better = True + else: + primary_metric = "cohen_kappa" + lower_is_better = False + + # Calculate mean primary metric for each model + model_scores = [] + for model_name, df in all_dfs.items(): + if primary_metric in df.columns: + vals = df[primary_metric].dropna() + if len(vals) > 0: + model_scores.append({ + "model": model_name, + "mean": vals.mean(), + "std": safe_std(vals), + }) + + if len(model_scores) > 0: + # Sort by primary metric + model_scores.sort(key=lambda x: x["mean"], reverse=not lower_is_better) + + print(f"\n{'='*80}") + print("MODEL RANKING") + print(f"{'='*80}") + print( + f"Ranked by: {primary_metric.replace('_', ' ').upper()} " + f"({'Lower is better' if lower_is_better else 'Higher is better'})" + ) + print(f"{'-'*80}") + + # Create ranking table with all metrics + ranking_data = [] + for rank, score in enumerate(model_scores, 1): + model_name = score["model"] + model_df = all_dfs[model_name] + + row = { + "Rank": f"{rank}.", + "Model": model_name.upper(), + primary_metric: f"{score['mean']:.4f} ± {score['std']:.4f}", + } + + # Add other metrics + metric_cols = [c for c in display_cols if c != "seed" and c != primary_metric] + for col in metric_cols: + if col in model_df.columns: + vals = model_df[col].dropna() + if len(vals) > 0: + row[col] = f"{vals.mean():.4f} ± {safe_std(vals):.4f}" + + ranking_data.append(row) + + # Print ranking table + ranking_df = pd.DataFrame(ranking_data) + print(ranking_df.to_string(index=False)) + print(f"{'='*80}\n") + + +def get_cv_metric_keys(problem_type, is_multi_output): + """Get metric keys for CV results. + + Parameters + ---------- + problem_type : str + "classification" or "regression" + is_multi_output : bool + Whether the problem has multiple outputs + + Returns + ------- + list + List of metric keys + """ + if problem_type == "classification": + if is_multi_output: + return ["accuracy_mean", "balanced_accuracy_mean", "f1_mean"] + else: + return ["accuracy", "balanced_accuracy", "f1"] + else: + if is_multi_output: + return ["r2_mean", "rmse_mean", "mae_mean"] + else: + return ["r2", "rmse", "mae"] + + +def print_cv_fold_results(folds, fold_summaries, metric_keys): + """Print per-fold CV results. + + Parameters + ---------- + folds : dict + Dictionary of fold results + fold_summaries : list + List of fold summaries + metric_keys : list + Metric keys to display + """ + print(f"\nPer-Fold Metrics (averaged across seeds per fold):") + + # Print table header + print(f"\n {'Fold':<6}", end="") + for mk in metric_keys: + print(f"{mk:<20}", end="") + print() + print(f" {'-'*6}", end="") + for _ in metric_keys: + print(f"{'-'*20}", end="") + print() + + # Print per-fold metrics + for fold_id in sorted(folds.keys()): + fold_results_list = fold_summaries[fold_id] + fold_df = pd.DataFrame(fold_results_list) + + print(f" {fold_id:<6}", end="") + for mk in metric_keys: + if mk in fold_df.columns: + mean_val = fold_df[mk].mean() + std_val = fold_df[mk].std() + print(f"{mean_val:.4f} ± {std_val:.4f} ", end="") + else: + print(f"{'N/A':<20}", end="") + print() + + +def print_tuning_summary(val_df, test_df, best_cv_scores, problem_type, n_seeds): + """Print summary of hyperparameter tuning results. + + Parameters + ---------- + val_df : DataFrame + Validation metrics + test_df : DataFrame + Test metrics + best_cv_scores : list + Best CV scores across seeds + problem_type : str + "classification" or "regression" + n_seeds : int + Number of seeds + """ + print(f"\nValidation Set Performance (across {n_seeds} seeds):") + if problem_type == "regression": + print(f" R² = {val_df['r2'].mean():.4f} ± {safe_std(val_df['r2']):.4f}") + print(f" RMSE = {val_df['rmse'].mean():.4f} ± {safe_std(val_df['rmse']):.4f}") + print(f" MAE = {val_df['mae'].mean():.4f} ± {safe_std(val_df['mae']):.4f}") + else: + print(f" Accuracy = {val_df['accuracy'].mean():.4f} ± {safe_std(val_df['accuracy']):.4f}") + print(f" Balanced Accuracy = {val_df['balanced_accuracy'].mean():.4f} ± {safe_std(val_df['balanced_accuracy']):.4f}") + print(f" F1 = {val_df['f1'].mean():.4f} ± {safe_std(val_df['f1']):.4f}") + + print(f"\nTest Set Performance (across {n_seeds} seeds):") + if problem_type == "regression": + print(f" R² = {test_df['r2'].mean():.4f} ± {safe_std(test_df['r2']):.4f}") + print(f" RMSE = {test_df['rmse'].mean():.4f} ± {safe_std(test_df['rmse']):.4f}") + print(f" MAE = {test_df['mae'].mean():.4f} ± {safe_std(test_df['mae']):.4f}") + else: + print(f" Accuracy = {test_df['accuracy'].mean():.4f} ± {safe_std(test_df['accuracy']):.4f}") + print(f" Balanced Accuracy = {test_df['balanced_accuracy'].mean():.4f} ± {safe_std(test_df['balanced_accuracy']):.4f}") + print(f" F1 = {test_df['f1'].mean():.4f} ± {safe_std(test_df['f1']):.4f}") + + print(f"\nBest CV Score (during tuning): {np.mean(best_cv_scores):.4f} ± {safe_std(np.array(best_cv_scores)):.4f}") diff --git a/ecosci/evaluation/reporting/evaluation.py b/ecosci/evaluation/reporting/evaluation.py new file mode 100644 index 0000000..5664c40 --- /dev/null +++ b/ecosci/evaluation/reporting/evaluation.py @@ -0,0 +1,385 @@ +"""Main evaluation orchestration functions. + +This module contains the high-level functions that orchestrate metrics computation, +feature importance analysis, and plot generation. +""" + +from typing import Dict, Optional +import os +import json +import numpy as np +import pandas as pd + +from ..metrics import ( + compute_classification_metrics, + compute_regression_metrics, + compute_multi_output_classification_metrics, + compute_multi_output_regression_metrics, +) +from ..plotting import ( + plot_metric_comparisons, + plot_confusion_matrices, + plot_pr_curves, + plot_residuals, + plot_cv_comparison, + plot_validation_vs_test, +) +from ..feature_importance import ( + extract_feature_importance, + extract_cv_feature_importance, +) +from .display import ( + get_metric_columns, + print_model_results, + print_model_comparison, + print_model_ranking, + get_cv_metric_keys, + print_cv_fold_results, + print_tuning_summary, +) +from .output import ( + save_cv_reports, + save_tuning_results, + save_tuning_summary_report, +) + + +def evaluate_and_report( + results, y_test, output_dir: str = "outputs", problem_type: str = "classification", + label_names: list = None, feature_names: list = None, X_test=None +): + """Compute metrics across runs and save a compact report and plots. + + Parameters + ---------- + results : dict or list + If dict: Output from Trainer.run() with multiple models {model_name: [run_results]} + If list: Single model results (backward compatibility) + y_test : array-like + Ground-truth labels (classification) or values (regression) for the test set. + output_dir : str + Where to save the report and plots. + problem_type : str + "classification" or "regression". Determines which metrics to compute. + label_names : list, optional + Names of the target variables (for multi-output). + feature_names : list, optional + Names of the input features. + X_test : array-like, optional + Test features for permutation importance. + + Returns + ------- + list + Combined summary of results across all models + """ + os.makedirs(output_dir, exist_ok=True) + + # Handle backward compatibility: convert list to dict + if isinstance(results, list): + model_name = results[0].get("model_name", "model") if results else "model" + results = {model_name: results} + + # Determine if we have multi-output + is_multi_output = len(y_test.shape) > 1 and y_test.shape[1] > 1 + + # Compute metrics for each model + all_summaries = {} + all_dfs = {} + + for model_name, model_results in results.items(): + summary = [] + for r in model_results: + seed = r.get("seed") + y_pred = r.get("y_pred") + y_proba = r.get("y_proba") + + if is_multi_output: + if problem_type == "regression": + metrics = compute_multi_output_regression_metrics(y_test, y_pred) + else: + metrics = compute_multi_output_classification_metrics(y_test, y_pred, y_proba) + else: + if problem_type == "regression": + metrics = compute_regression_metrics(y_test, y_pred) + else: + metrics = compute_classification_metrics(y_test, y_pred, y_proba) + + metrics["seed"] = seed + metrics["model"] = model_name + summary.append(metrics) + + all_summaries[model_name] = summary + all_dfs[model_name] = pd.DataFrame(summary) + + # Save individual model reports + for model_name, summary in all_summaries.items(): + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + report_path = os.path.join(model_dir, f"report_{model_name}.json") + with open(report_path, "w") as f: + json.dump(summary, f, indent=2) + + # Combine all results + combined_df = pd.concat(all_dfs.values(), ignore_index=True) + combined_summary = [item for sublist in all_summaries.values() for item in sublist] + + # Save combined report + with open(os.path.join(output_dir, "report_all_models.json"), "w") as f: + json.dump(combined_summary, f, indent=2) + + # Print results + display_cols, comparison_metrics, plot_metrics = get_metric_columns( + problem_type, is_multi_output, combined_df + ) + + print_model_results(all_dfs, display_cols, is_multi_output, label_names) + + if len(all_dfs) > 1: + print_model_comparison(all_dfs, comparison_metrics) + print_model_ranking(all_dfs, display_cols, problem_type, is_multi_output) + + # Create plots + plot_metric_comparisons(combined_df, plot_metrics, output_dir, all_dfs) + + if problem_type == "classification": + plot_confusion_matrices(results, y_test, output_dir) + plot_pr_curves(results, y_test, output_dir) + elif problem_type == "regression" and not is_multi_output: + plot_residuals(results, y_test, output_dir) + + # Feature importance + extract_feature_importance(results, X_test, y_test, output_dir, problem_type, feature_names) + + return combined_summary + + +def evaluate_and_report_cv( + results, output_dir: str = "outputs", problem_type: str = "classification", + label_names: list = None, feature_names: list = None, X_test=None +): + """Compute metrics for k-fold cross-validation with per-fold breakdown. + + Parameters + ---------- + results : dict + Output from Trainer.run_cv() with format {model_name: [run_results_with_fold_info]} + output_dir : str + Where to save the report and plots. + problem_type : str + "classification" or "regression". + label_names : list, optional + Names of the target variables. + feature_names : list, optional + Names of the input features. + X_test : array-like, optional + Test features (note: in CV, each fold has different test set). + + Returns + ------- + dict + Summaries for all models + """ + os.makedirs(output_dir, exist_ok=True) + + print(f"\n{'='*80}") + print("K-FOLD CROSS-VALIDATION RESULTS") + print(f"{'='*80}\n") + + # Determine if multi-output + first_result = next(iter(results.values()))[0] + y_test_sample = first_result["y_test"] + is_multi_output = len(y_test_sample.shape) > 1 and y_test_sample.shape[1] > 1 + + all_summaries = {} + all_dfs = {} + + for model_name, model_results in results.items(): + # Group results by fold + folds = {} + for r in model_results: + fold_id = r.get("fold") + if fold_id not in folds: + folds[fold_id] = [] + folds[fold_id].append(r) + + n_folds = len(folds) + + print(f"\n{model_name.upper()} - {n_folds} Folds") + print(f"{'-'*80}") + + # Compute metrics for each fold + fold_summaries = [] + for fold_id in sorted(folds.keys()): + fold_results = folds[fold_id] + fold_metrics_list = [] + + for r in fold_results: + seed = r.get("seed") + y_pred = r.get("y_pred") + y_test = r.get("y_test") + y_proba = r.get("y_proba") + + if problem_type == "classification": + if is_multi_output: + metrics = compute_multi_output_classification_metrics(y_test, y_pred, y_proba) + else: + metrics = compute_classification_metrics(y_test, y_pred, y_proba) + else: + if is_multi_output: + metrics = compute_multi_output_regression_metrics(y_test, y_pred) + else: + metrics = compute_regression_metrics(y_test, y_pred) + + metrics["seed"] = seed + metrics["fold"] = fold_id + fold_metrics_list.append(metrics) + + fold_summaries.append(fold_metrics_list) + + # Print per-fold metrics + metric_keys = get_cv_metric_keys(problem_type, is_multi_output) + print_cv_fold_results(folds, fold_summaries, metric_keys) + + # Compute overall metrics + all_fold_metrics = [m for fold_list in fold_summaries for m in fold_list] + overall_df = pd.DataFrame(all_fold_metrics) + + # Print overall metrics + print(f"\n {'Overall':<6}", end="") + for mk in metric_keys: + if mk in overall_df.columns: + mean_val = overall_df[mk].mean() + std_val = overall_df[mk].std() + print(f"{mean_val:.4f} ± {std_val:.4f} ", end="") + else: + print(f"{'N/A':<20}", end="") + print("\n") + + # Save reports + save_cv_reports(model_name, all_fold_metrics, fold_summaries, folds, overall_df, output_dir) + + all_summaries[model_name] = overall_df + all_dfs[model_name] = overall_df + + # Create plots + plot_cv_comparison(all_dfs, output_dir, problem_type, is_multi_output) + + print(f"\n{'='*80}\n") + + # Feature importance + extract_cv_feature_importance(results, output_dir, problem_type, feature_names) + + return all_summaries + + +def evaluate_tuning_results( + results: Dict, + y_val: np.ndarray, + y_test: np.ndarray, + output_dir: str = "outputs", + problem_type: str = "classification", + label_names: Optional[list] = None, + feature_names: Optional[list] = None, + X_val: Optional[np.ndarray] = None, + X_test: Optional[np.ndarray] = None, +): + """Evaluate and report results from hyperparameter tuning. + + Parameters + ---------- + results : Dict + Results dictionary from trainer.run_with_tuning() + y_val : np.ndarray + Validation set labels + y_test : np.ndarray + Test set labels + output_dir : str + Directory to save outputs + problem_type : str + "classification" or "regression" + label_names : list, optional + Names of label columns + feature_names : list, optional + Names of features + X_val : np.ndarray, optional + Validation features + X_test : np.ndarray, optional + Test features + + Returns + ------- + dict + Summary statistics for each model + """ + os.makedirs(output_dir, exist_ok=True) + + print(f"\n{'='*80}") + print("HYPERPARAMETER TUNING EVALUATION") + print(f"{'='*80}\n") + + all_summaries = {} + + for model_name, model_results in results.items(): + print(f"\n{model_name.upper()}") + print("-" * 60) + + # Collect metrics for each seed + val_metrics_per_seed = [] + test_metrics_per_seed = [] + best_params_per_seed = [] + best_cv_scores = [] + + for run in model_results: + seed = run['seed'] + best_params = run['best_params'] + best_cv_score = run['best_cv_score'] + + best_params_per_seed.append(best_params) + best_cv_scores.append(best_cv_score) + + # Compute validation metrics + if problem_type == "regression": + val_metrics = compute_regression_metrics(y_val, run['y_val_pred']) + else: + val_metrics = compute_classification_metrics( + y_val, run['y_val_pred'], run.get('y_val_proba') + ) + val_metrics['seed'] = seed + val_metrics_per_seed.append(val_metrics) + + # Compute test metrics + if problem_type == "regression": + test_metrics = compute_regression_metrics(y_test, run['y_test_pred']) + else: + test_metrics = compute_classification_metrics( + y_test, run['y_test_pred'], run.get('y_test_proba') + ) + test_metrics['seed'] = seed + test_metrics_per_seed.append(test_metrics) + + # Create summary DataFrames + val_df = pd.DataFrame(val_metrics_per_seed) + test_df = pd.DataFrame(test_metrics_per_seed) + + # Print summary + print_tuning_summary(val_df, test_df, best_cv_scores, problem_type, len(model_results)) + + # Save results + save_tuning_results(model_name, val_df, test_df, best_params_per_seed, output_dir) + + # Create plots + plot_validation_vs_test(val_df, test_df, model_name, output_dir, problem_type) + + # Store summary + all_summaries[model_name] = { + 'validation': val_df, + 'test': test_df, + 'best_params': best_params_per_seed, + 'best_cv_scores': best_cv_scores, + } + + # Save summary report + save_tuning_summary_report(all_summaries, output_dir, problem_type) + + return all_summaries diff --git a/ecosci/evaluation/reporting/output.py b/ecosci/evaluation/reporting/output.py new file mode 100644 index 0000000..115df29 --- /dev/null +++ b/ecosci/evaluation/reporting/output.py @@ -0,0 +1,152 @@ +"""Output and file saving utilities for evaluation reports.""" + +import os +import json +import numpy as np +import pandas as pd +from ..metrics import safe_std + + +def save_cv_reports(model_name, all_fold_metrics, fold_summaries, folds, overall_df, output_dir): + """Save CV reports and metrics to files. + + Parameters + ---------- + model_name : str + Name of the model + all_fold_metrics : list + All metrics across folds + fold_summaries : list + Per-fold summaries + folds : dict + Dictionary of fold results + overall_df : DataFrame + Overall results DataFrame + output_dir : str + Output directory + """ + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + + # Save detailed report + report_path = os.path.join(model_dir, f"report_{model_name}_cv.json") + with open(report_path, "w") as f: + json.dump(all_fold_metrics, f, indent=2, default=str) + print(f" Detailed JSON report saved to: {report_path}") + + # Save detailed metrics as CSV (all seeds and folds) + csv_detailed_path = os.path.join(model_dir, f"metrics_{model_name}_cv_detailed.csv") + overall_df.to_csv(csv_detailed_path, index=False) + print(f" Detailed CSV metrics saved to: {csv_detailed_path}") + + # Save per-fold summary (averaged across seeds) + fold_summary_data = [] + for fold_id in sorted(folds.keys()): + fold_results_list = fold_summaries[fold_id] + fold_df = pd.DataFrame(fold_results_list) + + # Calculate mean and std for each metric + fold_summary = {"fold": fold_id} + for col in fold_df.columns: + if col not in ["seed", "fold"]: + fold_summary[f"{col}_mean"] = fold_df[col].mean() + fold_summary[f"{col}_std"] = fold_df[col].std() + fold_summary_data.append(fold_summary) + + # Add overall row + overall_summary = {"fold": "overall"} + for col in overall_df.columns: + if col not in ["seed", "fold"]: + overall_summary[f"{col}_mean"] = overall_df[col].mean() + overall_summary[f"{col}_std"] = overall_df[col].std() + fold_summary_data.append(overall_summary) + + fold_summary_df = pd.DataFrame(fold_summary_data) + csv_summary_path = os.path.join(model_dir, f"metrics_{model_name}_cv_per_fold.csv") + fold_summary_df.to_csv(csv_summary_path, index=False) + print(f" Per-fold summary CSV saved to: {csv_summary_path}") + + +def save_tuning_results(model_name, val_df, test_df, best_params_per_seed, output_dir): + """Save hyperparameter tuning results to files. + + Parameters + ---------- + model_name : str + Name of the model + val_df : DataFrame + Validation metrics + test_df : DataFrame + Test metrics + best_params_per_seed : list + Best parameters for each seed + output_dir : str + Output directory + """ + model_dir = os.path.join(output_dir, model_name) + os.makedirs(model_dir, exist_ok=True) + + # Save validation metrics + val_csv = os.path.join(model_dir, f"metrics_{model_name}_validation.csv") + val_df.to_csv(val_csv, index=False) + print(f"\n Validation metrics saved to: {val_csv}") + + # Save test metrics + test_csv = os.path.join(model_dir, f"metrics_{model_name}_test.csv") + test_df.to_csv(test_csv, index=False) + print(f" Test metrics saved to: {test_csv}") + + # Save hyperparameter summary + params_df = pd.DataFrame(best_params_per_seed) + params_csv = os.path.join(model_dir, f"best_params_{model_name}.csv") + params_df.to_csv(params_csv, index=False) + print(f" Best parameters saved to: {params_csv}") + + +def save_tuning_summary_report(all_summaries, output_dir, problem_type): + """Save summary report comparing all models. + + Parameters + ---------- + all_summaries : dict + Dictionary of summaries for all models + output_dir : str + Output directory + problem_type : str + "classification" or "regression" + """ + summary_report = {} + for model_name, summary in all_summaries.items(): + val_df = summary['validation'] + test_df = summary['test'] + + if problem_type == "regression": + summary_report[model_name] = { + 'val_r2_mean': float(val_df['r2'].mean()), + 'val_r2_std': float(safe_std(val_df['r2'])), + 'test_r2_mean': float(test_df['r2'].mean()), + 'test_r2_std': float(safe_std(test_df['r2'])), + 'val_rmse_mean': float(val_df['rmse'].mean()), + 'val_rmse_std': float(safe_std(val_df['rmse'])), + 'test_rmse_mean': float(test_df['rmse'].mean()), + 'test_rmse_std': float(safe_std(test_df['rmse'])), + } + else: + summary_report[model_name] = { + 'val_accuracy_mean': float(val_df['accuracy'].mean()), + 'val_accuracy_std': float(safe_std(val_df['accuracy'])), + 'test_accuracy_mean': float(test_df['accuracy'].mean()), + 'test_accuracy_std': float(safe_std(test_df['accuracy'])), + 'val_f1_mean': float(val_df['f1'].mean()), + 'val_f1_std': float(safe_std(val_df['f1'])), + 'test_f1_mean': float(test_df['f1'].mean()), + 'test_f1_std': float(safe_std(test_df['f1'])), + } + + # Save summary report + summary_path = os.path.join(output_dir, "tuning_summary_all_models.json") + with open(summary_path, 'w') as f: + json.dump(summary_report, f, indent=2) + print(f"\n{'='*60}") + print(f"Summary report saved to: {summary_path}") + print(f"{'='*60}\n") diff --git a/ecosci/hyperopt.py b/ecosci/hyperopt.py index 818c7b5..a401926 100644 --- a/ecosci/hyperopt.py +++ b/ecosci/hyperopt.py @@ -415,12 +415,13 @@ def tune_linear( base_model = Ridge(random_state=self.random_state) else: # Use LogisticRegression with C and penalty tuning - param_space = { - 'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0], - 'penalty': ['l1', 'l2', 'elasticnet'], - 'solver': ['saga'], # saga supports all penalties - 'l1_ratio': [0.1, 0.5, 0.9], # for elasticnet - } + # Separate parameter spaces for different penalties to avoid invalid combinations + c_values = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] + param_space = [ + {'C': c_values, 'penalty': ['l1'], 'solver': ['saga']}, + {'C': c_values, 'penalty': ['l2'], 'solver': ['saga']}, + {'C': c_values, 'penalty': ['elasticnet'], 'solver': ['saga'], 'l1_ratio': [0.1, 0.5, 0.9]}, + ] base_model = LogisticRegression( random_state=self.random_state, max_iter=1000 From a24994bbd41608bf59c9fbc25b8330a1214cac0b Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 13:22:40 +1100 Subject: [PATCH 11/15] fix warnings --- tests/test_hyperopt.py | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/tests/test_hyperopt.py b/tests/test_hyperopt.py index 72374c5..c3fe2bb 100755 --- a/tests/test_hyperopt.py +++ b/tests/test_hyperopt.py @@ -74,12 +74,6 @@ def test_grouped_splits(): print("✓ All splits are non-empty") - return True - except Exception as e: - print(f"✗ Error: {e}") - import traceback - traceback.print_exc() - return False finally: os.unlink(temp_file) @@ -131,12 +125,13 @@ def test_hyperparameter_tuner(): print(f" Best params: {results['best_params']}") print(f" Best score: {results['best_score']:.4f}") - return True + assert best_model is not None, "Best model should not be None" + assert 'best_params' in results, "Results should contain best_params" except Exception as e: print(f"✗ Error: {e}") import traceback traceback.print_exc() - return False + raise def test_integration(): """Test full integration with config.""" @@ -203,7 +198,8 @@ def test_integration(): n_test_groups = config['data'].pop('n_test_groups') loader = CSVDataLoader(**config['data'], problem_type=config['problem_type']) - (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments) = \ + (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments, + groups_train, groups_val, groups_test) = \ loader.prepare_grouped_splits( n_train_groups=n_train_groups, n_val_groups=n_val_groups, @@ -240,12 +236,6 @@ def test_integration(): print(f"✓ All expected output files created") - return True - except Exception as e: - print(f"✗ Error: {e}") - import traceback - traceback.print_exc() - return False finally: os.unlink(temp_file) # Cleanup temp output dir From ba4fcbbbb6ca262943ffae8772db5694f5a57702 Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 13:57:52 +1100 Subject: [PATCH 12/15] convert hyperopt to pytests --- run_tests.py | 1 + tests/test_hyperopt.py | 178 +++++++++++++++-------------------------- 2 files changed, 67 insertions(+), 112 deletions(-) diff --git a/run_tests.py b/run_tests.py index 5b2f7e2..64aac0d 100644 --- a/run_tests.py +++ b/run_tests.py @@ -19,6 +19,7 @@ "trainer": "tests/test_trainer.py", "eval": "tests/test_eval.py", "config": "tests/test_config.py", + "hyperopt": "tests/test_hyperopt.py", "integration": "tests/test_integration.py", } diff --git a/tests/test_hyperopt.py b/tests/test_hyperopt.py index c3fe2bb..e2fc445 100755 --- a/tests/test_hyperopt.py +++ b/tests/test_hyperopt.py @@ -1,5 +1,11 @@ #!/usr/bin/env python3 -"""Quick test script to validate hyperparameter tuning functionality.""" +"""Tests for hyperparameter tuning functionality. + +This module tests the hyperparameter optimization features including: +- Grouped train/validation/test splits +- Hyperparameter tuning for different model types (MLP, Random Forest) +- Full integration of tuning with the training pipeline +""" import numpy as np import pandas as pd @@ -7,6 +13,8 @@ import os import sys +from ecosci.data import CSVDataLoader + def create_test_data(): """Create a small synthetic dataset with groups for testing.""" np.random.seed(42) @@ -33,12 +41,6 @@ def create_test_data(): def test_grouped_splits(): """Test grouped train/val/test splits.""" - print("\n" + "="*60) - print("Testing Grouped Splits") - print("="*60) - - from ecosci.data import CSVDataLoader - # Create test data df = create_test_data() @@ -58,31 +60,20 @@ def test_grouped_splits(): ) # Test grouped splits - (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments, _, _, _) = \ + (X_train, X_val, X_test, _, _, _, group_assignments, _, _, _) = \ loader.prepare_grouped_splits(n_train_groups=4, n_val_groups=2, n_test_groups=2) - print(f"\n✓ Successfully created grouped splits") - print(f" Train shape: {X_train.shape}") - print(f" Val shape: {X_val.shape}") - print(f" Test shape: {X_test.shape}") - print(f" Group assignments: {group_assignments}") - # Verify no overlap assert X_train.shape[0] > 0, "Train set is empty" assert X_val.shape[0] > 0, "Val set is empty" assert X_test.shape[0] > 0, "Test set is empty" - - print("✓ All splits are non-empty") - + assert len(group_assignments) == 3, "Should have 3 group assignments (train, val, test)" + finally: os.unlink(temp_file) -def test_hyperparameter_tuner(): - """Test hyperparameter tuner.""" - print("\n" + "="*60) - print("Testing Hyperparameter Tuner") - print("="*60) - +def test_hyperparameter_tuner_mlp(): + """Test hyperparameter tuner with MLP.""" from ecosci.hyperopt import HyperparameterTuner # Create simple test data @@ -90,55 +81,55 @@ def test_hyperparameter_tuner(): X_train = np.random.randn(100, 5) y_train = X_train[:, 0] * 2 + np.random.randn(100) * 0.5 - try: - # Test MLP tuning - print("\nTesting MLP tuning...") - tuner = HyperparameterTuner( - problem_type='regression', - n_iter=3, # Small for testing - cv=2, # Small for testing - verbose=0, - ) - - param_space = { - 'hidden_layer_sizes': [(16,), (32,)], - 'alpha': [0.001, 0.01], - 'max_iter': [100], - } - - best_model, results = tuner.tune_mlp(X_train, y_train, param_space) - - print(f"✓ MLP tuning successful") - print(f" Best params: {results['best_params']}") - print(f" Best score: {results['best_score']:.4f}") - - # Test Random Forest tuning - print("\nTesting Random Forest tuning...") - param_space = { - 'n_estimators': [10, 20], - 'max_depth': [3, 5], - } - - best_model, results = tuner.tune_random_forest(X_train, y_train, param_space) - - print(f"✓ Random Forest tuning successful") - print(f" Best params: {results['best_params']}") - print(f" Best score: {results['best_score']:.4f}") - - assert best_model is not None, "Best model should not be None" - assert 'best_params' in results, "Results should contain best_params" - except Exception as e: - print(f"✗ Error: {e}") - import traceback - traceback.print_exc() - raise + # Test MLP tuning + tuner = HyperparameterTuner( + problem_type='regression', + n_iter=3, # Small for testing + cv=2, # Small for testing + verbose=0, + ) + + param_space = { + 'hidden_layer_sizes': [(16,), (32,)], + 'alpha': [0.001, 0.01], + 'max_iter': [100], + } + + best_model, results = tuner.tune_mlp(X_train, y_train, param_space) + + assert best_model is not None, "Best model should not be None" + assert 'best_params' in results, "Results should contain best_params" + assert 'best_score' in results, "Results should contain best_score" + -def test_integration(): - """Test full integration with config.""" - print("\n" + "="*60) - print("Testing Full Integration") - print("="*60) +def test_hyperparameter_tuner_random_forest(): + """Test hyperparameter tuner with Random Forest.""" + from ecosci.hyperopt import HyperparameterTuner + + # Create simple test data + np.random.seed(42) + X_train = np.random.randn(100, 5) + y_train = X_train[:, 0] * 2 + np.random.randn(100) * 0.5 + # Test Random Forest tuning + tuner = HyperparameterTuner( + problem_type='regression', + n_iter=3, # Small for testing + cv=2, # Small for testing + verbose=0, + ) + + param_space = { + 'n_estimators': [10, 20], + 'max_depth': [3, 5], + } + + best_model, results = tuner.tune_random_forest(X_train, y_train, param_space) + + assert best_model is not None, "Best model should not be None" + assert 'best_params' in results, "Results should contain best_params" +def test_tuning_integration(): + """Test full integration with config and hyperparameter tuning.""" # Create test data df = create_test_data() @@ -206,8 +197,6 @@ def test_integration(): n_test_groups=n_test_groups ) - print(f"✓ Data loaded successfully") - # Create trainer trainer = Trainer( ModelZoo.get_model, @@ -216,15 +205,13 @@ def test_integration(): ) # Run training with tuning - print("\nRunning training with tuning...") results = trainer.run_with_tuning( config, X_train, X_val, X_test, y_train, y_val, y_test, group_assignments ) - print(f"✓ Training completed successfully") - print(f" Models trained: {list(results.keys())}") - # Check outputs + assert len(results) > 0, "No models were trained" + for model_name in results.keys(): model_dir = os.path.join(temp_output_dir, model_name) assert os.path.exists(model_dir), f"Model directory not found: {model_dir}" @@ -233,8 +220,6 @@ def test_integration(): files = os.listdir(model_dir) assert any('model_' in f for f in files), "No model files found" assert any('tuning_results_' in f for f in files), "No tuning results found" - - print(f"✓ All expected output files created") finally: os.unlink(temp_file) @@ -242,39 +227,8 @@ def test_integration(): import shutil shutil.rmtree(temp_output_dir, ignore_errors=True) -def main(): - """Run all tests.""" - print("\n" + "="*60) - print("HYPERPARAMETER TUNING VALIDATION TESTS") - print("="*60) - - tests = [ - ("Grouped Splits", test_grouped_splits), - ("Hyperparameter Tuner", test_hyperparameter_tuner), - ("Full Integration", test_integration), - ] - - results = {} - for test_name, test_func in tests: - results[test_name] = test_func() - - # Summary - print("\n" + "="*60) - print("TEST SUMMARY") - print("="*60) - - for test_name, passed in results.items(): - status = "✓ PASSED" if passed else "✗ FAILED" - print(f"{test_name}: {status}") - - all_passed = all(results.values()) - - if all_passed: - print("\n✓ All tests passed!") - return 0 - else: - print("\n✗ Some tests failed") - return 1 if __name__ == "__main__": - sys.exit(main()) + # This file can now be run with pytest + import pytest + sys.exit(pytest.main([__file__, "-v"])) \ No newline at end of file From c003f767c74b2870f21c43a94445899083287b05 Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 13:58:34 +1100 Subject: [PATCH 13/15] unpack --- ecosci/evaluation/plotting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ecosci/evaluation/plotting.py b/ecosci/evaluation/plotting.py index 337a23d..f6c8ef2 100644 --- a/ecosci/evaluation/plotting.py +++ b/ecosci/evaluation/plotting.py @@ -202,7 +202,7 @@ def plot_residuals( y_pred = last.get("y_pred") residuals = y_test - y_pred - (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) + _, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4)) # Predicted vs Actual ax1.scatter(y_test, y_pred, alpha=0.6) From 5fb6048ba63d722fff0840c4d10cc2d8706f2a38 Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 14:00:46 +1100 Subject: [PATCH 14/15] unused --- ecosci/data/splitting.py | 2 +- ecosci/evaluation/reporting/output.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/ecosci/data/splitting.py b/ecosci/data/splitting.py index c7767f2..d4eb94a 100644 --- a/ecosci/data/splitting.py +++ b/ecosci/data/splitting.py @@ -1,6 +1,6 @@ """Data splitting utilities for cross-validation and grouped splits.""" -from typing import List, Tuple, Dict, Optional +from typing import List, Tuple import numpy as np import pandas as pd diff --git a/ecosci/evaluation/reporting/output.py b/ecosci/evaluation/reporting/output.py index 115df29..b80c936 100644 --- a/ecosci/evaluation/reporting/output.py +++ b/ecosci/evaluation/reporting/output.py @@ -2,7 +2,6 @@ import os import json -import numpy as np import pandas as pd from ..metrics import safe_std From ddd531a63710e61ca1482a6c99c1dbf0eb36f7e7 Mon Sep 17 00:00:00 2001 From: Ysobel Date: Mon, 24 Nov 2025 14:08:42 +1100 Subject: [PATCH 15/15] minor updates --- ecosci/data/splitting.py | 25 ++++++++++++++++++++++++- ecosci/hyperopt.py | 2 +- ecosci/models.py | 2 +- tests/test_hyperopt.py | 3 ++- 4 files changed, 28 insertions(+), 4 deletions(-) diff --git a/ecosci/data/splitting.py b/ecosci/data/splitting.py index d4eb94a..a832071 100644 --- a/ecosci/data/splitting.py +++ b/ecosci/data/splitting.py @@ -172,7 +172,30 @@ def prepare_grouped_splits( ------- tuple (X_train, X_val, X_test, y_train, y_val, y_test, group_assignments, - groups_train, groups_val, groups_test) + groups_train, groups_val, groups_test, preprocessor) + + X_train : ndarray + Preprocessed training features + X_val : ndarray + Preprocessed validation features + X_test : ndarray + Preprocessed testing features + y_train : ndarray + Training target values + y_val : ndarray + Validation target values + y_test : ndarray + Testing target values + group_assignments : dict + Dictionary mapping split names to group IDs + groups_train : ndarray + Group IDs for training samples + groups_val : ndarray + Group IDs for validation samples + groups_test : ndarray + Group IDs for testing samples + preprocessor : ColumnTransformer + Fitted preprocessing pipeline """ from .preprocessing import build_preprocessing_pipeline diff --git a/ecosci/hyperopt.py b/ecosci/hyperopt.py index a401926..851c8d5 100644 --- a/ecosci/hyperopt.py +++ b/ecosci/hyperopt.py @@ -383,7 +383,7 @@ def tune_linear( """Tune Linear model hyperparameters. For linear regression, there are few hyperparameters to tune, - but we can tune regularization if using Ridge/Lasso. + but we can tune regularisation if using Ridge/Lasso. Parameters ---------- diff --git a/ecosci/models.py b/ecosci/models.py index ab23751..c59632d 100644 --- a/ecosci/models.py +++ b/ecosci/models.py @@ -167,7 +167,7 @@ def get_model( "Linear model is only for regression. Use 'logistic' for classification." ) - # If alpha is specified, use Ridge regression (supports regularization) + # If alpha is specified, use Ridge regression (supports regularisation) # Otherwise use plain LinearRegression if "alpha" in params: base_model = Ridge( diff --git a/tests/test_hyperopt.py b/tests/test_hyperopt.py index e2fc445..52e9e51 100755 --- a/tests/test_hyperopt.py +++ b/tests/test_hyperopt.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """Tests for hyperparameter tuning functionality. -This module tests the hyperparameter optimization features including: +This module tests the hyperparameter optimisation features including: - Grouped train/validation/test splits - Hyperparameter tuning for different model types (MLP, Random Forest) - Full integration of tuning with the training pipeline @@ -128,6 +128,7 @@ def test_hyperparameter_tuner_random_forest(): assert best_model is not None, "Best model should not be None" assert 'best_params' in results, "Results should contain best_params" + assert 'best_score' in results, "Results should contain best_score" def test_tuning_integration(): """Test full integration with config and hyperparameter tuning.""" # Create test data