Revamp training v3 script

mostafa · mostafa · commit d78f3b60dfc8 · 2024-10-28T11:28:54.000+01:00
Add cross validation with KFold
Add F1 and F2 score metrics
Fix training metrics
diff --git a/training/train_v3.py b/training/train_v3.py
@@ -1,157 +1,154 @@
 import sys
 import pandas as pd
+import tensorflow as tf
 from tensorflow.keras.preprocessing.text import Tokenizer
 from tensorflow.keras.preprocessing.sequence import pad_sequences
 from tensorflow.keras.models import Sequential
 from tensorflow.keras.layers import (
     Bidirectional,
     Conv1D,
     Dense,
+    Dropout,
     Embedding,
     Flatten,
     LSTM,
     MaxPooling1D,
 )
-from tensorflow.keras.metrics import Accuracy, Recall, Precision
 from tensorflow.keras.callbacks import EarlyStopping
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import (
-    accuracy_score,
-    recall_score,
-    precision_score,
-    f1_score,
-    confusion_matrix,
-)
+from sklearn.model_selection import KFold
+from sklearn.metrics import accuracy_score, precision_score, recall_score
 import numpy as np
 import matplotlib.pyplot as plt
 
 
-# Check if the input file and output directory are provided
-if len(sys.argv) != 3:
-    print("Usage: python train.py <input_file> <output_dir>")
-    sys.exit(1)
-
-# Load dataset
-data = pd.read_csv(sys.argv[1])
-
-# Define parameters
-MAX_WORDS = 10000
-MAX_LEN = 100
-
-# Use Tokenizer to encode text
-tokenizer = Tokenizer(num_words=MAX_WORDS, filters="")
-tokenizer.fit_on_texts(data["Query"])
-sequences = tokenizer.texts_to_sequences(data["Query"])
-
-# Pad the text sequence
-X = pad_sequences(sequences, maxlen=MAX_LEN)
+def load_data(file_path):
+    """Load data from a CSV file."""
+    try:
+        return pd.read_csv(file_path)
+    except Exception as e:
+        print(f"Error loading data: {e}")
+        sys.exit(1)
+
+
+def preprocess_text(data, max_words=10000, max_len=100):
+    """Tokenize and pad text data."""
+    tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
+    tokenizer.fit_on_texts(data["Query"])
+    sequences = tokenizer.texts_to_sequences(data["Query"])
+    return pad_sequences(sequences, maxlen=max_len), tokenizer
+
+
+def build_model(input_dim, output_dim=128):
+    """Define and compile the CNN-BiLSTM model."""
+    model = Sequential(
+        [
+            Embedding(input_dim=input_dim, output_dim=output_dim),
+            Dropout(0.2),
+            Conv1D(filters=64, kernel_size=3, padding="same", activation="relu"),
+            MaxPooling1D(pool_size=2),
+            Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
+            Flatten(),
+            Dense(1, activation="sigmoid"),
+        ]
+    )
+    model.compile(
+        loss="binary_crossentropy",
+        optimizer="adam",
+        metrics=[
+            "accuracy",
+            tf.keras.metrics.Precision(name="precision"),
+            tf.keras.metrics.Recall(name="recall"),
+        ],
+    )
+    return model
+
+
+def calculate_f1_f2(precision, recall, beta=1):
+    """Calculate F1 or F2 score based on precision and recall with given beta."""
+    beta_squared = beta**2
+    return (
+        (1 + beta_squared)
+        * (precision * recall)
+        / (beta_squared * precision + recall + tf.keras.backend.epsilon())
+    )
 
-# Split the training set and test set
-y = data["Label"]
-X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=0.2, random_state=42
-)
-
-# Create CNN-BiLSTM model
-model = Sequential()
-model.add(Embedding(MAX_WORDS, 128))
-model.add(Conv1D(filters=64, kernel_size=3, padding="same", activation="relu"))
-model.add(MaxPooling1D(pool_size=2))
-model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
-model.add(Flatten())
-model.add(Dense(1, activation="sigmoid"))
-
-model.compile(
-    loss="binary_crossentropy",
-    optimizer="adam",
-    metrics=[
-        Accuracy(),
-        Recall(),
-        Precision(),
-    ],
-)
-
-# Define early stopping callback with a rollback of 5
-early_stopping = EarlyStopping(
-    monitor="val_loss", patience=5, restore_best_weights=True
-)
 
-# Train model with early stopping
-history = model.fit(
-    X_train,
-    y_train,
-    epochs=50,  # Maximum number of epochs
-    batch_size=32,
-    validation_data=(X_test, y_test),
-    callbacks=[early_stopping],
-    verbose=1,
-)
-
-# Predict test set
-y_pred = model.predict(X_test, verbose=1)
-y_pred_classes = np.argmax(y_pred, axis=1)
-
-# Calculate model performance indicators
-accuracy = accuracy_score(y_test, y_pred_classes)
-recall = recall_score(y_test, y_pred_classes, zero_division=1)
-precision = precision_score(y_test, y_pred_classes, zero_division=1)
-f1 = f1_score(y_test, y_pred_classes, zero_division=1)
-tn, fp, fn, tp = confusion_matrix(y_test, y_pred_classes).ravel()
-
-# Output performance indicators
-print("Accuracy: {:.2f}%".format(accuracy * 100))
-print("Recall: {:.2f}%".format(recall * 100))
-print("Precision: {:.2f}%".format(precision * 100))
-print("F1-score: {:.2f}%".format(f1 * 100))
-print("Specificity: {:.2f}%".format(tn / (tn + fp) * 100))
-print("ROC: {:.2f}%".format(tp / (tp + fn) * 100))
-
-# Save model as SavedModel format
-model.export(sys.argv[2])
-
-
-# Plot the training history
 def plot_history(history):
+    """Plot the training and validation loss, accuracy, precision, and recall."""
     plt.figure(figsize=(12, 8))
-
-    # Plot loss
-    plt.subplot(2, 2, 1)
-    plt.plot(history.history["loss"], label="Training Loss")
-    plt.plot(history.history["val_loss"], label="Validation Loss")
-    plt.title("Loss")
-    plt.xlabel("Epochs")
-    plt.ylabel("Loss")
-    plt.legend()
-
-    # Plot accuracy
-    plt.subplot(2, 2, 2)
-    plt.plot(history.history["accuracy"], label="Training Accuracy")
-    plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
-    plt.title("Accuracy")
-    plt.xlabel("Epochs")
-    plt.ylabel("Accuracy")
-    plt.legend()
-
-    # Plot precision
-    plt.subplot(2, 2, 3)
-    plt.plot(history.history["precision"], label="Training Precision")
-    plt.plot(history.history["val_precision"], label="Validation Precision")
-    plt.title("Precision")
-    plt.xlabel("Epochs")
-    plt.ylabel("Precision")
-    plt.legend()
-
-    # Plot recall
-    plt.subplot(2, 2, 4)
-    plt.plot(history.history["recall"], label="Training Recall")
-    plt.plot(history.history["val_recall"], label="Validation Recall")
-    plt.title("Recall")
-    plt.xlabel("Epochs")
-    plt.ylabel("Recall")
-    plt.legend()
-
+    for i, metric in enumerate(["loss", "accuracy", "precision", "recall"], start=1):
+        plt.subplot(2, 2, i)
+        plt.plot(history.history[metric], label=f"Training {metric.capitalize()}")
+        plt.plot(
+            history.history[f"val_{metric}"], label=f"Validation {metric.capitalize()}"
+        )
+        plt.title(metric.capitalize())
+        plt.xlabel("Epochs")
+        plt.ylabel(metric.capitalize())
+        plt.legend()
     plt.tight_layout()
     plt.savefig("training_history.png")
 
 
-plot_history(history)
+# Main function
+if __name__ == "__main__":
+    if len(sys.argv) != 3:
+        print("Usage: python train.py <input_file> <output_dir>")
+        sys.exit(1)
+
+    # Load and preprocess data
+    data = load_data(sys.argv[1])
+    X, tokenizer = preprocess_text(data)
+    y = data["Label"]
+
+    # Initialize cross-validation
+    k_folds = 5
+    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)
+    fold_metrics = {"accuracy": [], "precision": [], "recall": [], "f1": [], "f2": []}
+
+    for fold, (train_idx, val_idx) in enumerate(kfold.split(X, y), 1):
+        print(f"Training fold {fold}/{k_folds}")
+
+        # Split the data
+        X_train, X_val = X[train_idx], X[val_idx]
+        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
+
+        # Build and train the model
+        model = build_model(input_dim=len(tokenizer.word_index) + 1)
+        early_stopping = EarlyStopping(
+            monitor="val_loss", patience=5, restore_best_weights=True
+        )
+        history = model.fit(
+            X_train,
+            y_train,
+            epochs=50,
+            batch_size=32,
+            validation_data=(X_val, y_val),
+            callbacks=[early_stopping],
+            verbose=1,
+        )
+
+        # Make predictions to manually calculate metrics
+        y_val_pred = (model.predict(X_val) > 0.5).astype(int)
+        accuracy = accuracy_score(y_val, y_val_pred)
+        precision = precision_score(y_val, y_val_pred)
+        recall = recall_score(y_val, y_val_pred)
+        f1_score = calculate_f1_f2(precision, recall, beta=1)
+        f2_score = calculate_f1_f2(precision, recall, beta=2)
+
+        # Collect fold metrics
+        fold_metrics["accuracy"].append(accuracy)
+        fold_metrics["precision"].append(precision)
+        fold_metrics["recall"].append(recall)
+        fold_metrics["f1"].append(f1_score)
+        fold_metrics["f2"].append(f2_score)
+
+    # Calculate average metrics across folds
+    avg_metrics = {metric: np.mean(scores) for metric, scores in fold_metrics.items()}
+    print("\nCross-validation results:")
+    for metric, value in avg_metrics.items():
+        print(f"{metric.capitalize()}: {value:.2f}")
+
+    # Save the final model trained on the last fold
+    model.export(sys.argv[2])
+    plot_history(history)