Bitu-Singh-Rathoud · Copilot · Apr 1, 2026 · Apr 1, 2026 · Apr 1, 2026
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+__pycache__/
+*.pyc
+*.pyo
+.pytest_cache/
+*.egg-info/
+dist/
+build/
+.env
diff --git a/neural_network.py b/neural_network.py
@@ -0,0 +1,241 @@
+"""
+Neural Network implementation from scratch using NumPy.
+
+Supports arbitrary hidden layer sizes, ReLU activations for hidden layers,
+softmax output, and cross-entropy loss trained with mini-batch gradient descent.
+"""
+
+import numpy as np
+
+
+def relu(z):
+    """Rectified Linear Unit activation."""
+    return np.maximum(0, z)
+
+
+def relu_derivative(z):
+    """Derivative of ReLU."""
+    return (z > 0).astype(float)
+
+
+def softmax(z):
+    """Numerically stable softmax activation."""
+    shifted = z - np.max(z, axis=0, keepdims=True)
+    exp_z = np.exp(shifted)
+    return exp_z / np.sum(exp_z, axis=0, keepdims=True)
+
+
+def cross_entropy_loss(y_pred, y_true):
+    """
+    Cross-entropy loss between predicted probabilities and one-hot encoded labels.
+
+    Args:
+        y_pred: (num_classes, batch_size) predicted probabilities.
+        y_true: (num_classes, batch_size) one-hot encoded true labels.
+
+    Returns:
+        Scalar average loss.
+    """
+    m = y_true.shape[1]
+    log_probs = -np.log(np.clip(y_pred, 1e-12, 1.0))
+    return np.sum(y_true * log_probs) / m
+
+
+class NeuralNetwork:
+    """
+    Fully-connected neural network trained with gradient descent.
+
+    Architecture: Input -> [Dense + ReLU] * num_hidden_layers -> Dense -> Softmax
+
+    Args:
+        layer_sizes: List of integers specifying the number of units per layer
+                     including the input and output dimensions.
+                     E.g. [784, 128, 64, 10] creates two hidden layers of
+                     sizes 128 and 64 with a 10-class output.
+        learning_rate: Step size for gradient descent (default 0.1).
+        seed: Random seed for reproducibility (default 42).
+    """
+
+    def __init__(self, layer_sizes, learning_rate=0.1, seed=42):
+        np.random.seed(seed)
+        self.layer_sizes = layer_sizes
+        self.learning_rate = learning_rate
+        self.num_layers = len(layer_sizes) - 1  # number of weight matrices
+        self._init_params()
+
+    # ------------------------------------------------------------------
+    # Parameter initialisation
+    # ------------------------------------------------------------------
+
+    def _init_params(self):
+        """He initialisation for weights; zeros for biases."""
+        self.params = {}
+        for l in range(1, self.num_layers + 1):
+            fan_in = self.layer_sizes[l - 1]
+            fan_out = self.layer_sizes[l]
+            self.params[f"W{l}"] = np.random.randn(fan_out, fan_in) * np.sqrt(2.0 / fan_in)
+            self.params[f"b{l}"] = np.zeros((fan_out, 1))
+
+    # ------------------------------------------------------------------
+    # Forward propagation
+    # ------------------------------------------------------------------
+
+    def forward(self, X):
+        """
+        Compute forward pass through the network.
+
+        Args:
+            X: (input_size, batch_size) input matrix.
+
+        Returns:
+            Tuple (output probabilities, cache dict for backprop).
+        """
+        cache = {"A0": X}
+        A = X
+        for l in range(1, self.num_layers + 1):
+            W = self.params[f"W{l}"]
+            b = self.params[f"b{l}"]
+            Z = W @ A + b
+            if l < self.num_layers:
+                A = relu(Z)
+            else:
+                A = softmax(Z)
+            cache[f"Z{l}"] = Z
+            cache[f"A{l}"] = A
+        return A, cache
+
+    # ------------------------------------------------------------------
+    # Backpropagation
+    # ------------------------------------------------------------------
+
+    def backward(self, y_true, cache):
+        """
+        Compute gradients via backpropagation.
+
+        Args:
+            y_true: (num_classes, batch_size) one-hot encoded labels.
+            cache:  Dict produced by forward().
+
+        Returns:
+            Dict of gradients keyed by 'W1', 'b1', …, 'WL', 'bL'.
+        """
+        grads = {}
+        m = y_true.shape[1]
+        L = self.num_layers
+
+        # Gradient of loss w.r.t. softmax output (combined softmax + CE)
+        dA = (cache[f"A{L}"] - y_true) / m
+
+        for l in reversed(range(1, L + 1)):
+            A_prev = cache[f"A{l - 1}"]
+            W = self.params[f"W{l}"]
+            Z = cache[f"Z{l}"]
+
+            if l < L:
+                dZ = dA * relu_derivative(Z)
+            else:
+                dZ = dA  # Already the combined gradient for softmax + CE
+
+            grads[f"W{l}"] = dZ @ A_prev.T
+            grads[f"b{l}"] = np.sum(dZ, axis=1, keepdims=True)
+            dA = W.T @ dZ
+
+        return grads
+
+    # ------------------------------------------------------------------
+    # Parameter update
+    # ------------------------------------------------------------------
+
+    def update_params(self, grads):
+        """Gradient descent parameter update."""
+        for l in range(1, self.num_layers + 1):
+            self.params[f"W{l}"] -= self.learning_rate * grads[f"W{l}"]
+            self.params[f"b{l}"] -= self.learning_rate * grads[f"b{l}"]
+
+    # ------------------------------------------------------------------
+    # Training
+    # ------------------------------------------------------------------
+
+    def train(self, X_train, y_train, epochs=20, batch_size=64, verbose=True):
+        """
+        Train the network using mini-batch gradient descent.
+
+        Args:
+            X_train:    (input_size, num_samples) training data.
+            y_train:    (num_classes, num_samples) one-hot labels.
+            epochs:     Number of passes over the training set.
+            batch_size: Mini-batch size.
+            verbose:    Print loss/accuracy each epoch when True.
+
+        Returns:
+            Dict with 'loss' and 'accuracy' lists (one value per epoch).
+        """
+        history = {"loss": [], "accuracy": []}
+        m = X_train.shape[1]
+
+        for epoch in range(1, epochs + 1):
+            # Shuffle
+            permutation = np.random.permutation(m)
+            X_shuffled = X_train[:, permutation]
+            y_shuffled = y_train[:, permutation]
+
+            epoch_loss = 0.0
+            num_batches = 0
+
+            for start in range(0, m, batch_size):
+                X_batch = X_shuffled[:, start: start + batch_size]
+                y_batch = y_shuffled[:, start: start + batch_size]
+
+                probs, cache = self.forward(X_batch)
+                loss = cross_entropy_loss(probs, y_batch)
+                grads = self.backward(y_batch, cache)
+                self.update_params(grads)
+
+                epoch_loss += loss
+                num_batches += 1
+
+            epoch_loss /= num_batches
+            epoch_acc = self.evaluate(X_train, y_train)
+            history["loss"].append(epoch_loss)
+            history["accuracy"].append(epoch_acc)
+
+            if verbose:
+                print(
+                    f"Epoch {epoch:>3}/{epochs}  "
+                    f"loss: {epoch_loss:.4f}  "
+                    f"train_acc: {epoch_acc:.4f}"
+                )
+
+        return history
+
+    # ------------------------------------------------------------------
+    # Inference
+    # ------------------------------------------------------------------
+
+    def predict(self, X):
+        """
+        Return predicted class indices for each sample.
+
+        Args:
+            X: (input_size, num_samples) input matrix.
+
+        Returns:
+            1-D array of predicted class labels.
+        """
+        probs, _ = self.forward(X)
+        return np.argmax(probs, axis=0)
+
+    def evaluate(self, X, y_true):
+        """
+        Compute classification accuracy.
+
+        Args:
+            X:      (input_size, num_samples) input matrix.
+            y_true: (num_classes, num_samples) one-hot labels.
+
+        Returns:
+            Accuracy in [0, 1].
+        """
+        predictions = self.predict(X)
+        labels = np.argmax(y_true, axis=0)
+        return np.mean(predictions == labels)
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1 @@
+numpy>=1.21