Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
__pycache__/
*.pyc
*.pyo
.pytest_cache/
*.egg-info/
dist/
build/
.env
241 changes: 241 additions & 0 deletions neural_network.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
"""
Neural Network implementation from scratch using NumPy.

Supports arbitrary hidden layer sizes, ReLU activations for hidden layers,
softmax output, and cross-entropy loss trained with mini-batch gradient descent.
"""

import numpy as np


def relu(z):
"""Rectified Linear Unit activation."""
return np.maximum(0, z)


def relu_derivative(z):
"""Derivative of ReLU."""
return (z > 0).astype(float)


def softmax(z):
"""Numerically stable softmax activation."""
shifted = z - np.max(z, axis=0, keepdims=True)
exp_z = np.exp(shifted)
return exp_z / np.sum(exp_z, axis=0, keepdims=True)


def cross_entropy_loss(y_pred, y_true):
"""
Cross-entropy loss between predicted probabilities and one-hot encoded labels.

Args:
y_pred: (num_classes, batch_size) predicted probabilities.
y_true: (num_classes, batch_size) one-hot encoded true labels.

Returns:
Scalar average loss.
"""
m = y_true.shape[1]
log_probs = -np.log(np.clip(y_pred, 1e-12, 1.0))
return np.sum(y_true * log_probs) / m


class NeuralNetwork:
"""
Fully-connected neural network trained with gradient descent.

Architecture: Input -> [Dense + ReLU] * num_hidden_layers -> Dense -> Softmax

Args:
layer_sizes: List of integers specifying the number of units per layer
including the input and output dimensions.
E.g. [784, 128, 64, 10] creates two hidden layers of
sizes 128 and 64 with a 10-class output.
learning_rate: Step size for gradient descent (default 0.1).
seed: Random seed for reproducibility (default 42).
"""

def __init__(self, layer_sizes, learning_rate=0.1, seed=42):
np.random.seed(seed)
self.layer_sizes = layer_sizes
self.learning_rate = learning_rate
self.num_layers = len(layer_sizes) - 1 # number of weight matrices
self._init_params()

# ------------------------------------------------------------------
# Parameter initialisation
# ------------------------------------------------------------------

def _init_params(self):
"""He initialisation for weights; zeros for biases."""
self.params = {}
for l in range(1, self.num_layers + 1):
fan_in = self.layer_sizes[l - 1]
fan_out = self.layer_sizes[l]
self.params[f"W{l}"] = np.random.randn(fan_out, fan_in) * np.sqrt(2.0 / fan_in)
self.params[f"b{l}"] = np.zeros((fan_out, 1))

# ------------------------------------------------------------------
# Forward propagation
# ------------------------------------------------------------------

def forward(self, X):
"""
Compute forward pass through the network.

Args:
X: (input_size, batch_size) input matrix.

Returns:
Tuple (output probabilities, cache dict for backprop).
"""
cache = {"A0": X}
A = X
for l in range(1, self.num_layers + 1):
W = self.params[f"W{l}"]
b = self.params[f"b{l}"]
Z = W @ A + b
if l < self.num_layers:
A = relu(Z)
else:
A = softmax(Z)
cache[f"Z{l}"] = Z
cache[f"A{l}"] = A
return A, cache

# ------------------------------------------------------------------
# Backpropagation
# ------------------------------------------------------------------

def backward(self, y_true, cache):
"""
Compute gradients via backpropagation.

Args:
y_true: (num_classes, batch_size) one-hot encoded labels.
cache: Dict produced by forward().

Returns:
Dict of gradients keyed by 'W1', 'b1', …, 'WL', 'bL'.
"""
grads = {}
m = y_true.shape[1]
L = self.num_layers

# Gradient of loss w.r.t. softmax output (combined softmax + CE)
dA = (cache[f"A{L}"] - y_true) / m

for l in reversed(range(1, L + 1)):
A_prev = cache[f"A{l - 1}"]
W = self.params[f"W{l}"]
Z = cache[f"Z{l}"]

if l < L:
dZ = dA * relu_derivative(Z)
else:
dZ = dA # Already the combined gradient for softmax + CE

grads[f"W{l}"] = dZ @ A_prev.T
grads[f"b{l}"] = np.sum(dZ, axis=1, keepdims=True)
dA = W.T @ dZ

return grads

# ------------------------------------------------------------------
# Parameter update
# ------------------------------------------------------------------

def update_params(self, grads):
"""Gradient descent parameter update."""
for l in range(1, self.num_layers + 1):
self.params[f"W{l}"] -= self.learning_rate * grads[f"W{l}"]
self.params[f"b{l}"] -= self.learning_rate * grads[f"b{l}"]

# ------------------------------------------------------------------
# Training
# ------------------------------------------------------------------

def train(self, X_train, y_train, epochs=20, batch_size=64, verbose=True):
"""
Train the network using mini-batch gradient descent.

Args:
X_train: (input_size, num_samples) training data.
y_train: (num_classes, num_samples) one-hot labels.
epochs: Number of passes over the training set.
batch_size: Mini-batch size.
verbose: Print loss/accuracy each epoch when True.

Returns:
Dict with 'loss' and 'accuracy' lists (one value per epoch).
"""
history = {"loss": [], "accuracy": []}
m = X_train.shape[1]

for epoch in range(1, epochs + 1):
# Shuffle
permutation = np.random.permutation(m)
X_shuffled = X_train[:, permutation]
y_shuffled = y_train[:, permutation]

epoch_loss = 0.0
num_batches = 0

for start in range(0, m, batch_size):
X_batch = X_shuffled[:, start: start + batch_size]
y_batch = y_shuffled[:, start: start + batch_size]

probs, cache = self.forward(X_batch)
loss = cross_entropy_loss(probs, y_batch)
grads = self.backward(y_batch, cache)
self.update_params(grads)

epoch_loss += loss
num_batches += 1

epoch_loss /= num_batches
epoch_acc = self.evaluate(X_train, y_train)
history["loss"].append(epoch_loss)
history["accuracy"].append(epoch_acc)

if verbose:
print(
f"Epoch {epoch:>3}/{epochs} "
f"loss: {epoch_loss:.4f} "
f"train_acc: {epoch_acc:.4f}"
)

return history

# ------------------------------------------------------------------
# Inference
# ------------------------------------------------------------------

def predict(self, X):
"""
Return predicted class indices for each sample.

Args:
X: (input_size, num_samples) input matrix.

Returns:
1-D array of predicted class labels.
"""
probs, _ = self.forward(X)
return np.argmax(probs, axis=0)

def evaluate(self, X, y_true):
"""
Compute classification accuracy.

Args:
X: (input_size, num_samples) input matrix.
y_true: (num_classes, num_samples) one-hot labels.

Returns:
Accuracy in [0, 1].
"""
predictions = self.predict(X)
labels = np.argmax(y_true, axis=0)
return np.mean(predictions == labels)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
numpy>=1.21
Loading