diff --git a/Cargo.toml b/Cargo.toml
index ba016e9..f810966 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,16 +1,16 @@
 [package]
-name = "cognius"
+name = "athena"
 # versions
 version = "0.1.0"
-edition = "2021"
-rust-version = "1.73.0"
+edition = "2024"
+rust-version = "1.95.0"
 # info
 authors = ["commanderxa"]
 readme = "README.md"
-repository = "https://github.com/CommanderXA/cognius"
+repository = "https://github.com/CommanderXA/athena"
 license-file = "LICENSE.txt"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-rand = "0.8.5"
+rand = "0.10.1"
diff --git a/README.md b/README.md
index 8277fe1..8285825 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,10 @@
-# COGNIUS
+# ATHENA
 
 A pure `Rust` implementation of `micrograd`.
 
 ## Requirements
 
-Rust >= 1.73
+Rust >= 1.95
 
 ## Usage
 
diff --git a/examples/linear.rs b/examples/linear.rs
index 680d011..64ff4fd 100644
--- a/examples/linear.rs
+++ b/examples/linear.rs
@@ -1,4 +1,4 @@
-use cognius::{
+use athena::{
     module::Forward,
     nn::{functional as F, Linear},
     Tensor,
diff --git a/examples/matmul.rs b/examples/matmul.rs
index 48604f3..524eea8 100644
--- a/examples/matmul.rs
+++ b/examples/matmul.rs
@@ -1,4 +1,4 @@
-use cognius::{linalg, Tensor};
+use athena::{linalg, Tensor};
 
 fn main() {
     let a = Tensor::randn(&[2, 3, 2]);
diff --git a/examples/mlp.rs b/examples/mlp.rs
index 9eef732..73fdc6b 100644
--- a/examples/mlp.rs
+++ b/examples/mlp.rs
@@ -1,4 +1,4 @@
-use cognius::{
+use athena::{
     module::{Forward, Module},
     nn::{functional as F, Linear, MSELoss},
     optim::{Optim, SGD},
@@ -7,9 +7,9 @@ use cognius::{
 
 fn main() {
     let epochs = 10;
-    let criterion = MSELoss::new();
-    let mlp = MLP::new([2, 1]);
-    let optim = SGD::new(mlp.parameters(), 3e-1);
+    let criterion = MSELoss::new(None);
+    let mlp = MLP::new([2, 16, 1]);
+    let optim = SGD::new(mlp.parameters(), 3e-3);
 
     let data = vec![
         Tensor::tensor(&[9., 3.], &[2]),
@@ -76,17 +76,18 @@ fn main() {
         );
         loss.backward();
     }
-    println!("MODEL: {:?}", mlp.parameters());
 }
 
 struct MLP {
     linear: Linear,
+    linear1: Linear,
 }
 
 impl MLP {
-    pub fn new(features: [usize; 2]) -> Self {
+    pub fn new(features: [usize; 3]) -> Self {
         Self {
             linear: Linear::new(features[0], features[1]),
+            linear1: Linear::new(features[1], features[2]),
         }
     }
 }
@@ -97,7 +98,8 @@ impl Module for MLP {
     }
 
     fn parameters(&self) -> Vec<Tensor> {
-        let parameters = self.linear.parameters();
+        let mut parameters = self.linear.parameters();
+        parameters.append(&mut self.linear1.parameters());
         parameters
     }
 }
@@ -105,6 +107,8 @@ impl Module for MLP {
 impl Forward for MLP {
     fn forward(&self, x: Tensor) -> Tensor {
         let x = self.linear.forward(x);
+        let x = F::relu(x);
+        let x = self.linear1.forward(x);
         F::sigmoid(x)
     }
 }
diff --git a/examples/mlp2.rs b/examples/mlp2.rs
index 24c6b66..4f9e455 100644
--- a/examples/mlp2.rs
+++ b/examples/mlp2.rs
@@ -1,4 +1,4 @@
-use cognius::{
+use athena::{
     module::{Forward, Module},
     nn::{Linear, MSELoss},
     optim::{Optim, SGD},
@@ -7,7 +7,7 @@ use cognius::{
 
 fn main() {
     let epochs = 10;
-    let criterion = MSELoss::new();
+    let criterion = MSELoss::new(None);
     let mlp = MLP::new([1, 1]);
     let optim = SGD::new(mlp.parameters(), 3e-3);
 
diff --git a/examples/softmax.rs b/examples/softmax.rs
new file mode 100644
index 0000000..5ba956d
--- /dev/null
+++ b/examples/softmax.rs
@@ -0,0 +1,8 @@
+use athena::{nn::functional as F, Tensor};
+
+fn main() {
+    let x = Tensor::tensor(&[0.24, 0.1, 0.5, 0.8, 1.2, 2.2], &[1, 2, 3]);
+    println!("IN:\n{x}\n\n");
+    let x = F::softmax(x, 2);
+    println!("OUT:\n{x}");
+}
diff --git a/src/backward.rs b/src/backward.rs
index ff7b9da..a8a3613 100644
--- a/src/backward.rs
+++ b/src/backward.rs
@@ -118,7 +118,26 @@ impl Backward for Op {
                 t._prev[0].add_to_grad(dx);
             }
 
-            Op::MSE => {
+            Op::Softmax(x, _) => {
+                let t = tensor.inner.borrow();
+                let n = x.length();
+                let s = x.item();
+                let mut jacobian = vec![0.0; n * n];
+                for i in 0..n {
+                    for j in 0..n {
+                        if i == j {
+                            jacobian[i * j] = s[i] * (1.0 - s[i]);
+                        } else {
+                            jacobian[i * j] = -s[i] * s[j];
+                        }
+                    }
+                }
+                let a = Tensor::tensor(&jacobian, &[n, n]).t();
+                t._prev[0].add_to_grad(a.item());
+            }
+
+            Op::MSE(n) => {
+                let n = *n as f64;
                 let t = tensor.inner.borrow();
                 let t_prev = t._prev[0].inner.borrow();
                 let t_sub = t_prev._prev[0].inner.borrow();
@@ -127,7 +146,7 @@ impl Backward for Op {
                 let grad = out
                     .iter()
                     .zip(target)
-                    .map(|(x, y)| 2.0 * (x - y))
+                    .map(|(x, y)| 2.0 / n * (x - y))
                     .collect::<Vec<f64>>();
                 drop(t_sub);
                 drop(t_prev);
diff --git a/src/data/dataloader.rs b/src/data/dataloader.rs
index aec2c4b..20aa353 100644
--- a/src/data/dataloader.rs
+++ b/src/data/dataloader.rs
@@ -1,6 +1,6 @@
 use std::{cell::RefCell, rc::Rc};
 
-use rand::{seq::SliceRandom, thread_rng};
+use rand::{seq::SliceRandom, rng};
 
 use super::dataset::Dataset;
 
@@ -26,7 +26,7 @@ impl<T> DataloaderInner<T> {
 
     /// Mix the indices up to obtain random sequence
     fn shuffle_indices(&mut self) {
-        self.indices.shuffle(&mut thread_rng());
+        self.indices.shuffle(&mut rng());
     }
 }
 
@@ -48,7 +48,7 @@ impl<T> Dataloader<T> {
         }
         // shuffle all indices if it is specified so
         if shuffle {
-            indices.shuffle(&mut thread_rng());
+            indices.shuffle(&mut rng());
         }
         Self(Rc::new(RefCell::new(DataloaderInner {
             dataset,
@@ -68,6 +68,10 @@ impl<T> Dataloader<T> {
     pub fn is_shuffle(&self) -> bool {
         self.0.borrow().shuffle
     }
+
+    pub fn len(&self) -> usize {
+        self.0.borrow().dataset.len()
+    }
 }
 
 impl<T> Iterator for Dataloader<T> {
diff --git a/src/nn/criterions.rs b/src/nn/criterions.rs
index 1b7ebd9..1565fcf 100644
--- a/src/nn/criterions.rs
+++ b/src/nn/criterions.rs
@@ -1,22 +1,43 @@
 use crate::{op::Op, tensor_data::TensorData, Tensor};
 
-pub struct MSELoss {}
+#[derive(Clone, Copy, PartialEq, PartialOrd)]
+pub enum Reduction {
+    SUM,
+    MEAN,
+}
+
+#[derive(Clone)]
+pub struct MSELoss {
+    reduction: Option<Reduction>,
+}
 
 impl MSELoss {
-    pub fn new() -> Self {
-        Self {}
+    pub fn new(reduction: Option<Reduction>) -> Self {
+        Self {
+            reduction: reduction,
+        }
     }
 
     pub fn measure(&self, a: Tensor, b: Tensor) -> Tensor {
         let t = (a - b).pow(2);
         let shape = t.shape();
-        let inner = TensorData::from_op(t.item(), vec![t], Op::MSE);
+        let a = t.item();
+        let t_len = t.length() as f64;
+        let mut s = 0.0;
+        if let Some(reduction) = self.reduction {
+            s = a.iter().sum::<f64>();
+            if reduction == Reduction::MEAN {
+                s = s / t_len;
+            }
+        }
+        s /= t_len;
+        let inner = TensorData::from_op(vec![s], vec![t], Op::MSE(t_len as usize));
         Tensor::new(inner, &shape)
     }
 }
 
 impl Default for MSELoss {
     fn default() -> Self {
-        Self::new()
+        Self::new(None)
     }
 }
diff --git a/src/nn/functional.rs b/src/nn/functional.rs
index 206eb1a..216cdf2 100644
--- a/src/nn/functional.rs
+++ b/src/nn/functional.rs
@@ -15,3 +15,41 @@ pub fn sigmoid(x: Tensor) -> Tensor {
     let inner = TensorData::from_op(data.item(), vec![x.clone()], Op::Sigmoid(x));
     Tensor::new(inner, &data.shape)
 }
+
+pub fn softmax(x: Tensor, dim: usize) -> Tensor {
+    let shape = x.shape();
+    let mut shape2 = shape.clone();
+    assert_eq!(
+        dim,
+        shape.len() - 1,
+        "Softmax for dimensions other than the last one is not supported."
+    );
+    let mut result = vec![0.0; x.length()];
+    let data = x.item();
+    // get batch dimensions if they exist
+    let mut batches: Vec<usize> = vec![];
+    for i in 2..shape.len() {
+        batches.push(shape[i - 2]);
+    }
+    // remove batch dimensions from the A tensor shape
+    shape2.drain(0..batches.len());
+    let batch_prod = batches.iter().product::<usize>();
+    let m = shape2[0];
+    let n = shape2[1];
+    // iterate over the batch dimensions
+    // `k` is a batch dimension
+    for k in 0..batch_prod {
+        for i in 0..m {
+            let _x = &data[(k * m + i * n)..(k * m + i * n + n)];
+            // do operations
+            let max_x = _x.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
+            let exp_x: Vec<f64> = _x.iter().map(|&xi| (xi - max_x).exp()).collect();
+            let sum_exp_x: f64 = exp_x.iter().sum();
+            result[(k * m + i * n)..(k * m + i * n + n)]
+                .copy_from_slice(&exp_x.iter().map(|&ei| ei / sum_exp_x).collect::<Vec<f64>>());
+        }
+    }
+    // create new tensor
+    let inner = TensorData::from_op(result, vec![x.clone()], Op::Softmax(x, dim));
+    Tensor::new(inner, &shape)
+}
diff --git a/src/op.rs b/src/op.rs
index 5f11312..29ef74c 100644
--- a/src/op.rs
+++ b/src/op.rs
@@ -13,7 +13,8 @@ pub enum Op {
     Cross,
     ReLU,
     Sigmoid(Tensor),
-    MSE,
+    Softmax(Tensor, usize),
+    MSE(usize),
 }
 
 impl std::fmt::Display for Op {
@@ -28,7 +29,8 @@ impl std::fmt::Display for Op {
             Op::Cross => write!(f, "Cross"),
             Op::ReLU => write!(f, "ReLU"),
             Op::Sigmoid(n) => write!(f, "Sigmoid({n})"),
-            Op::MSE => write!(f, "MSE"),
+            Op::Softmax(n, dim) => write!(f, "Softmax({n},{dim})"),
+            Op::MSE(n) => write!(f, "MSE({n})"),
         }
     }
 }
diff --git a/src/optim.rs b/src/optim.rs
index 6919bad..948b034 100644
--- a/src/optim.rs
+++ b/src/optim.rs
@@ -1,3 +1,4 @@
+pub mod lr_scheduler;
 pub mod sgd;
 
 // Short paths for algorithms
@@ -10,4 +11,7 @@ pub trait Optim {
 
     /// Sets gradients to zero.
     fn zero_grad(&self);
+
+    /// changes the learning rate by gamma
+    fn change_lr(&mut self, gamma: f64);
 }
diff --git a/src/optim/lr_scheduler.rs b/src/optim/lr_scheduler.rs
new file mode 100644
index 0000000..f05d647
--- /dev/null
+++ b/src/optim/lr_scheduler.rs
@@ -0,0 +1,5 @@
+pub mod multistep_lr;
+
+pub trait Scheduler {
+    fn step(&mut self) -> ();
+}
diff --git a/src/optim/lr_scheduler/multistep_lr.rs b/src/optim/lr_scheduler/multistep_lr.rs
new file mode 100644
index 0000000..847c663
--- /dev/null
+++ b/src/optim/lr_scheduler/multistep_lr.rs
@@ -0,0 +1,33 @@
+use crate::optim::Optim;
+
+use super::Scheduler;
+
+pub struct MultiStepLR {
+    optimizer: Box<dyn Optim>,
+    pub milestones: Vec<usize>,
+    pub gamma: f64,
+    count: usize,
+}
+
+impl MultiStepLR {
+    pub fn new(optimizer: Box<dyn Optim>, milestones: &[usize], gamma: f64) -> Self {
+        Self {
+            optimizer,
+            milestones: milestones.to_vec(),
+            gamma: gamma,
+            count: 0,
+        }
+    }
+}
+
+impl Scheduler for MultiStepLR {
+    fn step(&mut self) -> () {
+        for m in &self.milestones {
+            if self.count == *m {
+                self.optimizer.as_mut().change_lr(self.gamma);
+            }
+        }
+        self.optimizer.as_mut().step();
+        self.count += 1;
+    }
+}
diff --git a/src/optim/sgd.rs b/src/optim/sgd.rs
index 462eb27..049a84c 100644
--- a/src/optim/sgd.rs
+++ b/src/optim/sgd.rs
@@ -9,6 +9,7 @@ use super::Optim;
 /// It has:
 /// - parameters of the model
 /// - learning rate
+#[derive(Clone)]
 pub struct SGD {
     parameters: Vec<Tensor>,
     lr: f64,
@@ -57,4 +58,8 @@ impl Optim for SGD {
             self.parameters[i].inner.borrow_mut().zero_grad();
         }
     }
+
+    fn change_lr(&mut self, gamma: f64) {
+        self.lr *= gamma
+    }
 }
diff --git a/src/tensor.rs b/src/tensor.rs
index 910663e..229f20c 100644
--- a/src/tensor.rs
+++ b/src/tensor.rs
@@ -6,8 +6,6 @@ use std::{
     rc::Rc,
 };
 
-use rand::Rng;
-
 use crate::{backward::Backward, op::Op, tensor_data::TensorData};
 
 #[derive(Clone, Debug, PartialEq)]
@@ -140,7 +138,7 @@ impl Tensor {
     /// between -1 and 1.
     fn fill_tensor(tensor: &mut TensorData, range: Range<f64>) {
         for i in 0..tensor.data.len() {
-            let data = rand::thread_rng().gen_range(range.clone());
+            let data = rand::random_range(range.clone());
             tensor.data[i] = data;
             tensor.grad.as_mut().unwrap().push(0.0);
         }
@@ -232,7 +230,9 @@ impl Tensor {
         );
         let mut t = self.clone();
         if self.stride.iter().product::<usize>() == 0 {
-            panic!("view size is not compatible with size and stride of input tensor. Use .reshape(...) instead");
+            panic!(
+                "view size is not compatible with size and stride of input tensor. Use .reshape(...) instead"
+            );
         }
         let mut stride = vec![1; shape.len()];
         // compute stride
@@ -358,7 +358,12 @@ impl Tensor {
     ///
     /// Takes new shape.
     pub fn expand(&self, new_shape: &[usize]) -> Self {
-        assert!(self.shape().len() <= new_shape.len(), "The number of sizes provided ({:?}) must be equal or greater than the number of sizes in the tensor ({:?})", self.shape().len(), new_shape.len());
+        assert!(
+            self.shape().len() <= new_shape.len(),
+            "The number of sizes provided ({:?}) must be equal or greater than the number of sizes in the tensor ({:?})",
+            self.shape().len(),
+            new_shape.len()
+        );
         let mut t = self.clone();
         let mut _old_shape = self.shape();
         // check if batch dims have to be added in th front
@@ -374,7 +379,7 @@ impl Tensor {
         for i in (0..new_shape.len()).rev() {
             assert!(
                 old_shape[i] == new_shape[i] || (old_shape[i] == 1),
-                "The expanded size of the tensor ({}) must match the existing size ({}) at dimension ({})", 
+                "The expanded size of the tensor ({}) must match the existing size ({}) at dimension ({})",
                 new_shape[i],
                 old_shape[i],
                 i,
@@ -431,7 +436,7 @@ impl Tensor {
     /// Computes the gradients of all the tensors that have been interacting and
     /// have `requires_grad` set to `true`.
     pub fn backward(&self) {
-        let end_grad = self.inner.borrow()._prev[0]
+        let end_grad: Vec<f64> = self.inner.borrow()._prev[0]
             .item()
             .iter()
             .map(|a| a * 2.0)
diff --git a/tests/activations_test.rs b/tests/activations_test.rs
index 0f9f4d4..de5bd37 100644
--- a/tests/activations_test.rs
+++ b/tests/activations_test.rs
@@ -1,6 +1,6 @@
 #[cfg(test)]
 mod tests {
-    use cognius::{nn::functional as F, Tensor};
+    use athena::{nn::functional as F, Tensor};
 
     #[test]
     fn sigmoid() {
diff --git a/tests/data_test.rs b/tests/data_test.rs
index 5f539ee..98bdea7 100644
--- a/tests/data_test.rs
+++ b/tests/data_test.rs
@@ -1,6 +1,6 @@
 #[cfg(test)]
 mod tests {
-    use cognius::{
+    use athena::{
         data::{dataloader::Dataloader, dataset::Dataset, sample::Sample},
         Tensor,
     };
diff --git a/tests/linalg_test.rs b/tests/linalg_test.rs
index 7c507e7..9ee8a46 100644
--- a/tests/linalg_test.rs
+++ b/tests/linalg_test.rs
@@ -1,6 +1,6 @@
 #[cfg(test)]
 mod tests {
-    use cognius::{linalg, Tensor};
+    use athena::{linalg, Tensor};
 
     #[test]
     /// Matrix multiplication
@@ -68,7 +68,7 @@ mod tests {
             ],
             &[3, 1, 3, 1, 2, 3],
         );
-        let c = cognius::linalg::cross(a.clone(), b.clone());
+        let c = athena::linalg::cross(a.clone(), b.clone());
         let correct = vec![
             -0.5866, 1.0262, -0.4396, -1.9130, -0.8710, 2.7839, -0.5866, 1.0262, -0.4396, -1.9130,
             -0.8710, 2.7839, 0.8525, 0.7091, -1.5616, -1.0111, 3.0523, -2.0412, 0.8525, 0.7091,
diff --git a/tests/optim_test.rs b/tests/optim_test.rs
index 25169bf..b923d51 100644
--- a/tests/optim_test.rs
+++ b/tests/optim_test.rs
@@ -1,6 +1,6 @@
 #[cfg(test)]
 mod tests {
-    use cognius::{
+    use athena::{
         module::{Forward, Module},
         nn::{self, functional as F, Linear},
         optim::{Optim, SGD},
@@ -27,7 +27,7 @@ mod tests {
         let mlp = MLP::new([4, 2, 4, 2, 1]);
         let optim = SGD::new(mlp.parameters(), 1e-1);
         let x = Tensor::randn(&[10, 4]);
-        let criterion = nn::MSELoss::new();
+        let criterion = nn::MSELoss::new(None);
 
         let mut out = mlp.forward(x.clone());
         out = out.squeeze(&[]);
diff --git a/tests/tensor_test.rs b/tests/tensor_test.rs
index 55cfa8d..5d7b9be 100644
--- a/tests/tensor_test.rs
+++ b/tests/tensor_test.rs
@@ -1,6 +1,6 @@
 #[cfg(test)]
 mod tests {
-    use cognius::{randn, Tensor};
+    use athena::{randn, Tensor};
 
     #[test]
     /// Valid shape of the tensor