commanderxa · commanderxa · Apr 26, 2026 · Jul 21, 2025 · Apr 26, 2026
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,16 +1,16 @@
 [package]
-name = "cognius"
+name = "athena"
 # versions
 version = "0.1.0"
-edition = "2021"
-rust-version = "1.73.0"
+edition = "2024"
+rust-version = "1.95.0"
 # info
 authors = ["commanderxa"]
 readme = "README.md"
-repository = "https://github.com/CommanderXA/cognius"
+repository = "https://github.com/CommanderXA/athena"
 license-file = "LICENSE.txt"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-rand = "0.8.5"
+rand = "0.10.1"
diff --git a/README.md b/README.md
@@ -1,10 +1,10 @@
-# COGNIUS
+# ATHENA
 
 A pure `Rust` implementation of `micrograd`.
 
 ## Requirements
 
-Rust >= 1.73
+Rust >= 1.95
 
 ## Usage
 

diff --git a/examples/linear.rs b/examples/linear.rs
@@ -1,4 +1,4 @@
-use cognius::{
+use athena::{
     module::Forward,
     nn::{functional as F, Linear},
     Tensor,

diff --git a/examples/matmul.rs b/examples/matmul.rs
@@ -1,4 +1,4 @@
-use cognius::{linalg, Tensor};
+use athena::{linalg, Tensor};
 
 fn main() {
     let a = Tensor::randn(&[2, 3, 2]);

diff --git a/examples/mlp.rs b/examples/mlp.rs
@@ -1,4 +1,4 @@
-use cognius::{
+use athena::{
     module::{Forward, Module},
     nn::{functional as F, Linear, MSELoss},
     optim::{Optim, SGD},
@@ -7,9 +7,9 @@ use cognius::{
 
 fn main() {
     let epochs = 10;
-    let criterion = MSELoss::new();
-    let mlp = MLP::new([2, 1]);
-    let optim = SGD::new(mlp.parameters(), 3e-1);
+    let criterion = MSELoss::new(None);
+    let mlp = MLP::new([2, 16, 1]);
+    let optim = SGD::new(mlp.parameters(), 3e-3);
 
     let data = vec![
         Tensor::tensor(&[9., 3.], &[2]),
@@ -76,17 +76,18 @@ fn main() {
         );
         loss.backward();
     }
-    println!("MODEL: {:?}", mlp.parameters());
 }
 
 struct MLP {
     linear: Linear,
+    linear1: Linear,
 }
 
 impl MLP {
-    pub fn new(features: [usize; 2]) -> Self {
+    pub fn new(features: [usize; 3]) -> Self {
         Self {
             linear: Linear::new(features[0], features[1]),
+            linear1: Linear::new(features[1], features[2]),
         }
     }
 }
@@ -97,14 +98,17 @@ impl Module for MLP {
     }
 
     fn parameters(&self) -> Vec<Tensor> {
-        let parameters = self.linear.parameters();
+        let mut parameters = self.linear.parameters();
+        parameters.append(&mut self.linear1.parameters());
         parameters
     }
 }
 
 impl Forward for MLP {
     fn forward(&self, x: Tensor) -> Tensor {
         let x = self.linear.forward(x);
+        let x = F::relu(x);
+        let x = self.linear1.forward(x);
         F::sigmoid(x)
     }
 }
diff --git a/examples/mlp2.rs b/examples/mlp2.rs
@@ -1,4 +1,4 @@
-use cognius::{
+use athena::{
     module::{Forward, Module},
     nn::{Linear, MSELoss},
     optim::{Optim, SGD},
@@ -7,7 +7,7 @@ use cognius::{
 
 fn main() {
     let epochs = 10;
-    let criterion = MSELoss::new();
+    let criterion = MSELoss::new(None);
     let mlp = MLP::new([1, 1]);
     let optim = SGD::new(mlp.parameters(), 3e-3);
 

diff --git a/examples/softmax.rs b/examples/softmax.rs
@@ -0,0 +1,8 @@
+use athena::{nn::functional as F, Tensor};
+
+fn main() {
+    let x = Tensor::tensor(&[0.24, 0.1, 0.5, 0.8, 1.2, 2.2], &[1, 2, 3]);
+    println!("IN:\n{x}\n\n");
+    let x = F::softmax(x, 2);
+    println!("OUT:\n{x}");
+}
diff --git a/src/backward.rs b/src/backward.rs
@@ -118,7 +118,26 @@ impl Backward for Op {
                 t._prev[0].add_to_grad(dx);
             }
 
-            Op::MSE => {
+            Op::Softmax(x, _) => {
+                let t = tensor.inner.borrow();
+                let n = x.length();
+                let s = x.item();
+                let mut jacobian = vec![0.0; n * n];
+                for i in 0..n {
+                    for j in 0..n {
+                        if i == j {
+                            jacobian[i * j] = s[i] * (1.0 - s[i]);
+                        } else {
+                            jacobian[i * j] = -s[i] * s[j];
+                        }
+                    }
+                }
+                let a = Tensor::tensor(&jacobian, &[n, n]).t();
+                t._prev[0].add_to_grad(a.item());
+            }
+
+            Op::MSE(n) => {
+                let n = *n as f64;
                 let t = tensor.inner.borrow();
                 let t_prev = t._prev[0].inner.borrow();
                 let t_sub = t_prev._prev[0].inner.borrow();
@@ -127,7 +146,7 @@ impl Backward for Op {
                 let grad = out
                     .iter()
                     .zip(target)
-                    .map(|(x, y)| 2.0 * (x - y))
+                    .map(|(x, y)| 2.0 / n * (x - y))
                     .collect::<Vec<f64>>();
                 drop(t_sub);
                 drop(t_prev);

diff --git a/src/data/dataloader.rs b/src/data/dataloader.rs
@@ -1,6 +1,6 @@
 use std::{cell::RefCell, rc::Rc};
 
-use rand::{seq::SliceRandom, thread_rng};
+use rand::{seq::SliceRandom, rng};
 
 use super::dataset::Dataset;
 
@@ -26,7 +26,7 @@ impl<T> DataloaderInner<T> {
 
     /// Mix the indices up to obtain random sequence
     fn shuffle_indices(&mut self) {
-        self.indices.shuffle(&mut thread_rng());
+        self.indices.shuffle(&mut rng());
     }
 }
 
@@ -48,7 +48,7 @@ impl<T> Dataloader<T> {
         }
         // shuffle all indices if it is specified so
         if shuffle {
-            indices.shuffle(&mut thread_rng());
+            indices.shuffle(&mut rng());
         }
         Self(Rc::new(RefCell::new(DataloaderInner {
             dataset,
@@ -68,6 +68,10 @@ impl<T> Dataloader<T> {
     pub fn is_shuffle(&self) -> bool {
         self.0.borrow().shuffle
     }
+
+    pub fn len(&self) -> usize {
+        self.0.borrow().dataset.len()
+    }
 }
 
 impl<T> Iterator for Dataloader<T> {

diff --git a/src/nn/criterions.rs b/src/nn/criterions.rs
@@ -1,22 +1,43 @@
 use crate::{op::Op, tensor_data::TensorData, Tensor};
 
-pub struct MSELoss {}
+#[derive(Clone, Copy, PartialEq, PartialOrd)]
+pub enum Reduction {
+    SUM,
+    MEAN,
+}
+
+#[derive(Clone)]
+pub struct MSELoss {
+    reduction: Option<Reduction>,
+}
 
 impl MSELoss {
-    pub fn new() -> Self {
-        Self {}
+    pub fn new(reduction: Option<Reduction>) -> Self {
+        Self {
+            reduction: reduction,
+        }
     }
 
     pub fn measure(&self, a: Tensor, b: Tensor) -> Tensor {
         let t = (a - b).pow(2);
         let shape = t.shape();
-        let inner = TensorData::from_op(t.item(), vec![t], Op::MSE);
+        let a = t.item();
+        let t_len = t.length() as f64;
+        let mut s = 0.0;
+        if let Some(reduction) = self.reduction {
+            s = a.iter().sum::<f64>();
+            if reduction == Reduction::MEAN {
+                s = s / t_len;
+            }
+        }
+        s /= t_len;
+        let inner = TensorData::from_op(vec![s], vec![t], Op::MSE(t_len as usize));
         Tensor::new(inner, &shape)
     }
 }
 
 impl Default for MSELoss {
     fn default() -> Self {
-        Self::new()
+        Self::new(None)
     }
 }
diff --git a/src/nn/functional.rs b/src/nn/functional.rs
@@ -15,3 +15,41 @@ pub fn sigmoid(x: Tensor) -> Tensor {
     let inner = TensorData::from_op(data.item(), vec![x.clone()], Op::Sigmoid(x));
     Tensor::new(inner, &data.shape)
 }
+
+pub fn softmax(x: Tensor, dim: usize) -> Tensor {
+    let shape = x.shape();
+    let mut shape2 = shape.clone();
+    assert_eq!(
+        dim,
+        shape.len() - 1,
+        "Softmax for dimensions other than the last one is not supported."
+    );
+    let mut result = vec![0.0; x.length()];
+    let data = x.item();
+    // get batch dimensions if they exist
+    let mut batches: Vec<usize> = vec![];
+    for i in 2..shape.len() {
+        batches.push(shape[i - 2]);
+    }
+    // remove batch dimensions from the A tensor shape
+    shape2.drain(0..batches.len());
+    let batch_prod = batches.iter().product::<usize>();
+    let m = shape2[0];
+    let n = shape2[1];
+    // iterate over the batch dimensions
+    // `k` is a batch dimension
+    for k in 0..batch_prod {
+        for i in 0..m {
+            let _x = &data[(k * m + i * n)..(k * m + i * n + n)];
+            // do operations
+            let max_x = _x.iter().cloned().fold(f64::NEG_INFINITY, f64::max);
+            let exp_x: Vec<f64> = _x.iter().map(|&xi| (xi - max_x).exp()).collect();
+            let sum_exp_x: f64 = exp_x.iter().sum();
+            result[(k * m + i * n)..(k * m + i * n + n)]
+                .copy_from_slice(&exp_x.iter().map(|&ei| ei / sum_exp_x).collect::<Vec<f64>>());
+        }
+    }
+    // create new tensor
+    let inner = TensorData::from_op(result, vec![x.clone()], Op::Softmax(x, dim));
+    Tensor::new(inner, &shape)
+}
diff --git a/src/op.rs b/src/op.rs
@@ -13,7 +13,8 @@ pub enum Op {
     Cross,
     ReLU,
     Sigmoid(Tensor),
-    MSE,
+    Softmax(Tensor, usize),
+    MSE(usize),
 }
 
 impl std::fmt::Display for Op {
@@ -28,7 +29,8 @@ impl std::fmt::Display for Op {
             Op::Cross => write!(f, "Cross"),
             Op::ReLU => write!(f, "ReLU"),
             Op::Sigmoid(n) => write!(f, "Sigmoid({n})"),
-            Op::MSE => write!(f, "MSE"),
+            Op::Softmax(n, dim) => write!(f, "Softmax({n},{dim})"),
+            Op::MSE(n) => write!(f, "MSE({n})"),
         }
     }
 }
diff --git a/src/optim.rs b/src/optim.rs
@@ -1,3 +1,4 @@
+pub mod lr_scheduler;
 pub mod sgd;
 
 // Short paths for algorithms
@@ -10,4 +11,7 @@ pub trait Optim {
 
     /// Sets gradients to zero.
     fn zero_grad(&self);
+
+    /// changes the learning rate by gamma
+    fn change_lr(&mut self, gamma: f64);
 }
diff --git a/src/optim/lr_scheduler.rs b/src/optim/lr_scheduler.rs
@@ -0,0 +1,5 @@
+pub mod multistep_lr;
+
+pub trait Scheduler {
+    fn step(&mut self) -> ();
+}
diff --git a/src/optim/lr_scheduler/multistep_lr.rs b/src/optim/lr_scheduler/multistep_lr.rs
@@ -0,0 +1,33 @@
+use crate::optim::Optim;
+
+use super::Scheduler;
+
+pub struct MultiStepLR {
+    optimizer: Box<dyn Optim>,
+    pub milestones: Vec<usize>,
+    pub gamma: f64,
+    count: usize,
+}
+
+impl MultiStepLR {
+    pub fn new(optimizer: Box<dyn Optim>, milestones: &[usize], gamma: f64) -> Self {
+        Self {
+            optimizer,
+            milestones: milestones.to_vec(),
+            gamma: gamma,
+            count: 0,
+        }
+    }
+}
+
+impl Scheduler for MultiStepLR {
+    fn step(&mut self) -> () {
+        for m in &self.milestones {
+            if self.count == *m {
+                self.optimizer.as_mut().change_lr(self.gamma);
+            }
+        }
+        self.optimizer.as_mut().step();
+        self.count += 1;
+    }
+}
diff --git a/src/optim/sgd.rs b/src/optim/sgd.rs
@@ -9,6 +9,7 @@ use super::Optim;
 /// It has:
 /// - parameters of the model
 /// - learning rate
+#[derive(Clone)]
 pub struct SGD {
     parameters: Vec<Tensor>,
     lr: f64,
@@ -57,4 +58,8 @@ impl Optim for SGD {
             self.parameters[i].inner.borrow_mut().zero_grad();
         }
     }
+
+    fn change_lr(&mut self, gamma: f64) {
+        self.lr *= gamma
+    }
 }