add ability for the grokfast optimizer to automatically do spectral entropy reg pre step

lucidrains · lucidrains · commit 1d9e34465dab · 2024-12-03T11:16:51.000-08:00
diff --git a/README.md b/README.md
@@ -71,3 +71,14 @@ opt.zero_grad()
     url={https://openreview.net/forum?id=lyoOWX0e0O}
 }
 ```
+
+```bibtex
+@inproceedings{anonymous2024the,
+    title   = {The Complexity Dynamics of Grokking},
+    author  = {Anonymous},
+    booktitle = {Submitted to The Thirteenth International Conference on Learning Representations},
+    year    = {2024},
+    url     = {https://openreview.net/forum?id=07N9jCfIE4},
+    note    = {under review}
+}
+```
diff --git a/grokfast_pytorch/grokfast.py b/grokfast_pytorch/grokfast.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
-from typing import Tuple, Callable
+from typing import Callable
+
+from functools import partial
 
 import torch
 from torch.optim.optimizer import Optimizer
@@ -9,22 +11,50 @@
 def exists(val):
     return val is not None
 
+# tensor helpers
+
+def log(t, eps = 1e-20):
+    return t.clamp(min = eps).log()
+
+def entropy(prob):
+    return (-prob * log(prob)).sum(dim = -1)
+
+def spectral_entropy_reg_loss_hook(optimizer, weight, *args, **kwargs):
+    loss = torch.tensor(0.).requires_grad_()
+
+    for param_group in optimizer.param_groups:
+        for param in param_group['params']:
+            if param.ndim < 2:
+                continue
+
+            *_, row, col = param.shape
+            reshaped_param = param.reshape(-1, row, col)
+
+            singular_values = torch.linalg.svdvals(reshaped_param)
+            spectral_prob = singular_values.softmax(dim = -1)
+            spectral_entropy = entropy(spectral_prob).sum()
+            loss = loss + spectral_entropy
+
+    (loss * weight).backward()
+
 # class
 
 class GrokFastAdamW(Optimizer):
     def __init__(
         self,
         params,
         lr = 1e-4,
-        betas: Tuple[float, float] = (0.9, 0.99),
+        betas: tuple[float, float] = (0.9, 0.99),
         weight_decay = 0.,
         eps = 1e-8,
         regen_reg_rate = 0.,
         grokfast = True,
         grokfast_alpha = 0.98,
         grokfast_lamb = 2.,
         grokfast_after_step = 0,
-        normalize_lr = True
+        normalize_lr = True,
+        add_spectral_entropy_reg = False,
+        spectral_entropy_reg_weight = 0.1
     ):
         assert lr > 0.
         assert all([0. <= beta <= 1. for beta in betas])
@@ -55,6 +85,14 @@ def __init__(
 
         super().__init__(params, defaults)
 
+        # maybe spectral entropy reg
+        # https://openreview.net/forum?id=07N9jCfIE4
+
+        if not add_spectral_entropy_reg:
+            return
+
+        self.register_step_pre_hook(partial(spectral_entropy_reg_loss_hook, self, spectral_entropy_reg_weight))
+
     def turn_on_grokfast(self):
         for group in self.param_groups:
             group['grokfast'] = True
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "grokfast-pytorch"
-version = "0.0.7"
+version = "0.0.9"
 description = "Grokfast"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }