add regenerative regularization from https://arxiv.org/abs/2308.11958

lucidrains · lucidrains · commit f757a671ddfc · 2024-08-26T16:48:35.000-07:00
diff --git a/README.md b/README.md
@@ -62,3 +62,12 @@ opt.zero_grad()
     url     = {https://api.semanticscholar.org/CorpusID:270123846}
 }
 ```
+
+```bibtex
+@misc{kumar2024maintaining,
+    title={Maintaining Plasticity in Continual Learning via Regenerative Regularization},
+    author={Saurabh Kumar and Henrik Marklund and Benjamin Van Roy},
+    year={2024},
+    url={https://openreview.net/forum?id=lyoOWX0e0O}
+}
+```
diff --git a/grokfast_pytorch/grokfast.py b/grokfast_pytorch/grokfast.py
@@ -19,6 +19,7 @@ def __init__(
         betas: Tuple[float, float] = (0.9, 0.99),
         weight_decay = 0.,
         eps = 1e-8,
+        regen_reg_rate = 0.,
         grokfast = True,
         grokfast_alpha = 0.98,
         grokfast_lamb = 2.,
@@ -28,7 +29,9 @@ def __init__(
         assert lr > 0.
         assert all([0. <= beta <= 1. for beta in betas])
         assert weight_decay >= 0.
+        assert regen_reg_rate >= 0.
         assert eps > 0.
+        assert not (weight_decay >0. and regen_reg_rate > 0.), 'weight decay and regenerative regularization cannot be used together'
 
         # in order for fair comparison
         # reduce the learning rate by a factor of (1 + grokfast_lamb)
@@ -43,6 +46,7 @@ def __init__(
             betas = betas,
             eps = eps,
             weight_decay = weight_decay,
+            regen_reg_rate = regen_reg_rate,
             grokfast = grokfast,
             grokfast_alpha = grokfast_alpha,
             grokfast_lamb = grokfast_lamb,
@@ -79,20 +83,31 @@ def step(
         for group in self.param_groups:
             for p in filter(lambda p: exists(p.grad), group['params']):
 
-                grad, lr, wd, beta1, beta2, eps, grokfast, grokfast_after_step, alpha, lamb, state, init_lr = p.grad, group['lr'], group['weight_decay'], *group['betas'], group['eps'], group['grokfast'], group['grokfast_after_step'], group['grokfast_alpha'], group['grokfast_lamb'], self.state[p], self._init_lr
+                grad, lr, wd, regen_rate, beta1, beta2, eps, grokfast, grokfast_after_step, alpha, lamb, state, init_lr = p.grad, group['lr'], group['weight_decay'], group['regen_reg_rate'], *group['betas'], group['eps'], group['grokfast'], group['grokfast_after_step'], group['grokfast_alpha'], group['grokfast_lamb'], self.state[p], self._init_lr
 
                 # decoupled weight decay
 
                 if wd > 0.:
                     p.mul_(1. - lr / init_lr * wd)
 
+                # regenerative regularization - ICLR 2024
+                # https://openreview.net/forum?id=lyoOWX0e0O
+
+                if regen_rate > 0. and 'param_init' in state:
+                    param_init = state['param_init']
+
+                    p.lerp_(param_init, lr / init_lr * regen_rate)
+
                 # init state if needed
 
                 if len(state) == 0:
                     state['steps'] = 0
                     state['exp_avg'] = torch.zeros_like(grad)
                     state['exp_avg_sq'] = torch.zeros_like(grad)
 
+                    if regen_rate > 0.:
+                        state['param_init'] = p.data.clone()
+
                 # get some of the states
 
                 exp_avg, exp_avg_sq, steps = state['exp_avg'], state['exp_avg_sq'], state['steps']
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "grokfast-pytorch"
-version = "0.0.5"
+version = "0.0.7"
 description = "Grokfast"
 authors = [
     { name = "Phil Wang", email = "lucidrains@gmail.com" }