Switched from reverse mode to forward mode where possible.

patrick-kidger · patrick-kidger · commit b930ea63d7ec · 2024-05-26T02:56:34.000+02:00
This commit switches some functions that unnecessarily use reverse-mode autodiff to using forward-mode autodiff. In particular this is to fix #51 (comment). Whilst I"m here, I noticed what looks like some incorrect handling of complex numbers. I've tried fixing those up, but at least as of this commit the test I've added fails. I've poked at this a bit but not yet been able to resolve this. It seems something is still awry!
diff --git a/optimistix/_search.py b/optimistix/_search.py
@@ -26,7 +26,10 @@
 from typing import ClassVar, Generic, Type, TypeVar
 
 import equinox as eqx
+import jax.numpy as jnp
+import jax.tree_util as jtu
 import lineax as lx
+from equinox.internal import ω
 from jaxtyping import Array, Bool, Scalar
 
 from ._custom_types import (
@@ -35,7 +38,7 @@
     SearchState,
     Y,
 )
-from ._misc import sum_squares
+from ._misc import sum_squares, tree_dot
 from ._solution import RESULTS
 
 
@@ -89,6 +92,9 @@ class EvalGrad(FunctionInfo, Generic[Y], strict=True):
     def as_min(self):
         return self.f
 
+    def compute_grad_dot(self, y: Y):
+        return tree_dot(self.grad, y)
+
 
 # NOT PUBLIC, despite lacking an underscore. This is so pyright gets the name right.
 class EvalGradHessian(FunctionInfo, Generic[Y], strict=True):
@@ -104,6 +110,9 @@ class EvalGradHessian(FunctionInfo, Generic[Y], strict=True):
     def as_min(self):
         return self.f
 
+    def compute_grad_dot(self, y: Y):
+        return tree_dot(self.grad, y)
+
 
 # NOT PUBLIC, despite lacking an underscore. This is so pyright gets the name right.
 class EvalGradHessianInv(FunctionInfo, Generic[Y], strict=True):
@@ -118,6 +127,9 @@ class EvalGradHessianInv(FunctionInfo, Generic[Y], strict=True):
     def as_min(self):
         return self.f
 
+    def compute_grad_dot(self, y: Y):
+        return tree_dot(self.grad, y)
+
 
 # NOT PUBLIC, despite lacking an underscore. This is so pyright gets the name right.
 class Residual(FunctionInfo, Generic[Out], strict=True):
@@ -144,18 +156,48 @@ class ResidualJac(FunctionInfo, Generic[Y, Out], strict=True):
 
     residual: Out
     jac: lx.AbstractLinearOperator
-    grad: Y
-
-    def __init__(self, residual: Out, jac: lx.AbstractLinearOperator):
-        self.residual = residual
-        self.jac = jac
-        # The gradient is used ubiquitously, so compute it once here, so that it can be
-        # used without recomputation in both the descent and search.
-        self.grad = jac.transpose().mv(residual)
 
     def as_min(self):
         return 0.5 * sum_squares(self.residual)
 
+    def compute_grad(self):
+        # Not precomputed during `__init__` as this may hit reverse-mode autodiff which
+        # may not be valid.
+        if any(jnp.iscomplexobj(x) for x in jtu.tree_leaves(self.residual)):
+            conj_residual = jtu.tree_map(jnp.conj, self.residual)
+            conj_jac = lx.conj(self.jac)
+            return (
+                0.5
+                * (
+                    self.jac.transpose().mv(conj_residual) ** ω
+                    + conj_jac.transpose().mv(self.residual) ** ω
+                )
+            ).ω
+        else:
+            return self.jac.transpose().mv(self.residual)
+
+    def compute_grad_dot(self, y: Y):
+        # If `self.jac` is a `lx.JacobianLinearOperator` (or a
+        # `lx.FunctionLinearOperator` wrapping the result of `jax.linearize`), then
+        # `grad = jac^T residual`, so that what we want to compute is
+        # `residual^T jac y`. Doing the reduction in this order means we hit
+        # forward-mode rather than reverse-mode autodiff.
+        if any(jnp.iscomplexobj(x) for x in jtu.tree_leaves(self.residual)):
+            # In this case then actually
+            # `grad = 0.5 * (jac^T residual^bar + jac^Tbar residual)`.
+            # all of this.
+            conj_residual = jtu.tree_map(jnp.conj, self.residual)
+            conj_jac = lx.conj(self.jac)
+            return (
+                0.5
+                * (
+                    tree_dot(conj_residual, self.jac.mv(y)) ** ω
+                    + tree_dot(self.residual, conj_jac.mv(y)) ** ω
+                )
+            ).ω
+        else:
+            return tree_dot(self.residual, self.jac.mv(y))
+
 
 Eval.__qualname__ = "FunctionInfo.Eval"
 EvalGrad.__qualname__ = "FunctionInfo.EvalGrad"
diff --git a/optimistix/_solver/backtracking.py b/optimistix/_solver/backtracking.py
@@ -7,9 +7,6 @@
 from jaxtyping import Array, Bool, Scalar, ScalarLike
 
 from .._custom_types import Y
-from .._misc import (
-    tree_dot,
-)
 from .._search import AbstractSearch, FunctionInfo
 from .._solution import RESULTS
 
@@ -55,7 +52,7 @@ def __post_init__(self):
         )
 
     def init(self, y: Y, f_info_struct: _FnInfo) -> _BacktrackingState:
-        del f_info_struct
+        del y, f_info_struct
         return _BacktrackingState(step_size=jnp.array(self.step_init))
 
     def step(
@@ -67,7 +64,7 @@ def step(
         f_eval_info: _FnEvalInfo,
         state: _BacktrackingState,
     ) -> tuple[Scalar, Bool[Array, ""], RESULTS, _BacktrackingState]:
-        if isinstance(
+        if not isinstance(
             f_info,
             (
                 FunctionInfo.EvalGrad,
@@ -76,16 +73,14 @@ def step(
                 FunctionInfo.ResidualJac,
             ),
         ):
-            grad = f_info.grad
-        else:
             raise ValueError(
                 "Cannot use `BacktrackingArmijo` with this solver. This is because "
                 "`BacktrackingArmijo` requires gradients of the target function, but "
                 "this solver does not evaluate such gradients."
             )
 
         y_diff = (y_eval**ω - y**ω).ω
-        predicted_reduction = tree_dot(grad, y_diff)
+        predicted_reduction = f_info.compute_grad_dot(y_diff)
         # Terminate when the Armijo condition is satisfied. That is, `fn(y_eval)`
         # must do better than its linear approximation:
         # `fn(y_eval) < fn(y) + grad•y_diff`
diff --git a/optimistix/_solver/dogleg.py b/optimistix/_solver/dogleg.py
@@ -72,13 +72,15 @@ def query(
         f_info: Union[FunctionInfo.EvalGradHessian, FunctionInfo.ResidualJac],
         state: _DoglegDescentState,
     ) -> _DoglegDescentState:
-        del state
+        del y, state
         # Compute `denom = grad^T Hess grad.`
         if isinstance(f_info, FunctionInfo.EvalGradHessian):
-            denom = tree_dot(f_info.grad, f_info.hessian.mv(f_info.grad))
+            grad = f_info.grad
+            denom = tree_dot(f_info.grad, f_info.hessian.mv(grad))
         elif isinstance(f_info, FunctionInfo.ResidualJac):
             # Use Gauss--Newton approximation `Hess ~ J^T J`
-            denom = sum_squares(f_info.jac.mv(f_info.grad))
+            grad = f_info.compute_grad()
+            denom = sum_squares(f_info.jac.mv(grad))
         else:
             raise ValueError(
                 "`DoglegDescent` can only be used with least-squares solvers, or "
@@ -88,7 +90,7 @@ def query(
         denom_nonzero = denom > jnp.finfo(denom.dtype).eps
         safe_denom = jnp.where(denom_nonzero, denom, 1)
         # Compute `grad^T grad / (grad^T Hess grad)`
-        scaling = jnp.where(denom_nonzero, sum_squares(f_info.grad) / safe_denom, 0.0)
+        scaling = jnp.where(denom_nonzero, sum_squares(grad) / safe_denom, 0.0)
         scaling = cast(Array, scaling)
 
         # Downhill towards the bottom of the quadratic basin.
@@ -97,7 +99,7 @@ def query(
         newton_norm = self.trust_region_norm(newton_sol)
 
         # Downhill steepest descent.
-        cauchy = (-scaling * f_info.grad**ω).ω
+        cauchy = (-scaling * grad**ω).ω
         cauchy_norm = self.trust_region_norm(cauchy)
 
         return _DoglegDescentState(
diff --git a/optimistix/_solver/gradient_methods.py b/optimistix/_solver/gradient_methods.py
@@ -58,10 +58,11 @@ def query(
                 FunctionInfo.EvalGrad,
                 FunctionInfo.EvalGradHessian,
                 FunctionInfo.EvalGradHessianInv,
-                FunctionInfo.ResidualJac,
             ),
         ):
             grad = f_info.grad
+        elif isinstance(f_info, FunctionInfo.ResidualJac):
+            grad = f_info.compute_grad()
         else:
             raise ValueError(
                 "Cannot use `SteepestDescent` with this solver. This is because "
diff --git a/optimistix/_solver/nonlinear_cg.py b/optimistix/_solver/nonlinear_cg.py
@@ -119,15 +119,19 @@ def query(
         ],
         state: _NonlinearCGDescentState,
     ) -> _NonlinearCGDescentState:
-        if not isinstance(
+        del y
+        if isinstance(
             f_info,
             (
                 FunctionInfo.EvalGrad,
                 FunctionInfo.EvalGradHessian,
                 FunctionInfo.EvalGradHessianInv,
-                FunctionInfo.ResidualJac,
             ),
         ):
+            grad = f_info.grad
+        elif isinstance(f_info, FunctionInfo.ResidualJac):
+            grad = f_info.compute_grad()
+        else:
             raise ValueError(
                 "Cannot use `NonlinearCGDescent` with this solver. This is because "
                 "`NonlinearCGDescent` requires gradients of the target function, but "
@@ -140,16 +144,16 @@ def query(
         # Furthermore, the same mechanism handles convergence: once
         # `state.{grad, y_diff} = 0`, i.e. our previous step hit a local minima, then
         # on this next step we'll again just use gradient descent, and stop.
-        beta = self.method(f_info.grad, state.grad, state.y_diff)
-        neg_grad = (-(f_info.grad**ω)).ω
+        beta = self.method(grad, state.grad, state.y_diff)
+        neg_grad = (-(grad**ω)).ω
         nonlinear_cg_direction = (neg_grad**ω + beta * state.y_diff**ω).ω
         # Check if this is a descent direction. Use gradient descent if it isn't.
         y_diff = tree_where(
-            tree_dot(f_info.grad, nonlinear_cg_direction) < 0,
+            tree_dot(grad, nonlinear_cg_direction) < 0,
             nonlinear_cg_direction,
             neg_grad,
         )
-        return _NonlinearCGDescentState(y_diff=y_diff, grad=f_info.grad)
+        return _NonlinearCGDescentState(y_diff=y_diff, grad=grad)
 
     def step(
         self, step_size: Scalar, state: _NonlinearCGDescentState
diff --git a/optimistix/_solver/trust_region.py b/optimistix/_solver/trust_region.py
@@ -273,7 +273,7 @@ def predict_reduction(
                 FunctionInfo.ResidualJac,
             ),
         ):
-            return tree_dot(f_info.grad, y_diff)
+            return f_info.compute_grad_dot(y_diff)
         else:
             raise ValueError(
                 "Cannot use `LinearTrustRegion` with this solver. This is because "
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,7 +33,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Mathematics",
 ]
 urls = {repository = "https://github.com/patrick-kidger/optimistix" }
-dependencies = ["jax>=0.4.18", "jaxtyping>=0.2.23", "lineax>=0.0.4", "equinox>=0.11.1", "typing_extensions>=4.5.0"]
+dependencies = ["jax>=0.4.18", "jaxtyping>=0.2.23", "lineax>=0.0.5", "equinox>=0.11.1", "typing_extensions>=4.5.0"]
 
 [build-system]
 requires = ["hatchling"]
diff --git a/tests/test_least_squares.py b/tests/test_least_squares.py
@@ -149,3 +149,66 @@ def f_bwd(sign, g):
 
     with pytest.raises(TypeError, match="forward-mode autodiff"):
         optx.least_squares(f, solver, y0, options=dict(jac="fwd"), max_steps=512)
+
+
+def test_residual_jac():
+    # First grab values as computed using the complex implementation. We compute the
+    # gradient both using the `.compute_grad` method (which uses a custom more-efficient
+    # approach using forward-mode-autodiff) and the simple way using `jax.grad`.
+
+    def residual1(y1):
+        return y1**2
+
+    def compute1(y1):
+        r = residual1(y1)
+        jac = lx.MatrixLinearOperator(jax.jacfwd(residual1, holomorphic=True)(y1))
+        f_info = optx.FunctionInfo.ResidualJac(r, jac)
+        return f_info.as_min(), (f_info.compute_grad(), f_info.compute_grad_dot(z))
+
+    y1 = jnp.array([2 + 3j, 4 + 1j])
+    z = jnp.array([-1 + 0j, 2 - 5j])
+    true_min = 0.5 * jnp.sum(y1**2 * jnp.conj(y1**2))
+    (min1, (grad1, grad_dot1)), true_grad1 = jax.value_and_grad(compute1, has_aux=True)(
+        y1
+    )
+    true_grad_dot1 = jnp.sum(true_grad1 * jnp.conj(z))
+
+    # Next compute the same quantities using just the real implementation.
+
+    def residual2(y2):
+        real, imag = y2
+        return real**2 - imag**2, 2 * real * imag
+
+    def compute2(y2):
+        r = residual2(y2)
+        jac = lx.PyTreeLinearOperator(
+            jax.jacfwd(residual2)(y2), jax.eval_shape(lambda: y2)
+        )
+        f_info = optx.FunctionInfo.ResidualJac(r, jac)
+        return f_info.as_min(), (
+            f_info.compute_grad(),
+            f_info.compute_grad_dot((z.real, z.imag)),
+        )
+
+    y2 = (y1.real, y1.imag)
+    (min2, (grad2, grad_dot2)), true_grad2 = jax.value_and_grad(compute2, has_aux=True)(
+        y2
+    )
+    true_grad2_real, true_grad2_imag = true_grad2
+    true_grad_dot2 = (
+        true_grad2_real * z.real + true_grad2_imag * z.imag,
+        true_grad2_imag * z.real - true_grad2_real * z.imag,
+    )
+
+    # Now check consistency.
+
+    assert tree_allclose(min1, min2)
+    assert tree_allclose(min1.astype(jnp.complex128), true_min)
+
+    assert tree_allclose(grad2, true_grad2)
+    assert tree_allclose((grad1.real, grad1.imag), grad2)
+    assert tree_allclose(grad1, true_grad1)
+
+    assert tree_allclose(grad_dot2, true_grad_dot2)
+    assert tree_allclose((grad_dot1.real, grad_dot1.imag), grad_dot2)
+    assert tree_allclose(grad_dot1, true_grad_dot1)
diff --git a/tests/test_solve.py b/tests/test_solve.py
@@ -1,4 +1,6 @@
+import equinox.internal as eqxi
 import jax
+import jax.numpy as jnp
 import optimistix as optx
 
 
@@ -48,3 +50,16 @@ def fn(x, _):
         return optx.fixed_point(fn, solver, 0.0).value
 
     f(0.0)
+
+
+def test_forward_mode():
+    def f(y, _):
+        return eqxi.nondifferentiable_backward(y)
+
+    sol = optx.least_squares(
+        f,
+        optx.LevenbergMarquardt(rtol=1e-4, atol=1e-4),
+        jnp.arange(3.0),
+        options=dict(jac="fwd"),
+    )
+    return sol.value

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@ classifiers = [`
`33`	`33`	`"Topic :: Scientific/Engineering :: Mathematics",`
`34`	`34`	`]`
`35`	`35`	`urls = {repository = "https://github.com/patrick-kidger/optimistix" }`
`36`		`-dependencies = ["jax>=0.4.18", "jaxtyping>=0.2.23", "lineax>=0.0.4", "equinox>=0.11.1", "typing_extensions>=4.5.0"]`
	`36`	`+dependencies = ["jax>=0.4.18", "jaxtyping>=0.2.23", "lineax>=0.0.5", "equinox>=0.11.1", "typing_extensions>=4.5.0"]`
`37`	`37`
`38`	`38`	`[build-system]`
`39`	`39`	`requires = ["hatchling"]`