From 45765e416970a50d724a4ef3578ca572492d1813 Mon Sep 17 00:00:00 2001
From: Penelope Yong <penelopeysm@gmail.com>
Date: Wed, 10 Dec 2025 18:06:13 +0000
Subject: [PATCH 1/3] Enable closure/non-closure case for LogDensityFunction

---
 HISTORY.md                      |   4 +
 Project.toml                    |   2 +-
 src/logdensityfunction.jl       | 153 +++++++++++++++++++++++++++-----
 test/integration/enzyme/main.jl |  10 +--
 4 files changed, 138 insertions(+), 31 deletions(-)

diff --git a/HISTORY.md b/HISTORY.md
index 3b0dec5e2..9efd5f6eb 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,5 +1,9 @@
 # DynamicPPL Changelog
 
+## 0.39.2
+
+The internals of `LogDensityFunction` have been changed slightly so that you do not need to specify `function_annotation` when performing AD with Enzyme.jl.
+
 ## 0.39.1
 
 `LogDensityFunction` now allows you to call `logdensity_and_gradient(ldf, x)` with `AbstractVector`s `x` that are not plain Vectors (they will be converted internally before calculating the gradient).
diff --git a/Project.toml b/Project.toml
index d869026e9..7b995f530 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "DynamicPPL"
 uuid = "366bfd00-2699-11ea-058f-f148b4cae6d8"
-version = "0.39.1"
+version = "0.39.2"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
diff --git a/src/logdensityfunction.jl b/src/logdensityfunction.jl
index 3008a329b..436321f89 100644
--- a/src/logdensityfunction.jl
+++ b/src/logdensityfunction.jl
@@ -191,11 +191,18 @@ struct LogDensityFunction{
         else
             # Make backend-specific tweaks to the adtype
             adtype = DynamicPPL.tweak_adtype(adtype, model, varinfo)
-            DI.prepare_gradient(
-                LogDensityAt{Tlink}(model, getlogdensity, all_iden_ranges, all_ranges),
-                adtype,
-                x,
-            )
+            args = (model, getlogdensity, all_iden_ranges, all_ranges)
+            if _use_closure(adtype)
+                DI.prepare_gradient(LogDensityAt{Tlink}(args...), adtype, x)
+            else
+                DI.prepare_gradient(
+                    logdensity_at,
+                    adtype,
+                    x,
+                    DI.Constant(Val{Tlink}()),
+                    map(DI.Constant, args)...,
+                )
+            end
         end
         return new{
             Tlink,
@@ -235,6 +242,47 @@ end
 ldf_accs(::typeof(getlogprior)) = AccumulatorTuple((LogPriorAccumulator(),))
 ldf_accs(::typeof(getloglikelihood)) = AccumulatorTuple((LogLikelihoodAccumulator(),))
 
+"""
+    logdensity_at(
+        params::AbstractVector{<:Real},
+        ::Val{Tlink},
+        model::Model,
+        getlogdensity::Function,
+        iden_varname_ranges::NamedTuple,
+        varname_ranges::Dict{VarName,RangeAndLinked},
+    ) where {Tlink}
+
+Calculate the log density at the given `params`, using the provided
+information extracted from a `LogDensityFunction`.
+"""
+function logdensity_at(
+    params::AbstractVector{<:Real},
+    ::Val{Tlink},
+    model::Model,
+    getlogdensity::Function,
+    iden_varname_ranges::NamedTuple,
+    varname_ranges::Dict{VarName,RangeAndLinked},
+) where {Tlink}
+    strategy = InitFromParams(
+        VectorWithRanges{Tlink}(iden_varname_ranges, varname_ranges, params), nothing
+    )
+    accs = ldf_accs(getlogdensity)
+    _, vi = DynamicPPL.init!!(model, OnlyAccsVarInfo(accs), strategy)
+    return getlogdensity(vi)
+end
+
+"""
+    LogDensityAt{Tlink}(
+        model::Model,
+        getlogdensity::Function,
+        iden_varname_ranges::NamedTuple,
+        varname_ranges::Dict{VarName,RangeAndLinked},
+    ) where {Tlink}
+
+A callable struct that behaves in the same way as `logdensity_at`, but stores the model and
+other information internally. Having two separate functions/structs allows for better
+performance with AD backends.
+"""
 struct LogDensityAt{Tlink,M<:Model,F<:Function,N<:NamedTuple}
     model::M
     getlogdensity::F
@@ -251,36 +299,57 @@ struct LogDensityAt{Tlink,M<:Model,F<:Function,N<:NamedTuple}
     end
 end
 function (f::LogDensityAt{Tlink})(params::AbstractVector{<:Real}) where {Tlink}
-    strategy = InitFromParams(
-        VectorWithRanges{Tlink}(f.iden_varname_ranges, f.varname_ranges, params), nothing
+    return logdensity_at(
+        params,
+        Val{Tlink}(),
+        f.model,
+        f.getlogdensity,
+        f.iden_varname_ranges,
+        f.varname_ranges,
     )
-    accs = ldf_accs(f.getlogdensity)
-    _, vi = DynamicPPL.init!!(f.model, OnlyAccsVarInfo(accs), strategy)
-    return f.getlogdensity(vi)
 end
 
 function LogDensityProblems.logdensity(
     ldf::LogDensityFunction{Tlink}, params::AbstractVector{<:Real}
 ) where {Tlink}
-    return LogDensityAt{Tlink}(
-        ldf.model, ldf._getlogdensity, ldf._iden_varname_ranges, ldf._varname_ranges
-    )(
-        params
+    return logdensity_at(
+        params,
+        Val{Tlink}(),
+        ldf.model,
+        ldf._getlogdensity,
+        ldf._iden_varname_ranges,
+        ldf._varname_ranges,
     )
 end
 
 function LogDensityProblems.logdensity_and_gradient(
     ldf::LogDensityFunction{Tlink}, params::AbstractVector{<:Real}
 ) where {Tlink}
+    # `params` has to be converted to the same vector type that was used for AD preparation,
+    # otherwise the preparation will not be valid.
     params = convert(_get_input_vector_type(ldf), params)
-    return DI.value_and_gradient(
-        LogDensityAt{Tlink}(
-            ldf.model, ldf._getlogdensity, ldf._iden_varname_ranges, ldf._varname_ranges
-        ),
-        ldf._adprep,
-        ldf.adtype,
-        params,
-    )
+    return if _use_closure(ldf.adtype)
+        DI.value_and_gradient(
+            LogDensityAt{Tlink}(
+                ldf.model, ldf._getlogdensity, ldf._iden_varname_ranges, ldf._varname_ranges
+            ),
+            ldf._adprep,
+            ldf.adtype,
+            params,
+        )
+    else
+        DI.value_and_gradient(
+            logdensity_at,
+            ldf._adprep,
+            ldf.adtype,
+            params,
+            DI.Constant(Val{Tlink}()),
+            DI.Constant(ldf.model),
+            DI.Constant(ldf._getlogdensity),
+            DI.Constant(ldf._iden_varname_ranges),
+            DI.Constant(ldf._varname_ranges),
+        )
+    end
 end
 
 function LogDensityProblems.capabilities(
@@ -314,6 +383,46 @@ By default, this just returns the input unchanged.
 """
 tweak_adtype(adtype::ADTypes.AbstractADType, ::Model, ::AbstractVarInfo) = adtype
 
+"""
+    _use_closure(adtype::ADTypes.AbstractADType)
+
+In LogDensityProblems, we want to calculate the derivative of logdensity(f, x)
+with respect to x, where f is the model (in our case LogDensityFunction) and is
+a constant. However, DifferentiationInterface generally expects a
+single-argument function g(x) to differentiate.
+
+There are two ways of dealing with this:
+
+1. Construct a closure over the model, i.e. let g = Base.Fix1(logdensity, f)
+
+2. Use a constant DI.Context. This lets us pass a two-argument function to DI,
+   as long as we also give it the 'inactive argument' (i.e. the model) wrapped
+   in `DI.Constant`.
+
+The relative performance of the two approaches, however, depends on the AD
+backend used. Some benchmarks are provided here:
+https://github.com/TuringLang/DynamicPPL.jl/issues/946#issuecomment-2931604829
+
+This function is used to determine whether a given AD backend should use a
+closure or a constant. If `use_closure(adtype)` returns `true`, then the
+closure approach will be used. By default, this function returns `false`, i.e.
+the constant approach will be used.
+"""
+# For these AD backends both closure and no closure work, but it is just faster to not use a
+# closure 
+_use_closure(::ADTypes.AutoForwardDiff) = false
+_use_closure(::ADTypes.AutoMooncake) = false
+_use_closure(::ADTypes.AutoMooncakeForward) = false
+# For ReverseDiff, with the compiled tape, you _must_ use a closure because otherwise with
+# DI.Constant arguments the tape will always be recompiled upon each call to
+# value_and_gradient. For non-compiled ReverseDiff, it is faster to not use a closure.
+_use_closure(::ADTypes.AutoReverseDiff{compile}) where {compile} = !compile
+# For AutoEnzyme it allows us to avoid setting function_annotation
+_use_closure(::ADTypes.AutoEnzyme) = false
+# Since for most backends it's faster to not use a closure, we set that as the default
+# for unknown AD backends
+_use_closure(::ADTypes.AbstractADType) = false
+
 ######################################################
 # Helper functions to extract ranges and link status #
 ######################################################
diff --git a/test/integration/enzyme/main.jl b/test/integration/enzyme/main.jl
index edfd67d18..0051e55b5 100644
--- a/test/integration/enzyme/main.jl
+++ b/test/integration/enzyme/main.jl
@@ -6,14 +6,8 @@ import Enzyme: set_runtime_activity, Forward, Reverse, Const
 using ForwardDiff: ForwardDiff  # run_ad uses FD for correctness test
 
 ADTYPES = (
-    (
-        "EnzymeForward",
-        AutoEnzyme(; mode=set_runtime_activity(Forward), function_annotation=Const),
-    ),
-    (
-        "EnzymeReverse",
-        AutoEnzyme(; mode=set_runtime_activity(Reverse), function_annotation=Const),
-    ),
+    ("EnzymeForward", AutoEnzyme(; mode=set_runtime_activity(Forward))),
+    ("EnzymeReverse", AutoEnzyme(; mode=set_runtime_activity(Reverse))),
 )
 
 @testset "$ad_key" for (ad_key, ad_type) in ADTYPES

From 396d0f9d9e4573d1e0ec74b4a233de8a16c62eb1 Mon Sep 17 00:00:00 2001
From: Penelope Yong <penelopeysm@gmail.com>
Date: Wed, 10 Dec 2025 18:07:00 +0000
Subject: [PATCH 2/3] Add changelog

---
 HISTORY.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/HISTORY.md b/HISTORY.md
index 9efd5f6eb..8bffe0f08 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -4,6 +4,8 @@
 
 The internals of `LogDensityFunction` have been changed slightly so that you do not need to specify `function_annotation` when performing AD with Enzyme.jl.
 
+There should also be some minor performance improvements (maybe 10%) on AD with ForwardDiff / Mooncake.
+
 ## 0.39.1
 
 `LogDensityFunction` now allows you to call `logdensity_and_gradient(ldf, x)` with `AbstractVector`s `x` that are not plain Vectors (they will be converted internally before calculating the gradient).

From 94e62b87abf3f3ebb20033cc2b3a77315fc4b3d3 Mon Sep 17 00:00:00 2001
From: Penelope Yong <penelopeysm@gmail.com>
Date: Wed, 10 Dec 2025 18:08:31 +0000
Subject: [PATCH 3/3] Update link to benchmarks

---
 src/logdensityfunction.jl | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/src/logdensityfunction.jl b/src/logdensityfunction.jl
index 436321f89..ba61da27d 100644
--- a/src/logdensityfunction.jl
+++ b/src/logdensityfunction.jl
@@ -386,30 +386,27 @@ tweak_adtype(adtype::ADTypes.AbstractADType, ::Model, ::AbstractVarInfo) = adtyp
 """
     _use_closure(adtype::ADTypes.AbstractADType)
 
-In LogDensityProblems, we want to calculate the derivative of logdensity(f, x)
-with respect to x, where f is the model (in our case LogDensityFunction) and is
-a constant. However, DifferentiationInterface generally expects a
-single-argument function g(x) to differentiate.
+In LogDensityProblems, we want to calculate the derivative of `logdensity(f, x)` with
+respect to x, where f is the model (in our case LogDensityFunction or its arguments ) and is
+a constant. However, DifferentiationInterface generally expects a single-argument function
+g(x) to differentiate.
 
 There are two ways of dealing with this:
 
 1. Construct a closure over the model, i.e. let g = Base.Fix1(logdensity, f)
 
-2. Use a constant DI.Context. This lets us pass a two-argument function to DI,
-   as long as we also give it the 'inactive argument' (i.e. the model) wrapped
-   in `DI.Constant`.
+2. Use a constant DI.Context. This lets us pass a two-argument function to DI, as long as we
+   also give it the 'inactive argument' (i.e. the model) wrapped in `DI.Constant`.
 
-The relative performance of the two approaches, however, depends on the AD
-backend used. Some benchmarks are provided here:
-https://github.com/TuringLang/DynamicPPL.jl/issues/946#issuecomment-2931604829
+The relative performance of the two approaches, however, depends on the AD backend used.
+Some benchmarks are provided here: https://github.com/TuringLang/DynamicPPL.jl/pull/1172
 
-This function is used to determine whether a given AD backend should use a
-closure or a constant. If `use_closure(adtype)` returns `true`, then the
-closure approach will be used. By default, this function returns `false`, i.e.
-the constant approach will be used.
+This function is used to determine whether a given AD backend should use a closure or a
+constant. If `use_closure(adtype)` returns `true`, then the closure approach will be used.
+By default, this function returns `false`, i.e. the constant approach will be used.
 """
 # For these AD backends both closure and no closure work, but it is just faster to not use a
-# closure 
+# closure (see link in the docstring).
 _use_closure(::ADTypes.AutoForwardDiff) = false
 _use_closure(::ADTypes.AutoMooncake) = false
 _use_closure(::ADTypes.AutoMooncakeForward) = false