From 45765e416970a50d724a4ef3578ca572492d1813 Mon Sep 17 00:00:00 2001 From: Penelope Yong Date: Wed, 10 Dec 2025 18:06:13 +0000 Subject: [PATCH 1/3] Enable closure/non-closure case for LogDensityFunction --- HISTORY.md | 4 + Project.toml | 2 +- src/logdensityfunction.jl | 153 +++++++++++++++++++++++++++----- test/integration/enzyme/main.jl | 10 +-- 4 files changed, 138 insertions(+), 31 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 3b0dec5e2..9efd5f6eb 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,9 @@ # DynamicPPL Changelog +## 0.39.2 + +The internals of `LogDensityFunction` have been changed slightly so that you do not need to specify `function_annotation` when performing AD with Enzyme.jl. + ## 0.39.1 `LogDensityFunction` now allows you to call `logdensity_and_gradient(ldf, x)` with `AbstractVector`s `x` that are not plain Vectors (they will be converted internally before calculating the gradient). diff --git a/Project.toml b/Project.toml index d869026e9..7b995f530 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "DynamicPPL" uuid = "366bfd00-2699-11ea-058f-f148b4cae6d8" -version = "0.39.1" +version = "0.39.2" [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" diff --git a/src/logdensityfunction.jl b/src/logdensityfunction.jl index 3008a329b..436321f89 100644 --- a/src/logdensityfunction.jl +++ b/src/logdensityfunction.jl @@ -191,11 +191,18 @@ struct LogDensityFunction{ else # Make backend-specific tweaks to the adtype adtype = DynamicPPL.tweak_adtype(adtype, model, varinfo) - DI.prepare_gradient( - LogDensityAt{Tlink}(model, getlogdensity, all_iden_ranges, all_ranges), - adtype, - x, - ) + args = (model, getlogdensity, all_iden_ranges, all_ranges) + if _use_closure(adtype) + DI.prepare_gradient(LogDensityAt{Tlink}(args...), adtype, x) + else + DI.prepare_gradient( + logdensity_at, + adtype, + x, + DI.Constant(Val{Tlink}()), + map(DI.Constant, args)..., + ) + end end return new{ Tlink, @@ -235,6 +242,47 @@ end ldf_accs(::typeof(getlogprior)) = AccumulatorTuple((LogPriorAccumulator(),)) ldf_accs(::typeof(getloglikelihood)) = AccumulatorTuple((LogLikelihoodAccumulator(),)) +""" + logdensity_at( + params::AbstractVector{<:Real}, + ::Val{Tlink}, + model::Model, + getlogdensity::Function, + iden_varname_ranges::NamedTuple, + varname_ranges::Dict{VarName,RangeAndLinked}, + ) where {Tlink} + +Calculate the log density at the given `params`, using the provided +information extracted from a `LogDensityFunction`. +""" +function logdensity_at( + params::AbstractVector{<:Real}, + ::Val{Tlink}, + model::Model, + getlogdensity::Function, + iden_varname_ranges::NamedTuple, + varname_ranges::Dict{VarName,RangeAndLinked}, +) where {Tlink} + strategy = InitFromParams( + VectorWithRanges{Tlink}(iden_varname_ranges, varname_ranges, params), nothing + ) + accs = ldf_accs(getlogdensity) + _, vi = DynamicPPL.init!!(model, OnlyAccsVarInfo(accs), strategy) + return getlogdensity(vi) +end + +""" + LogDensityAt{Tlink}( + model::Model, + getlogdensity::Function, + iden_varname_ranges::NamedTuple, + varname_ranges::Dict{VarName,RangeAndLinked}, + ) where {Tlink} + +A callable struct that behaves in the same way as `logdensity_at`, but stores the model and +other information internally. Having two separate functions/structs allows for better +performance with AD backends. +""" struct LogDensityAt{Tlink,M<:Model,F<:Function,N<:NamedTuple} model::M getlogdensity::F @@ -251,36 +299,57 @@ struct LogDensityAt{Tlink,M<:Model,F<:Function,N<:NamedTuple} end end function (f::LogDensityAt{Tlink})(params::AbstractVector{<:Real}) where {Tlink} - strategy = InitFromParams( - VectorWithRanges{Tlink}(f.iden_varname_ranges, f.varname_ranges, params), nothing + return logdensity_at( + params, + Val{Tlink}(), + f.model, + f.getlogdensity, + f.iden_varname_ranges, + f.varname_ranges, ) - accs = ldf_accs(f.getlogdensity) - _, vi = DynamicPPL.init!!(f.model, OnlyAccsVarInfo(accs), strategy) - return f.getlogdensity(vi) end function LogDensityProblems.logdensity( ldf::LogDensityFunction{Tlink}, params::AbstractVector{<:Real} ) where {Tlink} - return LogDensityAt{Tlink}( - ldf.model, ldf._getlogdensity, ldf._iden_varname_ranges, ldf._varname_ranges - )( - params + return logdensity_at( + params, + Val{Tlink}(), + ldf.model, + ldf._getlogdensity, + ldf._iden_varname_ranges, + ldf._varname_ranges, ) end function LogDensityProblems.logdensity_and_gradient( ldf::LogDensityFunction{Tlink}, params::AbstractVector{<:Real} ) where {Tlink} + # `params` has to be converted to the same vector type that was used for AD preparation, + # otherwise the preparation will not be valid. params = convert(_get_input_vector_type(ldf), params) - return DI.value_and_gradient( - LogDensityAt{Tlink}( - ldf.model, ldf._getlogdensity, ldf._iden_varname_ranges, ldf._varname_ranges - ), - ldf._adprep, - ldf.adtype, - params, - ) + return if _use_closure(ldf.adtype) + DI.value_and_gradient( + LogDensityAt{Tlink}( + ldf.model, ldf._getlogdensity, ldf._iden_varname_ranges, ldf._varname_ranges + ), + ldf._adprep, + ldf.adtype, + params, + ) + else + DI.value_and_gradient( + logdensity_at, + ldf._adprep, + ldf.adtype, + params, + DI.Constant(Val{Tlink}()), + DI.Constant(ldf.model), + DI.Constant(ldf._getlogdensity), + DI.Constant(ldf._iden_varname_ranges), + DI.Constant(ldf._varname_ranges), + ) + end end function LogDensityProblems.capabilities( @@ -314,6 +383,46 @@ By default, this just returns the input unchanged. """ tweak_adtype(adtype::ADTypes.AbstractADType, ::Model, ::AbstractVarInfo) = adtype +""" + _use_closure(adtype::ADTypes.AbstractADType) + +In LogDensityProblems, we want to calculate the derivative of logdensity(f, x) +with respect to x, where f is the model (in our case LogDensityFunction) and is +a constant. However, DifferentiationInterface generally expects a +single-argument function g(x) to differentiate. + +There are two ways of dealing with this: + +1. Construct a closure over the model, i.e. let g = Base.Fix1(logdensity, f) + +2. Use a constant DI.Context. This lets us pass a two-argument function to DI, + as long as we also give it the 'inactive argument' (i.e. the model) wrapped + in `DI.Constant`. + +The relative performance of the two approaches, however, depends on the AD +backend used. Some benchmarks are provided here: +https://github.com/TuringLang/DynamicPPL.jl/issues/946#issuecomment-2931604829 + +This function is used to determine whether a given AD backend should use a +closure or a constant. If `use_closure(adtype)` returns `true`, then the +closure approach will be used. By default, this function returns `false`, i.e. +the constant approach will be used. +""" +# For these AD backends both closure and no closure work, but it is just faster to not use a +# closure +_use_closure(::ADTypes.AutoForwardDiff) = false +_use_closure(::ADTypes.AutoMooncake) = false +_use_closure(::ADTypes.AutoMooncakeForward) = false +# For ReverseDiff, with the compiled tape, you _must_ use a closure because otherwise with +# DI.Constant arguments the tape will always be recompiled upon each call to +# value_and_gradient. For non-compiled ReverseDiff, it is faster to not use a closure. +_use_closure(::ADTypes.AutoReverseDiff{compile}) where {compile} = !compile +# For AutoEnzyme it allows us to avoid setting function_annotation +_use_closure(::ADTypes.AutoEnzyme) = false +# Since for most backends it's faster to not use a closure, we set that as the default +# for unknown AD backends +_use_closure(::ADTypes.AbstractADType) = false + ###################################################### # Helper functions to extract ranges and link status # ###################################################### diff --git a/test/integration/enzyme/main.jl b/test/integration/enzyme/main.jl index edfd67d18..0051e55b5 100644 --- a/test/integration/enzyme/main.jl +++ b/test/integration/enzyme/main.jl @@ -6,14 +6,8 @@ import Enzyme: set_runtime_activity, Forward, Reverse, Const using ForwardDiff: ForwardDiff # run_ad uses FD for correctness test ADTYPES = ( - ( - "EnzymeForward", - AutoEnzyme(; mode=set_runtime_activity(Forward), function_annotation=Const), - ), - ( - "EnzymeReverse", - AutoEnzyme(; mode=set_runtime_activity(Reverse), function_annotation=Const), - ), + ("EnzymeForward", AutoEnzyme(; mode=set_runtime_activity(Forward))), + ("EnzymeReverse", AutoEnzyme(; mode=set_runtime_activity(Reverse))), ) @testset "$ad_key" for (ad_key, ad_type) in ADTYPES From 396d0f9d9e4573d1e0ec74b4a233de8a16c62eb1 Mon Sep 17 00:00:00 2001 From: Penelope Yong Date: Wed, 10 Dec 2025 18:07:00 +0000 Subject: [PATCH 2/3] Add changelog --- HISTORY.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 9efd5f6eb..8bffe0f08 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -4,6 +4,8 @@ The internals of `LogDensityFunction` have been changed slightly so that you do not need to specify `function_annotation` when performing AD with Enzyme.jl. +There should also be some minor performance improvements (maybe 10%) on AD with ForwardDiff / Mooncake. + ## 0.39.1 `LogDensityFunction` now allows you to call `logdensity_and_gradient(ldf, x)` with `AbstractVector`s `x` that are not plain Vectors (they will be converted internally before calculating the gradient). From 94e62b87abf3f3ebb20033cc2b3a77315fc4b3d3 Mon Sep 17 00:00:00 2001 From: Penelope Yong Date: Wed, 10 Dec 2025 18:08:31 +0000 Subject: [PATCH 3/3] Update link to benchmarks --- src/logdensityfunction.jl | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/src/logdensityfunction.jl b/src/logdensityfunction.jl index 436321f89..ba61da27d 100644 --- a/src/logdensityfunction.jl +++ b/src/logdensityfunction.jl @@ -386,30 +386,27 @@ tweak_adtype(adtype::ADTypes.AbstractADType, ::Model, ::AbstractVarInfo) = adtyp """ _use_closure(adtype::ADTypes.AbstractADType) -In LogDensityProblems, we want to calculate the derivative of logdensity(f, x) -with respect to x, where f is the model (in our case LogDensityFunction) and is -a constant. However, DifferentiationInterface generally expects a -single-argument function g(x) to differentiate. +In LogDensityProblems, we want to calculate the derivative of `logdensity(f, x)` with +respect to x, where f is the model (in our case LogDensityFunction or its arguments ) and is +a constant. However, DifferentiationInterface generally expects a single-argument function +g(x) to differentiate. There are two ways of dealing with this: 1. Construct a closure over the model, i.e. let g = Base.Fix1(logdensity, f) -2. Use a constant DI.Context. This lets us pass a two-argument function to DI, - as long as we also give it the 'inactive argument' (i.e. the model) wrapped - in `DI.Constant`. +2. Use a constant DI.Context. This lets us pass a two-argument function to DI, as long as we + also give it the 'inactive argument' (i.e. the model) wrapped in `DI.Constant`. -The relative performance of the two approaches, however, depends on the AD -backend used. Some benchmarks are provided here: -https://github.com/TuringLang/DynamicPPL.jl/issues/946#issuecomment-2931604829 +The relative performance of the two approaches, however, depends on the AD backend used. +Some benchmarks are provided here: https://github.com/TuringLang/DynamicPPL.jl/pull/1172 -This function is used to determine whether a given AD backend should use a -closure or a constant. If `use_closure(adtype)` returns `true`, then the -closure approach will be used. By default, this function returns `false`, i.e. -the constant approach will be used. +This function is used to determine whether a given AD backend should use a closure or a +constant. If `use_closure(adtype)` returns `true`, then the closure approach will be used. +By default, this function returns `false`, i.e. the constant approach will be used. """ # For these AD backends both closure and no closure work, but it is just faster to not use a -# closure +# closure (see link in the docstring). _use_closure(::ADTypes.AutoForwardDiff) = false _use_closure(::ADTypes.AutoMooncake) = false _use_closure(::ADTypes.AutoMooncakeForward) = false