try to add support for AbstractRegularized through an extended interface

BatyLeo · BatyLeo · commit 9a245f5ab934 · 2023-09-07T16:11:06.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -9,6 +9,7 @@ DensityInterface = "b429d917-457f-4dbc-8f4c-0cc954292b1d"
 DifferentiableFrankWolfe = "b383313e-5450-4164-a800-befbd27b574d"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+RequiredInterfaces = "97f35ef4-7bc5-4ec1-a41a-dcc69c7308c6"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 ThreadsX = "ac1d9e8a-700a-412c-b207-f0111f4b6c0d"
@@ -23,6 +24,7 @@ InferOptFrankWolfeExt = "DifferentiableFrankWolfe"
 ChainRulesCore = "1"
 DensityInterface = "0.4.0"
 DifferentiableFrankWolfe = "0.1.2"
+RequiredInterfaces = "0.1.3"
 StatsBase = "0.33, 0.34"
 TestItemRunner = "0.2.2"
 ThreadsX = "0.1.11"
diff --git a/src/InferOpt.jl b/src/InferOpt.jl
@@ -15,6 +15,7 @@ using Random: AbstractRNG, GLOBAL_RNG, MersenneTwister, rand, seed!
 using Statistics: mean
 using StatsBase: StatsBase, sample
 using ThreadsX: ThreadsX
+using RequiredInterfaces
 
 include("interface.jl")
 
diff --git a/src/imitation/fenchel_young_loss.jl b/src/imitation/fenchel_young_loss.jl
@@ -43,6 +43,21 @@ function fenchel_young_loss_and_grad(
     return l, g
 end
 
+function fenchel_young_loss_and_grad(
+    fyl::FenchelYoungLoss{O}, θ::AbstractArray, y_true::AbstractArray; kwargs...
+) where {O<:AbstractRegularized{<:GeneralizedMaximizer}}
+    (; optimization_layer) = fyl
+    ŷ = optimization_layer(θ; kwargs...)
+    Ωy_true = compute_regularization(optimization_layer, y_true)
+    Ωŷ = compute_regularization(optimization_layer, ŷ)
+    maximizer = get_maximizer(optimization_layer)
+    l =
+        (Ωy_true - objective_value(maximizer, θ, y_true; kwargs...)) -
+        (Ωŷ - objective_value(maximizer, θ, ŷ; kwargs...))
+    g = maximizer.g(ŷ; kwargs...) - maximizer.g(y_true; kwargs...)
+    return l, g
+end
+
 function fenchel_young_loss_and_grad(
     fyl::FenchelYoungLoss{O}, θ::AbstractArray, y_true::AbstractArray; kwargs...
 ) where {O<:AbstractPerturbed}
@@ -61,7 +76,7 @@ function fenchel_young_loss_and_grad(
         optimization_layer, θ; kwargs...
     )
     l = F - objective_value(optimization_layer.oracle, θ, y_true; kwargs...)
-    g = almost_g_of_ŷ - optimization_layer.oracle.g(y_true)
+    g = almost_g_of_ŷ - optimization_layer.oracle.g(y_true; kwargs...)
     return l, g
 end
 
diff --git a/src/regularized/abstract_regularized.jl b/src/regularized/abstract_regularized.jl
@@ -17,11 +17,18 @@ Convex regularization perturbation of a black box optimizer
 - [`SparseArgmax`](@ref)
 - [`RegularizedFrankWolfe`](@ref)
 """
-abstract type AbstractRegularized <: AbstractOptimizationLayer end
+abstract type AbstractRegularized{O} <: AbstractOptimizationLayer end
 
 """
     compute_regularization(regularized, y)
 
 Return the convex penalty `Ω(y)` associated with an `AbstractRegularized` layer.
 """
 function compute_regularization end
+
+function get_maximizer end
+
+@required AbstractRegularized begin
+    compute_regularization(::AbstractRegularized, ::Any)
+    get_maximizer(::AbstractRegularized)
+end
diff --git a/src/regularized/regularized_frank_wolfe.jl b/src/regularized/regularized_frank_wolfe.jl
@@ -29,7 +29,7 @@ Some values you can tune:
 
 See the documentation of FrankWolfe.jl for details.
 """
-struct RegularizedFrankWolfe{M,RF,RG,FWK} <: AbstractRegularized
+struct RegularizedFrankWolfe{M,RF,RG,FWK} <: AbstractRegularized{M}
     linear_maximizer::M
     Ω::RF
     Ω_grad::RG
diff --git a/src/regularized/soft_argmax.jl b/src/regularized/soft_argmax.jl
@@ -5,7 +5,7 @@ Soft argmax activation function `s(z) = (e^zᵢ / ∑ e^zⱼ)ᵢ`.
 
 Corresponds to regularized prediction on the probability simplex with entropic penalty.
 """
-struct SoftArgmax <: AbstractRegularized end
+struct SoftArgmax <: AbstractRegularized{nothing} end
 
 (::SoftArgmax)(z; kwargs...) = soft_argmax(z)
 compute_regularization(::SoftArgmax, y) = soft_argmax_regularization(y)
diff --git a/src/regularized/sparse_argmax.jl b/src/regularized/sparse_argmax.jl
@@ -5,7 +5,7 @@ Compute the Euclidean projection of the vector `z` onto the probability simplex.
 
 Corresponds to regularized prediction on the probability simplex with square norm penalty.
 """
-struct SparseArgmax <: AbstractRegularized end
+struct SparseArgmax <: AbstractRegularized{nothing} end
 
 (::SparseArgmax)(z; kwargs...) = sparse_argmax(z)
 compute_regularization(::SparseArgmax, y) = sparse_argmax_regularization(y)