Improve documentation

gdalle · gdalle · commit 16b1e5d2e8ea · 2022-07-05T18:50:43.000+02:00
diff --git a/docs/src/algorithms.md b/docs/src/algorithms.md
@@ -1,11 +1,5 @@
 # API Reference
 
-## Index
-
-```@index
-Modules = [InferOpt]
-```
-
 ## Probability distributions
 
 ```@autodocs
@@ -48,6 +42,9 @@ Pages = ["ssvm/isbaseloss.jl", "ssvm/ssvm_loss.jl", "ssvm/zeroone_baseloss.jl"]
 !!! note "Reference"
     [Efficient and Modular Implicit Differentiation](http://arxiv.org/abs/2105.15183)
 
+!!! note "Reference"
+    [FrankWolfe.jl: a high-performance and flexible toolbox for Frank-Wolfe algorithms and Conditional Gradients](https://arxiv.org/abs/2104.06675)
+
 ```@autodocs
 Modules = [InferOpt]
 Pages = ["frank_wolfe/frank_wolfe_utils.jl", "frank_wolfe/differentiable_frank_wolfe.jl"]
@@ -60,7 +57,7 @@ Pages = ["frank_wolfe/frank_wolfe_utils.jl", "frank_wolfe/differentiable_frank_w
 
 ```@autodocs
 Modules = [InferOpt]
-Pages = ["regularized/frank_wolfe.jl", "regularized/isregularized.jl", "regularized/soft_argmax.jl", "regularized/sparse_argmax.jl", "regularized/regularized_generic.jl", "regularized/regularized_utils.jl"]
+Pages = ["regularized/isregularized.jl", "regularized/regularized_generic.jl", "regularized/regularized_utils.jl", "regularized/soft_argmax.jl", "regularized/sparse_argmax.jl"]
 ```
 
 ## Perturbed optimizers
@@ -70,7 +67,7 @@ Pages = ["regularized/frank_wolfe.jl", "regularized/isregularized.jl", "regulari
 
 ```@autodocs
 Modules = [InferOpt]
-Pages = ["perturbed/abstract_perturbed.jl", "perturbed/additive.jl", "perturbed/composition.jl", "perturbed/multiplicative.jl"]
+Pages = ["perturbed/abstract_perturbed.jl", "perturbed/additive.jl", "perturbed/multiplicative.jl"]
 ```
 
 ## Fenchel-Young losses
@@ -80,5 +77,11 @@ Pages = ["perturbed/abstract_perturbed.jl", "perturbed/additive.jl", "perturbed/
 
 ```@autodocs
 Modules = [InferOpt]
-Pages = ["fenchel_young/fenchel_young.jl"]
+Pages = ["fenchel_young/fenchel_young.jl", "fenchel_young/perturbed.jl"]
 ```
+
+## Index
+
+```@index
+Modules = [InferOpt]
+```
diff --git a/src/InferOpt.jl b/src/InferOpt.jl
@@ -49,6 +49,7 @@ export compute_probability_distribution
 export Interpolation
 
 export DifferentiableFrankWolfe
+export LMOWrapper
 
 export half_square_norm
 export shannon_entropy, negative_shannon_entropy
diff --git a/src/frank_wolfe/differentiable_frank_wolfe.jl b/src/frank_wolfe/differentiable_frank_wolfe.jl
@@ -3,15 +3,19 @@
 """
     DifferentiableFrankWolfe{F,G,M,S}
 
-Parameterized version of the Frank-Wolfe algorithm `θ -> argmin_{x ∈ C} f(x, θ)`.
-
-Compatible with implicit differentiation.
+Parameterized version of the Frank-Wolfe algorithm `θ -> argmin_{x ∈ C} f(x, θ)`, which can be differentiated implicitly wrt `θ`.
 
 # Fields
 - `f::F`: function `f(x, θ)` to minimize wrt `x`
 - `f_grad1::G`: gradient `∇ₓf(x, θ)` of `f` wrt `x`
-- `lmo::M`: linear minimization oracle `θ -> argmin_{x ∈ C} θᵀx`
-- `linear_solver::S`: solver for linear systems of equations
+- `lmo::M`: linear minimization oracle `θ -> argmin_{x ∈ C} θᵀx` which implicitly defines the polytope `C`
+- `linear_solver::S`: solver for linear systems of equations, used during implicit differentiation
+
+# Applicable methods
+
+- [`compute_probability_distribution(dfw::DifferentiableFrankWolfe, θ, x0)`](@ref)
+- `(dfw::DifferentiableFrankWolfe)(θ, x0)`
+
 """
 struct DifferentiableFrankWolfe{F,G,M<:LinearMinimizationOracle,S}
     f::F
@@ -26,11 +30,19 @@ end
 
 ## Forward pass
 
+"""
+    compute_probability_distribution(dfw::DifferentiableFrankWolfe, θ, x0[; fw_kwargs=(;)])
+
+Compute the optimal active set by applying the away-step Frank-Wolfe algorithm with initial point `x0`, then turn it into a probability distribution.
+
+The named tuple `fw_kwargs` is passed as keyword arguments to `FrankWolfe.away_frank_wolfe`.
+"""
 function compute_probability_distribution(
     dfw::DifferentiableFrankWolfe,
     θ::AbstractArray{<:Real},
     x0::AbstractArray{<:Real};
     fw_kwargs=(;),
+    kwargs...,
 )
     (; f, f_grad1, lmo) = dfw
     obj(x) = f(x, θ)
@@ -44,8 +56,13 @@ function compute_probability_distribution(
     return probadist
 end
 
+"""
+    (dfw::DifferentiableFrankWolfe)(θ, x0[; fw_kwargs=(;)])
+
+Apply `compute_probability_distribution(dfw, θ, x0)` and return the expectation.
+"""
 function (dfw::DifferentiableFrankWolfe)(
-    θ::AbstractArray{<:Real}, x0::AbstractArray{<:Real}; fw_kwargs=(;)
+    θ::AbstractArray{<:Real}, x0::AbstractArray{<:Real}; fw_kwargs=(;), kwargs...
 )
     probadist = compute_probability_distribution(dfw, θ, x0; fw_kwargs=fw_kwargs)
     return compute_expectation(probadist)
@@ -74,6 +91,7 @@ function ChainRulesCore.rrule(
     θ::AbstractArray{R1},
     x0::AbstractArray{R2};
     fw_kwargs=(;),
+    kwargs...,
 ) where {R1<:Real,R2<:Real}
     R = promote_type(R1, R2)
     (; linear_solver) = dfw
@@ -88,8 +106,8 @@ function ChainRulesCore.rrule(
     pullback_Aᵀ = last ∘ rrule_via_ad(rc, conditions_p, p)[2]
     pullback_Bᵀ = last ∘ rrule_via_ad(rc, conditions_θ, θ)[2]
 
-    mul_Aᵀ!(res, u::AbstractVector) = res .= vec(pullback_Aᵀ(reshape(u, size(p))))
-    mul_Bᵀ!(res, v::AbstractVector) = res .= vec(pullback_Bᵀ(reshape(v, size(p))))
+    mul_Aᵀ!(res, u::AbstractVector) = res .= vec(pullback_Aᵀ(u))
+    mul_Bᵀ!(res, v::AbstractVector) = res .= vec(pullback_Bᵀ(v))
 
     n, m = length(θ), length(p)
     Aᵀ = LinearOperator(R, m, m, false, false, mul_Aᵀ!)
diff --git a/src/frank_wolfe/frank_wolfe_utils.jl b/src/frank_wolfe/frank_wolfe_utils.jl
@@ -4,13 +4,13 @@
 Default configuration for the Frank-Wolfe wrapper.
 
 # Parameters
-- `away_steps`
-- `epsilon`
-- `lazy`
-- `line_search`
-- `max_iteration`
-- `timeout`
-- `verbose`
+- `away_steps=true`: activate away steps to avoid zig-zagging
+- `epsilon=1e-2`: precision
+- `lazy=true`: caching strategy
+- `line_search=FrankWolfe.Adaptive()`: step size selection
+- `max_iteration=10`: number of iterations
+- `timeout=1.0`: maximum time in seconds
+- `verbose=false`: console output
 """
 const DEFAULT_FRANK_WOLFE_KWARGS = (
     away_steps=true,
@@ -24,13 +24,25 @@ const DEFAULT_FRANK_WOLFE_KWARGS = (
 
 ## Wrapper for linear maximizers to use them within Frank-Wolfe
 
+"""
+    LMOWrapper{F,K}
+
+Wraps a linear maximizer as a `FrankWolfe.LinearMinimizationOracle`.
+
+# Fields
+- `maximizer::F`: black box linear maximizer
+- `maximizer_kwargs::K`: keyword arguments passed to the maximizer whenever it is called
+"""
 struct LMOWrapper{F,K} <: LinearMinimizationOracle
     maximizer::F
     maximizer_kwargs::K
 end
 
 LMOWrapper(maximizer) = LMOWrapper(maximizer, (;))
 
+"""
+    FrankWolfe.compute_extreme_point(lmo_wrapper::LMOWrapper, direction)
+"""
 function FrankWolfe.compute_extreme_point(lmo_wrapper::LMOWrapper, direction; kwargs...)
     (; maximizer, maximizer_kwargs) = lmo_wrapper
     v = maximizer(-direction; maximizer_kwargs...)
diff --git a/src/perturbed/abstract_perturbed.jl b/src/perturbed/abstract_perturbed.jl
@@ -1,38 +1,40 @@
 """
     AbstractPerturbed
 
-Differentiable perturbation of a black-box optimizer.
+Differentiable perturbation of a black box optimizer.
+
+# Applicable functions
+
+- [`compute_probability_distribution(perturbed::AbstractPerturbed, θ)`](@ref)
+- `(perturbed::AbstractPerturbed)(θ)`
 
 # Available subtypes
-- [`PerturbedAdditive{F}`](@ref)
-- [`PerturbedMultiplicative{F}`](@ref)
 
-# Required fields
+- [`PerturbedAdditive`](@ref)
+- [`PerturbedMultiplicative`](@ref)
+
+These subtypes share the following fields:
+
+- `maximizer`: black box optimizer
+- `ε`: magnitude of the perturbation
 - `rng::AbstractRNG`: random number generator
 - `seed::Union{Nothing,Int}`: random seed
 - `nb_samples::Int`: number of random samples for Monte-Carlo computations
 """
-abstract type AbstractPerturbed{F} end
+abstract type AbstractPerturbed end
 
+"""
+    sample_perturbations(perturbed::AbstractPerturbed, θ)
+
+Draw random perturbations `Z` which will be applied to the objective direction `θ`.
+"""
 function sample_perturbations(perturbed::AbstractPerturbed, θ::AbstractArray{<:Real})
     (; rng, seed, nb_samples) = perturbed
     seed!(rng, seed)
     Z_samples = [randn(rng, size(θ)) for _ in 1:nb_samples]
     return Z_samples
 end
 
-"""
-    perturb_and_optimize(perturbed, θ, Z; kwargs...)
-"""
-function perturb_and_optimize(
-    perturbed::AbstractPerturbed,
-    θ::AbstractArray{<:Real},
-    Z::AbstractArray{<:Real};
-    kwargs...,
-)
-    return error("Not implemented")
-end
-
 function compute_probability_distribution(
     perturbed::AbstractPerturbed,
     θ::AbstractArray{<:Real},
@@ -45,23 +47,24 @@ function compute_probability_distribution(
     return probadist
 end
 
+"""
+    compute_probability_distribution(perturbed::AbstractPerturbed, θ)
+
+Turn random perturbations of `θ` into a distribution on polytope vertices.
+"""
 function compute_probability_distribution(
     perturbed::AbstractPerturbed, θ::AbstractArray{<:Real}; kwargs...
 )
     Z_samples = sample_perturbations(perturbed, θ)
     return compute_probability_distribution(perturbed, θ, Z_samples; kwargs...)
 end
 
+"""
+    (perturbed::AbstractPerturbed)(θ)
+
+Apply `compute_probability_distribution(perturbed, θ)` and return the expectation.
+"""
 function (perturbed::AbstractPerturbed)(θ::AbstractArray{<:Real}; kwargs...)
     probadist = compute_probability_distribution(perturbed, θ; kwargs...)
     return compute_expectation(probadist)
 end
-
-function ChainRulesCore.rrule(
-    ::typeof(compute_probability_distribution),
-    perturbed::AbstractPerturbed,
-    θ::AbstractArray{<:Real};
-    kwargs...,
-)
-    return error("Not implemented")
-end
diff --git a/src/perturbed/additive.jl b/src/perturbed/additive.jl
@@ -3,11 +3,11 @@
 
 Differentiable normal perturbation of a black-box optimizer: the input undergoes `θ -> θ + εZ` where `Z ∼ N(0, I)`.
 
-See also: [`AbstractPerturbed{F}`](@ref).
+See also: [`AbstractPerturbed`](@ref).
 
 Reference: <https://arxiv.org/abs/2002.08676>
 """
-struct PerturbedAdditive{F,R<:AbstractRNG,S<:Union{Nothing,Int}} <: AbstractPerturbed{F}
+struct PerturbedAdditive{F,R<:AbstractRNG,S<:Union{Nothing,Int}} <: AbstractPerturbed
     maximizer::F
     ε::Float64
     rng::R
diff --git a/src/perturbed/multiplicative.jl b/src/perturbed/multiplicative.jl
@@ -3,10 +3,11 @@
 
 Differentiable log-normal perturbation of a black-box optimizer: the input undergoes `θ -> θ ⊙ exp[εZ - ε²/2]` where `Z ∼ N(0, I)`.
 
-See also: [`AbstractPerturbed{F}`](@ref).
+See also: [`AbstractPerturbed`](@ref).
+
+Reference: preprint coming soon.
 """
-struct PerturbedMultiplicative{F,R<:AbstractRNG,S<:Union{Nothing,Int}} <:
-       AbstractPerturbed{F}
+struct PerturbedMultiplicative{F,R<:AbstractRNG,S<:Union{Nothing,Int}} <: AbstractPerturbed
     maximizer::F
     ε::Float64
     rng::R
diff --git a/src/regularized/isregularized.jl b/src/regularized/isregularized.jl
@@ -11,7 +11,13 @@ For `predictor::P` to comply with this interface, the following methods must exi
 - [`one_hot_argmax`](@ref)
 - [`soft_argmax`](@ref)
 - [`sparse_argmax`](@ref)
+- [`RegularizedGeneric`](@ref)
 """
 @traitdef IsRegularized{P}
 
-@traitfn function compute_regularization(predictor::P, y) where {P; IsRegularized{P}} end
+"""
+    compute_regularization(predictor::P, y)
+
+Compute the convex regularization function `Ω(y)`.
+"""
+function compute_regularization end
diff --git a/src/regularized/regularized_generic.jl b/src/regularized/regularized_generic.jl
@@ -1,17 +1,22 @@
 """
     RegularizedGeneric{M,RF,RG,F,G,S}
 
-Generic and differentiable regularized prediction function `ŷ(θ) = argmax {θᵀy - Ω(y)}`.
+Differentiable regularized prediction function `ŷ(θ) = argmax_{y ∈ C} {θᵀy - Ω(y)}`.
 
 Relies on the Frank-Wolfe algorithm to minimize a concave objective on a polytope.
 
 # Fields
-- `maximizer::M`
-- `Ω::RF`
-- `Ω_grad::RG`
-- `f::F`
-- `f_grad1::G`
-- `linear_solver::S`
+- `maximizer::M`: linear maximization oracle `θ -> argmax_{x ∈ C} θᵀx` which implicitly defines the polytope `C`
+- `Ω::RF`: regularization function `Ω(y)`
+- `Ω_grad::RG`: gradient of the regularization function `∇Ω(y)`
+- `f::F`: objective function `f(x, θ) = Ω(y) - θᵀy` minimized by Frank-Wolfe (computed automatically)
+- `f_grad1::G`: gradient of the objective function `∇ₓf(x, θ) = ∇Ω(y) - θ` with respect to `x` (computed automatically)
+- `linear_solver::S`: solver for linear systems of equations, used during implicit differentiation
+
+# Applicable methods
+
+- [`compute_probability_distribution(regularized::RegularizedGeneric, θ)`](@ref)
+- `(regularized::RegularizedGeneric)(θ)`
 
 See also: [`DifferentiableFrankWolfe`](@ref).
 """
@@ -29,6 +34,11 @@ function Base.show(io::IO, regularized::RegularizedGeneric)
     return print(io, "RegularizedGeneric($maximizer, $Ω, $Ω_grad, $linear_solver)")
 end
 
+"""
+    RegularizedGeneric(maximizer, Ω, Ω_grad[; linear_solver=gmres])
+
+Short form constructor with a default linear solver.
+"""
 function RegularizedGeneric(maximizer, Ω, Ω_grad; linear_solver=gmres)
     f(y, θ) = Ω(y) - dot(θ, y)
     f_grad1(y, θ) = Ω_grad(y) - θ
@@ -43,6 +53,14 @@ end
 
 ## Forward pass
 
+"""
+    compute_probability_distribution(regularized::RegularizedGeneric, θ[; maximizer_kwargs=(;), fw_kwargs=(;)])
+
+Construct a [`DifferentiableFrankWolfe`](@ref) struct and call `compute_probability_distribution` on it.
+
+The named tuple `maximizer_kwargs` is passed as keyword arguments to the underlying maximizer, which is wrapped inside a [`LMOWrapper`](@ref).
+The named tuple `fw_kwargs` is passed as keyword arguments to `FrankWolfe.away_frank_wolfe`.
+"""
 function compute_probability_distribution(
     regularized::RegularizedGeneric,
     θ::AbstractArray{<:Real};
@@ -58,8 +76,13 @@ function compute_probability_distribution(
     return probadist
 end
 
+"""
+    (regularized::RegularizedGeneric)(θ[; maximizer_kwargs=(;), fw_kwargs=(;)])
+
+Apply `compute_probability_distribution(regularized, θ)` and return the expectation.
+"""
 function (regularized::RegularizedGeneric)(
-    θ::AbstractArray{<:Real}; maximizer_kwargs=(;), fw_kwargs=(;)
+    θ::AbstractArray{<:Real}; maximizer_kwargs=(;), fw_kwargs=(;), kwargs...
 )
     probadist = compute_probability_distribution(
         regularized, θ; maximizer_kwargs=maximizer_kwargs, fw_kwargs=fw_kwargs
diff --git a/src/utils/composition.jl b/src/utils/composition.jl
diff --git a/src/utils/probability_distribution.jl b/src/utils/probability_distribution.jl
diff --git a/test/paths.jl b/test/paths.jl