sensitivity reg (#200)

matbesancon · web-flow · commit ba669f5e2247 · 2022-03-21T16:21:51.000+01:00
* sensitivity reg

* local model, Plots explicit

* direct assign

* block separation

* fix block

* fix computations

* comments

* dont reg bias

* phrasing relu

* fix dot
diff --git a/docs/src/examples/custom-relu.jl b/docs/src/examples/custom-relu.jl
@@ -3,7 +3,7 @@
 #md # [![](https://img.shields.io/badge/show-github-579ACA.svg)](@__REPO_ROOT_URL__/docs/src/examples/custom-relu.jl)
 
 # We demonstrate how DiffOpt can be used to generate a simple neural network
-# unit - the ReLU layer. A neural network is created using Flux.jl which is
+# unit - the ReLU layer. A neural network is created using Flux.jl and
 # trained on the MNIST dataset.
 
 # This tutorial uses the following packages
@@ -15,6 +15,7 @@ import ChainRulesCore
 import Flux
 import Statistics
 import Base.Iterators: repeated
+using LinearAlgebra
 
 # ## The ReLU and its derivative
 
@@ -33,7 +34,7 @@ function matrix_relu(
         @objective(
             model,
             Min,
-            x'x -2y[:, i]'x  # x' Q x + q'x with Q = I, q = -2y
+            dot(x, x) -2dot(y[:, i], x)
         )
         optimize!(model)
         _x[:, i] = value.(x)
@@ -52,23 +53,26 @@ function ChainRulesCore.rrule(
     function pullback_matrix_relu(dl_dx)
         ## some value from the backpropagation (e.g., loss) is denoted by `l`
         ## so `dl_dy` is the derivative of `l` wrt `y`
-        x = model[:x] # load decision variable `x` into scope
+        x = model[:x] ## load decision variable `x` into scope
         dl_dy = zeros(T, size(dl_dx))
-        dl_dq = zeros(T, size(dl_dx)) # for step-by-step explanation
+        dl_dq = zeros(T, size(dl_dx)) ## for step-by-step explanation
         for i in 1:size(y, 2)
+            ## set sensitivities
             MOI.set.(
                 model,
                 DiffOpt.BackwardInVariablePrimal(),
                 x,
                 dl_dx[:, i]
-            ) # set sensitivities
-            DiffOpt.backward(model) # compute grad
+            )
+            ## compute grad
+            DiffOpt.backward(model)
+            ## return gradient wrt objective function parameters
             obj_exp = MOI.get(
                 model,
                 DiffOpt.BackwardOutObjective()
-            ) # return gradient wrt objective function parameters
-            dl_dq[:, i] = JuMP.coefficient.(obj_exp, x) # coeff of `x` in q'x = -2y'x
-            dq_dy = -2 # ∵ dq/dy = -2
+            )
+            dl_dq[:, i] = JuMP.coefficient.(obj_exp, x) ## coeff of `x` in q'x = -2y'x
+            dq_dy = -2 ## dq/dy = -2
             dl_dy[:, i] = dl_dq[:, i] * dq_dy
         end
         return (ChainRulesCore.NoTangent(), dl_dy,)
@@ -84,8 +88,8 @@ imgs = MLDatasets.MNIST.traintensor(1:N)
 labels = MLDatasets.MNIST.trainlabels(1:N);
 
 # Preprocessing
-train_X = float.(reshape(imgs, size(imgs, 1) * size(imgs, 2), N)) #stack all the images
-train_Y = Flux.onehotbatch(labels, 0:9); # just a common way to encode categorical variables
+train_X = float.(reshape(imgs, size(imgs, 1) * size(imgs, 2), N)) ## stack all the images
+train_Y = Flux.onehotbatch(labels, 0:9);
 
 test_imgs = MLDatasets.MNIST.testtensor(1:N)
 test_X = float.(reshape(test_imgs, size(test_imgs, 1) * size(test_imgs, 2), N))
@@ -114,9 +118,10 @@ dataset = repeated((train_X, train_Y), epochs);
 
 # Parameters for the network training
 
-custom_loss(x, y) = Flux.crossentropy(m(x), y) # training loss function
-opt = Flux.ADAM(); # stochastic gradient descent variant to optimize weights of the neral network
-evalcb = () -> @show(custom_loss(train_X, train_Y)); # callback to show loss
+# training loss function, Flux optimizer
+custom_loss(x, y) = Flux.crossentropy(m(x), y)
+opt = Flux.ADAM()
+evalcb = () -> @show(custom_loss(train_X, train_Y))
 
 # Train to optimize network parameters
 
@@ -125,9 +130,10 @@ evalcb = () -> @show(custom_loss(train_X, train_Y)); # callback to show loss
 # Although our custom implementation takes time, it is able to reach similar
 # accuracy as the usual ReLU function implementation.
 
-accuracy(x, y) = Statistics.mean(Flux.onecold(m(x)) .== Flux.onecold(y)); # average of correct guesses
+# Average of correct guesses
+accuracy(x, y) = Statistics.mean(Flux.onecold(m(x)) .== Flux.onecold(y));
 
-# Train accuracy
+# Training accuracy
 
 accuracy(train_X, train_Y)
 
diff --git a/docs/src/examples/sensitivity-analysis-ridge.jl b/docs/src/examples/sensitivity-analysis-ridge.jl
@@ -22,7 +22,7 @@
 # ```math
 # \begin{split}
 # \begin{array} {ll}
-# \mbox{minimize} & e^{\top}e + \alpha (w^2 + b^2) \\
+# \mbox{minimize} & e^{\top}e + \alpha (w^2) \\
 # \mbox{s.t.} & e_{i} = y_{i} - w x_{i} - b \quad \quad i=1..N  \\
 # \end{array}
 # \end{split}
@@ -36,15 +36,15 @@ import DiffOpt
 import Random
 import Ipopt
 import Plots
-import LinearAlgebra: normalize!, dot
+using LinearAlgebra: dot
 
 # ## Define and solve the problem
 
 # Construct a set of noisy (guassian) data points around a line.
 
 Random.seed!(42)
 
-N = 100
+N = 150
 
 w = 2 * abs(randn())
 b = rand()
@@ -64,76 +64,106 @@ function fit_ridge(X, Y, alpha = 0.1)
     set_silent(model)
     @variable(model, w) # angular coefficient
     @variable(model, b) # linear coefficient
-    @variable(model, e[1:N]) # approximation error
-    ## constraint defining approximation error
-    @constraint(model, cons[i=1:N], e[i] == Y[i] - w * X[i] - b)
+    ## expression defining approximation error
+    @expression(model, e[i=1:N], Y[i] - w * X[i] - b)
     ## objective minimizing squared error and ridge penalty
     @objective(
         model,
         Min,
-        dot(e, e) + alpha * (sum(w * w) + sum(b * b)),
+        1 / N * dot(e, e) + alpha * (w^2),
     )
     optimize!(model)
-    return model, w, b, cons # return model, variables and constraints references
+    return model, w, b # return model & variables
 end
 
 
-# Train on the data generated.
+# Plot the data points and the fitted line for different alpha values
 
-model, w, b, cons = fit_ridge(X, Y)
-ŵ, b̂ = value(w), value(b)
-
-# We can visualize the approximating line.
-
-p = Plots.scatter(X, Y, label="")
+p = Plots.scatter(X, Y, label=nothing, legend=:topleft)
 mi, ma = minimum(X), maximum(X)
-Plots.plot!(p, [mi, ma], [mi * ŵ + b̂, ma * ŵ + b̂], color=:red, label="")
+Plots.title!("Fitted lines and points")
 
+for alpha in 0.5:0.5:1.5
+    local model, w, b = fit_ridge(X, Y, alpha)
+    ŵ = value(w)
+    b̂ = value(b)
+    Plots.plot!(p, [mi, ma], [mi * ŵ + b̂, ma * ŵ + b̂], label="alpha=$alpha", width=2)
+end
+p
 
 # ## Differentiate
 
 # Now that we've solved the problem, we can compute the sensitivity of optimal
-# values of the angular coefficient `w` with
+# values of the slope `w` with
 # respect to perturbations in the data points (`x`,`y`).
 
-# Begin differentiating the model.
-# analogous to varying θ in the expression:
-# ```math
-# e_i = (y_{i} + \theta_{y_i}) - w (x_{i} + \theta_{x_{i}}) - b
-# ```
+alpha = 0.4
+model, w, b = fit_ridge(X, Y, alpha)
+ŵ = value(w)
+b̂ = value(b)
+
+# We first compute sensitivity of the slope with respect to a perturbation of the independent
+# variable `x`.
 
-∇ = zero(X)
+# Recalling that the points $(x_i, y_i)$ appear in the objective function as:
+# `(yi - b - w*xi)^2`, the `DiffOpt.ForwardInObjective` attribute must be set accordingly,
+# with the terms multiplying the parameter in the objective.
+# When considering the perturbation of a parameter θ, `DiffOpt.ForwardInObjective()` takes in the expression in the
+# objective that multiplies θ.
+# If θ appears with a quadratic and a linear form: `θ^2 a x + θ b y`, then the expression to pass to
+# `ForwardInObjective` is `2θ a x + b y`.
+
+# Sensitivity with respect to x and y
+
+∇y = zero(X)
+∇x = zero(X)
 for i in 1:N
-    for j in 1:N
-        MOI.set(
-            model,
-            DiffOpt.ForwardInConstraint(),
-            cons[j],
-            i == j ? index(w) + 1.0 : 0.0 * index(w)
-        )
-    end
+    MOI.set(
+        model,
+        DiffOpt.ForwardInObjective(),
+        2w^2 * X[i] + 2b * w - 2 * w * Y[i]
+    )
+    DiffOpt.forward(model)
+    ∇x[i] = MOI.get(
+        model,
+        DiffOpt.ForwardOutVariablePrimal(),
+        w
+    )
+    MOI.set(
+        model,
+        DiffOpt.ForwardInObjective(),
+        (2Y[i] - 2b - 2w * X[i]),
+    )
     DiffOpt.forward(model)
-    dw = MOI.get(
+    ∇y[i] = MOI.get(
         model,
         DiffOpt.ForwardOutVariablePrimal(),
         w
     )
-    ∇[i] = abs(dw)
 end
 
-normalize!(∇);
+# Visualize point sensitivities with respect to regression points.
+
+p = Plots.scatter(
+    X, Y,
+    color = [dw < 0 ? :blue : :red for dw in ∇x],
+    markersize = [5 * abs(dw) + 1.2 for dw in ∇x],
+    label = ""
+)
+mi, ma = minimum(X), maximum(X)
+Plots.plot!(p, [mi, ma], [mi * ŵ + b̂, ma * ŵ + b̂], color = :blue, label = "")
+Plots.title!("Regression slope sensitivity with respect to x")
 
-# Visualize point sensitivities with respect to regressing line.
-# Note that the gradients are normalized.
+#
 
 p = Plots.scatter(
     X, Y,
-    color = [x > 0 ? :red : :blue for x in ∇],
-    markersize = [25 * abs(x) for x in ∇],
+    color = [dw < 0 ? :blue : :red for dw in ∇y],
+    markersize = [5 * abs(dw) + 1.2 for dw in ∇y],
     label = ""
 )
 mi, ma = minimum(X), maximum(X)
-Plots.plot!(p, [mi, ma], [mi * ŵ + b̂, ma * ŵ + b̂], color = :red, label = "")
+Plots.plot!(p, [mi, ma], [mi * ŵ + b̂, ma * ŵ + b̂], color = :blue, label = "")
+Plots.title!("Regression slope sensitivity with respect to y")
 
-# Note the points in the extremes of the line segment are larger because
-# moving those points has a stronger effect on the angular coefficient of the line.
+# Note the points with less central `x` values induce a greater y sensitivity of the slope.