@@ -43,7 +43,7 @@ function ChainRulesCore.rrule(::typeof(matrix_relu), y::Matrix{T}) where T
4343 function pullback_matrix_relu (dl_dx)
4444 # # some value from the backpropagation (e.g., loss) is denoted by `l`
4545 # # so `dl_dy` is the derivative of `l` wrt `y`
46- x = model[:x ] # # load decision variable `x` into scope
46+ x = model[:x ] # load decision variable `x` into scope
4747 dl_dy = zeros (T, size (dl_dx))
4848 dl_dq = zeros (T, size (dl_dx))
4949 # # set sensitivities
@@ -54,50 +54,48 @@ function ChainRulesCore.rrule(::typeof(matrix_relu), y::Matrix{T}) where T
5454 obj_exp = MOI. get (model, DiffOpt. ReverseObjectiveFunction ())
5555 # # coeff of `x` in q'x = -2y'x
5656 dl_dq[:] .= JuMP. coefficient .(obj_exp, x[:])
57- dq_dy = - 2 # # dq/dy = -2
57+ dq_dy = - 2 # dq/dy = -2
5858 dl_dy[:] .= dl_dq[:] * dq_dy
5959 return (ChainRulesCore. NoTangent (), dl_dy,)
6060 end
6161 return pv, pullback_matrix_relu
6262end
6363
6464# For more details about backpropagation, visit [Introduction, ChainRulesCore.jl](https://juliadiff.org/ChainRulesCore.jl/dev/).
65- # ## prepare data
66- N = 1000 # # batch size
67- imgs = MLDatasets. MNIST. traintensor (1 : N)
68- labels = MLDatasets. MNIST. trainlabels (1 : N);
69-
70- # Preprocessing
71- train_X = float .(reshape (imgs, size (imgs, 1 ) * size (imgs, 2 ), N)) # # stack all the images
72- train_Y = Flux. onehotbatch (labels, 0 : 9 );
7365
74- test_imgs = MLDatasets. MNIST. testtensor (1 : N)
75- test_X = float .(reshape (test_imgs, size (test_imgs, 1 ) * size (test_imgs, 2 ), N))
76- test_Y = Flux. onehotbatch (MLDatasets. MNIST. testlabels (1 : N), 0 : 9 );
77-
78- # ## Define the Network
79-
80- # Network structure
66+ # ## Define the network
8167
8268layer_size = 10
83-
8469m = Flux. Chain (
85- Flux. Dense (784 , layer_size), # # 784 being image linear dimension (28 x 28)
70+ Flux. Dense (784 , layer_size), # 784 being image linear dimension (28 x 28)
8671 matrix_relu,
87- Flux. Dense (layer_size, 10 ), # # 10 being the number of outcomes (0 to 9)
72+ Flux. Dense (layer_size, 10 ), # 10 being the number of outcomes (0 to 9)
8873 Flux. softmax,
8974)
9075
76+ # ## Prepare data
77+
78+ N = 1000 # batch size
79+ # # Preprocessing train data
80+ imgs = MLDatasets. MNIST. traintensor (1 : N)
81+ labels = MLDatasets. MNIST. trainlabels (1 : N)
82+ train_X = float .(reshape (imgs, size (imgs, 1 ) * size (imgs, 2 ), N)) # stack images
83+ train_Y = Flux. onehotbatch (labels, 0 : 9 );
84+ # # Preprocessing test data
85+ test_imgs = MLDatasets. MNIST. testtensor (1 : N)
86+ test_labels = MLDatasets. MNIST. testlabels (1 : N)
87+ test_X = float .(reshape (test_imgs, size (test_imgs, 1 ) * size (test_imgs, 2 ), N))
88+ test_Y = Flux. onehotbatch (test_labels, 0 : 9 );
89+
9190# Define input data
9291# The original data is repeated `epochs` times because `Flux.train!` only
9392# loops through the data set once
9493
9594epochs = 50 # ~1 minute (i7 8th gen with 16gb RAM)
9695# # epochs = 100 # leads to 77.8% in about 2 minutes
97-
9896dataset = repeated ((train_X, train_Y), epochs);
9997
100- # Parameters for the network training
98+ # ## Network training
10199
102100# training loss function, Flux optimizer
103101custom_loss (x, y) = Flux. crossentropy (m (x), y)
@@ -111,7 +109,10 @@ evalcb = () -> @show(custom_loss(train_X, train_Y))
111109# Although our custom implementation takes time, it is able to reach similar
112110# accuracy as the usual ReLU function implementation.
113111
112+ # ## Accuracy results
113+
114114# Average of correct guesses
115+
115116accuracy (x, y) = Statistics. mean (Flux. onecold (m (x)) .== Flux. onecold (y));
116117
117118# Training accuracy
0 commit comments