Skip to content

Commit 813e88d

Browse files
pkan2WindQAQ
authored andcommitted
Modify ConditionalGradient optimizer with handling case of gradient be 0 (#558)
* Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * add CG optimizer * Revert "add CG optimizer" This reverts commit 953fa39. * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Revert "Add files via upload" This reverts commit de3cf0b. * Handling the case of gradient be 0 * Handle the case of gradient to be 0 * Modify the format of epsilon in the argument * Adding missing part in init function for epsilon * fixing line overlong issue
1 parent d20e803 commit 813e88d

File tree

1 file changed

+23
-7
lines changed

1 file changed

+23
-7
lines changed

tensorflow_addons/optimizers/conditional_gradient.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,23 @@ class ConditionalGradient(tf.keras.optimizers.Optimizer):
3131
See https://arxiv.org/pdf/1803.06453.pdf
3232
3333
```
34-
variable -= (1-learning_rate)
35-
* (variable + lambda_ * gradient / frobenius_norm(gradient))
34+
variable -= (1-learning_rate) * (variable + lambda_ * gradient
35+
/ (frobenius_norm(gradient) + epsilon))
3636
```
3737
38-
Note that we choose "lambda_" here to refer to the constraint "lambda" in the paper.
38+
Note that we choose "lambda_" here to refer to the constraint "lambda" in
39+
the paper.
40+
And 'epsilon' is constant with tiny value as compared to the value of
41+
frobenius_norm of gradient. The purpose of 'epsilon' here is to avoid the
42+
case that the value of frobenius_norm of gradient is 0.
43+
44+
In this implementation, we choose 'epsilon' with value of 10^-7.
3945
"""
4046

4147
def __init__(self,
4248
learning_rate,
4349
lambda_,
50+
epsilon=1e-7,
4451
use_locking=False,
4552
name='ConditionalGradient',
4653
**kwargs):
@@ -50,19 +57,24 @@ def __init__(self,
5057
learning_rate: A `Tensor` or a floating point value.
5158
The learning rate.
5259
lambda_: A `Tensor` or a floating point value. The constraint.
60+
epsilon: A `Tensor` or a floating point value. A small constant
61+
for numerical stability when handling the case of norm of
62+
gradient to be zero.
5363
use_locking: If `True` use locks for update operations.
5464
name: Optional name prefix for the operations created when
5565
applying gradients. Defaults to 'ConditionalGradient'
5666
"""
5767
super(ConditionalGradient, self).__init__(name=name, **kwargs)
5868
self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
5969
self._set_hyper('lambda_', lambda_)
70+
self.epsilon = epsilon or tf.keras.backend.epsilon()
6071
self._set_hyper('use_locking', use_locking)
6172

6273
def get_config(self):
6374
config = {
6475
'learning_rate': self._serialize_hyperparameter('learning_rate'),
6576
'lambda_': self._serialize_hyperparameter('lambda_'),
77+
'epsilon': self.epsilon,
6678
'use_locking': self._serialize_hyperparameter('use_locking')
6779
}
6880
base_config = super(ConditionalGradient, self).get_config()
@@ -79,6 +91,8 @@ def _prepare_local(self, var_device, var_dtype, apply_state):
7991
self._get_hyper('learning_rate', var_dtype))
8092
apply_state[(var_device, var_dtype)]['lambda_'] = tf.identity(
8193
self._get_hyper('lambda_', var_dtype))
94+
apply_state[(var_device, var_dtype)]['epsilon'] = tf.convert_to_tensor(
95+
self.epsilon, var_dtype)
8296

8397
def _resource_apply_dense(self, grad, var, apply_state=None):
8498
def frobenius_norm(m):
@@ -91,8 +105,9 @@ def frobenius_norm(m):
91105
frobenius_norm(grad), name='norm', dtype=var.dtype.base_dtype)
92106
lr = coefficients['learning_rate']
93107
lambda_ = coefficients['lambda_']
94-
var_update_tensor = (
95-
tf.math.multiply(var, lr) - (1 - lr) * lambda_ * grad / norm)
108+
epsilon = coefficients['epsilon']
109+
var_update_tensor = (tf.math.multiply(var, lr) -
110+
(1 - lr) * lambda_ * grad / (norm + epsilon))
96111
var_update_kwargs = {
97112
'resource': var.handle,
98113
'value': var_update_tensor,
@@ -111,9 +126,10 @@ def frobenius_norm(m):
111126
frobenius_norm(grad), name='norm', dtype=var.dtype.base_dtype)
112127
lr = coefficients['learning_rate']
113128
lambda_ = coefficients['lambda_']
129+
epsilon = coefficients['epsilon']
114130
var_slice = tf.gather(var, indices)
115-
var_update_value = (
116-
tf.math.multiply(var_slice, lr) - (1 - lr) * lambda_ * grad / norm)
131+
var_update_value = (tf.math.multiply(var_slice, lr) -
132+
(1 - lr) * lambda_ * grad / (norm + epsilon))
117133
var_update_kwargs = {
118134
'resource': var.handle,
119135
'indices': indices,

0 commit comments

Comments
 (0)