Modify ConditionalGradient optimizer with handling case of gradient be 0 (#558)

pkan2 · WindQAQ · commit 813e88d12b00 · 2019-10-02T01:21:52.000-07:00
* Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * add CG optimizer * Revert "add CG optimizer" This reverts commit 953fa39. * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Add files via upload * Revert "Add files via upload" This reverts commit de3cf0b. * Handling the case of gradient be 0 * Handle the case of gradient to be 0 * Modify the format of epsilon in the argument * Adding missing part in init function for epsilon * fixing line overlong issue
diff --git a/tensorflow_addons/optimizers/conditional_gradient.py b/tensorflow_addons/optimizers/conditional_gradient.py
@@ -31,16 +31,23 @@ class ConditionalGradient(tf.keras.optimizers.Optimizer):
     See https://arxiv.org/pdf/1803.06453.pdf
 
     ```
-    variable -= (1-learning_rate)
-        * (variable + lambda_ * gradient / frobenius_norm(gradient))
+    variable -= (1-learning_rate) * (variable + lambda_ * gradient
+        / (frobenius_norm(gradient) + epsilon))
     ```
 
-    Note that we choose "lambda_" here to refer to the constraint "lambda" in the paper.
+    Note that we choose "lambda_" here to refer to the constraint "lambda" in
+    the paper.
+    And 'epsilon' is constant with tiny value as compared to the value of
+    frobenius_norm of gradient. The purpose of 'epsilon' here is to avoid the
+    case that the value of frobenius_norm of gradient is 0.
+
+    In this implementation, we choose 'epsilon' with value of 10^-7.
     """
 
     def __init__(self,
                  learning_rate,
                  lambda_,
+                 epsilon=1e-7,
                  use_locking=False,
                  name='ConditionalGradient',
                  **kwargs):
@@ -50,19 +57,24 @@ def __init__(self,
             learning_rate: A `Tensor` or a floating point value.
                         The learning rate.
             lambda_: A `Tensor` or a floating point value. The constraint.
+            epsilon: A `Tensor` or a floating point value. A small constant
+                    for numerical stability when handling the case of norm of
+                    gradient to be zero.
             use_locking: If `True` use locks for update operations.
             name: Optional name prefix for the operations created when
                 applying gradients.  Defaults to 'ConditionalGradient'
         """
         super(ConditionalGradient, self).__init__(name=name, **kwargs)
         self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
         self._set_hyper('lambda_', lambda_)
+        self.epsilon = epsilon or tf.keras.backend.epsilon()
         self._set_hyper('use_locking', use_locking)
 
     def get_config(self):
         config = {
             'learning_rate': self._serialize_hyperparameter('learning_rate'),
             'lambda_': self._serialize_hyperparameter('lambda_'),
+            'epsilon': self.epsilon,
             'use_locking': self._serialize_hyperparameter('use_locking')
         }
         base_config = super(ConditionalGradient, self).get_config()
@@ -79,6 +91,8 @@ def _prepare_local(self, var_device, var_dtype, apply_state):
             self._get_hyper('learning_rate', var_dtype))
         apply_state[(var_device, var_dtype)]['lambda_'] = tf.identity(
             self._get_hyper('lambda_', var_dtype))
+        apply_state[(var_device, var_dtype)]['epsilon'] = tf.convert_to_tensor(
+            self.epsilon, var_dtype)
 
     def _resource_apply_dense(self, grad, var, apply_state=None):
         def frobenius_norm(m):
@@ -91,8 +105,9 @@ def frobenius_norm(m):
             frobenius_norm(grad), name='norm', dtype=var.dtype.base_dtype)
         lr = coefficients['learning_rate']
         lambda_ = coefficients['lambda_']
-        var_update_tensor = (
-            tf.math.multiply(var, lr) - (1 - lr) * lambda_ * grad / norm)
+        epsilon = coefficients['epsilon']
+        var_update_tensor = (tf.math.multiply(var, lr) -
+                             (1 - lr) * lambda_ * grad / (norm + epsilon))
         var_update_kwargs = {
             'resource': var.handle,
             'value': var_update_tensor,
@@ -111,9 +126,10 @@ def frobenius_norm(m):
             frobenius_norm(grad), name='norm', dtype=var.dtype.base_dtype)
         lr = coefficients['learning_rate']
         lambda_ = coefficients['lambda_']
+        epsilon = coefficients['epsilon']
         var_slice = tf.gather(var, indices)
-        var_update_value = (
-            tf.math.multiply(var_slice, lr) - (1 - lr) * lambda_ * grad / norm)
+        var_update_value = (tf.math.multiply(var_slice, lr) -
+                            (1 - lr) * lambda_ * grad / (norm + epsilon))
         var_update_kwargs = {
             'resource': var.handle,
             'indices': indices,