Implement Conditional Gradient Optimizer (#469)

pkan2 · Squadrick · commit 91c08460ffeb · 2019-09-20T21:14:03.000+05:30
diff --git a/tensorflow_addons/optimizers/BUILD b/tensorflow_addons/optimizers/BUILD
@@ -6,6 +6,7 @@ py_library(
     name = "optimizers",
     srcs = [
         "__init__.py",
+        "conditional_gradient.py",
         "lazy_adam.py",
         "lookahead.py",
         "moving_average.py",
@@ -18,6 +19,19 @@ py_library(
     ],
 )
 
+py_test(
+    name = "conditional_gradient_test",
+    size = "small",
+    srcs = [
+        "conditional_gradient_test.py",
+    ],
+    main = "conditional_gradient_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":optimizers",
+    ],
+)
+
 py_test(
     name = "lazy_adam_test",
     size = "small",
diff --git a/tensorflow_addons/optimizers/README.md b/tensorflow_addons/optimizers/README.md
@@ -3,6 +3,7 @@
 ## Maintainers
 | Submodule  | Maintainers  | Contact Info   |
 |:---------- |:------------- |:--------------|
+| conditional_gradient | Pengyu Kan, Vishnu Lokhande | pkan2@wisc.edu, lokhande@cs.wisc.edu |
 | lazy_adam | Saishruthi Swaminathan  | saishruthi.tn@gmail.com  |
 | lookahead | Zhao Hanguang | cyberzhg@gmail.com |
 | moving_average | Dheeraj R. Reddy | dheeraj98reddy@gmail.com |
@@ -13,6 +14,7 @@
 ## Components
 | Submodule | Optimizer  | Reference                                   |
 |:--------- |:---------- |:---------|
+| conditional_gradient | ConditionalGradient | https://arxiv.org/pdf/1803.06453.pdf |
 | lazy_adam | LazyAdam | https://arxiv.org/abs/1412.6980      |
 | lookahead | Lookahead | https://arxiv.org/abs/1907.08610v1 |
 | moving_average | MovingAverage | |
diff --git a/tensorflow_addons/optimizers/__init__.py b/tensorflow_addons/optimizers/__init__.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow_addons.optimizers.conditional_gradient import ConditionalGradient
 from tensorflow_addons.optimizers.lazy_adam import LazyAdam
 from tensorflow_addons.optimizers.lookahead import Lookahead
 from tensorflow_addons.optimizers.moving_average import MovingAverage
diff --git a/tensorflow_addons/optimizers/conditional_gradient.py b/tensorflow_addons/optimizers/conditional_gradient.py
@@ -0,0 +1,123 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Conditional Gradient method for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow_addons.utils import keras_utils
+
+
+@keras_utils.register_keras_custom_object
+class ConditionalGradient(tf.keras.optimizers.Optimizer):
+    """Optimizer that implements the Conditional Gradient optimization.
+
+    This optimizer helps handle constraints well.
+
+    Currently only supports frobenius norm constraint.
+    See https://arxiv.org/pdf/1803.06453.pdf
+
+    ```
+    variable -= (1-learning_rate)
+        * (variable + lambda_ * gradient / frobenius_norm(gradient))
+    ```
+
+    Note that we choose "lambda_" here to refer to the constraint "lambda" in the paper.
+    """
+
+    def __init__(self,
+                 learning_rate,
+                 lambda_,
+                 use_locking=False,
+                 name='ConditionalGradient',
+                 **kwargs):
+        """Construct a conditional gradient optimizer.
+
+        Args:
+            learning_rate: A `Tensor` or a floating point value.
+                        The learning rate.
+            lambda_: A `Tensor` or a floating point value. The constraint.
+            use_locking: If `True` use locks for update operations.
+            name: Optional name prefix for the operations created when
+                applying gradients.  Defaults to 'ConditionalGradient'
+        """
+        super(ConditionalGradient, self).__init__(name=name, **kwargs)
+        self._set_hyper('learning_rate', kwargs.get('lr', learning_rate))
+        self._set_hyper('lambda_', lambda_)
+        self._set_hyper('use_locking', use_locking)
+
+    def get_config(self):
+        config = {
+            'learning_rate': self._serialize_hyperparameter('learning_rate'),
+            'lambda_': self._serialize_hyperparameter('lambda_'),
+            'use_locking': self._serialize_hyperparameter('use_locking')
+        }
+        base_config = super(ConditionalGradient, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
+
+    def _create_slots(self, var_list):
+        for v in var_list:
+            self.add_slot(v, 'conditional_gradient')
+
+    def _prepare_local(self, var_device, var_dtype, apply_state):
+        super(ConditionalGradient, self)._prepare_local(
+            var_device, var_dtype, apply_state)
+        apply_state[(var_device, var_dtype)]['learning_rate'] = tf.identity(
+            self._get_hyper('learning_rate', var_dtype))
+        apply_state[(var_device, var_dtype)]['lambda_'] = tf.identity(
+            self._get_hyper('lambda_', var_dtype))
+
+    def _resource_apply_dense(self, grad, var, apply_state=None):
+        def frobenius_norm(m):
+            return tf.math.reduce_sum(m**2)**0.5
+
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                        or self._fallback_apply_state(var_device, var_dtype))
+        norm = tf.convert_to_tensor(
+            frobenius_norm(grad), name='norm', dtype=var.dtype.base_dtype)
+        lr = coefficients['learning_rate']
+        lambda_ = coefficients['lambda_']
+        var_update_tensor = (
+            tf.math.multiply(var, lr) - (1 - lr) * lambda_ * grad / norm)
+        var_update_kwargs = {
+            'resource': var.handle,
+            'value': var_update_tensor,
+        }
+        var_update_op = tf.raw_ops.AssignVariableOp(**var_update_kwargs)
+        return tf.group(var_update_op)
+
+    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):
+        def frobenius_norm(m):
+            return tf.reduce_sum(m**2)**0.5
+
+        var_device, var_dtype = var.device, var.dtype.base_dtype
+        coefficients = ((apply_state or {}).get((var_device, var_dtype))
+                        or self._fallback_apply_state(var_device, var_dtype))
+        norm = tf.convert_to_tensor(
+            frobenius_norm(grad), name='norm', dtype=var.dtype.base_dtype)
+        lr = coefficients['learning_rate']
+        lambda_ = coefficients['lambda_']
+        var_slice = tf.gather(var, indices)
+        var_update_value = (
+            tf.math.multiply(var_slice, lr) - (1 - lr) * lambda_ * grad / norm)
+        var_update_kwargs = {
+            'resource': var.handle,
+            'indices': indices,
+            'updates': var_update_value
+        }
+        var_update_op = tf.raw_ops.ResourceScatterUpdate(**var_update_kwargs)
+        return tf.group(var_update_op)
diff --git a/tensorflow_addons/optimizers/conditional_gradient_test.py b/tensorflow_addons/optimizers/conditional_gradient_test.py