From fdc6e891f7cd6123c45a1e69fed0c27f4654e044 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 03:57:25 +0000 Subject: [PATCH 01/62] Add initial novograd --- tensorflow_addons/optimizers/novograd.py | 138 +++++++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 tensorflow_addons/optimizers/novograd.py diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py new file mode 100644 index 0000000000..154e32c0f2 --- /dev/null +++ b/tensorflow_addons/optimizers/novograd.py @@ -0,0 +1,138 @@ +"""Novograd for TensorFlow.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + + +@tf.keras.utils.register_keras_serializable(package='Addons') +class Novograd(tf.keras.optimizer.Optimizer): + + def __init__(self, + learning_rate=1, + beta_1=0.95, + beta_2=0.98, + epsilon=1e-8, + weight_decay=0.0, + grad_averaging=False, + name='Novograd', + **kwargs): + super(Novograd, self).__init__(name, **kwargs) + self._set_hyper('learning_rate', kwargs.get('lr', learning_rate)) + self._set_hyper('decay', self._initial_decay) + self._set_hyper('beta_1', beta_1) + self._set_hyper('beta_2', beta_2) + self._set_hyper('weight_decay', weight_decay) + self._set_hyper('grad_averaging', grad_averaging) + self.epsilon = epsilon or tf.keras.backend.epsilon() + + def _create_slots(self, var_list): + # Create slots for the first and second moments. + # Separate for-loops to respect the ordering of slot variables from v1. + for var in var_list: + self.add_slot(var=var, slot_name='m', initializer='zeros') + for var in var_list: + self.add_slot(var=var, slot_name='v', initializer=tf.zeros(shape=[], dtype=var.dtype)) + + def _prepare_local(self, var_device, var_dtype, apply_state): + super(Novograd, self)._prepare_local(var_device, var_dtype, apply_state) + local_step = tf.cast(self.iterations + 1, var_device) + beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype)) + beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype)) + beta_1_power = tf.pow(beta_1_t, local_step) + beta_2_power = tf.pow(beta_2_t, local_step) + lr = (apply_state[(var_device, var_dtype)]['lr_t'] * + (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))) + apply_state[(var_device, var_dtype)].update(dict( + lr=lr, + epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), + beta_1_t=beta_1_t, + beta_2_t=beta_2_t, + one_minus_beta_2_t=1 - beta_2_t, + )) + + def set_weights(self, weights): + params = self.weights + # If the weights are generated by Keras V1 optimizer, it includes vhats + # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2 + # optimizer has 2x + 1 variables. Filter vhats out for compatibility. + num_vars = int((len(params) - 1) / 2) + if len(weights) == 3 * num_vars + 1: + weights = weights[:len(params)] + super(Novograd, self).set_weights(weights) + + def _resource_apply_dense(self, grad, var, apply_state=None): + var_device, var_dtype = var.device, var.dtype.base_dtype + coefficients = ((apply_state or {}).get((var_device, var_dtype)) + or self._fallback_apply_state(var_device, var_dtype)) + weight_decay = self._get_hyper('weight_decay') + grad_averaging = self._get_hyper('weight_averaging') + + v = self.get_slot(var, 'v') + g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) + v_t = tf.cond(tf.equal(self.iterations, 0), + g_2, + v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) + v_t = v.assign(v_t, use_locking=self._use_locking) + + grad = grad / (tf.sqrt(v_t) + self.epsilon) + + m = self.get_slot(var, 'm') + m_t = tf.cond(tf.equal(self.iterations, 0), + grad, + tf.cond(grad_averaging, + m * coefficients['beta_1_t'] + grad, + m * coefficients['beta_1_t'] + grad * coefficients['one_minus_beta_1_t'])) + m_t = tf.cond(tf.greater(weight_decay, 0), + m_t + weight_decay * var, + m_t) + m_t = m.assign(m_t, use_locking=self._use_locking) + + var_update = var - coefficients['lr'] * m_t + + return var.assign(var_update, use_locking=self._use_locking).op + + def _resource_apply_sparse(self, grad, var, indices, apply_state=None): + var_device, var_dtype = var.device, var.dtype.base_dtype + coefficients = ((apply_state or {}).get((var_device, var_dtype)) + or self._fallback_apply_state(var_device, var_dtype)) + weight_decay = self._get_hyper('weight_decay') + grad_averaging = self._get_hyper('grad_averaging') + + v = self.get_slot(var, 'v') + g_2 = tf.sparse.reduce_sum(tf.square(tf.cast(grad, tf.float32))) + # v is just a scalar and does not need to involve sparse tensors. + v_t = tf.cond(tf.equal(self.iterations, 0), + g_2, + v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) + v_t = v.assign(v_t, use_locking=self._use_locking) + + grad = grad / (tf.sqrt(v_t) + self.epsilon) + m = self.get_slot(var, 'm') + m_t = tf.cond(tf.equal(self.iterations, 0), + grad, + tf.cond(grad_averaging, + self._resource_scatter_add(m * coefficients['beta_1_t'], indices, grad), + self._resource_scatter_add(m * coefficients['beta_1_t'], indices, + grad * coefficients['one_minus_beta_1_t']))) + m_t = tf.cond(tf.greater(weight_decay, 0), + self._resource_scatter_add(m_t, indices, weight_decay * var), + m_t) + m_t = m.assign(m_t, use_locking=self._use_locking) + + var_update = self._resource_scatter_add(var, coefficients['lr_t'] * (-m_t)) + + return var.assign(var_update, use_locking=self._use_locking).op + + def get_config(self): + config = super(Novograd, self).get_config() + config.update({ + 'learning_rate': self._serialize_hyperparameter('learning_rate'), + 'beta_1': self._serialize_hyperparameter('beta_1'), + 'beta_2': self._serialize_hyperparameter('beta_2'), + 'epsilon': self.epsilon, + 'weight_decay': self._serialize_hyperparameter('weight_decay'), + 'grad_averaging': self._serialize_hyperparameter('grad_averaging'), + }) + return config From 60bee2585fbef8f59ec5c406a9b4cf37e0bb0f9f Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 04:48:01 +0000 Subject: [PATCH 02/62] Add tests from rectified adam --- tensorflow_addons/optimizers/README.md | 2 + tensorflow_addons/optimizers/__init__.py | 1 + tensorflow_addons/optimizers/novograd.py | 62 +++---- tensorflow_addons/optimizers/novograd_test.py | 156 ++++++++++++++++++ 4 files changed, 193 insertions(+), 28 deletions(-) create mode 100644 tensorflow_addons/optimizers/novograd_test.py diff --git a/tensorflow_addons/optimizers/README.md b/tensorflow_addons/optimizers/README.md index c73e49b2ed..50513f9ec0 100644 --- a/tensorflow_addons/optimizers/README.md +++ b/tensorflow_addons/optimizers/README.md @@ -9,6 +9,7 @@ | lazy_adam | Saishruthi Swaminathan | saishruthi.tn@gmail.com | | lookahead | Zhao Hanguang | cyberzhg@gmail.com | | moving_average | Dheeraj R. Reddy | dheeraj98reddy@gmail.com | +| novograd | Shreyash Patodia | patodiashreyash32@gmail.com | | rectified_adam | Zhao Hanguang | cyberzhg@gmail.com | | stochastic_weight_averaging | Shreyash Patodia | patodiashreyash32@gmail.com | | weight_decay_optimizers | Phil Jund | ijund.phil@googlemail.com | @@ -25,6 +26,7 @@ | lazy_adam | LazyAdam | https://arxiv.org/abs/1412.6980 | | lookahead | Lookahead | https://arxiv.org/abs/1907.08610v1 | | moving_average | MovingAverage | | +| novograd | Novograd | https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html | | rectified_adam | RectifiedAdam | https://arxiv.org/pdf/1908.03265v1.pdf | | stochastic_weight_averaging | SWA | https://arxiv.org/abs/1803.05407.pdf | | weight_decay_optimizers | SGDW, AdamW, extend_with_decoupled_weight_decay | https://arxiv.org/pdf/1711.05101.pdf | diff --git a/tensorflow_addons/optimizers/__init__.py b/tensorflow_addons/optimizers/__init__.py index 2deaf5ee66..42ba48e15a 100644 --- a/tensorflow_addons/optimizers/__init__.py +++ b/tensorflow_addons/optimizers/__init__.py @@ -32,6 +32,7 @@ from tensorflow_addons.optimizers.lookahead import Lookahead from tensorflow_addons.optimizers.moving_average import MovingAverage from tensorflow_addons.optimizers.rectified_adam import RectifiedAdam +from tensorflow_addons.optimizers.novograd import Novograd from tensorflow_addons.optimizers.stochastic_weight_averaging import SWA from tensorflow_addons.optimizers.weight_decay_optimizers import AdamW from tensorflow_addons.optimizers.weight_decay_optimizers import SGDW diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 154e32c0f2..813d98156e 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -1,9 +1,24 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== """Novograd for TensorFlow.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import tensorflow as tf +from tensorflow.python.training import training_ops @tf.keras.utils.register_keras_serializable(package='Addons') @@ -77,21 +92,16 @@ def _resource_apply_dense(self, grad, var, apply_state=None): v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) - + grad = tf.cond(grad_averaging, grad * coefficients['one_minus_beta_1_t'], grad) + grad = tf.cond(tf.greater(weight_decay, 0), grad + weight_decay * var, grad) m = self.get_slot(var, 'm') - m_t = tf.cond(tf.equal(self.iterations, 0), - grad, - tf.cond(grad_averaging, - m * coefficients['beta_1_t'] + grad, - m * coefficients['beta_1_t'] + grad * coefficients['one_minus_beta_1_t'])) - m_t = tf.cond(tf.greater(weight_decay, 0), - m_t + weight_decay * var, - m_t) - m_t = m.assign(m_t, use_locking=self._use_locking) - - var_update = var - coefficients['lr'] * m_t - - return var.assign(var_update, use_locking=self._use_locking).op + return training_ops.resource_apply_momentum(var.handle, + m.handle, + coefficients['lr'], + grad, + coefficients['beta_1_t'], + use_locking=self._use_locking, + use_nesterov=False) def _resource_apply_sparse(self, grad, var, indices, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype @@ -109,21 +119,17 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) + grad = tf.cond(grad_averaging, grad * coefficients['one_minus_beta_1_t'], grad) + grad = tf.cond(tf.greater(weight_decay, 0), self._resource_scatter_add(grad, indices, weight_decay * var)) m = self.get_slot(var, 'm') - m_t = tf.cond(tf.equal(self.iterations, 0), - grad, - tf.cond(grad_averaging, - self._resource_scatter_add(m * coefficients['beta_1_t'], indices, grad), - self._resource_scatter_add(m * coefficients['beta_1_t'], indices, - grad * coefficients['one_minus_beta_1_t']))) - m_t = tf.cond(tf.greater(weight_decay, 0), - self._resource_scatter_add(m_t, indices, weight_decay * var), - m_t) - m_t = m.assign(m_t, use_locking=self._use_locking) - - var_update = self._resource_scatter_add(var, coefficients['lr_t'] * (-m_t)) - - return var.assign(var_update, use_locking=self._use_locking).op + return training_ops.resource_apply_sparse_momentum(var.handle, + m.handle, + coefficients['lr'], + grad, + indices, + coefficients['beta_1_t'], + use_locking=self._use_locking, + use_nesterov=False) def get_config(self): config = super(Novograd, self).get_config() diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py new file mode 100644 index 0000000000..e988d9a270 --- /dev/null +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -0,0 +1,156 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for Novograd Optimizer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from tensorflow_addons.utils import test_utils +from tensorflow_addons.optimizers import Novograd + + +@test_utils.run_all_in_graph_and_eager_modes +class NovogradTest(tf.test.TestCase): + def run_dense_sample(self, iterations, expected, optimizer): + var_0 = tf.Variable([1.0, 2.0], dtype=tf.dtypes.float32) + var_1 = tf.Variable([3.0, 4.0], dtype=tf.dtypes.float32) + + grad_0 = tf.constant([0.1, 0.2], dtype=tf.dtypes.float32) + grad_1 = tf.constant([0.03, 0.04], dtype=tf.dtypes.float32) + + grads_and_vars = list(zip([grad_0, grad_1], [var_0, var_1])) + + if tf.executing_eagerly(): + for _ in range(iterations): + optimizer.apply_gradients(grads_and_vars) + else: + update = optimizer.apply_gradients(grads_and_vars) + self.evaluate(tf.compat.v1.global_variables_initializer()) + for _ in range(iterations): + self.evaluate(update) + + self.assertAllClose(var_0.read_value(), expected[0], atol=2e-4) + self.assertAllClose(var_1.read_value(), expected[1], atol=2e-4) + + def run_sparse_sample(self, iterations, expected, optimizer): + var_0 = tf.Variable([1.0, 2.0]) + var_1 = tf.Variable([3.0, 4.0]) + + grad_0 = tf.IndexedSlices( + tf.constant([0.1]), tf.constant([0]), tf.constant([2])) + grad_1 = tf.IndexedSlices( + tf.constant([0.04]), tf.constant([1]), tf.constant([2])) + + grads_and_vars = list(zip([grad_0, grad_1], [var_0, var_1])) + + if tf.executing_eagerly(): + for _ in range(iterations): + optimizer.apply_gradients(grads_and_vars) + else: + update = optimizer.apply_gradients(grads_and_vars) + self.evaluate(tf.compat.v1.global_variables_initializer()) + for _ in range(iterations): + self.evaluate(update) + + self.assertAllClose(var_0.read_value(), expected[0], atol=2e-4) + self.assertAllClose(var_1.read_value(), expected[1], atol=2e-4) + + def test_dense_sample(self): + # Expected values are obtained from the official implementation + self.run_dense_sample( + iterations=1, + expected=[[0.5554, 1.5549], [2.5557, 3.5557]], + optimizer=Novograd(lr=1e-3), + ) + self.run_dense_sample( + iterations=1, + expected=[[0.5554, 1.5549], [2.5557, 3.5557]], + optimizer=Novograd(lr=1e-3), + ) + + def test_sparse_sample(self): + # Expected values are obtained from the official implementation + # Dense results should be: [-0.1929, 0.8066], [1.8075, 2.8074] + self.run_sparse_sample( + iterations=1, + expected=[[-0.1929, 2.0], [3.0, 2.8074]], + optimizer=Novograd(lr=1e-3), + ) + self.run_sparse_sample( + iterations=2, + expected=[[-0.1929, 2.0], [3.0, 2.8074]], + optimizer=Novograd(lr=1e-3), + ) + + def test_dense_sample_with_weight_decay(self): + # Expected values are obtained from the official implementation + self.run_dense_sample( + iterations=1, + expected=[[0.5472, 1.5368], [2.5276, 3.5176]], + optimizer=Novograd(lr=1e-3, weight_decay=0.01), + ) + self.run_dense_sample( + iterations=1, + expected=[[0.5472, 1.5368], [2.5276, 3.5176]], + optimizer=Novograd(lr=1e-3, weight_decay=0.01), + ) + + def test_sparse_sample_with_weight_decay(self): + # Expected values are obtained from the official implementation + # Dense results should be: [-0.2029, 0.7768], [1.7578, 2.7380] + self.run_sparse_sample( + iterations=1, + expected=[[-0.2029, 2.0], [3.0, 2.7380]], + optimizer=Novograd(lr=1e-3, weight_decay=0.01), + ) + self.run_sparse_sample( + iterations=2, + expected=[[-0.2029, 2.0], [3.0, 2.7380]], + optimizer=Novograd(lr=1e-3, weight_decay=0.01), + ) + + def test_dense_sample_with_grad_averaging(self): + self.run_dense_sample( + iterations=1, + expected=[[0.8041, 1.8041], [2.8041, 3.8041]], + optimizer=Novograd( + lr=1e-3, + grad_averaging=True + ) + ) + + def test_sparse_sample_with_grad_averaging(self): + self.run_sparse_sample( + iterations=2, + expected=[[0.4653, 2.0], [3.0, 3.4653]], + optimizer=Novograd( + lr=1e-3, + grad_averaging=True + ) + ) + + def test_get_config(self): + opt = Novograd(lr=1e-4) + config = opt.get_config() + self.assertEqual(config['learning_rate'], 1e-4) + self.assertEqual(config['weight_decay'], 0.0) + self.assertEqual(config['grad_averaging'], False) + + +if __name__ == '__main__': + tf.test.main() From b692a7046691911fa8eeafada89456360f880175 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 04:50:43 +0000 Subject: [PATCH 03/62] Add build and __init__ --- tensorflow_addons/optimizers/BUILD | 13 +++++++++++++ tensorflow_addons/optimizers/__init__.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/BUILD b/tensorflow_addons/optimizers/BUILD index 00a31f327e..f796ecf556 100644 --- a/tensorflow_addons/optimizers/BUILD +++ b/tensorflow_addons/optimizers/BUILD @@ -12,6 +12,7 @@ py_library( "lazy_adam.py", "lookahead.py", "moving_average.py", + "novograd.py" "rectified_adam.py", "stochastic_weight_averaging.py", "weight_decay_optimizers.py", @@ -106,6 +107,18 @@ py_test( ], ) +py_test( + name = "novograd_test", + size = "small", + srcs = [ + "novograd_test.py", + ], + main = "novograd_test.py", + deps = [ + ":optimizers", + ], +) + py_test( name = "rectified_adam_test", size = "small", diff --git a/tensorflow_addons/optimizers/__init__.py b/tensorflow_addons/optimizers/__init__.py index 42ba48e15a..919eb76502 100644 --- a/tensorflow_addons/optimizers/__init__.py +++ b/tensorflow_addons/optimizers/__init__.py @@ -31,8 +31,8 @@ from tensorflow_addons.optimizers.lazy_adam import LazyAdam from tensorflow_addons.optimizers.lookahead import Lookahead from tensorflow_addons.optimizers.moving_average import MovingAverage -from tensorflow_addons.optimizers.rectified_adam import RectifiedAdam from tensorflow_addons.optimizers.novograd import Novograd +from tensorflow_addons.optimizers.rectified_adam import RectifiedAdam from tensorflow_addons.optimizers.stochastic_weight_averaging import SWA from tensorflow_addons.optimizers.weight_decay_optimizers import AdamW from tensorflow_addons.optimizers.weight_decay_optimizers import SGDW From b622e2a7fddf09e9ccd03f55075f625f935414fd Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 04:53:40 +0000 Subject: [PATCH 04/62] Code format --- tensorflow_addons/optimizers/BUILD | 2 +- tensorflow_addons/optimizers/novograd.py | 101 ++++++++++-------- tensorflow_addons/optimizers/novograd_test.py | 12 +-- 3 files changed, 62 insertions(+), 53 deletions(-) diff --git a/tensorflow_addons/optimizers/BUILD b/tensorflow_addons/optimizers/BUILD index f796ecf556..87599d8654 100644 --- a/tensorflow_addons/optimizers/BUILD +++ b/tensorflow_addons/optimizers/BUILD @@ -12,7 +12,7 @@ py_library( "lazy_adam.py", "lookahead.py", "moving_average.py", - "novograd.py" + "novograd.py", "rectified_adam.py", "stochastic_weight_averaging.py", "weight_decay_optimizers.py", diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 813d98156e..ed5a033aa8 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -23,7 +23,6 @@ @tf.keras.utils.register_keras_serializable(package='Addons') class Novograd(tf.keras.optimizer.Optimizer): - def __init__(self, learning_rate=1, beta_1=0.95, @@ -48,24 +47,29 @@ def _create_slots(self, var_list): for var in var_list: self.add_slot(var=var, slot_name='m', initializer='zeros') for var in var_list: - self.add_slot(var=var, slot_name='v', initializer=tf.zeros(shape=[], dtype=var.dtype)) + self.add_slot( + var=var, + slot_name='v', + initializer=tf.zeros(shape=[], dtype=var.dtype)) def _prepare_local(self, var_device, var_dtype, apply_state): - super(Novograd, self)._prepare_local(var_device, var_dtype, apply_state) + super(Novograd, self)._prepare_local(var_device, var_dtype, + apply_state) local_step = tf.cast(self.iterations + 1, var_device) beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype)) beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype)) beta_1_power = tf.pow(beta_1_t, local_step) beta_2_power = tf.pow(beta_2_t, local_step) lr = (apply_state[(var_device, var_dtype)]['lr_t'] * - (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))) - apply_state[(var_device, var_dtype)].update(dict( - lr=lr, - epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), - beta_1_t=beta_1_t, - beta_2_t=beta_2_t, - one_minus_beta_2_t=1 - beta_2_t, - )) + (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))) + apply_state[(var_device, var_dtype)].update( + dict( + lr=lr, + epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), + beta_1_t=beta_1_t, + beta_2_t=beta_2_t, + one_minus_beta_2_t=1 - beta_2_t, + )) def set_weights(self, weights): params = self.weights @@ -86,22 +90,25 @@ def _resource_apply_dense(self, grad, var, apply_state=None): v = self.get_slot(var, 'v') g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) - v_t = tf.cond(tf.equal(self.iterations, 0), - g_2, - v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) + v_t = tf.cond( + tf.equal(self.iterations, 0), g_2, v * coefficients['beta_2_t'] + + g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) - grad = tf.cond(grad_averaging, grad * coefficients['one_minus_beta_1_t'], grad) - grad = tf.cond(tf.greater(weight_decay, 0), grad + weight_decay * var, grad) + grad = tf.cond(grad_averaging, + grad * coefficients['one_minus_beta_1_t'], grad) + grad = tf.cond( + tf.greater(weight_decay, 0), grad + weight_decay * var, grad) m = self.get_slot(var, 'm') - return training_ops.resource_apply_momentum(var.handle, - m.handle, - coefficients['lr'], - grad, - coefficients['beta_1_t'], - use_locking=self._use_locking, - use_nesterov=False) + return training_ops.resource_apply_momentum( + var.handle, + m.handle, + coefficients['lr'], + grad, + coefficients['beta_1_t'], + use_locking=self._use_locking, + use_nesterov=False) def _resource_apply_sparse(self, grad, var, indices, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype @@ -113,32 +120,42 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): v = self.get_slot(var, 'v') g_2 = tf.sparse.reduce_sum(tf.square(tf.cast(grad, tf.float32))) # v is just a scalar and does not need to involve sparse tensors. - v_t = tf.cond(tf.equal(self.iterations, 0), - g_2, - v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) + v_t = tf.cond( + tf.equal(self.iterations, 0), g_2, v * coefficients['beta_2_t'] + + g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) - grad = tf.cond(grad_averaging, grad * coefficients['one_minus_beta_1_t'], grad) - grad = tf.cond(tf.greater(weight_decay, 0), self._resource_scatter_add(grad, indices, weight_decay * var)) + grad = tf.cond(grad_averaging, + grad * coefficients['one_minus_beta_1_t'], grad) + grad = tf.cond( + tf.greater(weight_decay, 0), + self._resource_scatter_add(grad, indices, weight_decay * var)) m = self.get_slot(var, 'm') - return training_ops.resource_apply_sparse_momentum(var.handle, - m.handle, - coefficients['lr'], - grad, - indices, - coefficients['beta_1_t'], - use_locking=self._use_locking, - use_nesterov=False) + return training_ops.resource_apply_sparse_momentum( + var.handle, + m.handle, + coefficients['lr'], + grad, + indices, + coefficients['beta_1_t'], + use_locking=self._use_locking, + use_nesterov=False) def get_config(self): config = super(Novograd, self).get_config() config.update({ - 'learning_rate': self._serialize_hyperparameter('learning_rate'), - 'beta_1': self._serialize_hyperparameter('beta_1'), - 'beta_2': self._serialize_hyperparameter('beta_2'), - 'epsilon': self.epsilon, - 'weight_decay': self._serialize_hyperparameter('weight_decay'), - 'grad_averaging': self._serialize_hyperparameter('grad_averaging'), + 'learning_rate': + self._serialize_hyperparameter('learning_rate'), + 'beta_1': + self._serialize_hyperparameter('beta_1'), + 'beta_2': + self._serialize_hyperparameter('beta_2'), + 'epsilon': + self.epsilon, + 'weight_decay': + self._serialize_hyperparameter('weight_decay'), + 'grad_averaging': + self._serialize_hyperparameter('grad_averaging'), }) return config diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index e988d9a270..990272226c 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -128,21 +128,13 @@ def test_dense_sample_with_grad_averaging(self): self.run_dense_sample( iterations=1, expected=[[0.8041, 1.8041], [2.8041, 3.8041]], - optimizer=Novograd( - lr=1e-3, - grad_averaging=True - ) - ) + optimizer=Novograd(lr=1e-3, grad_averaging=True)) def test_sparse_sample_with_grad_averaging(self): self.run_sparse_sample( iterations=2, expected=[[0.4653, 2.0], [3.0, 3.4653]], - optimizer=Novograd( - lr=1e-3, - grad_averaging=True - ) - ) + optimizer=Novograd(lr=1e-3, grad_averaging=True)) def test_get_config(self): opt = Novograd(lr=1e-4) From b3ea7ef23eba352dfed4d305435dd430a6f4dcdf Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 05:31:30 +0000 Subject: [PATCH 05/62] Fix errors --- tensorflow_addons/optimizers/novograd.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index ed5a033aa8..c9f59a9f09 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -22,7 +22,7 @@ @tf.keras.utils.register_keras_serializable(package='Addons') -class Novograd(tf.keras.optimizer.Optimizer): +class Novograd(tf.keras.optimizers.Optimizer): def __init__(self, learning_rate=1, beta_1=0.95, @@ -55,7 +55,7 @@ def _create_slots(self, var_list): def _prepare_local(self, var_device, var_dtype, apply_state): super(Novograd, self)._prepare_local(var_device, var_dtype, apply_state) - local_step = tf.cast(self.iterations + 1, var_device) + local_step = tf.cast(self.iterations + 1, var_dtype) beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype)) beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype)) beta_1_power = tf.pow(beta_1_t, local_step) @@ -86,7 +86,7 @@ def _resource_apply_dense(self, grad, var, apply_state=None): coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) weight_decay = self._get_hyper('weight_decay') - grad_averaging = self._get_hyper('weight_averaging') + grad_averaging = self._get_hyper('grad_averaging') v = self.get_slot(var, 'v') g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) From d1f25825e798161bfa521b3a23e37ddfc3fa2743 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 05:55:46 +0000 Subject: [PATCH 06/62] More fixes --- tensorflow_addons/optimizers/novograd.py | 29 +++++++++++++----------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index c9f59a9f09..b2289a84dc 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -90,16 +90,18 @@ def _resource_apply_dense(self, grad, var, apply_state=None): v = self.get_slot(var, 'v') g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) - v_t = tf.cond( - tf.equal(self.iterations, 0), g_2, v * coefficients['beta_2_t'] + - g_2 * coefficients['one_minus_beta_2_t']) + v_t = tf.cond(tf.equal(self.iterations, 0), + lambda: g_2, + lambda: v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond(grad_averaging, - grad * coefficients['one_minus_beta_1_t'], grad) - grad = tf.cond( - tf.greater(weight_decay, 0), grad + weight_decay * var, grad) + lambda: grad * coefficients['one_minus_beta_1_t'], + lambda: grad) + grad = tf.cond(tf.greater(weight_decay, 0), + lambda: grad + weight_decay * var, + lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_apply_momentum( var.handle, @@ -120,17 +122,18 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): v = self.get_slot(var, 'v') g_2 = tf.sparse.reduce_sum(tf.square(tf.cast(grad, tf.float32))) # v is just a scalar and does not need to involve sparse tensors. - v_t = tf.cond( - tf.equal(self.iterations, 0), g_2, v * coefficients['beta_2_t'] + - g_2 * coefficients['one_minus_beta_2_t']) + v_t = tf.cond(tf.equal(self.iterations, 0), + lambda: g_2, + lambda: v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond(grad_averaging, - grad * coefficients['one_minus_beta_1_t'], grad) - grad = tf.cond( - tf.greater(weight_decay, 0), - self._resource_scatter_add(grad, indices, weight_decay * var)) + lambda: grad * coefficients['one_minus_beta_1_t'], + lambda: grad) + grad = tf.cond(tf.greater(weight_decay, 0), + self._resource_scatter_add(grad, indices, weight_decay * var), + grad) m = self.get_slot(var, 'm') return training_ops.resource_apply_sparse_momentum( var.handle, From fa184d7f6eea77641eb71d31726c0a8cb1fa4866 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 06:50:45 +0000 Subject: [PATCH 07/62] Add back one - beta_1_t --- tensorflow_addons/optimizers/novograd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index b2289a84dc..d6240075ec 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -69,6 +69,7 @@ def _prepare_local(self, var_device, var_dtype, apply_state): beta_1_t=beta_1_t, beta_2_t=beta_2_t, one_minus_beta_2_t=1 - beta_2_t, + one_minus_beta_1_t=1 - beta_1_t, )) def set_weights(self, weights): From 863b4dcc005766dfdebdb0f4c3ab202be8deedcc Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 07:21:57 +0000 Subject: [PATCH 08/62] Fix some sparse errors --- tensorflow_addons/optimizers/novograd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index d6240075ec..4420efbea6 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -121,7 +121,7 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): grad_averaging = self._get_hyper('grad_averaging') v = self.get_slot(var, 'v') - g_2 = tf.sparse.reduce_sum(tf.square(tf.cast(grad, tf.float32))) + g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) # v is just a scalar and does not need to involve sparse tensors. v_t = tf.cond(tf.equal(self.iterations, 0), lambda: g_2, @@ -133,7 +133,7 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): lambda: grad * coefficients['one_minus_beta_1_t'], lambda: grad) grad = tf.cond(tf.greater(weight_decay, 0), - self._resource_scatter_add(grad, indices, weight_decay * var), + grad + weight_decay * var, grad) m = self.get_slot(var, 'm') return training_ops.resource_apply_sparse_momentum( From 53fca2c5dcbf58e0723db4ac118dad1009b4f769 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 07:28:18 +0000 Subject: [PATCH 09/62] Fix some sparse errors --- tensorflow_addons/optimizers/novograd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 4420efbea6..fae2faafc8 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -133,8 +133,8 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): lambda: grad * coefficients['one_minus_beta_1_t'], lambda: grad) grad = tf.cond(tf.greater(weight_decay, 0), - grad + weight_decay * var, - grad) + lambda: grad + weight_decay * var, + lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_apply_sparse_momentum( var.handle, From 54e38ecd4571b5114d23cd2cce55d4994ade83e3 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 07:33:15 +0000 Subject: [PATCH 10/62] More fixes --- tensorflow_addons/optimizers/novograd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index fae2faafc8..6818055648 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -136,7 +136,7 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): lambda: grad + weight_decay * var, lambda: grad) m = self.get_slot(var, 'm') - return training_ops.resource_apply_sparse_momentum( + return training_ops.resource_sparse_apply_momentum( var.handle, m.handle, coefficients['lr'], From 156af572a3c891ece9d4c8ef263b6e0bff79e502 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 07:52:14 +0000 Subject: [PATCH 11/62] More sparse fixes --- tensorflow_addons/optimizers/novograd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 6818055648..97b5ed77e7 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -140,7 +140,7 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): var.handle, m.handle, coefficients['lr'], - grad, + tf.gather(grad, indices), indices, coefficients['beta_1_t'], use_locking=self._use_locking, From b2ed149b89c3959fe820faf6e4711adbeb66cd55 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 08:18:27 +0000 Subject: [PATCH 12/62] Change tests --- tensorflow_addons/optimizers/novograd.py | 2 +- tensorflow_addons/optimizers/novograd_test.py | 25 ++++++------------- 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 97b5ed77e7..40c33e8c5a 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -24,7 +24,7 @@ @tf.keras.utils.register_keras_serializable(package='Addons') class Novograd(tf.keras.optimizers.Optimizer): def __init__(self, - learning_rate=1, + learning_rate=0.1, beta_1=0.95, beta_2=0.98, epsilon=1e-8, diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 990272226c..861bee816f 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -31,7 +31,7 @@ def run_dense_sample(self, iterations, expected, optimizer): var_1 = tf.Variable([3.0, 4.0], dtype=tf.dtypes.float32) grad_0 = tf.constant([0.1, 0.2], dtype=tf.dtypes.float32) - grad_1 = tf.constant([0.03, 0.04], dtype=tf.dtypes.float32) + grad_1 = tf.constant([0.3, 0.4], dtype=tf.dtypes.float32) grads_and_vars = list(zip([grad_0, grad_1], [var_0, var_1])) @@ -72,15 +72,12 @@ def run_sparse_sample(self, iterations, expected, optimizer): def test_dense_sample(self): # Expected values are obtained from the official implementation + # m_1: 0.4472135755, 0.894427151 + self.run_dense_sample( iterations=1, - expected=[[0.5554, 1.5549], [2.5557, 3.5557]], - optimizer=Novograd(lr=1e-3), - ) - self.run_dense_sample( - iterations=1, - expected=[[0.5554, 1.5549], [2.5557, 3.5557]], - optimizer=Novograd(lr=1e-3), + expected=[[1.9105572849, 0.9552786425], [2.9400000012, 3.9200000016]], + optimizer=Novograd(lr=0.1), ) def test_sparse_sample(self): @@ -100,15 +97,9 @@ def test_sparse_sample(self): def test_dense_sample_with_weight_decay(self): # Expected values are obtained from the official implementation self.run_dense_sample( - iterations=1, - expected=[[0.5472, 1.5368], [2.5276, 3.5176]], - optimizer=Novograd(lr=1e-3, weight_decay=0.01), - ) - self.run_dense_sample( - iterations=1, - expected=[[0.5472, 1.5368], [2.5276, 3.5176]], - optimizer=Novograd(lr=1e-3, weight_decay=0.01), - ) + iterations=2, + expected=[[1.9105572849, 0.9552786425], [2.9400000012, 3.9200000016]], + optimizer=Novograd(lr=0.1, weight_decay=0.01)) def test_sparse_sample_with_weight_decay(self): # Expected values are obtained from the official implementation From ceb97ba885ea0729ac6095b6efca89ee9b388cd3 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 08:22:29 +0000 Subject: [PATCH 13/62] Fix ordering --- tensorflow_addons/optimizers/novograd.py | 1 - tensorflow_addons/optimizers/novograd_test.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 40c33e8c5a..c63838a46d 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -94,7 +94,6 @@ def _resource_apply_dense(self, grad, var, apply_state=None): v_t = tf.cond(tf.equal(self.iterations, 0), lambda: g_2, lambda: v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) - v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond(grad_averaging, diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 861bee816f..0938305f7a 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -98,7 +98,7 @@ def test_dense_sample_with_weight_decay(self): # Expected values are obtained from the official implementation self.run_dense_sample( iterations=2, - expected=[[1.9105572849, 0.9552786425], [2.9400000012, 3.9200000016]], + expected=[[0.9552786425, 1.9105572849], [2.9400000012, 3.9200000016]], optimizer=Novograd(lr=0.1, weight_decay=0.01)) def test_sparse_sample_with_weight_decay(self): From 0fe393dc2a2069d806463990bca623394ab2a11d Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 08:27:19 +0000 Subject: [PATCH 14/62] More test fixes --- tensorflow_addons/optimizers/novograd_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 0938305f7a..f8c0f22aff 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -76,7 +76,7 @@ def test_dense_sample(self): self.run_dense_sample( iterations=1, - expected=[[1.9105572849, 0.9552786425], [2.9400000012, 3.9200000016]], + expected=[[0.9552786425, 1.9105572849], [2.9400000012, 3.9200000016]], optimizer=Novograd(lr=0.1), ) @@ -98,7 +98,7 @@ def test_dense_sample_with_weight_decay(self): # Expected values are obtained from the official implementation self.run_dense_sample( iterations=2, - expected=[[0.9552786425, 1.9105572849], [2.9400000012, 3.9200000016]], + expected=[[0.9010044985, 1.802008997], [2.8506000024, ]], optimizer=Novograd(lr=0.1, weight_decay=0.01)) def test_sparse_sample_with_weight_decay(self): From 731f13250dcdd44776082dccec19917198b3d4db Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 08:35:50 +0000 Subject: [PATCH 15/62] Account for learning rate --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index f8c0f22aff..762ca3e699 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -76,7 +76,7 @@ def test_dense_sample(self): self.run_dense_sample( iterations=1, - expected=[[0.9552786425, 1.9105572849], [2.9400000012, 3.9200000016]], + expected=[[0.8735088993, 1.7470177985], [2.8302943759, 3.7737258345]], optimizer=Novograd(lr=0.1), ) From fa17484a1a6f5aa7b2ccc0ca7d670118751fb456 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 08:59:29 +0000 Subject: [PATCH 16/62] Fix error --- tensorflow_addons/optimizers/novograd.py | 2 +- tensorflow_addons/optimizers/novograd_test.py | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index c63838a46d..67ef54caa0 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -94,7 +94,7 @@ def _resource_apply_dense(self, grad, var, apply_state=None): v_t = tf.cond(tf.equal(self.iterations, 0), lambda: g_2, lambda: v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) - + v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond(grad_averaging, lambda: grad * coefficients['one_minus_beta_1_t'], diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 762ca3e699..d0b5a239a7 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -71,8 +71,6 @@ def run_sparse_sample(self, iterations, expected, optimizer): self.assertAllClose(var_1.read_value(), expected[1], atol=2e-4) def test_dense_sample(self): - # Expected values are obtained from the official implementation - # m_1: 0.4472135755, 0.894427151 self.run_dense_sample( iterations=1, @@ -97,9 +95,10 @@ def test_sparse_sample(self): def test_dense_sample_with_weight_decay(self): # Expected values are obtained from the official implementation self.run_dense_sample( - iterations=2, - expected=[[0.9010044985, 1.802008997], [2.8506000024, ]], - optimizer=Novograd(lr=0.1, weight_decay=0.01)) + iterations=1, + expected=[[0.7382827095, 1.7470177985], [2.8302943759, 3.7737258345]], + optimizer=Novograd(lr=0.1, weight_decay=0.01), + ) def test_sparse_sample_with_weight_decay(self): # Expected values are obtained from the official implementation From 2071669d88de5ac171289d71feb4f2c3c1f3a9bc Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 18:29:39 +0000 Subject: [PATCH 17/62] Sparse fix --- tensorflow_addons/optimizers/novograd_test.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index d0b5a239a7..b936de044a 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -52,9 +52,9 @@ def run_sparse_sample(self, iterations, expected, optimizer): var_1 = tf.Variable([3.0, 4.0]) grad_0 = tf.IndexedSlices( - tf.constant([0.1]), tf.constant([0]), tf.constant([2])) + tf.constant([0.1, 0.2]), tf.constant([0, 1]), tf.constant([2])) grad_1 = tf.IndexedSlices( - tf.constant([0.04]), tf.constant([1]), tf.constant([2])) + tf.constant([0.3, 0.4]), tf.constant([0, 1]), tf.constant([2])) grads_and_vars = list(zip([grad_0, grad_1], [var_0, var_1])) @@ -79,23 +79,16 @@ def test_dense_sample(self): ) def test_sparse_sample(self): - # Expected values are obtained from the official implementation - # Dense results should be: [-0.1929, 0.8066], [1.8075, 2.8074] self.run_sparse_sample( iterations=1, - expected=[[-0.1929, 2.0], [3.0, 2.8074]], - optimizer=Novograd(lr=1e-3), - ) - self.run_sparse_sample( - iterations=2, - expected=[[-0.1929, 2.0], [3.0, 2.8074]], - optimizer=Novograd(lr=1e-3), + expected=[[0.8735088993, 1.7470177985], [2.8302943759, 3.7737258345]], + optimizer=Novograd(lr=0.1), ) def test_dense_sample_with_weight_decay(self): # Expected values are obtained from the official implementation self.run_dense_sample( - iterations=1, + iterations=2, expected=[[0.7382827095, 1.7470177985], [2.8302943759, 3.7737258345]], optimizer=Novograd(lr=0.1, weight_decay=0.01), ) From f725cdddffaedfe4661839767e37c158554761ba Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 19:12:37 +0000 Subject: [PATCH 18/62] Fix weight decay dense --- tensorflow_addons/optimizers/novograd.py | 1 + tensorflow_addons/optimizers/novograd_test.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 67ef54caa0..40c33e8c5a 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -95,6 +95,7 @@ def _resource_apply_dense(self, grad, var, apply_state=None): lambda: g_2, lambda: v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) + grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond(grad_averaging, lambda: grad * coefficients['one_minus_beta_1_t'], diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index b936de044a..5e644e1021 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -81,15 +81,15 @@ def test_dense_sample(self): def test_sparse_sample(self): self.run_sparse_sample( iterations=1, - expected=[[0.8735088993, 1.7470177985], [2.8302943759, 3.7737258345]], + expected=[[0.8706804722, 1.7470177985], [2.8302943759, 3.7737258345]], optimizer=Novograd(lr=0.1), ) def test_dense_sample_with_weight_decay(self): # Expected values are obtained from the official implementation self.run_dense_sample( - iterations=2, - expected=[[0.7382827095, 1.7470177985], [2.8302943759, 3.7737258345]], + iterations=1, + expected=[[0.8706804722, 1.7413609443], [2.8218090945, 3.762412126]], optimizer=Novograd(lr=0.1, weight_decay=0.01), ) @@ -120,7 +120,7 @@ def test_sparse_sample_with_grad_averaging(self): optimizer=Novograd(lr=1e-3, grad_averaging=True)) def test_get_config(self): - opt = Novograd(lr=1e-4) + opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) config = opt.get_config() self.assertEqual(config['learning_rate'], 1e-4) self.assertEqual(config['weight_decay'], 0.0) From a9894606188a4124353064f4e9da7d1e8bcff566 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 19:25:45 +0000 Subject: [PATCH 19/62] More complete testing for desne resource apply --- tensorflow_addons/optimizers/novograd_test.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 5e644e1021..5cf0c209ce 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -75,7 +75,7 @@ def test_dense_sample(self): self.run_dense_sample( iterations=1, expected=[[0.8735088993, 1.7470177985], [2.8302943759, 3.7737258345]], - optimizer=Novograd(lr=0.1), + optimizer=Novograd(lr=0.1, beta_2=0.98, beta_1=0.95, epsilon=1e-8), ) def test_sparse_sample(self): @@ -90,7 +90,7 @@ def test_dense_sample_with_weight_decay(self): self.run_dense_sample( iterations=1, expected=[[0.8706804722, 1.7413609443], [2.8218090945, 3.762412126]], - optimizer=Novograd(lr=0.1, weight_decay=0.01), + optimizer=Novograd(lr=0.1, beta_1=0.95, beta_2=0.98, weight_decay=0.01, epsilon=1e-8), ) def test_sparse_sample_with_weight_decay(self): @@ -101,17 +101,12 @@ def test_sparse_sample_with_weight_decay(self): expected=[[-0.2029, 2.0], [3.0, 2.7380]], optimizer=Novograd(lr=1e-3, weight_decay=0.01), ) - self.run_sparse_sample( - iterations=2, - expected=[[-0.2029, 2.0], [3.0, 2.7380]], - optimizer=Novograd(lr=1e-3, weight_decay=0.01), - ) def test_dense_sample_with_grad_averaging(self): self.run_dense_sample( iterations=1, - expected=[[0.8041, 1.8041], [2.8041, 3.8041]], - optimizer=Novograd(lr=1e-3, grad_averaging=True)) + expected=[[0.993675445, 1.8041], [2.8041, 3.8041]], + optimizer=Novograd(lr=0.1, beta_1=0.95, beta_2=0.98, epsilon=1e-8, grad_averaging=True)) def test_sparse_sample_with_grad_averaging(self): self.run_sparse_sample( From 1b60c3caf153c5c499be33e72e6a15bd96049f39 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 19:29:38 +0000 Subject: [PATCH 20/62] Add linear model test --- tensorflow_addons/optimizers/novograd_test.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 5cf0c209ce..408602fe82 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -19,6 +19,7 @@ from __future__ import print_function import tensorflow as tf +import numpy as np from tensorflow_addons.utils import test_utils from tensorflow_addons.optimizers import Novograd @@ -114,6 +115,27 @@ def test_sparse_sample_with_grad_averaging(self): expected=[[0.4653, 2.0], [3.0, 3.4653]], optimizer=Novograd(lr=1e-3, grad_averaging=True)) + def test_fit_simple_linear_model(self): + np.random.seed(0x2019) + tf.random.set_seed(0x2019) + + x = np.random.standard_normal((100000, 3)) + w = np.random.standard_normal((3, 1)) + y = np.dot(x, w) + np.random.standard_normal((100000, 1)) * 1e-4 + + model = tf.keras.models.Sequential() + model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) + model.compile(Novograd(lr=0.01, beta_1=0.9, beta_2=0.999), loss='mse') + + model.fit(x, y, epochs=3) + + x = np.random.standard_normal((100, 3)) + y = np.dot(x, w) + predicted = model.predict(x) + + max_abs_diff = np.max(np.abs(predicted - y)) + self.assertLess(max_abs_diff, 1e-4) + def test_get_config(self): opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) config = opt.get_config() From 5f7a0055081a1f3f62aea7e0d254169eb63ff16d Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 19:38:46 +0000 Subject: [PATCH 21/62] Increase number of epochs for novograd --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 408602fe82..b1f31af61e 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -127,7 +127,7 @@ def test_fit_simple_linear_model(self): model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) model.compile(Novograd(lr=0.01, beta_1=0.9, beta_2=0.999), loss='mse') - model.fit(x, y, epochs=3) + model.fit(x, y, epochs=10) x = np.random.standard_normal((100, 3)) y = np.dot(x, w) From 993f3359ac022a77bb6b7400379cda00d9bb3cf5 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 19:40:02 +0000 Subject: [PATCH 22/62] Increae error threshold --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index b1f31af61e..85762c007d 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -134,7 +134,7 @@ def test_fit_simple_linear_model(self): predicted = model.predict(x) max_abs_diff = np.max(np.abs(predicted - y)) - self.assertLess(max_abs_diff, 1e-4) + self.assertLess(max_abs_diff, 1e-3) def test_get_config(self): opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) From 2a386c052ed5a5653d1fdd3d5f21aa640e9f6735 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 19:51:29 +0000 Subject: [PATCH 23/62] More epochs --- .../optimizers/stochastic_weight_averaging_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/stochastic_weight_averaging_test.py b/tensorflow_addons/optimizers/stochastic_weight_averaging_test.py index adc58fd128..c1e92eb6ff 100644 --- a/tensorflow_addons/optimizers/stochastic_weight_averaging_test.py +++ b/tensorflow_addons/optimizers/stochastic_weight_averaging_test.py @@ -87,7 +87,7 @@ def test_fit_simple_linear_model(self): optimizer = SWA( 'adam', start_averaging=num_examples // 32 - 1, average_period=100) model.compile(optimizer, loss='mse') - model.fit(x, y, epochs=10) + model.fit(x, y, epochs=20) optimizer.assign_average_vars(model.variables) x = np.random.standard_normal((100, 3)) From a06074bf06b07264786f646df552361c63ee3687 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 20:01:56 +0000 Subject: [PATCH 24/62] More linear updates --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 85762c007d..47929e1be5 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -125,7 +125,7 @@ def test_fit_simple_linear_model(self): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) - model.compile(Novograd(lr=0.01, beta_1=0.9, beta_2=0.999), loss='mse') + model.compile(Novograd(lr=0.001, beta_1=0.9, beta_2=0.999), loss='mse') model.fit(x, y, epochs=10) From 78c1c47792485580881dc5d81f7284b8fca1143b Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 20:06:34 +0000 Subject: [PATCH 25/62] More changes to linear test --- tensorflow_addons/optimizers/novograd_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 47929e1be5..cdfc4854da 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -106,7 +106,7 @@ def test_sparse_sample_with_weight_decay(self): def test_dense_sample_with_grad_averaging(self): self.run_dense_sample( iterations=1, - expected=[[0.993675445, 1.8041], [2.8041, 3.8041]], + expected=[[0.993675445, 1.9873508899], [2.8041, 3.8041]], optimizer=Novograd(lr=0.1, beta_1=0.95, beta_2=0.98, epsilon=1e-8, grad_averaging=True)) def test_sparse_sample_with_grad_averaging(self): @@ -116,8 +116,8 @@ def test_sparse_sample_with_grad_averaging(self): optimizer=Novograd(lr=1e-3, grad_averaging=True)) def test_fit_simple_linear_model(self): - np.random.seed(0x2019) - tf.random.set_seed(0x2019) + np.random.seed(0x2020) + tf.random.set_seed(0x2020) x = np.random.standard_normal((100000, 3)) w = np.random.standard_normal((3, 1)) @@ -127,7 +127,7 @@ def test_fit_simple_linear_model(self): model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) model.compile(Novograd(lr=0.001, beta_1=0.9, beta_2=0.999), loss='mse') - model.fit(x, y, epochs=10) + model.fit(x, y, epochs=20) x = np.random.standard_normal((100, 3)) y = np.dot(x, w) From 2749a96faf38e37963a8faee6c01f8de8ee77b0c Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 20:08:09 +0000 Subject: [PATCH 26/62] Update another dense test --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index cdfc4854da..442766b3fc 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -106,7 +106,7 @@ def test_sparse_sample_with_weight_decay(self): def test_dense_sample_with_grad_averaging(self): self.run_dense_sample( iterations=1, - expected=[[0.993675445, 1.9873508899], [2.8041, 3.8041]], + expected=[[0.993675445, 1.9873508899], [2.9915147188, 3.9886862917]], optimizer=Novograd(lr=0.1, beta_1=0.95, beta_2=0.98, epsilon=1e-8, grad_averaging=True)) def test_sparse_sample_with_grad_averaging(self): From da62d39239422749ae8e6945d58fe87fefdd432a Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 20:22:49 +0000 Subject: [PATCH 27/62] Tests --- tensorflow_addons/optimizers/novograd_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 442766b3fc..cd6109e0af 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -127,14 +127,14 @@ def test_fit_simple_linear_model(self): model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) model.compile(Novograd(lr=0.001, beta_1=0.9, beta_2=0.999), loss='mse') - model.fit(x, y, epochs=20) + model.fit(x, y, epochs=3) x = np.random.standard_normal((100, 3)) y = np.dot(x, w) predicted = model.predict(x) max_abs_diff = np.max(np.abs(predicted - y)) - self.assertLess(max_abs_diff, 1e-3) + self.assertLess(max_abs_diff, 1e-2) def test_get_config(self): opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) From 6020c2cbc3e0734e6691ff2d7b751fe512348966 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 20:29:03 +0000 Subject: [PATCH 28/62] Revert change to swa_test --- .../optimizers/stochastic_weight_averaging_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/stochastic_weight_averaging_test.py b/tensorflow_addons/optimizers/stochastic_weight_averaging_test.py index c1e92eb6ff..adc58fd128 100644 --- a/tensorflow_addons/optimizers/stochastic_weight_averaging_test.py +++ b/tensorflow_addons/optimizers/stochastic_weight_averaging_test.py @@ -87,7 +87,7 @@ def test_fit_simple_linear_model(self): optimizer = SWA( 'adam', start_averaging=num_examples // 32 - 1, average_period=100) model.compile(optimizer, loss='mse') - model.fit(x, y, epochs=20) + model.fit(x, y, epochs=10) optimizer.assign_average_vars(model.variables) x = np.random.standard_normal((100, 3)) From 6cbb60552cb71e06139aa488931609d8efa3815f Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 20:52:56 +0000 Subject: [PATCH 29/62] Possibly fix all tests --- tensorflow_addons/optimizers/novograd_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index cd6109e0af..eddba64f1c 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -82,8 +82,8 @@ def test_dense_sample(self): def test_sparse_sample(self): self.run_sparse_sample( iterations=1, - expected=[[0.8706804722, 1.7470177985], [2.8302943759, 3.7737258345]], - optimizer=Novograd(lr=0.1), + expected=[[0.8735088993, 1.7470177985], [2.8302943759, 3.7737258345]], + optimizer=Novograd(lr=0.1, beta_2=0.98, beta_1=0.95, epsilon=1e-8), ) def test_dense_sample_with_weight_decay(self): @@ -99,8 +99,8 @@ def test_sparse_sample_with_weight_decay(self): # Dense results should be: [-0.2029, 0.7768], [1.7578, 2.7380] self.run_sparse_sample( iterations=1, - expected=[[-0.2029, 2.0], [3.0, 2.7380]], - optimizer=Novograd(lr=1e-3, weight_decay=0.01), + expected=[[0.8706804722, 1.7413609443], [2.8218090945, 3.762412126]], + optimizer=Novograd(lr=0.1, beta_1=0.95, beta_2=0.98, weight_decay=0.01, epsilon=1e-8), ) def test_dense_sample_with_grad_averaging(self): @@ -111,9 +111,9 @@ def test_dense_sample_with_grad_averaging(self): def test_sparse_sample_with_grad_averaging(self): self.run_sparse_sample( - iterations=2, - expected=[[0.4653, 2.0], [3.0, 3.4653]], - optimizer=Novograd(lr=1e-3, grad_averaging=True)) + iterations=1, + expected=[[0.993675445, 1.9873508899], [2.9915147188, 3.9886862917]], + optimizer=Novograd(lr=0.1, beta_1=0.95, beta_2=0.98, epsilon=1e-8, grad_averaging=True)) def test_fit_simple_linear_model(self): np.random.seed(0x2020) From 1652150be49ed3457b1bab7055413ad64d3cf497 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 21:27:58 +0000 Subject: [PATCH 30/62] Documentation and cleanup --- tensorflow_addons/optimizers/novograd.py | 73 ++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 6 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 40c33e8c5a..3b466060cd 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -23,6 +23,44 @@ @tf.keras.utils.register_keras_serializable(package='Addons') class Novograd(tf.keras.optimizers.Optimizer): + """ + The Novograd Optimizer was first proposed in [Stochastic Gradient Methods with + Layerwise Adaptvie Moments for training of Deep Networks](https://arxiv.org/pdf/1905.11286.pdf) + + NovoGrad is a first-order SGD-based algorithm, which computes second moments per layer + instead of per weight as in Adam. Compared to Adam, NovoGrad takes less memory, + and has been found to be more numerically stable. More specifically we compute (for more information + on the computation please refer to this [link](https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html): + + ``` + Second order moment = exponential moving average of Layer-wise square of grads: + v_t <-- beta_2 * v_{t-1} + (1-beta_2) * (g_t)^2 + First order moment in one of four modes: + 1. moment of grads normalized by v_t: + m_t <- beta_1 * m_{t-1} + [ g_t / (sqrt(v_t)+epsilon)] + 2. moment similar to Adam: exponential moving average of grads normalized by v_t + (set grad_averaging = True to use this): + m_t <- beta_1 * m_{t-1} + [(1 - beta_1) * (g_t / (sqrt(v_t) + epsilon))] + 3. weight decay adds a w_d term after grads are rescaled by 1/sqrt(v_t) + (set weight_decay > 0 to use this0: + m_t <- beta1*m_{t-1} + [(g_t / (sqrt(v_t) + epsilon)) + (w_d * w_{t-1})] + 4. weight decay + exponential moving average from Adam: + m_t <- beta_1 * m_{t-1} + [(1 - beta_1) * ((g_t / (sqrt(v_t + epsilon)) + (w_d * w_{t-1}))] + Weight update: + w_t <- w_{t-1} - lr_t * m_t + ``` + + Example of usage: + ```python + opt = tfa.optimizers.Novograd( + lr=1e-3, + beta_1=0.9, + beta_2=0.999, + weight_decay=0.001, + grad_averaging=False + ) + ``` + """ def __init__(self, learning_rate=0.1, beta_1=0.95, @@ -32,7 +70,30 @@ def __init__(self, grad_averaging=False, name='Novograd', **kwargs): + r"""Construct a new RAdam optimizer. + + Args: + learning_rate: A `Tensor` or a floating point value. or a schedule + that is a `tf.keras.optimizers.schedules.LearningRateSchedule` + The learning rate. + beta_1: A float value or a constant float tensor. + The exponential decay rate for the 1st moment estimates. + beta_2: A float value or a constant float tensor. + The exponential decay rate for the 2nd moment estimates. + epsilon: A small constant for numerical stability. + weight_decay: A floating point value. Weight decay for each param. + grad_averaging: determines whether to use Adam style exponential moving + averaging for the first order moments. + **kwargs: keyword arguments. Allowed to be {`clipnorm`, + `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients + by norm; `clipvalue` is clip gradients by value, `decay` is + included for backward compatibility to allow time inverse + decay of learning rate. `lr` is included for backward + compatibility, recommended to use `learning_rate` instead. + """ super(Novograd, self).__init__(name, **kwargs) + if weight_decay < 0.0: + raise ValueError('Weight decay rate cannot be negative') self._set_hyper('learning_rate', kwargs.get('lr', learning_rate)) self._set_hyper('decay', self._initial_decay) self._set_hyper('beta_1', beta_1) @@ -97,12 +158,12 @@ def _resource_apply_dense(self, grad, var, apply_state=None): v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) - grad = tf.cond(grad_averaging, - lambda: grad * coefficients['one_minus_beta_1_t'], - lambda: grad) grad = tf.cond(tf.greater(weight_decay, 0), lambda: grad + weight_decay * var, lambda: grad) + grad = tf.cond(grad_averaging, + lambda: grad * coefficients['one_minus_beta_1_t'], + lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_apply_momentum( var.handle, @@ -129,12 +190,12 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) - grad = tf.cond(grad_averaging, - lambda: grad * coefficients['one_minus_beta_1_t'], - lambda: grad) grad = tf.cond(tf.greater(weight_decay, 0), lambda: grad + weight_decay * var, lambda: grad) + grad = tf.cond(grad_averaging, + lambda: grad * coefficients['one_minus_beta_1_t'], + lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_sparse_apply_momentum( var.handle, From e7b15e1e4bd99ba42bc4165526ab34ce7886da93 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 22:11:16 +0000 Subject: [PATCH 31/62] Attempt to reduce tolerance for linear test --- tensorflow_addons/optimizers/novograd_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index eddba64f1c..3e443db6d6 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -125,16 +125,16 @@ def test_fit_simple_linear_model(self): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) - model.compile(Novograd(lr=0.001, beta_1=0.9, beta_2=0.999), loss='mse') + model.compile(Novograd(lr=0.01, beta_1=0.9, beta_2=0.999), loss='mse') - model.fit(x, y, epochs=3) + model.fit(x, y, epochs=5) x = np.random.standard_normal((100, 3)) y = np.dot(x, w) predicted = model.predict(x) max_abs_diff = np.max(np.abs(predicted - y)) - self.assertLess(max_abs_diff, 1e-2) + self.assertLess(max_abs_diff, 2e-3) def test_get_config(self): opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) From 1af1b0b077fec30af8f68931c590fd82dece9acd Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 22:19:17 +0000 Subject: [PATCH 32/62] Reduce even further --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 3e443db6d6..d76e50f2e8 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -134,7 +134,7 @@ def test_fit_simple_linear_model(self): predicted = model.predict(x) max_abs_diff = np.max(np.abs(predicted - y)) - self.assertLess(max_abs_diff, 2e-3) + self.assertLess(max_abs_diff, 1e-3) def test_get_config(self): opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) From 5503694d42841fcd087d750657f6a5d331b470e0 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 22:29:14 +0000 Subject: [PATCH 33/62] Even further --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index d76e50f2e8..c9d7a04e16 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -134,7 +134,7 @@ def test_fit_simple_linear_model(self): predicted = model.predict(x) max_abs_diff = np.max(np.abs(predicted - y)) - self.assertLess(max_abs_diff, 1e-3) + self.assertLess(max_abs_diff, 2e-4) def test_get_config(self): opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) From c3478548f7d0dd3ae8286efe6dcfa40fe8770ac5 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sun, 5 Jan 2020 22:36:49 +0000 Subject: [PATCH 34/62] Pushed as far as possible --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index c9d7a04e16..b8ae04d66f 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -134,7 +134,7 @@ def test_fit_simple_linear_model(self): predicted = model.predict(x) max_abs_diff = np.max(np.abs(predicted - y)) - self.assertLess(max_abs_diff, 2e-4) + self.assertLess(max_abs_diff, 2.5e-4) def test_get_config(self): opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) From b31b44008b8575917209695ae8e4442c337b84d0 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 00:03:41 +0000 Subject: [PATCH 35/62] Pylint and sanity check --- tensorflow_addons/optimizers/novograd.py | 78 ++++++++++--------- tensorflow_addons/optimizers/novograd_test.py | 46 ++++++++--- 2 files changed, 79 insertions(+), 45 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 3b466060cd..0c4e0933fb 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -23,29 +23,36 @@ @tf.keras.utils.register_keras_serializable(package='Addons') class Novograd(tf.keras.optimizers.Optimizer): - """ - The Novograd Optimizer was first proposed in [Stochastic Gradient Methods with - Layerwise Adaptvie Moments for training of Deep Networks](https://arxiv.org/pdf/1905.11286.pdf) + """The Novograd Optimizer was first proposed in [Stochastic Gradient + Methods with Layerwise Adaptvie Moments for training of Deep + Networks](https://arxiv.org/pdf/1905.11286.pdf) - NovoGrad is a first-order SGD-based algorithm, which computes second moments per layer - instead of per weight as in Adam. Compared to Adam, NovoGrad takes less memory, - and has been found to be more numerically stable. More specifically we compute (for more information - on the computation please refer to this [link](https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html): + NovoGrad is a first-order SGD-based algorithm, which computes second + moments per layer instead of per weight as in Adam. Compared to Adam, + NovoGrad takes less memory, and has been found to be more numerically + stable. More specifically we compute (for more information on the + computation please refer to this + [link](https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html): ``` - Second order moment = exponential moving average of Layer-wise square of grads: + Second order moment = exponential moving average of Layer-wise square + of grads: v_t <-- beta_2 * v_{t-1} + (1-beta_2) * (g_t)^2 First order moment in one of four modes: 1. moment of grads normalized by v_t: m_t <- beta_1 * m_{t-1} + [ g_t / (sqrt(v_t)+epsilon)] - 2. moment similar to Adam: exponential moving average of grads normalized by v_t - (set grad_averaging = True to use this): - m_t <- beta_1 * m_{t-1} + [(1 - beta_1) * (g_t / (sqrt(v_t) + epsilon))] - 3. weight decay adds a w_d term after grads are rescaled by 1/sqrt(v_t) - (set weight_decay > 0 to use this0: - m_t <- beta1*m_{t-1} + [(g_t / (sqrt(v_t) + epsilon)) + (w_d * w_{t-1})] + 2. moment similar to Adam: exponential moving average of grads + normalized by v_t (set grad_averaging = True to use this): + m_t <- beta_1 * m_{t-1} + + [(1 - beta_1) * (g_t / (sqrt(v_t) + epsilon))] + 3. weight decay adds a w_d term after grads are rescaled by + 1/sqrt(v_t) (set weight_decay > 0 to use this0: + m_t <- beta_1 * m_{t-1} + + [(g_t / (sqrt(v_t) + epsilon)) + (w_d * w_{t-1})] 4. weight decay + exponential moving average from Adam: - m_t <- beta_1 * m_{t-1} + [(1 - beta_1) * ((g_t / (sqrt(v_t + epsilon)) + (w_d * w_{t-1}))] + m_t <- beta_1 * m_{t-1} + + [(1 - beta_1) * ((g_t / (sqrt(v_t + epsilon)) + + (w_d * w_{t-1}))] Weight update: w_t <- w_{t-1} - lr_t * m_t ``` @@ -61,6 +68,7 @@ class Novograd(tf.keras.optimizers.Optimizer): ) ``` """ + def __init__(self, learning_rate=0.1, beta_1=0.95, @@ -82,8 +90,8 @@ def __init__(self, The exponential decay rate for the 2nd moment estimates. epsilon: A small constant for numerical stability. weight_decay: A floating point value. Weight decay for each param. - grad_averaging: determines whether to use Adam style exponential moving - averaging for the first order moments. + grad_averaging: determines whether to use Adam style exponential + moving averaging for the first order moments. **kwargs: keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by norm; `clipvalue` is clip gradients by value, `decay` is @@ -152,18 +160,18 @@ def _resource_apply_dense(self, grad, var, apply_state=None): v = self.get_slot(var, 'v') g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) - v_t = tf.cond(tf.equal(self.iterations, 0), - lambda: g_2, - lambda: v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) + v_t = tf.cond( + tf.equal(self.iterations, + 0), lambda: g_2, lambda: v * coefficients['beta_2_t'] + + g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) - grad = tf.cond(tf.greater(weight_decay, 0), - lambda: grad + weight_decay * var, - lambda: grad) - grad = tf.cond(grad_averaging, - lambda: grad * coefficients['one_minus_beta_1_t'], - lambda: grad) + grad = tf.cond( + tf.greater(weight_decay, + 0), lambda: grad + weight_decay * var, lambda: grad) + grad = tf.cond(grad_averaging, lambda: grad * coefficients[ + 'one_minus_beta_1_t'], lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_apply_momentum( var.handle, @@ -184,18 +192,18 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): v = self.get_slot(var, 'v') g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) # v is just a scalar and does not need to involve sparse tensors. - v_t = tf.cond(tf.equal(self.iterations, 0), - lambda: g_2, - lambda: v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) + v_t = tf.cond( + tf.equal(self.iterations, + 0), lambda: g_2, lambda: v * coefficients['beta_2_t'] + + g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) grad = grad / (tf.sqrt(v_t) + self.epsilon) - grad = tf.cond(tf.greater(weight_decay, 0), - lambda: grad + weight_decay * var, - lambda: grad) - grad = tf.cond(grad_averaging, - lambda: grad * coefficients['one_minus_beta_1_t'], - lambda: grad) + grad = tf.cond( + tf.greater(weight_decay, + 0), lambda: grad + weight_decay * var, lambda: grad) + grad = tf.cond(grad_averaging, lambda: grad * coefficients[ + 'one_minus_beta_1_t'], lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_sparse_apply_momentum( var.handle, diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index b8ae04d66f..323ce2cb47 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -75,14 +75,16 @@ def test_dense_sample(self): self.run_dense_sample( iterations=1, - expected=[[0.8735088993, 1.7470177985], [2.8302943759, 3.7737258345]], + expected=[[0.8735088993, 1.7470177985], + [2.8302943759, 3.7737258345]], optimizer=Novograd(lr=0.1, beta_2=0.98, beta_1=0.95, epsilon=1e-8), ) def test_sparse_sample(self): self.run_sparse_sample( iterations=1, - expected=[[0.8735088993, 1.7470177985], [2.8302943759, 3.7737258345]], + expected=[[0.8735088993, 1.7470177985], + [2.8302943759, 3.7737258345]], optimizer=Novograd(lr=0.1, beta_2=0.98, beta_1=0.95, epsilon=1e-8), ) @@ -90,8 +92,14 @@ def test_dense_sample_with_weight_decay(self): # Expected values are obtained from the official implementation self.run_dense_sample( iterations=1, - expected=[[0.8706804722, 1.7413609443], [2.8218090945, 3.762412126]], - optimizer=Novograd(lr=0.1, beta_1=0.95, beta_2=0.98, weight_decay=0.01, epsilon=1e-8), + expected=[[0.8706804722, 1.7413609443], + [2.8218090945, 3.762412126]], + optimizer=Novograd( + lr=0.1, + beta_1=0.95, + beta_2=0.98, + weight_decay=0.01, + epsilon=1e-8), ) def test_sparse_sample_with_weight_decay(self): @@ -99,21 +107,39 @@ def test_sparse_sample_with_weight_decay(self): # Dense results should be: [-0.2029, 0.7768], [1.7578, 2.7380] self.run_sparse_sample( iterations=1, - expected=[[0.8706804722, 1.7413609443], [2.8218090945, 3.762412126]], - optimizer=Novograd(lr=0.1, beta_1=0.95, beta_2=0.98, weight_decay=0.01, epsilon=1e-8), + expected=[[0.8706804722, 1.7413609443], + [2.8218090945, 3.762412126]], + optimizer=Novograd( + lr=0.1, + beta_1=0.95, + beta_2=0.98, + weight_decay=0.01, + epsilon=1e-8), ) def test_dense_sample_with_grad_averaging(self): self.run_dense_sample( iterations=1, - expected=[[0.993675445, 1.9873508899], [2.9915147188, 3.9886862917]], - optimizer=Novograd(lr=0.1, beta_1=0.95, beta_2=0.98, epsilon=1e-8, grad_averaging=True)) + expected=[[0.993675445, 1.9873508899], + [2.9915147188, 3.9886862917]], + optimizer=Novograd( + lr=0.1, + beta_1=0.95, + beta_2=0.98, + epsilon=1e-8, + grad_averaging=True)) def test_sparse_sample_with_grad_averaging(self): self.run_sparse_sample( iterations=1, - expected=[[0.993675445, 1.9873508899], [2.9915147188, 3.9886862917]], - optimizer=Novograd(lr=0.1, beta_1=0.95, beta_2=0.98, epsilon=1e-8, grad_averaging=True)) + expected=[[0.993675445, 1.9873508899], + [2.9915147188, 3.9886862917]], + optimizer=Novograd( + lr=0.1, + beta_1=0.95, + beta_2=0.98, + epsilon=1e-8, + grad_averaging=True)) def test_fit_simple_linear_model(self): np.random.seed(0x2020) From 4cc524aef08209c2a8f4191a48d74063076a5e1b Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 00:42:31 +0000 Subject: [PATCH 36/62] More epochs and change beta_1 and beta_2 --- tensorflow_addons/optimizers/novograd_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 323ce2cb47..7d5e6a6399 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -151,9 +151,9 @@ def test_fit_simple_linear_model(self): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) - model.compile(Novograd(lr=0.01, beta_1=0.9, beta_2=0.999), loss='mse') + model.compile(Novograd(lr=0.01, beta_1=0.95, beta_2=0.98), loss='mse') - model.fit(x, y, epochs=5) + model.fit(x, y, epochs=10) x = np.random.standard_normal((100, 3)) y = np.dot(x, w) From 23c85948ec7fd5a8cb333e397c90c0904bb94dad Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 00:52:32 +0000 Subject: [PATCH 37/62] More epochs and change beta_1 and beta_2 --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 7d5e6a6399..175adc6832 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -151,7 +151,7 @@ def test_fit_simple_linear_model(self): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) - model.compile(Novograd(lr=0.01, beta_1=0.95, beta_2=0.98), loss='mse') + model.compile(Novograd(lr=0.01, beta_1=0.9, beta_2=0.98), loss='mse', grad_averaging=True) model.fit(x, y, epochs=10) From 743e829f0a566bcc1f2bcb2a9f3ff4760f96be98 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 00:52:47 +0000 Subject: [PATCH 38/62] More epochs and change beta_1 and beta_2 --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 175adc6832..c4ef9dfe6a 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -151,7 +151,7 @@ def test_fit_simple_linear_model(self): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) - model.compile(Novograd(lr=0.01, beta_1=0.9, beta_2=0.98), loss='mse', grad_averaging=True) + model.compile(Novograd(lr=0.1, beta_1=0.9, beta_2=0.98), loss='mse', grad_averaging=True) model.fit(x, y, epochs=10) From 372e814cfc17d21045c17eb5576720aabf4b6734 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 00:59:31 +0000 Subject: [PATCH 39/62] Fix typo --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index c4ef9dfe6a..50caf9810d 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -151,7 +151,7 @@ def test_fit_simple_linear_model(self): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) - model.compile(Novograd(lr=0.1, beta_1=0.9, beta_2=0.98), loss='mse', grad_averaging=True) + model.compile(Novograd(lr=0.1, beta_1=0.9, beta_2=0.98, grad_averaging=True), loss='mse') model.fit(x, y, epochs=10) From a03c56ec5ada47da920a1c022294055bb2d4a0bb Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 01:09:10 +0000 Subject: [PATCH 40/62] Make current values more important --- tensorflow_addons/optimizers/novograd_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 50caf9810d..9aceed3050 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -151,9 +151,9 @@ def test_fit_simple_linear_model(self): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) - model.compile(Novograd(lr=0.1, beta_1=0.9, beta_2=0.98, grad_averaging=True), loss='mse') + model.compile(Novograd(lr=0.01, beta_1=0.50, beta_2=0.50), loss='mse') - model.fit(x, y, epochs=10) + model.fit(x, y, epochs=5) x = np.random.standard_normal((100, 3)) y = np.dot(x, w) From e7e5f01bcb1e34e1e0306b867c94615fac4a5016 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 01:14:46 +0000 Subject: [PATCH 41/62] Make current values more important --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 9aceed3050..165ac1e047 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -151,7 +151,7 @@ def test_fit_simple_linear_model(self): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) - model.compile(Novograd(lr=0.01, beta_1=0.50, beta_2=0.50), loss='mse') + model.compile(Novograd(lr=0.1, beta_1=0.10, beta_2=0.50), loss='mse') model.fit(x, y, epochs=5) From 86b6edf48d90a7fea1d0e11ce42f63bceb3bbfb0 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 01:25:32 +0000 Subject: [PATCH 42/62] Increase threshold --- tensorflow_addons/optimizers/novograd.py | 6 +++--- tensorflow_addons/optimizers/novograd_test.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 0c4e0933fb..2c33ecb047 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -70,9 +70,9 @@ class Novograd(tf.keras.optimizers.Optimizer): """ def __init__(self, - learning_rate=0.1, - beta_1=0.95, - beta_2=0.98, + learning_rate=0.001, + beta_1=0.9, + beta_2=0.999, epsilon=1e-8, weight_decay=0.0, grad_averaging=False, diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 165ac1e047..9bd9390156 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -147,20 +147,20 @@ def test_fit_simple_linear_model(self): x = np.random.standard_normal((100000, 3)) w = np.random.standard_normal((3, 1)) - y = np.dot(x, w) + np.random.standard_normal((100000, 1)) * 1e-4 + y = np.dot(x, w) + np.random.standard_normal((100000, 1)) * 1e-6 model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) - model.compile(Novograd(lr=0.1, beta_1=0.10, beta_2=0.50), loss='mse') + model.compile(Novograd(), loss='mse') - model.fit(x, y, epochs=5) + model.fit(x, y, epochs=10) x = np.random.standard_normal((100, 3)) y = np.dot(x, w) predicted = model.predict(x) max_abs_diff = np.max(np.abs(predicted - y)) - self.assertLess(max_abs_diff, 2.5e-4) + self.assertLess(max_abs_diff, 1e-2) def test_get_config(self): opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) From 97aa225983611632362c2301bd5ae5f805b8c8bf Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 01:39:07 +0000 Subject: [PATCH 43/62] Remove learning rate --- tensorflow_addons/optimizers/novograd.py | 10 ++-------- tensorflow_addons/optimizers/novograd_test.py | 10 +++++----- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 2c33ecb047..5dcd96c2f9 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -124,16 +124,10 @@ def _create_slots(self, var_list): def _prepare_local(self, var_device, var_dtype, apply_state): super(Novograd, self)._prepare_local(var_device, var_dtype, apply_state) - local_step = tf.cast(self.iterations + 1, var_dtype) beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype)) beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype)) - beta_1_power = tf.pow(beta_1_t, local_step) - beta_2_power = tf.pow(beta_2_t, local_step) - lr = (apply_state[(var_device, var_dtype)]['lr_t'] * - (tf.sqrt(1 - beta_2_power) / (1 - beta_1_power))) apply_state[(var_device, var_dtype)].update( dict( - lr=lr, epsilon=tf.convert_to_tensor(self.epsilon, var_dtype), beta_1_t=beta_1_t, beta_2_t=beta_2_t, @@ -176,7 +170,7 @@ def _resource_apply_dense(self, grad, var, apply_state=None): return training_ops.resource_apply_momentum( var.handle, m.handle, - coefficients['lr'], + coefficients['lr_t'], grad, coefficients['beta_1_t'], use_locking=self._use_locking, @@ -208,7 +202,7 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): return training_ops.resource_sparse_apply_momentum( var.handle, m.handle, - coefficients['lr'], + coefficients['lr_t'], tf.gather(grad, indices), indices, coefficients['beta_1_t'], diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 9bd9390156..015e859946 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -75,16 +75,16 @@ def test_dense_sample(self): self.run_dense_sample( iterations=1, - expected=[[0.8735088993, 1.7470177985], - [2.8302943759, 3.7737258345]], - optimizer=Novograd(lr=0.1, beta_2=0.98, beta_1=0.95, epsilon=1e-8), + expected=[[0.9552786425, 1.9105572849], + [2.9400000012, 3.9200000016]], + optimizer=Novograd(lr=0.1, epsilon=1e-8), ) def test_sparse_sample(self): self.run_sparse_sample( iterations=1, - expected=[[0.8735088993, 1.7470177985], - [2.8302943759, 3.7737258345]], + expected=[[0.9552786425, 1.9105572849], + [2.9400000012, 3.9200000016]], optimizer=Novograd(lr=0.1, beta_2=0.98, beta_1=0.95, epsilon=1e-8), ) From 8634520943612445023b47961b01c63a88be54cb Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 01:45:04 +0000 Subject: [PATCH 44/62] Update tests --- tensorflow_addons/optimizers/novograd_test.py | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 015e859946..518c31de3a 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -85,20 +85,18 @@ def test_sparse_sample(self): iterations=1, expected=[[0.9552786425, 1.9105572849], [2.9400000012, 3.9200000016]], - optimizer=Novograd(lr=0.1, beta_2=0.98, beta_1=0.95, epsilon=1e-8), + optimizer=Novograd(lr=0.1, epsilon=1e-8), ) def test_dense_sample_with_weight_decay(self): # Expected values are obtained from the official implementation self.run_dense_sample( iterations=1, - expected=[[0.8706804722, 1.7413609443], - [2.8218090945, 3.762412126]], + expected=[[0.945278642, 1.8905572849], + [2.9100000012, 3.8800000016]], optimizer=Novograd( lr=0.1, - beta_1=0.95, - beta_2=0.98, - weight_decay=0.01, + weight_decay=0.1, epsilon=1e-8), ) @@ -107,12 +105,10 @@ def test_sparse_sample_with_weight_decay(self): # Dense results should be: [-0.2029, 0.7768], [1.7578, 2.7380] self.run_sparse_sample( iterations=1, - expected=[[0.8706804722, 1.7413609443], - [2.8218090945, 3.762412126]], + expected=[[0.945278642, 1.8905572849], + [2.9100000012, 3.8800000016]], optimizer=Novograd( lr=0.1, - beta_1=0.95, - beta_2=0.98, weight_decay=0.01, epsilon=1e-8), ) @@ -124,8 +120,6 @@ def test_dense_sample_with_grad_averaging(self): [2.9915147188, 3.9886862917]], optimizer=Novograd( lr=0.1, - beta_1=0.95, - beta_2=0.98, epsilon=1e-8, grad_averaging=True)) @@ -136,8 +130,6 @@ def test_sparse_sample_with_grad_averaging(self): [2.9915147188, 3.9886862917]], optimizer=Novograd( lr=0.1, - beta_1=0.95, - beta_2=0.98, epsilon=1e-8, grad_averaging=True)) From 0c2430134984c769c9364214bd717f6feeb8c8dd Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 01:48:38 +0000 Subject: [PATCH 45/62] Update other tests --- tensorflow_addons/optimizers/novograd_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 518c31de3a..e60dec2e3b 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -116,8 +116,8 @@ def test_sparse_sample_with_weight_decay(self): def test_dense_sample_with_grad_averaging(self): self.run_dense_sample( iterations=1, - expected=[[0.993675445, 1.9873508899], - [2.9915147188, 3.9886862917]], + expected=[[0.9955278642, 1.9910557285], + [2.9940000001, 3.9920000002]], optimizer=Novograd( lr=0.1, epsilon=1e-8, @@ -126,8 +126,8 @@ def test_dense_sample_with_grad_averaging(self): def test_sparse_sample_with_grad_averaging(self): self.run_sparse_sample( iterations=1, - expected=[[0.993675445, 1.9873508899], - [2.9915147188, 3.9886862917]], + expected=[[0.9955278642, 1.9910557285], + [2.9940000001, 3.9920000002]], optimizer=Novograd( lr=0.1, epsilon=1e-8, From 7dd3fb6fc1fe0dda797410d2ecfa5f450c65fa0b Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 01:55:36 +0000 Subject: [PATCH 46/62] Update grad_averaging logic --- tensorflow_addons/optimizers/novograd.py | 5 +++-- tensorflow_addons/optimizers/novograd_test.py | 20 ------------------- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 5dcd96c2f9..6f93a857ca 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -196,8 +196,9 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): grad = tf.cond( tf.greater(weight_decay, 0), lambda: grad + weight_decay * var, lambda: grad) - grad = tf.cond(grad_averaging, lambda: grad * coefficients[ - 'one_minus_beta_1_t'], lambda: grad) + grad = tf.cond(tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)), + lambda: grad * coefficients['one_minus_beta_1_t'], + lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_sparse_apply_momentum( var.handle, diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index e60dec2e3b..00e28badc3 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -113,26 +113,6 @@ def test_sparse_sample_with_weight_decay(self): epsilon=1e-8), ) - def test_dense_sample_with_grad_averaging(self): - self.run_dense_sample( - iterations=1, - expected=[[0.9955278642, 1.9910557285], - [2.9940000001, 3.9920000002]], - optimizer=Novograd( - lr=0.1, - epsilon=1e-8, - grad_averaging=True)) - - def test_sparse_sample_with_grad_averaging(self): - self.run_sparse_sample( - iterations=1, - expected=[[0.9955278642, 1.9910557285], - [2.9940000001, 3.9920000002]], - optimizer=Novograd( - lr=0.1, - epsilon=1e-8, - grad_averaging=True)) - def test_fit_simple_linear_model(self): np.random.seed(0x2020) tf.random.set_seed(0x2020) From 10afb3ff7f9331f274bd3a80bb4dc670666c26ea Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 01:57:44 +0000 Subject: [PATCH 47/62] Update grad_averaging logic --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 00e28badc3..24a86ebef8 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -109,7 +109,7 @@ def test_sparse_sample_with_weight_decay(self): [2.9100000012, 3.8800000016]], optimizer=Novograd( lr=0.1, - weight_decay=0.01, + weight_decay=0.1, epsilon=1e-8), ) From 6c6012f4afddb7226c84a55b274bd01220ab63a7 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 02:07:18 +0000 Subject: [PATCH 48/62] Add amsgrad --- tensorflow_addons/optimizers/novograd.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 6f93a857ca..8e4fd73e03 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -76,6 +76,7 @@ def __init__(self, epsilon=1e-8, weight_decay=0.0, grad_averaging=False, + amsgrad=False, name='Novograd', **kwargs): r"""Construct a new RAdam optimizer. @@ -108,6 +109,7 @@ def __init__(self, self._set_hyper('beta_2', beta_2) self._set_hyper('weight_decay', weight_decay) self._set_hyper('grad_averaging', grad_averaging) + self.amsgrad = amsgrad self.epsilon = epsilon or tf.keras.backend.epsilon() def _create_slots(self, var_list): @@ -120,6 +122,9 @@ def _create_slots(self, var_list): var=var, slot_name='v', initializer=tf.zeros(shape=[], dtype=var.dtype)) + if self.amsgrad: + for var in var_list: + self.add_slot(var, 'vhat') def _prepare_local(self, var_device, var_dtype, apply_state): super(Novograd, self)._prepare_local(var_device, var_dtype, @@ -160,7 +165,12 @@ def _resource_apply_dense(self, grad, var, apply_state=None): g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) - grad = grad / (tf.sqrt(v_t) + self.epsilon) + if self.amsgrad: + vhat = self.get_slot(var, 'vhat') + vhat_t = vhat.assign(tf.maximum(vhat, v_t), use_locking=self._use_locking) + grad = grad / (tf.sqrt(vhat_t) + self.epsilon) + else: + grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond( tf.greater(weight_decay, 0), lambda: grad + weight_decay * var, lambda: grad) @@ -192,7 +202,12 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) - grad = grad / (tf.sqrt(v_t) + self.epsilon) + if self.amsgrad: + vhat = self.get_slot(var, 'vhat') + vhat_t = vhat.assign(tf.maximum(vhat, v_t), use_locking=self._use_locking) + grad = grad / (tf.sqrt(vhat_t) + self.epsilon) + else: + grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond( tf.greater(weight_decay, 0), lambda: grad + weight_decay * var, lambda: grad) From d66a2e8b88ef81fa8c3f634b67a3206c16c69143 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 02:08:08 +0000 Subject: [PATCH 49/62] Tests update --- tensorflow_addons/optimizers/novograd_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 24a86ebef8..0a01db35a2 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -119,7 +119,7 @@ def test_fit_simple_linear_model(self): x = np.random.standard_normal((100000, 3)) w = np.random.standard_normal((3, 1)) - y = np.dot(x, w) + np.random.standard_normal((100000, 1)) * 1e-6 + y = np.dot(x, w) + np.random.standard_normal((100000, 1)) * 1e-4 model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) @@ -132,7 +132,7 @@ def test_fit_simple_linear_model(self): predicted = model.predict(x) max_abs_diff = np.max(np.abs(predicted - y)) - self.assertLess(max_abs_diff, 1e-2) + self.assertLess(max_abs_diff, 1e-3) def test_get_config(self): opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) From dfc5fc57d39190145355224373edce41afe6daa2 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 02:18:05 +0000 Subject: [PATCH 50/62] Tests update --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 0a01db35a2..d084d1ecdf 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -119,7 +119,7 @@ def test_fit_simple_linear_model(self): x = np.random.standard_normal((100000, 3)) w = np.random.standard_normal((3, 1)) - y = np.dot(x, w) + np.random.standard_normal((100000, 1)) * 1e-4 + y = np.dot(x, w) + np.random.standard_normal((100000, 1)) * 1e-5 model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) From f3c6b5e4df97fb4f20e37d15b5793198ffcce8a2 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Mon, 6 Jan 2020 02:22:29 +0000 Subject: [PATCH 51/62] Tests update --- tensorflow_addons/optimizers/novograd_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index d084d1ecdf..6cc92814da 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -132,7 +132,7 @@ def test_fit_simple_linear_model(self): predicted = model.predict(x) max_abs_diff = np.max(np.abs(predicted - y)) - self.assertLess(max_abs_diff, 1e-3) + self.assertLess(max_abs_diff, 1e-2) def test_get_config(self): opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) From 43925fe587c853f310c4de72b8692136f10985ee Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Tue, 7 Jan 2020 19:43:44 +0000 Subject: [PATCH 52/62] Code format --- tensorflow_addons/optimizers/novograd.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 8e4fd73e03..4e7f6aa6a1 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -167,7 +167,8 @@ def _resource_apply_dense(self, grad, var, apply_state=None): if self.amsgrad: vhat = self.get_slot(var, 'vhat') - vhat_t = vhat.assign(tf.maximum(vhat, v_t), use_locking=self._use_locking) + vhat_t = vhat.assign(tf.maximum(vhat, v_t), + use_locking=self._use_locking) grad = grad / (tf.sqrt(vhat_t) + self.epsilon) else: grad = grad / (tf.sqrt(v_t) + self.epsilon) @@ -204,14 +205,16 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): if self.amsgrad: vhat = self.get_slot(var, 'vhat') - vhat_t = vhat.assign(tf.maximum(vhat, v_t), use_locking=self._use_locking) + vhat_t = vhat.assign(tf.maximum(vhat, v_t), + use_locking=self._use_locking) grad = grad / (tf.sqrt(vhat_t) + self.epsilon) else: grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond( tf.greater(weight_decay, 0), lambda: grad + weight_decay * var, lambda: grad) - grad = tf.cond(tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)), + grad = tf.cond(tf.logical_and(grad_averaging, + tf.not_equal(self.iterations, 0)), lambda: grad * coefficients['one_minus_beta_1_t'], lambda: grad) m = self.get_slot(var, 'm') From 5c85a8fa02199ee5d91a93b1bead5384fd5124e2 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Tue, 7 Jan 2020 19:47:44 +0000 Subject: [PATCH 53/62] Code format --- tensorflow_addons/optimizers/novograd.py | 15 +++++++-------- tensorflow_addons/optimizers/novograd_test.py | 10 ++-------- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 4e7f6aa6a1..27f4b22bfc 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -167,8 +167,8 @@ def _resource_apply_dense(self, grad, var, apply_state=None): if self.amsgrad: vhat = self.get_slot(var, 'vhat') - vhat_t = vhat.assign(tf.maximum(vhat, v_t), - use_locking=self._use_locking) + vhat_t = vhat.assign( + tf.maximum(vhat, v_t), use_locking=self._use_locking) grad = grad / (tf.sqrt(vhat_t) + self.epsilon) else: grad = grad / (tf.sqrt(v_t) + self.epsilon) @@ -205,18 +205,17 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): if self.amsgrad: vhat = self.get_slot(var, 'vhat') - vhat_t = vhat.assign(tf.maximum(vhat, v_t), - use_locking=self._use_locking) + vhat_t = vhat.assign( + tf.maximum(vhat, v_t), use_locking=self._use_locking) grad = grad / (tf.sqrt(vhat_t) + self.epsilon) else: grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond( tf.greater(weight_decay, 0), lambda: grad + weight_decay * var, lambda: grad) - grad = tf.cond(tf.logical_and(grad_averaging, - tf.not_equal(self.iterations, 0)), - lambda: grad * coefficients['one_minus_beta_1_t'], - lambda: grad) + grad = tf.cond( + tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)), + lambda: grad * coefficients['one_minus_beta_1_t'], lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_sparse_apply_momentum( var.handle, diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 6cc92814da..0d86ab5409 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -94,10 +94,7 @@ def test_dense_sample_with_weight_decay(self): iterations=1, expected=[[0.945278642, 1.8905572849], [2.9100000012, 3.8800000016]], - optimizer=Novograd( - lr=0.1, - weight_decay=0.1, - epsilon=1e-8), + optimizer=Novograd(lr=0.1, weight_decay=0.1, epsilon=1e-8), ) def test_sparse_sample_with_weight_decay(self): @@ -107,10 +104,7 @@ def test_sparse_sample_with_weight_decay(self): iterations=1, expected=[[0.945278642, 1.8905572849], [2.9100000012, 3.8800000016]], - optimizer=Novograd( - lr=0.1, - weight_decay=0.1, - epsilon=1e-8), + optimizer=Novograd(lr=0.1, weight_decay=0.1, epsilon=1e-8), ) def test_fit_simple_linear_model(self): From 515b34b9c7ca2dc6f2b04215b80e9030007c0a06 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Tue, 7 Jan 2020 20:19:23 +0000 Subject: [PATCH 54/62] Use keras training ops --- tensorflow_addons/optimizers/novograd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 27f4b22bfc..efa9173223 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -178,7 +178,7 @@ def _resource_apply_dense(self, grad, var, apply_state=None): grad = tf.cond(grad_averaging, lambda: grad * coefficients[ 'one_minus_beta_1_t'], lambda: grad) m = self.get_slot(var, 'm') - return training_ops.resource_apply_momentum( + return training_ops.resource_apply_keras_momentum( var.handle, m.handle, coefficients['lr_t'], @@ -217,7 +217,7 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)), lambda: grad * coefficients['one_minus_beta_1_t'], lambda: grad) m = self.get_slot(var, 'm') - return training_ops.resource_sparse_apply_momentum( + return training_ops.resource_sparse_apply_keras_momentum( var.handle, m.handle, coefficients['lr_t'], From ab08e5fd17ebc9aa9ec7d0f8a3401fbdfb6f35a6 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Wed, 8 Jan 2020 23:39:49 +0000 Subject: [PATCH 55/62] Address comments --- tensorflow_addons/optimizers/__init__.py | 2 +- tensorflow_addons/optimizers/novograd.py | 45 ++++++++++--------- tensorflow_addons/optimizers/novograd_test.py | 24 +++++----- 3 files changed, 35 insertions(+), 36 deletions(-) diff --git a/tensorflow_addons/optimizers/__init__.py b/tensorflow_addons/optimizers/__init__.py index 919eb76502..a4f49a0ea1 100644 --- a/tensorflow_addons/optimizers/__init__.py +++ b/tensorflow_addons/optimizers/__init__.py @@ -31,7 +31,7 @@ from tensorflow_addons.optimizers.lazy_adam import LazyAdam from tensorflow_addons.optimizers.lookahead import Lookahead from tensorflow_addons.optimizers.moving_average import MovingAverage -from tensorflow_addons.optimizers.novograd import Novograd +from tensorflow_addons.optimizers.novograd import NovoGrad from tensorflow_addons.optimizers.rectified_adam import RectifiedAdam from tensorflow_addons.optimizers.stochastic_weight_averaging import SWA from tensorflow_addons.optimizers.weight_decay_optimizers import AdamW diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index efa9173223..6669247dbd 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Novograd for TensorFlow.""" +"""NovoGrad for TensorFlow.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -22,8 +22,8 @@ @tf.keras.utils.register_keras_serializable(package='Addons') -class Novograd(tf.keras.optimizers.Optimizer): - """The Novograd Optimizer was first proposed in [Stochastic Gradient +class NovoGrad(tf.keras.optimizers.Optimizer): + """The NovoGrad Optimizer was first proposed in [Stochastic Gradient Methods with Layerwise Adaptvie Moments for training of Deep Networks](https://arxiv.org/pdf/1905.11286.pdf) @@ -34,7 +34,6 @@ class Novograd(tf.keras.optimizers.Optimizer): computation please refer to this [link](https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html): - ``` Second order moment = exponential moving average of Layer-wise square of grads: v_t <-- beta_2 * v_{t-1} + (1-beta_2) * (g_t)^2 @@ -55,11 +54,10 @@ class Novograd(tf.keras.optimizers.Optimizer): (w_d * w_{t-1}))] Weight update: w_t <- w_{t-1} - lr_t * m_t - ``` Example of usage: ```python - opt = tfa.optimizers.Novograd( + opt = tfa.optimizers.NovoGrad( lr=1e-3, beta_1=0.9, beta_2=0.999, @@ -77,9 +75,9 @@ def __init__(self, weight_decay=0.0, grad_averaging=False, amsgrad=False, - name='Novograd', + name='NovoGrad', **kwargs): - r"""Construct a new RAdam optimizer. + r"""Construct a new NovoGrad optimizer. Args: learning_rate: A `Tensor` or a floating point value. or a schedule @@ -100,7 +98,7 @@ def __init__(self, decay of learning rate. `lr` is included for backward compatibility, recommended to use `learning_rate` instead. """ - super(Novograd, self).__init__(name, **kwargs) + super(NovoGrad, self).__init__(name, **kwargs) if weight_decay < 0.0: raise ValueError('Weight decay rate cannot be negative') self._set_hyper('learning_rate', kwargs.get('lr', learning_rate)) @@ -127,7 +125,7 @@ def _create_slots(self, var_list): self.add_slot(var, 'vhat') def _prepare_local(self, var_device, var_dtype, apply_state): - super(Novograd, self)._prepare_local(var_device, var_dtype, + super(NovoGrad, self)._prepare_local(var_device, var_dtype, apply_state) beta_1_t = tf.identity(self._get_hyper('beta_1', var_dtype)) beta_2_t = tf.identity(self._get_hyper('beta_2', var_dtype)) @@ -148,7 +146,7 @@ def set_weights(self, weights): num_vars = int((len(params) - 1) / 2) if len(weights) == 3 * num_vars + 1: weights = weights[:len(params)] - super(Novograd, self).set_weights(weights) + super(NovoGrad, self).set_weights(weights) def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype @@ -160,8 +158,9 @@ def _resource_apply_dense(self, grad, var, apply_state=None): v = self.get_slot(var, 'v') g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) v_t = tf.cond( - tf.equal(self.iterations, - 0), lambda: g_2, lambda: v * coefficients['beta_2_t'] + + tf.equal(self.iterations, 0), + lambda: g_2, + lambda: v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) @@ -173,8 +172,9 @@ def _resource_apply_dense(self, grad, var, apply_state=None): else: grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond( - tf.greater(weight_decay, - 0), lambda: grad + weight_decay * var, lambda: grad) + tf.greater(weight_decay, 0), + lambda: grad + weight_decay * var, + lambda: grad) grad = tf.cond(grad_averaging, lambda: grad * coefficients[ 'one_minus_beta_1_t'], lambda: grad) m = self.get_slot(var, 'm') @@ -198,8 +198,9 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) # v is just a scalar and does not need to involve sparse tensors. v_t = tf.cond( - tf.equal(self.iterations, - 0), lambda: g_2, lambda: v * coefficients['beta_2_t'] + + tf.equal(self.iterations, 0), + lambda: g_2, + lambda: v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) @@ -211,11 +212,13 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): else: grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond( - tf.greater(weight_decay, - 0), lambda: grad + weight_decay * var, lambda: grad) + tf.greater(weight_decay, 0), + lambda: grad + weight_decay * var, + lambda: grad) grad = tf.cond( tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)), - lambda: grad * coefficients['one_minus_beta_1_t'], lambda: grad) + lambda: grad * coefficients['one_minus_beta_1_t'], + lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_sparse_apply_keras_momentum( var.handle, @@ -228,7 +231,7 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): use_nesterov=False) def get_config(self): - config = super(Novograd, self).get_config() + config = super(NovoGrad, self).get_config() config.update({ 'learning_rate': self._serialize_hyperparameter('learning_rate'), diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 0d86ab5409..edec0a6f9c 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -12,21 +12,21 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== -"""Tests for Novograd Optimizer.""" +"""Tests for NovoGrad Optimizer.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function -import tensorflow as tf import numpy as np +import tensorflow as tf +from tensorflow_addons.optimizers import NovoGrad from tensorflow_addons.utils import test_utils -from tensorflow_addons.optimizers import Novograd @test_utils.run_all_in_graph_and_eager_modes -class NovogradTest(tf.test.TestCase): +class NovoGradTest(tf.test.TestCase): def run_dense_sample(self, iterations, expected, optimizer): var_0 = tf.Variable([1.0, 2.0], dtype=tf.dtypes.float32) var_1 = tf.Variable([3.0, 4.0], dtype=tf.dtypes.float32) @@ -72,12 +72,11 @@ def run_sparse_sample(self, iterations, expected, optimizer): self.assertAllClose(var_1.read_value(), expected[1], atol=2e-4) def test_dense_sample(self): - self.run_dense_sample( iterations=1, expected=[[0.9552786425, 1.9105572849], [2.9400000012, 3.9200000016]], - optimizer=Novograd(lr=0.1, epsilon=1e-8), + optimizer=NovoGrad(lr=0.1, epsilon=1e-8), ) def test_sparse_sample(self): @@ -85,26 +84,23 @@ def test_sparse_sample(self): iterations=1, expected=[[0.9552786425, 1.9105572849], [2.9400000012, 3.9200000016]], - optimizer=Novograd(lr=0.1, epsilon=1e-8), + optimizer=NovoGrad(lr=0.1, epsilon=1e-8), ) def test_dense_sample_with_weight_decay(self): - # Expected values are obtained from the official implementation self.run_dense_sample( iterations=1, expected=[[0.945278642, 1.8905572849], [2.9100000012, 3.8800000016]], - optimizer=Novograd(lr=0.1, weight_decay=0.1, epsilon=1e-8), + optimizer=NovoGrad(lr=0.1, weight_decay=0.1, epsilon=1e-8), ) def test_sparse_sample_with_weight_decay(self): - # Expected values are obtained from the official implementation - # Dense results should be: [-0.2029, 0.7768], [1.7578, 2.7380] self.run_sparse_sample( iterations=1, expected=[[0.945278642, 1.8905572849], [2.9100000012, 3.8800000016]], - optimizer=Novograd(lr=0.1, weight_decay=0.1, epsilon=1e-8), + optimizer=NovoGrad(lr=0.1, weight_decay=0.1, epsilon=1e-8), ) def test_fit_simple_linear_model(self): @@ -117,7 +113,7 @@ def test_fit_simple_linear_model(self): model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(input_shape=(3,), units=1)) - model.compile(Novograd(), loss='mse') + model.compile(NovoGrad(), loss='mse') model.fit(x, y, epochs=10) @@ -129,7 +125,7 @@ def test_fit_simple_linear_model(self): self.assertLess(max_abs_diff, 1e-2) def test_get_config(self): - opt = Novograd(lr=1e-4, weight_decay=0.0, grad_averaging=False) + opt = NovoGrad(lr=1e-4, weight_decay=0.0, grad_averaging=False) config = opt.get_config() self.assertEqual(config['learning_rate'], 1e-4) self.assertEqual(config['weight_decay'], 0.0) From 15dbe2663950ad2fb7c735eedb4eb85be22a9e20 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Wed, 8 Jan 2020 23:40:06 +0000 Subject: [PATCH 56/62] Address comments --- tensorflow_addons/optimizers/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/README.md b/tensorflow_addons/optimizers/README.md index 50513f9ec0..a540be4bd5 100644 --- a/tensorflow_addons/optimizers/README.md +++ b/tensorflow_addons/optimizers/README.md @@ -26,7 +26,7 @@ | lazy_adam | LazyAdam | https://arxiv.org/abs/1412.6980 | | lookahead | Lookahead | https://arxiv.org/abs/1907.08610v1 | | moving_average | MovingAverage | | -| novograd | Novograd | https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html | +| novograd | NovoGrad | https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html | | rectified_adam | RectifiedAdam | https://arxiv.org/pdf/1908.03265v1.pdf | | stochastic_weight_averaging | SWA | https://arxiv.org/abs/1803.05407.pdf | | weight_decay_optimizers | SGDW, AdamW, extend_with_decoupled_weight_decay | https://arxiv.org/pdf/1711.05101.pdf | From a4d4783400f9e3aa3d21d71aea458cee7a698f66 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Thu, 9 Jan 2020 00:04:57 +0000 Subject: [PATCH 57/62] Tests for grad_averaging --- tensorflow_addons/optimizers/novograd_test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index edec0a6f9c..632317e28b 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -103,6 +103,22 @@ def test_sparse_sample_with_weight_decay(self): optimizer=NovoGrad(lr=0.1, weight_decay=0.1, epsilon=1e-8), ) + def test_dense_sample_with_grad_averaging(self): + self.run_dense_sample( + iterations=1, + expected=[[0.9955278642, 1.9910557285], + [2.9940000001, 3.9920000002]], + optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8), + ) + + def test_sparse_sample_with_grad_averaging(self): + self.run_sparse_sample( + iterations=1, + expected=[[0.9955278642, 1.9910557285], + [2.9940000001, 3.9920000002]], + optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8), + ) + def test_fit_simple_linear_model(self): np.random.seed(0x2020) tf.random.set_seed(0x2020) From e43408485f912e529145cd405ef42981239e0c37 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Thu, 9 Jan 2020 00:54:07 +0000 Subject: [PATCH 58/62] Fix grad_averaging test --- tensorflow_addons/optimizers/novograd.py | 6 ++++-- tensorflow_addons/optimizers/novograd_test.py | 12 ++++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 6669247dbd..24966483d6 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -175,8 +175,10 @@ def _resource_apply_dense(self, grad, var, apply_state=None): tf.greater(weight_decay, 0), lambda: grad + weight_decay * var, lambda: grad) - grad = tf.cond(grad_averaging, lambda: grad * coefficients[ - 'one_minus_beta_1_t'], lambda: grad) + grad = tf.cond( + tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)), + lambda: grad * coefficients['one_minus_beta_1_t'], + lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_apply_keras_momentum( var.handle, diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index 632317e28b..a71fe8906f 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -105,17 +105,17 @@ def test_sparse_sample_with_weight_decay(self): def test_dense_sample_with_grad_averaging(self): self.run_dense_sample( - iterations=1, - expected=[[0.9955278642, 1.9910557285], - [2.9940000001, 3.9920000002]], + iterations=2, + expected=[[0.9508087044, 1.9016174088], + [2.9340000013, 3.9120000018]], optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8), ) def test_sparse_sample_with_grad_averaging(self): self.run_sparse_sample( - iterations=1, - expected=[[0.9955278642, 1.9910557285], - [2.9940000001, 3.9920000002]], + iterations=2, + expected=[[0.9508087044, 1.9016174088], + [2.9340000013, 3.9120000018]], optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8), ) From 82071be7aabd132ae85b66d22d43c270d07a1845 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Thu, 9 Jan 2020 01:05:19 +0000 Subject: [PATCH 59/62] Test fix --- tensorflow_addons/optimizers/novograd_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd_test.py b/tensorflow_addons/optimizers/novograd_test.py index a71fe8906f..7f2009158d 100644 --- a/tensorflow_addons/optimizers/novograd_test.py +++ b/tensorflow_addons/optimizers/novograd_test.py @@ -106,16 +106,16 @@ def test_sparse_sample_with_weight_decay(self): def test_dense_sample_with_grad_averaging(self): self.run_dense_sample( iterations=2, - expected=[[0.9508087044, 1.9016174088], - [2.9340000013, 3.9120000018]], + expected=[[0.9105572849, 1.8211145698], + [2.8800000024, 3.8400000032]], optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8), ) def test_sparse_sample_with_grad_averaging(self): self.run_sparse_sample( iterations=2, - expected=[[0.9508087044, 1.9016174088], - [2.9340000013, 3.9120000018]], + expected=[[0.9105572849, 1.8211145698], + [2.8800000024, 3.8400000032]], optimizer=NovoGrad(lr=0.1, grad_averaging=True, epsilon=1e-8), ) From 7a30de03dadc73aa6ec02256c988b32d23ad5a1c Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Thu, 9 Jan 2020 06:37:36 +0000 Subject: [PATCH 60/62] Change default epsilon value --- tensorflow_addons/optimizers/novograd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 24966483d6..128f7e255a 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -71,7 +71,7 @@ def __init__(self, learning_rate=0.001, beta_1=0.9, beta_2=0.999, - epsilon=1e-8, + epsilon=1e-7, weight_decay=0.0, grad_averaging=False, amsgrad=False, From 185fe926770914e22e5b13cfde07014da5e1105b Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Thu, 9 Jan 2020 13:16:55 +0000 Subject: [PATCH 61/62] Fix code format --- tensorflow_addons/optimizers/novograd.py | 26 +++++++++--------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 128f7e255a..3bdb8093d5 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -158,9 +158,8 @@ def _resource_apply_dense(self, grad, var, apply_state=None): v = self.get_slot(var, 'v') g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) v_t = tf.cond( - tf.equal(self.iterations, 0), - lambda: g_2, - lambda: v * coefficients['beta_2_t'] + + tf.equal(self.iterations, + 0), lambda: g_2, lambda: v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) @@ -172,13 +171,11 @@ def _resource_apply_dense(self, grad, var, apply_state=None): else: grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond( - tf.greater(weight_decay, 0), - lambda: grad + weight_decay * var, - lambda: grad) + tf.greater(weight_decay, + 0), lambda: grad + weight_decay * var, lambda: grad) grad = tf.cond( tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)), - lambda: grad * coefficients['one_minus_beta_1_t'], - lambda: grad) + lambda: grad * coefficients['one_minus_beta_1_t'], lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_apply_keras_momentum( var.handle, @@ -200,9 +197,8 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) # v is just a scalar and does not need to involve sparse tensors. v_t = tf.cond( - tf.equal(self.iterations, 0), - lambda: g_2, - lambda: v * coefficients['beta_2_t'] + + tf.equal(self.iterations, + 0), lambda: g_2, lambda: v * coefficients['beta_2_t'] + g_2 * coefficients['one_minus_beta_2_t']) v_t = v.assign(v_t, use_locking=self._use_locking) @@ -214,13 +210,11 @@ def _resource_apply_sparse(self, grad, var, indices, apply_state=None): else: grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond( - tf.greater(weight_decay, 0), - lambda: grad + weight_decay * var, - lambda: grad) + tf.greater(weight_decay, + 0), lambda: grad + weight_decay * var, lambda: grad) grad = tf.cond( tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)), - lambda: grad * coefficients['one_minus_beta_1_t'], - lambda: grad) + lambda: grad * coefficients['one_minus_beta_1_t'], lambda: grad) m = self.get_slot(var, 'm') return training_ops.resource_sparse_apply_keras_momentum( var.handle, From 977cb92fa34fb0031b7e47c97f87aef6975035d3 Mon Sep 17 00:00:00 2001 From: shreyashpatodia Date: Sat, 11 Jan 2020 21:39:13 +0000 Subject: [PATCH 62/62] docs: add TODO --- tensorflow_addons/optimizers/novograd.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index 3bdb8093d5..238faf678f 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -18,6 +18,7 @@ from __future__ import print_function import tensorflow as tf +# TODO: Find public API alternatives to these from tensorflow.python.training import training_ops