diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py index bdea0a9079..a089651488 100644 --- a/tensorflow_addons/optimizers/novograd.py +++ b/tensorflow_addons/optimizers/novograd.py @@ -23,16 +23,16 @@ @tf.keras.utils.register_keras_serializable(package="Addons") class NovoGrad(tf.keras.optimizers.Optimizer): - """The NovoGrad Optimizer was first proposed in [Stochastic Gradient - Methods with Layerwise Adaptvie Moments for training of Deep - Networks](https://arxiv.org/pdf/1905.11286.pdf) - - NovoGrad is a first-order SGD-based algorithm, which computes second - moments per layer instead of per weight as in Adam. Compared to Adam, - NovoGrad takes less memory, and has been found to be more numerically - stable. More specifically we compute (for more information on the - computation please refer to this - [link](https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html): + """Optimizer that implements NovoGrad. + + The NovoGrad Optimizer was first proposed in [Stochastic Gradient + Methods with Layerwise Adaptive Moments for training of Deep + Networks](https://arxiv.org/pdf/1905.11286.pdf) NovoGrad is a + first-order SGD-based algorithm, which computes second moments per + layer instead of per weight as in Adam. Compared to Adam, NovoGrad + takes less memory, and has been found to be more numerically stable. + (For more information on the computation please refer to this + [link](https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html)) Second order moment = exponential moving average of Layer-wise square of grads: