diff --git a/tensorflow_addons/optimizers/novograd.py b/tensorflow_addons/optimizers/novograd.py
index bdea0a9079..a089651488 100644
--- a/tensorflow_addons/optimizers/novograd.py
+++ b/tensorflow_addons/optimizers/novograd.py
@@ -23,16 +23,16 @@
 
 @tf.keras.utils.register_keras_serializable(package="Addons")
 class NovoGrad(tf.keras.optimizers.Optimizer):
-    """The NovoGrad Optimizer was first proposed in [Stochastic Gradient
-    Methods with Layerwise Adaptvie Moments for training of Deep
-    Networks](https://arxiv.org/pdf/1905.11286.pdf)
-
-    NovoGrad is a first-order SGD-based algorithm, which computes second
-    moments per layer instead of per weight as in Adam. Compared to Adam,
-    NovoGrad takes less memory, and has been found to be more numerically
-    stable. More specifically we compute (for more information on the
-    computation please refer to this
-    [link](https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html):
+    """Optimizer that implements NovoGrad.
+
+    The NovoGrad Optimizer was first proposed in [Stochastic Gradient
+    Methods with Layerwise Adaptive Moments for training of Deep
+    Networks](https://arxiv.org/pdf/1905.11286.pdf) NovoGrad is a
+    first-order SGD-based algorithm, which computes second moments per
+    layer instead of per weight as in Adam. Compared to Adam, NovoGrad
+    takes less memory, and has been found to be more numerically stable.
+    (For more information on the computation please refer to this
+    [link](https://nvidia.github.io/OpenSeq2Seq/html/optimizers.html))
 
     Second order moment = exponential moving average of Layer-wise square
     of grads: