From 4111955c228c8c2b7540afdc31421d7016f5cf15 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Tue, 12 Jan 2021 14:04:11 +0900
Subject: [PATCH 01/16] add clip_grad_by_value feature

---
 pytorch_lightning/accelerators/accelerator.py | 25 +++++++--
 .../accelerators/tpu_accelerator.py           | 44 +++++++++-------
 pytorch_lightning/plugins/apex.py             | 52 +++++++++++--------
 pytorch_lightning/plugins/native_amp.py       | 12 ++++-
 pytorch_lightning/plugins/precision_plugin.py |  6 ++-
 .../plugins/sharded_native_amp_plugin.py      | 17 ++++--
 .../connectors/training_trick_connector.py    | 13 ++++-
 pytorch_lightning/trainer/trainer.py          | 10 +++-
 8 files changed, 126 insertions(+), 53 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 9b56119a04c3e..e97bf30e263d0 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -119,14 +119,29 @@ def clip_gradients(self, optimizer, clip_val=None):
 
         if grad_clip_val <= 0:
             return
-        self._clip_gradients(optimizer, grad_clip_val)
-
-    def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0):
+        self._clip_gradients(optimizer=optimizer,
+                             grad_clip_val=grad_clip_val,
+                             gradient_clip_algorithm=self.trainer.gradient_clip_algorithm,
+                             norm_type=self.trainer.gradient_clip_norm_type)
+
+    def _clip_gradients(self,
+                        optimizer: Optimizer,
+                        grad_clip_val: Union[float, int],
+                        gradient_clip_algorithm: str,
+                        norm_type: Union[float, int]):
         if self.trainer.amp_backend:
-            self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, optimizer, norm_type)
+            self.trainer.precision_connector.backend.clip_gradients(optimizer=optimizer,
+                                                                    grad_clip_val=grad_clip_val,
+                                                                    gradient_clip_algorithm=gradient_clip_algorithm,
+                                                                    norm_type=norm_type)
         else:
             model = self.trainer.get_model()
-            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
+            if gradient_clip_algorithm == 'value':
+                torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val)
+            elif gradient_clip_algorithm.startswith('norm'):
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
+            else:
+                raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')
 
     def on_train_epoch_end(self, outputs):
         pass
diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py
index 66fc236a2a775..2cc58fdd2890f 100644
--- a/pytorch_lightning/accelerators/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/tpu_accelerator.py
@@ -244,27 +244,35 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
 
         return closure_loss
 
-    def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0):
-        # this code is a modification of torch.nn.utils.clip_grad_norm_
+    def _clip_gradients(self,
+                        optimizer: Optimizer,
+                        grad_clip_val: Union[float, int],
+                        gradient_clip_algorithm: str,
+                        norm_type: Union[float, int]):
+        # this code contains a modification of torch.nn.utils.clip_grad_norm_
         # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
         model = self.trainer.get_model()
         parameters = model.parameters()
-        max_norm = grad_clip_val
-
-        if isinstance(parameters, torch.Tensor):
-            parameters = [parameters]
-        parameters = list(filter(lambda p: p.grad is not None, parameters))
-
-        device = parameters[0].device
-        out = torch.empty(len(parameters), device=device)
-        for i, p in enumerate(parameters):
-            torch.norm(p.grad.data.to(device), norm_type, out=out[i])
-        total_norm = torch.norm(out, norm_type)
-
-        clip_coef = torch.tensor(max_norm, device=device) / (total_norm + self.norm_clipping_epsilon)
-        clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
-        for p in parameters:
-            p.grad.data.mul_(clip_coef.to(p.grad.data.device))
+        if gradient_clip_algorithm == 'value':
+            torch.nn.utils.clip_grad_value_(parameters, clip_value=grad_clip_val)
+        elif gradient_clip_algorithm.startswith('norm'):
+            max_norm = grad_clip_val
+            if isinstance(parameters, torch.Tensor):
+                parameters = [parameters]
+            parameters = list(filter(lambda p: p.grad is not None, parameters))
+
+            device = parameters[0].device
+            out = torch.empty(len(parameters), device=device)
+            for i, p in enumerate(parameters):
+                torch.norm(p.grad.data.to(device), norm_type, out=out[i])
+            total_norm = torch.norm(out, norm_type)
+
+            clip_coef = torch.tensor(max_norm, device=device) / (total_norm + self.norm_clipping_epsilon)
+            clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
+            for p in parameters:
+                p.grad.data.mul_(clip_coef.to(p.grad.data.device))
+        else:
+            raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')
 
     def barrier(self, name: Optional[str] = None):
         torch_xla.core.xla_model.rendezvous(f"pl.Trainer.{name}")
diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py
index f80461e5d4fe5..efd43a62421e6 100644
--- a/pytorch_lightning/plugins/apex.py
+++ b/pytorch_lightning/plugins/apex.py
@@ -99,33 +99,43 @@ def configure_apex(self, amp, model, optimizers, amp_level):
         model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level)
         return model, optimizers
 
-    def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float):
+    def clip_gradients(self,
+                       optimizer: Optimizer,
+                       grad_clip_val: Union[int, float],
+                       gradient_clip_algorithm: str,
+                       norm_type: Union[float, int]):
         """
-        This code is a modification of :meth:`torch.nn.utils.clip_grad_norm_` using a higher epsilon for fp16 weights.
-        This is important when setting amp_level to O2, and the master weights are in fp16.
+        This code contains a modification of :meth:`torch.nn.utils.clip_grad_norm_` using a higher epsilon
+        for fp16 weights. This is important when setting amp_level to O2, and the master weights are in fp16.
         Args:
-            grad_clip_val: Maximum norm of gradients.
             optimizer: Optimizer with gradients that will be clipped.
-            norm_type: (float or int): type of the used p-norm. Can be ``'inf'`` for
-            infinity norm.
+            grad_clip_val: Maximum norm of gradients.
+            gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{1,45}$)' means clip_by_norm.
+            norm_type: (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
         """
         model = self.trainer.get_model()
         parameters = model.parameters()
-        max_norm = float(grad_clip_val)
-
-        if isinstance(parameters, torch.Tensor):
-            parameters = [parameters]
-        parameters = [p for p in parameters if p.grad is not None]
-
-        if len(parameters) == 0:
-            return torch.tensor(0.)
-        device = parameters[0].grad.device
-        total_norm = torch.norm(
-            torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
-        clip_coef = max_norm / (total_norm + self.norm_clipping_epsilon)
-        if clip_coef < 1:
-            for p in parameters:
-                p.grad.detach().mul_(clip_coef.to(p.grad.device))
+
+        if gradient_clip_algorithm == 'value':
+            torch.nn.utils.clip_grad_value_(parameters, clip_value=grad_clip_val)
+        elif gradient_clip_algorithm.startswith('norm'):
+            max_norm = float(grad_clip_val)
+
+            if isinstance(parameters, torch.Tensor):
+                parameters = [parameters]
+            parameters = [p for p in parameters if p.grad is not None]
+
+            if len(parameters) == 0:
+                return torch.tensor(0.)
+            device = parameters[0].grad.device
+            total_norm = torch.norm(
+                torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type)
+            clip_coef = max_norm / (total_norm + self.norm_clipping_epsilon)
+            if clip_coef < 1:
+                for p in parameters:
+                    p.grad.detach().mul_(clip_coef.to(p.grad.device))
+        else:
+            raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')
 
     @property
     def norm_clipping_epsilon(self):
diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py
index 4df5d128476a4..4cc85e1add4ed 100644
--- a/pytorch_lightning/plugins/native_amp.py
+++ b/pytorch_lightning/plugins/native_amp.py
@@ -56,9 +56,19 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
 
         return closure_loss
 
-    def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float):
+    def clip_gradients(self,
+                       optimizer: Optimizer,
+                       grad_clip_val: Union[int, float],
+                       gradient_clip_algorithm: str,
+                       norm_type: Union[float, int]):
         model = self.trainer.get_model()
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
+        if gradient_clip_algorithm == 'value':
+            torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val)
+        elif gradient_clip_algorithm.startswith('norm'):
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
+        else:
+            raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')
 
     @property
     def scaler(self):
diff --git a/pytorch_lightning/plugins/precision_plugin.py b/pytorch_lightning/plugins/precision_plugin.py
index aaac3ede3c623..2d64f48855150 100644
--- a/pytorch_lightning/plugins/precision_plugin.py
+++ b/pytorch_lightning/plugins/precision_plugin.py
@@ -35,5 +35,9 @@ def training_step(self, fx, args):
     def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
         raise NotImplementedError
 
-    def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float):
+    def clip_gradients(self,
+                       optimizer: Optimizer,
+                       grad_clip_val: Union[int, float],
+                       gradient_clip_algorithm: str,
+                       norm_type: Union[float, int]):
         raise NotImplementedError
diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
index 5ddd29521203d..e14f49c94f927 100644
--- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py
+++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
@@ -28,8 +28,15 @@ class ShardedNativeAMPPlugin(NativeAMPPlugin):
     def scaler(self):
         return ShardedGradScaler()
 
-    def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float):
-        max_norm = grad_clip_val
-        norm_type = float(2.0)
-        optimizer = cast(OSS, optimizer)
-        optimizer.clip_grad_norm(max_norm, norm_type=norm_type)
+    def clip_gradients(self,
+                       optimizer: Optimizer,
+                       grad_clip_val: Union[int, float],
+                       gradient_clip_algorithm: str,
+                       norm_type: Union[float, int]):
+        if gradient_clip_algorithm == 'value':
+            raise NotImplementedError("Value grad clipping with sharded ddp is not implemented yet")
+        elif gradient_clip_algorithm.startswith('norm'):
+            optimizer = cast(OSS, optimizer)
+            optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type)
+        else:
+            raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')
\ No newline at end of file
diff --git a/pytorch_lightning/trainer/connectors/training_trick_connector.py b/pytorch_lightning/trainer/connectors/training_trick_connector.py
index 273efad403d10..b839429f16f77 100644
--- a/pytorch_lightning/trainer/connectors/training_trick_connector.py
+++ b/pytorch_lightning/trainer/connectors/training_trick_connector.py
@@ -11,8 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from pytorch_lightning.utilities.exceptions import MisconfigurationException
+import re
+
 from pytorch_lightning.callbacks import GradientAccumulationScheduler
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
 class TrainingTricksConnector:
@@ -23,6 +25,7 @@ def __init__(self, trainer):
     def on_trainer_init(
             self,
             gradient_clip_val,
+            gradient_clip_algorithm,
             track_grad_norm,
             accumulate_grad_batches,
             truncated_bptt_steps,
@@ -32,7 +35,15 @@ def on_trainer_init(
         self.trainer.terminate_on_nan = terminate_on_nan
 
         # gradient clipping
+        regex = '^norm[1-9]([0-9]{1,45}$)'
+        if gradient_clip_algorithm != 'value' and re.match(regex, gradient_clip_algorithm) is None:
+            raise MisconfigurationException(f"gradient_clip_algorithm should be value or match with regex {regex}")
         self.trainer.gradient_clip_val = gradient_clip_val
+        self.trainer.gradient_clip_algorithm = gradient_clip_algorithm
+        if gradient_clip_algorithm == 'value':
+            self.trainer.gradient_clip_norm_type = None
+        else:
+            self.trainer.gradient_clip_norm_type = int(gradient_clip_algorithm[4:])
 
         # gradient norm tracking
         if not isinstance(track_grad_norm, (int, float)) and track_grad_norm != 'inf':
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index b923ae9adce0c..9f5c00b6876b5 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -88,6 +88,7 @@ def __init__(
         callbacks: Optional[Union[List[Callback], Callback]] = None,
         default_root_dir: Optional[str] = None,
         gradient_clip_val: float = 0,
+        gradient_clip_algorithm: str = 'norm2',
         process_position: int = 0,
         num_nodes: int = 1,
         num_processes: int = 1,
@@ -197,6 +198,8 @@ def __init__(
 
             gradient_clip_val: 0 means don't clip.
 
+            gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{1,45}$)' means clip_by_norm.
+
             limit_train_batches: How much of training dataset to check (floats = percent, int = num_batches)
 
             limit_val_batches: How much of validation dataset to check (floats = percent, int = num_batches)
@@ -345,7 +348,12 @@ def __init__(
 
         # init training tricks
         self.training_tricks_connector.on_trainer_init(
-            gradient_clip_val, track_grad_norm, accumulate_grad_batches, truncated_bptt_steps, terminate_on_nan
+            gradient_clip_val,
+            gradient_clip_algorithm,
+            track_grad_norm,
+            accumulate_grad_batches,
+            truncated_bptt_steps,
+            terminate_on_nan,
         )
 
         # init accelerator related flags

From c09e990f9f4731cdc45e64e9c17b56fded3f7700 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Tue, 12 Jan 2021 14:59:16 +0900
Subject: [PATCH 02/16] write changelog, training_tricks.rst

---
 CHANGELOG.md                    |  3 +++
 docs/source/training_tricks.rst | 16 +++++++++++++---
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eb1963d64d954..25c73bad9505c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -47,6 +47,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `IoU` class interface ([#4704](https://github.com/PyTorchLightning/pytorch-lightning/pull/4704))
 
 
+- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#5477](https://github.com/PyTorchLightning/pytorch-lightning/pull/5477)).
+
+
 ### Changed
 
 - Changed `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839))
diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst
index 10ee668a97fa8..11aee5832a7c2 100644
--- a/docs/source/training_tricks.rst
+++ b/docs/source/training_tricks.rst
@@ -26,8 +26,12 @@ The effect is a large effective batch size of size KxN.
 
 Gradient Clipping
 -----------------
-Gradient clipping may be enabled to avoid exploding gradients. Specifically, this will `clip the gradient
-norm <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_ computed over all model parameters together.
+Gradient clipping may be enabled to avoid exploding gradients. Also, you can choose various criterion by
+`gradient_clip_algorithm` option. For example, if `gradient_clip_algorithm == 'value'`, this will `clip the gradient
+by value <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_value_>`_ computed over all model parameters.
+If `gradient_clip_algorithm == 'norm1'` `clip the gradient
+norm <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_ with l1 norm computed over
+all model parameters together.
 
 .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer`
 
@@ -36,9 +40,15 @@ norm <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_
     # DEFAULT (ie: don't clip)
     trainer = Trainer(gradient_clip_val=0)
 
-    # clip gradients with norm above 0.5
+    # clip gradients with norm-2 above 0.5
     trainer = Trainer(gradient_clip_val=0.5)
 
+    # clip gradients with norm-1 above 0.5
+    trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='norm1')
+
+    # clip gradients with value above 0.5
+    trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='value' )
+
 ----------
 
 Auto scaling of batch size

From baa9a497756288016c7af3a7206a742156784b51 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Tue, 12 Jan 2021 15:01:50 +0900
Subject: [PATCH 03/16] add end line to sharded_natvie_amp_pluigin.py

---
 pytorch_lightning/plugins/sharded_native_amp_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
index e14f49c94f927..ef20d07086290 100644
--- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py
+++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
@@ -39,4 +39,4 @@ def clip_gradients(self,
             optimizer = cast(OSS, optimizer)
             optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type)
         else:
-            raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')
\ No newline at end of file
+            raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')

From a2366113f2d1713360e36a3582db15686f4de427 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Tue, 12 Jan 2021 15:31:28 +0900
Subject: [PATCH 04/16] bugfix update regex

---
 .../trainer/connectors/training_trick_connector.py              | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_lightning/trainer/connectors/training_trick_connector.py b/pytorch_lightning/trainer/connectors/training_trick_connector.py
index b839429f16f77..d4a5b9f3c50ac 100644
--- a/pytorch_lightning/trainer/connectors/training_trick_connector.py
+++ b/pytorch_lightning/trainer/connectors/training_trick_connector.py
@@ -35,7 +35,7 @@ def on_trainer_init(
         self.trainer.terminate_on_nan = terminate_on_nan
 
         # gradient clipping
-        regex = '^norm[1-9]([0-9]{1,45}$)'
+        regex = '^norm[1-9]([0-9]{0,45}$)'
         if gradient_clip_algorithm != 'value' and re.match(regex, gradient_clip_algorithm) is None:
             raise MisconfigurationException(f"gradient_clip_algorithm should be value or match with regex {regex}")
         self.trainer.gradient_clip_val = gradient_clip_val

From 749e286744b2003409b99da7d531217f3a54b9d0 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Tue, 12 Jan 2021 15:38:20 +0900
Subject: [PATCH 05/16] update regex documentation

---
 pytorch_lightning/plugins/apex.py    | 2 +-
 pytorch_lightning/trainer/trainer.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py
index efd43a62421e6..6f7fa19096b14 100644
--- a/pytorch_lightning/plugins/apex.py
+++ b/pytorch_lightning/plugins/apex.py
@@ -110,7 +110,7 @@ def clip_gradients(self,
         Args:
             optimizer: Optimizer with gradients that will be clipped.
             grad_clip_val: Maximum norm of gradients.
-            gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{1,45}$)' means clip_by_norm.
+            gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{0,45}$)' means clip_by_norm.
             norm_type: (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
         """
         model = self.trainer.get_model()
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 9f5c00b6876b5..0d9b81d55974b 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -198,7 +198,7 @@ def __init__(
 
             gradient_clip_val: 0 means don't clip.
 
-            gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{1,45}$)' means clip_by_norm.
+            gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{0,45}$)' means clip_by_norm.
 
             limit_train_batches: How much of training dataset to check (floats = percent, int = num_batches)
 

From b83ea7f8600e9c08772b73353b1aba3687e1e8df Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Mon, 18 Jan 2021 13:19:43 +0900
Subject: [PATCH 06/16] revert changelog

---
 CHANGELOG.md | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 25c73bad9505c..eb1963d64d954 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -47,9 +47,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `IoU` class interface ([#4704](https://github.com/PyTorchLightning/pytorch-lightning/pull/4704))
 
 
-- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#5477](https://github.com/PyTorchLightning/pytorch-lightning/pull/5477)).
-
-
 ### Changed
 
 - Changed `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839))

From 9913351497efeef0fc5903f99d99b738c8ec9edd Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Mon, 18 Jan 2021 13:58:20 +0900
Subject: [PATCH 07/16] commit based on review

---
 CHANGELOG.md                                  |  3 +++
 docs/source/training_tricks.rst               | 15 ++++-------
 pytorch_lightning/accelerators/accelerator.py | 25 ++++++-------------
 .../accelerators/tpu_accelerator.py           |  2 --
 pytorch_lightning/plugins/apex.py             |  6 ++---
 pytorch_lightning/plugins/native_amp.py       |  2 --
 .../plugins/sharded_native_amp_plugin.py      |  2 --
 .../connectors/training_trick_connector.py    | 10 ++------
 pytorch_lightning/trainer/trainer.py          |  4 +--
 9 files changed, 21 insertions(+), 48 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 35ff8e275ed91..cdf5861580e8c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -54,6 +54,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added missing val/test hooks in `LightningModule` ([#5467](https://github.com/PyTorchLightning/pytorch-lightning/pull/5467))
 
 
+- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#5477](https://github.com/PyTorchLightning/pytorch-lightning/pull/5477)).
+
+
 ### Changed
 
 - Changed `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839))
diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst
index 11aee5832a7c2..7211dfefc6c88 100644
--- a/docs/source/training_tricks.rst
+++ b/docs/source/training_tricks.rst
@@ -26,12 +26,10 @@ The effect is a large effective batch size of size KxN.
 
 Gradient Clipping
 -----------------
-Gradient clipping may be enabled to avoid exploding gradients. Also, you can choose various criterion by
-`gradient_clip_algorithm` option. For example, if `gradient_clip_algorithm == 'value'`, this will `clip the gradient
-by value <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_value_>`_ computed over all model parameters.
-If `gradient_clip_algorithm == 'norm1'` `clip the gradient
-norm <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_ with l1 norm computed over
-all model parameters together.
+Gradient clipping may be enabled to avoid exploding gradients. By default, this will `clip the gradient norm
+<https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_norm_>`_ computed over all model parameters together.
+If gradient_clip_algorithm option is set to 'value', which is 'norm' by default, this will
+`clip the gradient value <https://pytorch.org/docs/stable/nn.html#torch.nn.utils.clip_grad_value_>`_ for each parameter instead.
 
 .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer`
 
@@ -40,12 +38,9 @@ all model parameters together.
     # DEFAULT (ie: don't clip)
     trainer = Trainer(gradient_clip_val=0)
 
-    # clip gradients with norm-2 above 0.5
+    # clip gradients with norm above 0.5
     trainer = Trainer(gradient_clip_val=0.5)
 
-    # clip gradients with norm-1 above 0.5
-    trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='norm1')
-
     # clip gradients with value above 0.5
     trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='value' )
 
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 84823b93f094d..666122f5115a3 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -115,29 +115,18 @@ def clip_gradients(self, optimizer, clip_val=None):
 
         if grad_clip_val <= 0:
             return
-        self._clip_gradients(optimizer=optimizer,
-                             grad_clip_val=grad_clip_val,
-                             gradient_clip_algorithm=self.trainer.gradient_clip_algorithm,
-                             norm_type=self.trainer.gradient_clip_norm_type)
-
-    def _clip_gradients(self,
-                        optimizer: Optimizer,
-                        grad_clip_val: Union[float, int],
-                        gradient_clip_algorithm: str,
-                        norm_type: Union[float, int]):
+        self._clip_gradients(optimizer, grad_clip_val)
+
+    def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0):
+        clip_algorithm = self.trainer.gradient_clip_algorithm
         if self.trainer.amp_backend:
-            self.trainer.precision_connector.backend.clip_gradients(optimizer=optimizer,
-                                                                    grad_clip_val=grad_clip_val,
-                                                                    gradient_clip_algorithm=gradient_clip_algorithm,
-                                                                    norm_type=norm_type)
+            self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, clip_algorithm, optimizer, norm_type)
         else:
             model = self.trainer.get_model()
-            if gradient_clip_algorithm == 'value':
+            if clip_algorithm == 'value':
                 torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val)
-            elif gradient_clip_algorithm.startswith('norm'):
+            elif clip_algorithm == 'norm':
                 torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
-            else:
-                raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')
 
     def on_train_epoch_end(self, outputs):
         pass
diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py
index 2cc58fdd2890f..0ef6845f62040 100644
--- a/pytorch_lightning/accelerators/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/tpu_accelerator.py
@@ -271,8 +271,6 @@ def _clip_gradients(self,
             clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
             for p in parameters:
                 p.grad.data.mul_(clip_coef.to(p.grad.data.device))
-        else:
-            raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')
 
     def barrier(self, name: Optional[str] = None):
         torch_xla.core.xla_model.rendezvous(f"pl.Trainer.{name}")
diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py
index 6f7fa19096b14..ec3682733f166 100644
--- a/pytorch_lightning/plugins/apex.py
+++ b/pytorch_lightning/plugins/apex.py
@@ -110,8 +110,8 @@ def clip_gradients(self,
         Args:
             optimizer: Optimizer with gradients that will be clipped.
             grad_clip_val: Maximum norm of gradients.
-            gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{0,45}$)' means clip_by_norm.
-            norm_type: (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm.
+            gradient_clip_algorithm: 'value' means clip_by_value, 'norm' means clip_by_norm.
+            norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm.
         """
         model = self.trainer.get_model()
         parameters = model.parameters()
@@ -134,8 +134,6 @@ def clip_gradients(self,
             if clip_coef < 1:
                 for p in parameters:
                     p.grad.detach().mul_(clip_coef.to(p.grad.device))
-        else:
-            raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')
 
     @property
     def norm_clipping_epsilon(self):
diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py
index 4cc85e1add4ed..6cbac417fd400 100644
--- a/pytorch_lightning/plugins/native_amp.py
+++ b/pytorch_lightning/plugins/native_amp.py
@@ -67,8 +67,6 @@ def clip_gradients(self,
             torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val)
         elif gradient_clip_algorithm.startswith('norm'):
             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
-        else:
-            raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')
 
     @property
     def scaler(self):
diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
index ef20d07086290..b2a47e9c91223 100644
--- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py
+++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
@@ -38,5 +38,3 @@ def clip_gradients(self,
         elif gradient_clip_algorithm.startswith('norm'):
             optimizer = cast(OSS, optimizer)
             optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type)
-        else:
-            raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.')
diff --git a/pytorch_lightning/trainer/connectors/training_trick_connector.py b/pytorch_lightning/trainer/connectors/training_trick_connector.py
index d4a5b9f3c50ac..326c73df610ac 100644
--- a/pytorch_lightning/trainer/connectors/training_trick_connector.py
+++ b/pytorch_lightning/trainer/connectors/training_trick_connector.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import re
 
 from pytorch_lightning.callbacks import GradientAccumulationScheduler
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -35,15 +34,10 @@ def on_trainer_init(
         self.trainer.terminate_on_nan = terminate_on_nan
 
         # gradient clipping
-        regex = '^norm[1-9]([0-9]{0,45}$)'
-        if gradient_clip_algorithm != 'value' and re.match(regex, gradient_clip_algorithm) is None:
-            raise MisconfigurationException(f"gradient_clip_algorithm should be value or match with regex {regex}")
+        if gradient_clip_algorithm not in ['value', 'norm']:
+            raise MisconfigurationException("gradient_clip_algorithm should be 'value' or 'norm'")
         self.trainer.gradient_clip_val = gradient_clip_val
         self.trainer.gradient_clip_algorithm = gradient_clip_algorithm
-        if gradient_clip_algorithm == 'value':
-            self.trainer.gradient_clip_norm_type = None
-        else:
-            self.trainer.gradient_clip_norm_type = int(gradient_clip_algorithm[4:])
 
         # gradient norm tracking
         if not isinstance(track_grad_norm, (int, float)) and track_grad_norm != 'inf':
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 3a6fa44ce1213..856ca30efb850 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -88,7 +88,7 @@ def __init__(
         callbacks: Optional[Union[List[Callback], Callback]] = None,
         default_root_dir: Optional[str] = None,
         gradient_clip_val: float = 0,
-        gradient_clip_algorithm: str = 'norm2',
+        gradient_clip_algorithm: str = 'norm',
         process_position: int = 0,
         num_nodes: int = 1,
         num_processes: int = 1,
@@ -198,7 +198,7 @@ def __init__(
 
             gradient_clip_val: 0 means don't clip.
 
-            gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{0,45}$)' means clip_by_norm.
+            gradient_clip_algorithm: 'value' means clip_by_value, 'norm' means clip_by_norm. defualt 'norm'
 
             limit_train_batches: How much of training dataset to check (floats = percent, int = num_batches)
 

From 4c8e46b1933d648c172d250212d8fbda6702dc17 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Wed, 20 Jan 2021 10:52:38 +0900
Subject: [PATCH 08/16] Add Enum Type

---
 pytorch_lightning/accelerators/accelerator.py          |  5 +++--
 pytorch_lightning/accelerators/tpu_accelerator.py      |  5 +++--
 pytorch_lightning/plugins/apex.py                      |  5 +++--
 pytorch_lightning/plugins/native_amp.py                |  5 +++--
 pytorch_lightning/plugins/sharded_native_amp_plugin.py | 10 +++++++---
 .../trainer/connectors/training_trick_connector.py     |  6 ++++--
 pytorch_lightning/utilities/__init__.py                |  8 +++++++-
 pytorch_lightning/utilities/enums.py                   | 10 ++++++++++
 8 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 666122f5115a3..3c656792a4a61 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -21,6 +21,7 @@
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.ddp_plugin import DDPPlugin
 from pytorch_lightning.plugins.rpc_plugin import RPCPlugin
+from pytorch_lightning.utilities import GradClipAlgorithmType
 from pytorch_lightning.utilities.apply_func import move_data_to_device
 from pytorch_lightning.utilities.parsing import AttributeDict
 
@@ -123,9 +124,9 @@ def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int]
             self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, clip_algorithm, optimizer, norm_type)
         else:
             model = self.trainer.get_model()
-            if clip_algorithm == 'value':
+            if clip_algorithm == GradClipAlgorithmType.VALUE:
                 torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val)
-            elif clip_algorithm == 'norm':
+            elif clip_algorithm == GradClipAlgorithmType.NORM:
                 torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
 
     def on_train_epoch_end(self, outputs):
diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py
index 0ef6845f62040..88068f40058b9 100644
--- a/pytorch_lightning/accelerators/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/tpu_accelerator.py
@@ -25,6 +25,7 @@
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import (
+    GradClipAlgorithmType,
     _TPU_AVAILABLE,
     move_data_to_device,
     rank_zero_info,
@@ -253,9 +254,9 @@ def _clip_gradients(self,
         # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
         model = self.trainer.get_model()
         parameters = model.parameters()
-        if gradient_clip_algorithm == 'value':
+        if gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
             torch.nn.utils.clip_grad_value_(parameters, clip_value=grad_clip_val)
-        elif gradient_clip_algorithm.startswith('norm'):
+        elif gradient_clip_algorithm == GradClipAlgorithmType.NORM:
             max_norm = grad_clip_val
             if isinstance(parameters, torch.Tensor):
                 parameters = [parameters]
diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py
index ec3682733f166..09c235fde204c 100644
--- a/pytorch_lightning/plugins/apex.py
+++ b/pytorch_lightning/plugins/apex.py
@@ -20,6 +20,7 @@
 from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin
 from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType
 from pytorch_lightning.utilities.distributed import rank_zero_warn
+from pytorch_lightning.utilities.enums import GradClipAlgorithmType
 
 if _APEX_AVAILABLE:
     from apex import amp
@@ -116,9 +117,9 @@ def clip_gradients(self,
         model = self.trainer.get_model()
         parameters = model.parameters()
 
-        if gradient_clip_algorithm == 'value':
+        if gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
             torch.nn.utils.clip_grad_value_(parameters, clip_value=grad_clip_val)
-        elif gradient_clip_algorithm.startswith('norm'):
+        if gradient_clip_algorithm == GradClipAlgorithmType.NORM:
             max_norm = float(grad_clip_val)
 
             if isinstance(parameters, torch.Tensor):
diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py
index 6cbac417fd400..df363d8329611 100644
--- a/pytorch_lightning/plugins/native_amp.py
+++ b/pytorch_lightning/plugins/native_amp.py
@@ -17,6 +17,7 @@
 from torch.optim import Optimizer
 
 from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin
+from pytorch_lightning.utilities import GradClipAlgorithmType
 
 
 class NativeAMPPlugin(PrecisionPlugin):
@@ -63,9 +64,9 @@ def clip_gradients(self,
                        norm_type: Union[float, int]):
         model = self.trainer.get_model()
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
-        if gradient_clip_algorithm == 'value':
+        if gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
             torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val)
-        elif gradient_clip_algorithm.startswith('norm'):
+        elif gradient_clip_algorithm == GradClipAlgorithmType.NORM:
             torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
 
     @property
diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
index b2a47e9c91223..25b99ab9363c1 100644
--- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py
+++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
@@ -16,7 +16,11 @@
 from torch.optim import Optimizer
 
 from pytorch_lightning.plugins.native_amp import NativeAMPPlugin
-from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE
+from pytorch_lightning.utilities import (
+    _FAIRSCALE_AVAILABLE,
+    _NATIVE_AMP_AVAILABLE,
+    GradClipAlgorithmType,
+)
 
 if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE:
     from fairscale.optim import OSS
@@ -33,8 +37,8 @@ def clip_gradients(self,
                        grad_clip_val: Union[int, float],
                        gradient_clip_algorithm: str,
                        norm_type: Union[float, int]):
-        if gradient_clip_algorithm == 'value':
+        if gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
             raise NotImplementedError("Value grad clipping with sharded ddp is not implemented yet")
-        elif gradient_clip_algorithm.startswith('norm'):
+        elif gradient_clip_algorithm == GradClipAlgorithmType.NORM:
             optimizer = cast(OSS, optimizer)
             optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type)
diff --git a/pytorch_lightning/trainer/connectors/training_trick_connector.py b/pytorch_lightning/trainer/connectors/training_trick_connector.py
index 9bab15226e2e1..a33c929d8fe49 100644
--- a/pytorch_lightning/trainer/connectors/training_trick_connector.py
+++ b/pytorch_lightning/trainer/connectors/training_trick_connector.py
@@ -14,6 +14,7 @@
 
 
 from pytorch_lightning.callbacks import GradientAccumulationScheduler
+from pytorch_lightning.utilities import GradClipAlgorithmType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 
 
@@ -35,8 +36,9 @@ def on_trainer_init(
         self.trainer.terminate_on_nan = terminate_on_nan
 
         # gradient clipping
-        if gradient_clip_algorithm not in ['value', 'norm']:
-            raise MisconfigurationException("gradient_clip_algorithm should be 'value' or 'norm'")
+        if gradient_clip_algorithm not in [GradClipAlgorithmType.VALUE, GradClipAlgorithmType.NORM]:
+            raise MisconfigurationException(f"gradient_clip_algorithm should be "
+                                            f"'{GradClipAlgorithmType.VALUE}' or '{GradClipAlgorithmType.NORM}'")
         self.trainer.gradient_clip_val = gradient_clip_val
         self.trainer.gradient_clip_algorithm = gradient_clip_algorithm
 
diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py
index 0a5ed04eb72a3..d197dc0ddb593 100644
--- a/pytorch_lightning/utilities/__init__.py
+++ b/pytorch_lightning/utilities/__init__.py
@@ -22,7 +22,13 @@
     rank_zero_only,
     rank_zero_warn,
 )
-from pytorch_lightning.utilities.enums import AMPType, DeviceType, DistributedType, LightningEnum  # noqa: F401
+from pytorch_lightning.utilities.enums import (  # noqa: F401
+    AMPType,
+    DeviceType,
+    DistributedType,
+    GradClipAlgorithmType,
+    LightningEnum,
+)
 from pytorch_lightning.utilities.imports import (  # noqa: F401
     _APEX_AVAILABLE,
     _BOLTS_AVAILABLE,
diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py
index f6c0bf1d6cc54..75ce4ab069650 100644
--- a/pytorch_lightning/utilities/enums.py
+++ b/pytorch_lightning/utilities/enums.py
@@ -80,3 +80,13 @@ class DeviceType(LightningEnum):
     CPU = 'CPU'
     GPU = 'GPU'
     TPU = 'TPU'
+
+
+class GradClipAlgorithmType(LightningEnum):
+    """ Define gradient_clip_algorithm types - training-tricks.
+
+    >>> GradClipAlgorithmType.VALUE in  ('value', 'norm')
+    True
+    """
+    VALUE = 'value'
+    NORM = 'norm'

From e2484aeb41567d5aba38b25ce1153ef4d41a8df5 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Wed, 20 Jan 2021 11:05:06 +0900
Subject: [PATCH 09/16] edit CHANGELOG.md to prevent conflicts

---
 CHANGELOG.md                                      | 4 ++--
 pytorch_lightning/accelerators/tpu_accelerator.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2201fdca61c26..6b0d69c20fecf 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -54,10 +54,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added missing val/test hooks in `LightningModule` ([#5467](https://github.com/PyTorchLightning/pytorch-lightning/pull/5467))
 
 
-- `Recall` and `Precision` metrics (and their functional counterparts `recall` and `precision`) can now be generalized to Recall@K and Precision@K with the use of `top_k` parameter ([#4842](https://github.com/PyTorchLightning/pytorch-lightning/pull/4842))
+- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#5477](https://github.com/PyTorchLightning/pytorch-lightning/pull/5477)).
 
 
-- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#5477](https://github.com/PyTorchLightning/pytorch-lightning/pull/5477)).
+- `Recall` and `Precision` metrics (and their functional counterparts `recall` and `precision`) can now be generalized to Recall@K and Precision@K with the use of `top_k` parameter ([#4842](https://github.com/PyTorchLightning/pytorch-lightning/pull/4842))
 
 
 ### Changed
diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py
index 88068f40058b9..cb4dd381aaf2a 100644
--- a/pytorch_lightning/accelerators/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/tpu_accelerator.py
@@ -25,12 +25,12 @@
 from pytorch_lightning.cluster_environments import ClusterEnvironment
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import (
-    GradClipAlgorithmType,
     _TPU_AVAILABLE,
     move_data_to_device,
     rank_zero_info,
     rank_zero_only,
     rank_zero_warn,
+    GradClipAlgorithmType,
 )
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.exceptions import MisconfigurationException

From 393b77b15114749f2f2143a25bcdf3ff13ad91c8 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Wed, 20 Jan 2021 12:40:22 +0900
Subject: [PATCH 10/16] add test codes

---
 docs/source/training_tricks.rst               |  2 +-
 pytorch_lightning/accelerators/accelerator.py |  2 +-
 .../accelerators/tpu_accelerator.py           |  6 +-
 pytorch_lightning/plugins/apex.py             |  7 +-
 pytorch_lightning/plugins/native_amp.py       |  4 +-
 pytorch_lightning/plugins/precision_plugin.py |  4 +-
 .../plugins/sharded_native_amp_plugin.py      |  8 +-
 pytorch_lightning/trainer/trainer.py          |  2 +-
 tests/models/test_horovod.py                  | 16 ++++
 tests/models/test_tpu.py                      | 19 +++++
 tests/trainer/test_trainer.py                 | 78 +++++++++++++++++++
 11 files changed, 131 insertions(+), 17 deletions(-)

diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst
index 7211dfefc6c88..faffe86d9ba2a 100644
--- a/docs/source/training_tricks.rst
+++ b/docs/source/training_tricks.rst
@@ -42,7 +42,7 @@ If gradient_clip_algorithm option is set to 'value', which is 'norm' by default,
     trainer = Trainer(gradient_clip_val=0.5)
 
     # clip gradients with value above 0.5
-    trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='value' )
+    trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='value')
 
 ----------
 
diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
index 3c656792a4a61..5c3839326182c 100644
--- a/pytorch_lightning/accelerators/accelerator.py
+++ b/pytorch_lightning/accelerators/accelerator.py
@@ -118,7 +118,7 @@ def clip_gradients(self, optimizer, clip_val=None):
             return
         self._clip_gradients(optimizer, grad_clip_val)
 
-    def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0):
+    def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: float, norm_type: float = 2.0):
         clip_algorithm = self.trainer.gradient_clip_algorithm
         if self.trainer.amp_backend:
             self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, clip_algorithm, optimizer, norm_type)
diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py
index cb4dd381aaf2a..fecf2209d985d 100644
--- a/pytorch_lightning/accelerators/tpu_accelerator.py
+++ b/pytorch_lightning/accelerators/tpu_accelerator.py
@@ -26,11 +26,11 @@
 from pytorch_lightning.core import LightningModule
 from pytorch_lightning.utilities import (
     _TPU_AVAILABLE,
+    GradClipAlgorithmType,
     move_data_to_device,
     rank_zero_info,
     rank_zero_only,
     rank_zero_warn,
-    GradClipAlgorithmType,
 )
 from pytorch_lightning.utilities.cloud_io import atomic_save
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
@@ -247,9 +247,9 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
 
     def _clip_gradients(self,
                         optimizer: Optimizer,
-                        grad_clip_val: Union[float, int],
+                        grad_clip_val: float,
                         gradient_clip_algorithm: str,
-                        norm_type: Union[float, int]):
+                        norm_type: float):
         # this code contains a modification of torch.nn.utils.clip_grad_norm_
         # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md
         model = self.trainer.get_model()
diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py
index 09c235fde204c..9f1c65afd4bc5 100644
--- a/pytorch_lightning/plugins/apex.py
+++ b/pytorch_lightning/plugins/apex.py
@@ -18,9 +18,8 @@
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin
-from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType
+from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType, GradClipAlgorithmType
 from pytorch_lightning.utilities.distributed import rank_zero_warn
-from pytorch_lightning.utilities.enums import GradClipAlgorithmType
 
 if _APEX_AVAILABLE:
     from apex import amp
@@ -102,9 +101,9 @@ def configure_apex(self, amp, model, optimizers, amp_level):
 
     def clip_gradients(self,
                        optimizer: Optimizer,
-                       grad_clip_val: Union[int, float],
+                       grad_clip_val: float,
                        gradient_clip_algorithm: str,
-                       norm_type: Union[float, int]):
+                       norm_type: float):
         """
         This code contains a modification of :meth:`torch.nn.utils.clip_grad_norm_` using a higher epsilon
         for fp16 weights. This is important when setting amp_level to O2, and the master weights are in fp16.
diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py
index df363d8329611..1fa8860bfcbc1 100644
--- a/pytorch_lightning/plugins/native_amp.py
+++ b/pytorch_lightning/plugins/native_amp.py
@@ -59,9 +59,9 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
 
     def clip_gradients(self,
                        optimizer: Optimizer,
-                       grad_clip_val: Union[int, float],
+                       grad_clip_val: float,
                        gradient_clip_algorithm: str,
-                       norm_type: Union[float, int]):
+                       norm_type: float):
         model = self.trainer.get_model()
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
         if gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
diff --git a/pytorch_lightning/plugins/precision_plugin.py b/pytorch_lightning/plugins/precision_plugin.py
index 2d64f48855150..cd04ba199b815 100644
--- a/pytorch_lightning/plugins/precision_plugin.py
+++ b/pytorch_lightning/plugins/precision_plugin.py
@@ -37,7 +37,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs):
 
     def clip_gradients(self,
                        optimizer: Optimizer,
-                       grad_clip_val: Union[int, float],
+                       grad_clip_val: float,
                        gradient_clip_algorithm: str,
-                       norm_type: Union[float, int]):
+                       norm_type: float):
         raise NotImplementedError
diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
index 25b99ab9363c1..dbeb0a1f8efd7 100644
--- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py
+++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
@@ -34,11 +34,13 @@ def scaler(self):
 
     def clip_gradients(self,
                        optimizer: Optimizer,
-                       grad_clip_val: Union[int, float],
+                       grad_clip_val: float,
                        gradient_clip_algorithm: str,
-                       norm_type: Union[float, int]):
+                       norm_type: float):
+
+        optimizer = cast(OSS, optimizer)
         if gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
             raise NotImplementedError("Value grad clipping with sharded ddp is not implemented yet")
+            # optimizer.clip_grad_value(grad_clip_val)
         elif gradient_clip_algorithm == GradClipAlgorithmType.NORM:
-            optimizer = cast(OSS, optimizer)
             optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type)
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
index 498145c4c31c3..57e38865ec8b8 100644
--- a/pytorch_lightning/trainer/trainer.py
+++ b/pytorch_lightning/trainer/trainer.py
@@ -87,7 +87,7 @@ def __init__(
         checkpoint_callback: bool = True,
         callbacks: Optional[Union[List[Callback], Callback]] = None,
         default_root_dir: Optional[str] = None,
-        gradient_clip_val: float = 0,
+        gradient_clip_val: Union[int, float] = 0,
         gradient_clip_algorithm: str = 'norm',
         process_position: int = 0,
         num_nodes: int = 1,
diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 7ac7cd235f392..9edfa6ed96484 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -17,6 +17,7 @@
 import shlex
 import subprocess
 import sys
+from copy import deepcopy
 
 import numpy as np
 import pytest
@@ -85,6 +86,9 @@ def test_horovod_cpu(enable_pl_optimizer, tmpdir):
         enable_pl_optimizer=enable_pl_optimizer,
     )
     _run_horovod(trainer_options)
+    trainer_options_clip_grad_val = deepcopy(trainer_options)
+    trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'})
+    _run_horovod(trainer_options_clip_grad_val, on_gpu=True)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@@ -103,6 +107,9 @@ def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir):
         enable_pl_optimizer=enable_pl_optimizer,
     )
     _run_horovod(trainer_options)
+    trainer_options_clip_grad_val = deepcopy(trainer_options)
+    trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'})
+    _run_horovod(trainer_options_clip_grad_val, on_gpu=True)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@@ -123,6 +130,9 @@ def test_horovod_multi_gpu(tmpdir):
         accelerator='horovod',
     )
     _run_horovod(trainer_options, on_gpu=True)
+    trainer_options_clip_grad_val = deepcopy(trainer_options)
+    trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'})
+    _run_horovod(trainer_options_clip_grad_val, on_gpu=True)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@@ -146,6 +156,9 @@ def test_horovod_apex(tmpdir):
         precision=16,
     )
     _run_horovod(trainer_options, on_gpu=True)
+    trainer_options_clip_grad_val = deepcopy(trainer_options)
+    trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'})
+    _run_horovod(trainer_options_clip_grad_val, on_gpu=True)
 
 
 @pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp")
@@ -170,6 +183,9 @@ def test_horovod_amp(tmpdir):
         precision=16,
     )
     _run_horovod(trainer_options, on_gpu=True)
+    trainer_options_clip_grad_val = deepcopy(trainer_options)
+    trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'})
+    _run_horovod(trainer_options_clip_grad_val, on_gpu=True)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
index 5e977eed765d0..61fcadf7b28da 100644
--- a/tests/models/test_tpu.py
+++ b/tests/models/test_tpu.py
@@ -197,6 +197,25 @@ def test_tpu_grad_norm(tmpdir):
     tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
 
 
+@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
+@pl_multi_process_test
+def test_tpu_clip_grad_by_value(tmpdir):
+    """Test if clip_gradients by value works on TPU."""
+    trainer_options = dict(
+        default_root_dir=tmpdir,
+        progress_bar_refresh_rate=0,
+        max_epochs=1,
+        tpu_cores=1,
+        limit_train_batches=0.4,
+        limit_val_batches=0.4,
+        gradient_clip_val=0.1,
+        gradient_clip_algorithm='value'
+    )
+
+    model = EvalModelTemplate()
+    tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False)
+
+
 @pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine")
 @pl_multi_process_test
 def test_dataloaders_passed_to_fit(tmpdir):
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index 97785d9e61a86..f66a3c119e256 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -962,6 +962,44 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde
     trainer.fit(model)
 
 
+def test_gradient_clipping_by_value(tmpdir):
+    """
+    Test gradient clipping by value
+    """
+    tutils.reset_seed()
+
+    model = EvalModelTemplate()
+
+    grad_clip_val = 0.001
+    trainer = Trainer(
+        max_steps=10,
+        max_epochs=1,
+        gradient_clip_val=grad_clip_val,
+        gradient_clip_algorithm='value',
+        default_root_dir=tmpdir,
+    )
+
+    trainer.train_loop.old_training_step_and_backward = trainer.train_loop.training_step_and_backward
+
+    def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens):
+        """
+        wrap the forward step in a closure so second order methods work
+        """
+        # test that gradient is clipped correctly
+        ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
+        parameters = model.parameters()
+        grad_max = torch.max(torch.stack([p.grad.detach() for p in parameters]).abs())
+        assert grad_max.item() <= grad_clip_val
+
+        return ret_val
+
+    trainer.train_loop.training_step_and_backward = training_step_and_backward
+    # for the test
+    model.prev_called_batch_idx = 0
+
+    trainer.fit(model)
+
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
 @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires native AMP.")
 def test_gradient_clipping_fp16(tmpdir):
@@ -1001,6 +1039,46 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde
     trainer.fit(model)
 
 
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
+@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires native AMP.")
+def test_gradient_clipping_by_value_fp16(tmpdir):
+    """
+    Test gradient clipping by value with fp16
+    """
+    tutils.reset_seed()
+
+    model = EvalModelTemplate()
+    grad_clip_val = 0.001
+    trainer = Trainer(
+        max_steps=10,
+        max_epochs=1,
+        precision=16,
+        gpus=1,
+        gradient_clip_val=grad_clip_val,
+        gradient_clip_algorithm='value',
+        default_root_dir=tmpdir,
+    )
+
+    trainer.train_loop.old_training_step_and_backward = trainer.train_loop.training_step_and_backward
+
+    def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens):
+        """
+        wrap the forward step in a closure so second order methods work
+        """
+        # test that gradient is clipped correctly
+        ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
+        parameters = model.parameters()
+        grad_max = torch.max(torch.stack([p.grad.detach() for p in parameters]))
+        assert grad_max.item() <= grad_clip_val
+
+        return ret_val
+
+    trainer.train_loop.training_step_and_backward = training_step_and_backward
+    model.prev_called_batch_idx = 0
+
+    trainer.fit(model)
+
+
 def test_gpu_choice(tmpdir):
     trainer_options = dict(
         default_root_dir=tmpdir,

From 2403f96b773a05c050e8275e5bb2f360a9812194 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Wed, 20 Jan 2021 14:08:44 +0900
Subject: [PATCH 11/16] pep8 formatting

---
 pytorch_lightning/plugins/apex.py                      | 2 +-
 pytorch_lightning/plugins/native_amp.py                | 1 -
 pytorch_lightning/plugins/precision_plugin.py          | 1 -
 pytorch_lightning/plugins/sharded_native_amp_plugin.py | 2 +-
 4 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py
index 9f1c65afd4bc5..8d3d065dac0dd 100644
--- a/pytorch_lightning/plugins/apex.py
+++ b/pytorch_lightning/plugins/apex.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Tuple, Union
+from typing import List, Tuple
 
 import torch
 from torch.optim.optimizer import Optimizer
diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py
index 1fa8860bfcbc1..41fe80d6199d0 100644
--- a/pytorch_lightning/plugins/native_amp.py
+++ b/pytorch_lightning/plugins/native_amp.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Union
 
 import torch
 from torch.optim import Optimizer
diff --git a/pytorch_lightning/plugins/precision_plugin.py b/pytorch_lightning/plugins/precision_plugin.py
index cd04ba199b815..93cdaa889b1d1 100644
--- a/pytorch_lightning/plugins/precision_plugin.py
+++ b/pytorch_lightning/plugins/precision_plugin.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Union
 
 from torch.optim import Optimizer
 
diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
index dbeb0a1f8efd7..78d6c88cebea7 100644
--- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py
+++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Union, cast
+from typing import cast
 
 from torch.optim import Optimizer
 

From 79d41491edbc6ef62d1142bcc52525b91ad18f23 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Wed, 20 Jan 2021 14:16:39 +0900
Subject: [PATCH 12/16] update test codes

---
 tests/models/test_horovod.py  | 14 ++++++++++++--
 tests/trainer/test_trainer.py |  3 ++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py
index 9edfa6ed96484..945bf94489e6e 100644
--- a/tests/models/test_horovod.py
+++ b/tests/models/test_horovod.py
@@ -86,9 +86,11 @@ def test_horovod_cpu(enable_pl_optimizer, tmpdir):
         enable_pl_optimizer=enable_pl_optimizer,
     )
     _run_horovod(trainer_options)
+
+    # clip_grad_by_value test
     trainer_options_clip_grad_val = deepcopy(trainer_options)
     trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'})
-    _run_horovod(trainer_options_clip_grad_val, on_gpu=True)
+    _run_horovod(trainer_options_clip_grad_val)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@@ -107,9 +109,11 @@ def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir):
         enable_pl_optimizer=enable_pl_optimizer,
     )
     _run_horovod(trainer_options)
+
+    # clip_grad_by_value test
     trainer_options_clip_grad_val = deepcopy(trainer_options)
     trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'})
-    _run_horovod(trainer_options_clip_grad_val, on_gpu=True)
+    _run_horovod(trainer_options_clip_grad_val)
 
 
 @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@@ -130,6 +134,8 @@ def test_horovod_multi_gpu(tmpdir):
         accelerator='horovod',
     )
     _run_horovod(trainer_options, on_gpu=True)
+
+    # clip_grad_by_value test
     trainer_options_clip_grad_val = deepcopy(trainer_options)
     trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'})
     _run_horovod(trainer_options_clip_grad_val, on_gpu=True)
@@ -156,6 +162,8 @@ def test_horovod_apex(tmpdir):
         precision=16,
     )
     _run_horovod(trainer_options, on_gpu=True)
+
+    # clip_grad_by_value test
     trainer_options_clip_grad_val = deepcopy(trainer_options)
     trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'})
     _run_horovod(trainer_options_clip_grad_val, on_gpu=True)
@@ -183,6 +191,8 @@ def test_horovod_amp(tmpdir):
         precision=16,
     )
     _run_horovod(trainer_options, on_gpu=True)
+
+    # clip_grad_by_value test
     trainer_options_clip_grad_val = deepcopy(trainer_options)
     trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'})
     _run_horovod(trainer_options_clip_grad_val, on_gpu=True)
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index f66a3c119e256..f48e55f72520f 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -988,7 +988,8 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde
         # test that gradient is clipped correctly
         ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
         parameters = model.parameters()
-        grad_max = torch.max(torch.stack([p.grad.detach() for p in parameters]).abs())
+        grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters]
+        grad_max = torch.max(torch.stack(grad_max_list))
         assert grad_max.item() <= grad_clip_val
 
         return ret_val

From 9fc6f62893a6b3d468abadb1c9c6f28041413854 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Wed, 20 Jan 2021 15:33:35 +0900
Subject: [PATCH 13/16] update test codes

---
 tests/trainer/test_trainer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index f48e55f72520f..b1bfeada38919 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -990,7 +990,7 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde
         parameters = model.parameters()
         grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters]
         grad_max = torch.max(torch.stack(grad_max_list))
-        assert grad_max.item() <= grad_clip_val
+        assert grad_max.item() <= grad_clip_val, f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ."
 
         return ret_val
 
@@ -1070,7 +1070,7 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde
         ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
         parameters = model.parameters()
         grad_max = torch.max(torch.stack([p.grad.detach() for p in parameters]))
-        assert grad_max.item() <= grad_clip_val
+        assert grad_max.item() <= grad_clip_val, f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ."
 
         return ret_val
 

From 853fb095a0fd69d0c5b392e138b18902205c8320 Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Wed, 20 Jan 2021 15:57:04 +0900
Subject: [PATCH 14/16] update test codes

---
 tests/trainer/test_trainer.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index b1bfeada38919..44f5467ffb08e 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -970,7 +970,7 @@ def test_gradient_clipping_by_value(tmpdir):
 
     model = EvalModelTemplate()
 
-    grad_clip_val = 0.001
+    grad_clip_val = 0.0001
     trainer = Trainer(
         max_steps=10,
         max_epochs=1,
@@ -990,7 +990,8 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde
         parameters = model.parameters()
         grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters]
         grad_max = torch.max(torch.stack(grad_max_list))
-        assert grad_max.item() <= grad_clip_val, f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ."
+        assert round(grad_max.item(), 6) <= grad_clip_val, \
+            f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ."
 
         return ret_val
 
@@ -1049,7 +1050,7 @@ def test_gradient_clipping_by_value_fp16(tmpdir):
     tutils.reset_seed()
 
     model = EvalModelTemplate()
-    grad_clip_val = 0.001
+    grad_clip_val = 0.0001
     trainer = Trainer(
         max_steps=10,
         max_epochs=1,
@@ -1070,7 +1071,8 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde
         ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
         parameters = model.parameters()
         grad_max = torch.max(torch.stack([p.grad.detach() for p in parameters]))
-        assert grad_max.item() <= grad_clip_val, f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ."
+        assert round(grad_max.item(), 6) <= grad_clip_val, \
+            f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ."
 
         return ret_val
 

From 257bcb6974f09d0e3d50c24898e6c50fa32b6d5e Mon Sep 17 00:00:00 2001
From: "dong.hyun" <artit.anthony@gmail.com>
Date: Fri, 22 Jan 2021 12:10:40 +0900
Subject: [PATCH 15/16] add value clipping for sharded ddp

---
 benchmarks/test_sharded_parity.py             | 31 +++++++++++++++++++
 .../plugins/sharded_native_amp_plugin.py      |  7 +++--
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py
index 05fde8e11523a..67d00a1637044 100644
--- a/benchmarks/test_sharded_parity.py
+++ b/benchmarks/test_sharded_parity.py
@@ -162,6 +162,31 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir):
     )
 
 
+@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1',
+                    reason="test should be run outside of pytest")
+@DDPLauncher.run("--accelerator ddp --gpus 2  --precision 16")
+def test_ddp_sharded_plugin_clip_gradients(tmpdir, args=None):
+    plugin_parity_test(
+        gpus=args.gpus,
+        precision=args.precision,
+        accelerator=args.accelerator,
+        plugin=DDPShardedPlugin(),
+        model_cls=SeedTrainLoaderModel,
+        gradient_clip_val=0.001,
+    )
+    plugin_parity_test(
+        gpus=args.gpus,
+        precision=args.precision,
+        accelerator=args.accelerator,
+        plugin=DDPShardedPlugin(),
+        model_cls=SeedTrainLoaderModel,
+        gradient_clip_val=0.001,
+        gradient_clip_algorithm='value',
+    )
+
+
 class SeedTrainLoaderModel(BoringModel):
     """
         Overrides training loader to ensure we enforce the same seed for all DDP processes.
@@ -261,6 +286,8 @@ def plugin_parity_test(
         gpus: int = 0,
         precision: int = 32,
         max_percent_speed_diff: float = 0.1,
+        gradient_clip_val: Union[int, float] = 0,
+        gradient_clip_algorithm: str = 'norm',
 ):
     """
     Ensures that the trained model is identical to the standard DDP implementation.
@@ -274,6 +301,8 @@ def plugin_parity_test(
         gpus: Number of GPUS to enable.
         precision: Whether to use AMP or normal FP32 training.
         max_percent_speed_diff: The maximum speed difference compared to normal DDP training.
+        gradient_clip_val: 0 means don't clip.
+        gradient_clip_algorithm: 'value' means clip_by_value, 'norm' means clip_by_norm. defualt 'norm'
         This is more a safety net for variability in CI which can vary in speed, not for benchmarking.
 
     """
@@ -308,6 +337,8 @@ def plugin_parity_test(
         precision=precision,
         accelerator=accelerator,
         plugins=[plugin],
+        gradient_clip_val=gradient_clip_val,
+        gradient_clip_algorithm=gradient_clip_algorithm,
     )
 
     max_memory_custom, custom_model_time = record_ddp_fit_model_stats(
diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
index 78d6c88cebea7..8396421b442e1 100644
--- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py
+++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from typing import cast
 
+import torch
 from torch.optim import Optimizer
 
 from pytorch_lightning.plugins.native_amp import NativeAMPPlugin
@@ -38,9 +39,9 @@ def clip_gradients(self,
                        gradient_clip_algorithm: str,
                        norm_type: float):
 
-        optimizer = cast(OSS, optimizer)
         if gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
-            raise NotImplementedError("Value grad clipping with sharded ddp is not implemented yet")
-            # optimizer.clip_grad_value(grad_clip_val)
+            model = self.trainer.get_model()
+            torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val)
         elif gradient_clip_algorithm == GradClipAlgorithmType.NORM:
+            optimizer = cast(OSS, optimizer)
             optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type)

From dcf9ff0b2bd30d13b8221d22430b5768b59b6b2d Mon Sep 17 00:00:00 2001
From: Anthony Kim <artit.anthony@gmail.com>
Date: Sat, 30 Jan 2021 01:27:18 +0900
Subject: [PATCH 16/16] remove bad line in native_amp.py

bugfix
---
 pytorch_lightning/plugins/legacy/native_amp.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pytorch_lightning/plugins/legacy/native_amp.py b/pytorch_lightning/plugins/legacy/native_amp.py
index b08135fa264c1..941042d9bc4ad 100644
--- a/pytorch_lightning/plugins/legacy/native_amp.py
+++ b/pytorch_lightning/plugins/legacy/native_amp.py
@@ -66,7 +66,6 @@ def clip_gradients(self,
                        gradient_clip_algorithm: str,
                        norm_type: float):
         model = self.trainer.get_model()
-        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type)
         if gradient_clip_algorithm == GradClipAlgorithmType.VALUE:
             torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val)
         elif gradient_clip_algorithm == GradClipAlgorithmType.NORM: