From 4111955c228c8c2b7540afdc31421d7016f5cf15 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Tue, 12 Jan 2021 14:04:11 +0900 Subject: [PATCH 01/16] add clip_grad_by_value feature --- pytorch_lightning/accelerators/accelerator.py | 25 +++++++-- .../accelerators/tpu_accelerator.py | 44 +++++++++------- pytorch_lightning/plugins/apex.py | 52 +++++++++++-------- pytorch_lightning/plugins/native_amp.py | 12 ++++- pytorch_lightning/plugins/precision_plugin.py | 6 ++- .../plugins/sharded_native_amp_plugin.py | 17 ++++-- .../connectors/training_trick_connector.py | 13 ++++- pytorch_lightning/trainer/trainer.py | 10 +++- 8 files changed, 126 insertions(+), 53 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 9b56119a04c3e..e97bf30e263d0 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -119,14 +119,29 @@ def clip_gradients(self, optimizer, clip_val=None): if grad_clip_val <= 0: return - self._clip_gradients(optimizer, grad_clip_val) - - def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0): + self._clip_gradients(optimizer=optimizer, + grad_clip_val=grad_clip_val, + gradient_clip_algorithm=self.trainer.gradient_clip_algorithm, + norm_type=self.trainer.gradient_clip_norm_type) + + def _clip_gradients(self, + optimizer: Optimizer, + grad_clip_val: Union[float, int], + gradient_clip_algorithm: str, + norm_type: Union[float, int]): if self.trainer.amp_backend: - self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, optimizer, norm_type) + self.trainer.precision_connector.backend.clip_gradients(optimizer=optimizer, + grad_clip_val=grad_clip_val, + gradient_clip_algorithm=gradient_clip_algorithm, + norm_type=norm_type) else: model = self.trainer.get_model() - torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) + if gradient_clip_algorithm == 'value': + torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val) + elif gradient_clip_algorithm.startswith('norm'): + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) + else: + raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') def on_train_epoch_end(self, outputs): pass diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index 66fc236a2a775..2cc58fdd2890f 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -244,27 +244,35 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): return closure_loss - def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0): - # this code is a modification of torch.nn.utils.clip_grad_norm_ + def _clip_gradients(self, + optimizer: Optimizer, + grad_clip_val: Union[float, int], + gradient_clip_algorithm: str, + norm_type: Union[float, int]): + # this code contains a modification of torch.nn.utils.clip_grad_norm_ # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md model = self.trainer.get_model() parameters = model.parameters() - max_norm = grad_clip_val - - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = list(filter(lambda p: p.grad is not None, parameters)) - - device = parameters[0].device - out = torch.empty(len(parameters), device=device) - for i, p in enumerate(parameters): - torch.norm(p.grad.data.to(device), norm_type, out=out[i]) - total_norm = torch.norm(out, norm_type) - - clip_coef = torch.tensor(max_norm, device=device) / (total_norm + self.norm_clipping_epsilon) - clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef)) - for p in parameters: - p.grad.data.mul_(clip_coef.to(p.grad.data.device)) + if gradient_clip_algorithm == 'value': + torch.nn.utils.clip_grad_value_(parameters, clip_value=grad_clip_val) + elif gradient_clip_algorithm.startswith('norm'): + max_norm = grad_clip_val + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = list(filter(lambda p: p.grad is not None, parameters)) + + device = parameters[0].device + out = torch.empty(len(parameters), device=device) + for i, p in enumerate(parameters): + torch.norm(p.grad.data.to(device), norm_type, out=out[i]) + total_norm = torch.norm(out, norm_type) + + clip_coef = torch.tensor(max_norm, device=device) / (total_norm + self.norm_clipping_epsilon) + clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef)) + for p in parameters: + p.grad.data.mul_(clip_coef.to(p.grad.data.device)) + else: + raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') def barrier(self, name: Optional[str] = None): torch_xla.core.xla_model.rendezvous(f"pl.Trainer.{name}") diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py index f80461e5d4fe5..efd43a62421e6 100644 --- a/pytorch_lightning/plugins/apex.py +++ b/pytorch_lightning/plugins/apex.py @@ -99,33 +99,43 @@ def configure_apex(self, amp, model, optimizers, amp_level): model, optimizers = amp.initialize(model, optimizers, opt_level=amp_level) return model, optimizers - def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float): + def clip_gradients(self, + optimizer: Optimizer, + grad_clip_val: Union[int, float], + gradient_clip_algorithm: str, + norm_type: Union[float, int]): """ - This code is a modification of :meth:`torch.nn.utils.clip_grad_norm_` using a higher epsilon for fp16 weights. - This is important when setting amp_level to O2, and the master weights are in fp16. + This code contains a modification of :meth:`torch.nn.utils.clip_grad_norm_` using a higher epsilon + for fp16 weights. This is important when setting amp_level to O2, and the master weights are in fp16. Args: - grad_clip_val: Maximum norm of gradients. optimizer: Optimizer with gradients that will be clipped. - norm_type: (float or int): type of the used p-norm. Can be ``'inf'`` for - infinity norm. + grad_clip_val: Maximum norm of gradients. + gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{1,45}$)' means clip_by_norm. + norm_type: (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. """ model = self.trainer.get_model() parameters = model.parameters() - max_norm = float(grad_clip_val) - - if isinstance(parameters, torch.Tensor): - parameters = [parameters] - parameters = [p for p in parameters if p.grad is not None] - - if len(parameters) == 0: - return torch.tensor(0.) - device = parameters[0].grad.device - total_norm = torch.norm( - torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) - clip_coef = max_norm / (total_norm + self.norm_clipping_epsilon) - if clip_coef < 1: - for p in parameters: - p.grad.detach().mul_(clip_coef.to(p.grad.device)) + + if gradient_clip_algorithm == 'value': + torch.nn.utils.clip_grad_value_(parameters, clip_value=grad_clip_val) + elif gradient_clip_algorithm.startswith('norm'): + max_norm = float(grad_clip_val) + + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + parameters = [p for p in parameters if p.grad is not None] + + if len(parameters) == 0: + return torch.tensor(0.) + device = parameters[0].grad.device + total_norm = torch.norm( + torch.stack([torch.norm(p.grad.detach(), norm_type).to(device) for p in parameters]), norm_type) + clip_coef = max_norm / (total_norm + self.norm_clipping_epsilon) + if clip_coef < 1: + for p in parameters: + p.grad.detach().mul_(clip_coef.to(p.grad.device)) + else: + raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') @property def norm_clipping_epsilon(self): diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 4df5d128476a4..4cc85e1add4ed 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -56,9 +56,19 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): return closure_loss - def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float): + def clip_gradients(self, + optimizer: Optimizer, + grad_clip_val: Union[int, float], + gradient_clip_algorithm: str, + norm_type: Union[float, int]): model = self.trainer.get_model() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) + if gradient_clip_algorithm == 'value': + torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val) + elif gradient_clip_algorithm.startswith('norm'): + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) + else: + raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') @property def scaler(self): diff --git a/pytorch_lightning/plugins/precision_plugin.py b/pytorch_lightning/plugins/precision_plugin.py index aaac3ede3c623..2d64f48855150 100644 --- a/pytorch_lightning/plugins/precision_plugin.py +++ b/pytorch_lightning/plugins/precision_plugin.py @@ -35,5 +35,9 @@ def training_step(self, fx, args): def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): raise NotImplementedError - def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float): + def clip_gradients(self, + optimizer: Optimizer, + grad_clip_val: Union[int, float], + gradient_clip_algorithm: str, + norm_type: Union[float, int]): raise NotImplementedError diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py index 5ddd29521203d..e14f49c94f927 100644 --- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py +++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py @@ -28,8 +28,15 @@ class ShardedNativeAMPPlugin(NativeAMPPlugin): def scaler(self): return ShardedGradScaler() - def clip_gradients(self, grad_clip_val: Union[int, float], optimizer: Optimizer, norm_type: float): - max_norm = grad_clip_val - norm_type = float(2.0) - optimizer = cast(OSS, optimizer) - optimizer.clip_grad_norm(max_norm, norm_type=norm_type) + def clip_gradients(self, + optimizer: Optimizer, + grad_clip_val: Union[int, float], + gradient_clip_algorithm: str, + norm_type: Union[float, int]): + if gradient_clip_algorithm == 'value': + raise NotImplementedError("Value grad clipping with sharded ddp is not implemented yet") + elif gradient_clip_algorithm.startswith('norm'): + optimizer = cast(OSS, optimizer) + optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type) + else: + raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') \ No newline at end of file diff --git a/pytorch_lightning/trainer/connectors/training_trick_connector.py b/pytorch_lightning/trainer/connectors/training_trick_connector.py index 273efad403d10..b839429f16f77 100644 --- a/pytorch_lightning/trainer/connectors/training_trick_connector.py +++ b/pytorch_lightning/trainer/connectors/training_trick_connector.py @@ -11,8 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.utilities.exceptions import MisconfigurationException +import re + from pytorch_lightning.callbacks import GradientAccumulationScheduler +from pytorch_lightning.utilities.exceptions import MisconfigurationException class TrainingTricksConnector: @@ -23,6 +25,7 @@ def __init__(self, trainer): def on_trainer_init( self, gradient_clip_val, + gradient_clip_algorithm, track_grad_norm, accumulate_grad_batches, truncated_bptt_steps, @@ -32,7 +35,15 @@ def on_trainer_init( self.trainer.terminate_on_nan = terminate_on_nan # gradient clipping + regex = '^norm[1-9]([0-9]{1,45}$)' + if gradient_clip_algorithm != 'value' and re.match(regex, gradient_clip_algorithm) is None: + raise MisconfigurationException(f"gradient_clip_algorithm should be value or match with regex {regex}") self.trainer.gradient_clip_val = gradient_clip_val + self.trainer.gradient_clip_algorithm = gradient_clip_algorithm + if gradient_clip_algorithm == 'value': + self.trainer.gradient_clip_norm_type = None + else: + self.trainer.gradient_clip_norm_type = int(gradient_clip_algorithm[4:]) # gradient norm tracking if not isinstance(track_grad_norm, (int, float)) and track_grad_norm != 'inf': diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index b923ae9adce0c..9f5c00b6876b5 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -88,6 +88,7 @@ def __init__( callbacks: Optional[Union[List[Callback], Callback]] = None, default_root_dir: Optional[str] = None, gradient_clip_val: float = 0, + gradient_clip_algorithm: str = 'norm2', process_position: int = 0, num_nodes: int = 1, num_processes: int = 1, @@ -197,6 +198,8 @@ def __init__( gradient_clip_val: 0 means don't clip. + gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{1,45}$)' means clip_by_norm. + limit_train_batches: How much of training dataset to check (floats = percent, int = num_batches) limit_val_batches: How much of validation dataset to check (floats = percent, int = num_batches) @@ -345,7 +348,12 @@ def __init__( # init training tricks self.training_tricks_connector.on_trainer_init( - gradient_clip_val, track_grad_norm, accumulate_grad_batches, truncated_bptt_steps, terminate_on_nan + gradient_clip_val, + gradient_clip_algorithm, + track_grad_norm, + accumulate_grad_batches, + truncated_bptt_steps, + terminate_on_nan, ) # init accelerator related flags From c09e990f9f4731cdc45e64e9c17b56fded3f7700 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Tue, 12 Jan 2021 14:59:16 +0900 Subject: [PATCH 02/16] write changelog, training_tricks.rst --- CHANGELOG.md | 3 +++ docs/source/training_tricks.rst | 16 +++++++++++++--- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb1963d64d954..25c73bad9505c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,6 +47,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `IoU` class interface ([#4704](https://github.com/PyTorchLightning/pytorch-lightning/pull/4704)) +- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#5477](https://github.com/PyTorchLightning/pytorch-lightning/pull/5477)). + + ### Changed - Changed `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839)) diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index 10ee668a97fa8..11aee5832a7c2 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -26,8 +26,12 @@ The effect is a large effective batch size of size KxN. Gradient Clipping ----------------- -Gradient clipping may be enabled to avoid exploding gradients. Specifically, this will `clip the gradient -norm `_ computed over all model parameters together. +Gradient clipping may be enabled to avoid exploding gradients. Also, you can choose various criterion by +`gradient_clip_algorithm` option. For example, if `gradient_clip_algorithm == 'value'`, this will `clip the gradient +by value `_ computed over all model parameters. +If `gradient_clip_algorithm == 'norm1'` `clip the gradient +norm `_ with l1 norm computed over +all model parameters together. .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer` @@ -36,9 +40,15 @@ norm `_ # DEFAULT (ie: don't clip) trainer = Trainer(gradient_clip_val=0) - # clip gradients with norm above 0.5 + # clip gradients with norm-2 above 0.5 trainer = Trainer(gradient_clip_val=0.5) + # clip gradients with norm-1 above 0.5 + trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='norm1') + + # clip gradients with value above 0.5 + trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='value' ) + ---------- Auto scaling of batch size From baa9a497756288016c7af3a7206a742156784b51 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Tue, 12 Jan 2021 15:01:50 +0900 Subject: [PATCH 03/16] add end line to sharded_natvie_amp_pluigin.py --- pytorch_lightning/plugins/sharded_native_amp_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py index e14f49c94f927..ef20d07086290 100644 --- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py +++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py @@ -39,4 +39,4 @@ def clip_gradients(self, optimizer = cast(OSS, optimizer) optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type) else: - raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') \ No newline at end of file + raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') From a2366113f2d1713360e36a3582db15686f4de427 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Tue, 12 Jan 2021 15:31:28 +0900 Subject: [PATCH 04/16] bugfix update regex --- .../trainer/connectors/training_trick_connector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/trainer/connectors/training_trick_connector.py b/pytorch_lightning/trainer/connectors/training_trick_connector.py index b839429f16f77..d4a5b9f3c50ac 100644 --- a/pytorch_lightning/trainer/connectors/training_trick_connector.py +++ b/pytorch_lightning/trainer/connectors/training_trick_connector.py @@ -35,7 +35,7 @@ def on_trainer_init( self.trainer.terminate_on_nan = terminate_on_nan # gradient clipping - regex = '^norm[1-9]([0-9]{1,45}$)' + regex = '^norm[1-9]([0-9]{0,45}$)' if gradient_clip_algorithm != 'value' and re.match(regex, gradient_clip_algorithm) is None: raise MisconfigurationException(f"gradient_clip_algorithm should be value or match with regex {regex}") self.trainer.gradient_clip_val = gradient_clip_val From 749e286744b2003409b99da7d531217f3a54b9d0 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Tue, 12 Jan 2021 15:38:20 +0900 Subject: [PATCH 05/16] update regex documentation --- pytorch_lightning/plugins/apex.py | 2 +- pytorch_lightning/trainer/trainer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py index efd43a62421e6..6f7fa19096b14 100644 --- a/pytorch_lightning/plugins/apex.py +++ b/pytorch_lightning/plugins/apex.py @@ -110,7 +110,7 @@ def clip_gradients(self, Args: optimizer: Optimizer with gradients that will be clipped. grad_clip_val: Maximum norm of gradients. - gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{1,45}$)' means clip_by_norm. + gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{0,45}$)' means clip_by_norm. norm_type: (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. """ model = self.trainer.get_model() diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 9f5c00b6876b5..0d9b81d55974b 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -198,7 +198,7 @@ def __init__( gradient_clip_val: 0 means don't clip. - gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{1,45}$)' means clip_by_norm. + gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{0,45}$)' means clip_by_norm. limit_train_batches: How much of training dataset to check (floats = percent, int = num_batches) From b83ea7f8600e9c08772b73353b1aba3687e1e8df Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Mon, 18 Jan 2021 13:19:43 +0900 Subject: [PATCH 06/16] revert changelog --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25c73bad9505c..eb1963d64d954 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -47,9 +47,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `IoU` class interface ([#4704](https://github.com/PyTorchLightning/pytorch-lightning/pull/4704)) -- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#5477](https://github.com/PyTorchLightning/pytorch-lightning/pull/5477)). - - ### Changed - Changed `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839)) From 9913351497efeef0fc5903f99d99b738c8ec9edd Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Mon, 18 Jan 2021 13:58:20 +0900 Subject: [PATCH 07/16] commit based on review --- CHANGELOG.md | 3 +++ docs/source/training_tricks.rst | 15 ++++------- pytorch_lightning/accelerators/accelerator.py | 25 ++++++------------- .../accelerators/tpu_accelerator.py | 2 -- pytorch_lightning/plugins/apex.py | 6 ++--- pytorch_lightning/plugins/native_amp.py | 2 -- .../plugins/sharded_native_amp_plugin.py | 2 -- .../connectors/training_trick_connector.py | 10 ++------ pytorch_lightning/trainer/trainer.py | 4 +-- 9 files changed, 21 insertions(+), 48 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 35ff8e275ed91..cdf5861580e8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,6 +54,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added missing val/test hooks in `LightningModule` ([#5467](https://github.com/PyTorchLightning/pytorch-lightning/pull/5467)) +- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#5477](https://github.com/PyTorchLightning/pytorch-lightning/pull/5477)). + + ### Changed - Changed `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839)) diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index 11aee5832a7c2..7211dfefc6c88 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -26,12 +26,10 @@ The effect is a large effective batch size of size KxN. Gradient Clipping ----------------- -Gradient clipping may be enabled to avoid exploding gradients. Also, you can choose various criterion by -`gradient_clip_algorithm` option. For example, if `gradient_clip_algorithm == 'value'`, this will `clip the gradient -by value `_ computed over all model parameters. -If `gradient_clip_algorithm == 'norm1'` `clip the gradient -norm `_ with l1 norm computed over -all model parameters together. +Gradient clipping may be enabled to avoid exploding gradients. By default, this will `clip the gradient norm +`_ computed over all model parameters together. +If gradient_clip_algorithm option is set to 'value', which is 'norm' by default, this will +`clip the gradient value `_ for each parameter instead. .. seealso:: :class:`~pytorch_lightning.trainer.trainer.Trainer` @@ -40,12 +38,9 @@ all model parameters together. # DEFAULT (ie: don't clip) trainer = Trainer(gradient_clip_val=0) - # clip gradients with norm-2 above 0.5 + # clip gradients with norm above 0.5 trainer = Trainer(gradient_clip_val=0.5) - # clip gradients with norm-1 above 0.5 - trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='norm1') - # clip gradients with value above 0.5 trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='value' ) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 84823b93f094d..666122f5115a3 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -115,29 +115,18 @@ def clip_gradients(self, optimizer, clip_val=None): if grad_clip_val <= 0: return - self._clip_gradients(optimizer=optimizer, - grad_clip_val=grad_clip_val, - gradient_clip_algorithm=self.trainer.gradient_clip_algorithm, - norm_type=self.trainer.gradient_clip_norm_type) - - def _clip_gradients(self, - optimizer: Optimizer, - grad_clip_val: Union[float, int], - gradient_clip_algorithm: str, - norm_type: Union[float, int]): + self._clip_gradients(optimizer, grad_clip_val) + + def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0): + clip_algorithm = self.trainer.gradient_clip_algorithm if self.trainer.amp_backend: - self.trainer.precision_connector.backend.clip_gradients(optimizer=optimizer, - grad_clip_val=grad_clip_val, - gradient_clip_algorithm=gradient_clip_algorithm, - norm_type=norm_type) + self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, clip_algorithm, optimizer, norm_type) else: model = self.trainer.get_model() - if gradient_clip_algorithm == 'value': + if clip_algorithm == 'value': torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val) - elif gradient_clip_algorithm.startswith('norm'): + elif clip_algorithm == 'norm': torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) - else: - raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') def on_train_epoch_end(self, outputs): pass diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index 2cc58fdd2890f..0ef6845f62040 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -271,8 +271,6 @@ def _clip_gradients(self, clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef)) for p in parameters: p.grad.data.mul_(clip_coef.to(p.grad.data.device)) - else: - raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') def barrier(self, name: Optional[str] = None): torch_xla.core.xla_model.rendezvous(f"pl.Trainer.{name}") diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py index 6f7fa19096b14..ec3682733f166 100644 --- a/pytorch_lightning/plugins/apex.py +++ b/pytorch_lightning/plugins/apex.py @@ -110,8 +110,8 @@ def clip_gradients(self, Args: optimizer: Optimizer with gradients that will be clipped. grad_clip_val: Maximum norm of gradients. - gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{0,45}$)' means clip_by_norm. - norm_type: (float or int): type of the used p-norm. Can be ``'inf'`` for infinity norm. + gradient_clip_algorithm: 'value' means clip_by_value, 'norm' means clip_by_norm. + norm_type: type of the used p-norm. Can be ``'inf'`` for infinity norm. """ model = self.trainer.get_model() parameters = model.parameters() @@ -134,8 +134,6 @@ def clip_gradients(self, if clip_coef < 1: for p in parameters: p.grad.detach().mul_(clip_coef.to(p.grad.device)) - else: - raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') @property def norm_clipping_epsilon(self): diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 4cc85e1add4ed..6cbac417fd400 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -67,8 +67,6 @@ def clip_gradients(self, torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val) elif gradient_clip_algorithm.startswith('norm'): torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) - else: - raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') @property def scaler(self): diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py index ef20d07086290..b2a47e9c91223 100644 --- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py +++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py @@ -38,5 +38,3 @@ def clip_gradients(self, elif gradient_clip_algorithm.startswith('norm'): optimizer = cast(OSS, optimizer) optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type) - else: - raise ValueError(f'gradient_clip_algorithm [{gradient_clip_algorithm}] is not valid.') diff --git a/pytorch_lightning/trainer/connectors/training_trick_connector.py b/pytorch_lightning/trainer/connectors/training_trick_connector.py index d4a5b9f3c50ac..326c73df610ac 100644 --- a/pytorch_lightning/trainer/connectors/training_trick_connector.py +++ b/pytorch_lightning/trainer/connectors/training_trick_connector.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import re from pytorch_lightning.callbacks import GradientAccumulationScheduler from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -35,15 +34,10 @@ def on_trainer_init( self.trainer.terminate_on_nan = terminate_on_nan # gradient clipping - regex = '^norm[1-9]([0-9]{0,45}$)' - if gradient_clip_algorithm != 'value' and re.match(regex, gradient_clip_algorithm) is None: - raise MisconfigurationException(f"gradient_clip_algorithm should be value or match with regex {regex}") + if gradient_clip_algorithm not in ['value', 'norm']: + raise MisconfigurationException("gradient_clip_algorithm should be 'value' or 'norm'") self.trainer.gradient_clip_val = gradient_clip_val self.trainer.gradient_clip_algorithm = gradient_clip_algorithm - if gradient_clip_algorithm == 'value': - self.trainer.gradient_clip_norm_type = None - else: - self.trainer.gradient_clip_norm_type = int(gradient_clip_algorithm[4:]) # gradient norm tracking if not isinstance(track_grad_norm, (int, float)) and track_grad_norm != 'inf': diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 3a6fa44ce1213..856ca30efb850 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -88,7 +88,7 @@ def __init__( callbacks: Optional[Union[List[Callback], Callback]] = None, default_root_dir: Optional[str] = None, gradient_clip_val: float = 0, - gradient_clip_algorithm: str = 'norm2', + gradient_clip_algorithm: str = 'norm', process_position: int = 0, num_nodes: int = 1, num_processes: int = 1, @@ -198,7 +198,7 @@ def __init__( gradient_clip_val: 0 means don't clip. - gradient_clip_algorithm: 'value' means clip_by_value, regex '^norm[1-9]([0-9]{0,45}$)' means clip_by_norm. + gradient_clip_algorithm: 'value' means clip_by_value, 'norm' means clip_by_norm. defualt 'norm' limit_train_batches: How much of training dataset to check (floats = percent, int = num_batches) From 4c8e46b1933d648c172d250212d8fbda6702dc17 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Wed, 20 Jan 2021 10:52:38 +0900 Subject: [PATCH 08/16] Add Enum Type --- pytorch_lightning/accelerators/accelerator.py | 5 +++-- pytorch_lightning/accelerators/tpu_accelerator.py | 5 +++-- pytorch_lightning/plugins/apex.py | 5 +++-- pytorch_lightning/plugins/native_amp.py | 5 +++-- pytorch_lightning/plugins/sharded_native_amp_plugin.py | 10 +++++++--- .../trainer/connectors/training_trick_connector.py | 6 ++++-- pytorch_lightning/utilities/__init__.py | 8 +++++++- pytorch_lightning/utilities/enums.py | 10 ++++++++++ 8 files changed, 40 insertions(+), 14 deletions(-) diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 666122f5115a3..3c656792a4a61 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -21,6 +21,7 @@ from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins.ddp_plugin import DDPPlugin from pytorch_lightning.plugins.rpc_plugin import RPCPlugin +from pytorch_lightning.utilities import GradClipAlgorithmType from pytorch_lightning.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.parsing import AttributeDict @@ -123,9 +124,9 @@ def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int] self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, clip_algorithm, optimizer, norm_type) else: model = self.trainer.get_model() - if clip_algorithm == 'value': + if clip_algorithm == GradClipAlgorithmType.VALUE: torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val) - elif clip_algorithm == 'norm': + elif clip_algorithm == GradClipAlgorithmType.NORM: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) def on_train_epoch_end(self, outputs): diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index 0ef6845f62040..88068f40058b9 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -25,6 +25,7 @@ from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import ( + GradClipAlgorithmType, _TPU_AVAILABLE, move_data_to_device, rank_zero_info, @@ -253,9 +254,9 @@ def _clip_gradients(self, # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md model = self.trainer.get_model() parameters = model.parameters() - if gradient_clip_algorithm == 'value': + if gradient_clip_algorithm == GradClipAlgorithmType.VALUE: torch.nn.utils.clip_grad_value_(parameters, clip_value=grad_clip_val) - elif gradient_clip_algorithm.startswith('norm'): + elif gradient_clip_algorithm == GradClipAlgorithmType.NORM: max_norm = grad_clip_val if isinstance(parameters, torch.Tensor): parameters = [parameters] diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py index ec3682733f166..09c235fde204c 100644 --- a/pytorch_lightning/plugins/apex.py +++ b/pytorch_lightning/plugins/apex.py @@ -20,6 +20,7 @@ from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType from pytorch_lightning.utilities.distributed import rank_zero_warn +from pytorch_lightning.utilities.enums import GradClipAlgorithmType if _APEX_AVAILABLE: from apex import amp @@ -116,9 +117,9 @@ def clip_gradients(self, model = self.trainer.get_model() parameters = model.parameters() - if gradient_clip_algorithm == 'value': + if gradient_clip_algorithm == GradClipAlgorithmType.VALUE: torch.nn.utils.clip_grad_value_(parameters, clip_value=grad_clip_val) - elif gradient_clip_algorithm.startswith('norm'): + if gradient_clip_algorithm == GradClipAlgorithmType.NORM: max_norm = float(grad_clip_val) if isinstance(parameters, torch.Tensor): diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 6cbac417fd400..df363d8329611 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -17,6 +17,7 @@ from torch.optim import Optimizer from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin +from pytorch_lightning.utilities import GradClipAlgorithmType class NativeAMPPlugin(PrecisionPlugin): @@ -63,9 +64,9 @@ def clip_gradients(self, norm_type: Union[float, int]): model = self.trainer.get_model() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) - if gradient_clip_algorithm == 'value': + if gradient_clip_algorithm == GradClipAlgorithmType.VALUE: torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val) - elif gradient_clip_algorithm.startswith('norm'): + elif gradient_clip_algorithm == GradClipAlgorithmType.NORM: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) @property diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py index b2a47e9c91223..25b99ab9363c1 100644 --- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py +++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py @@ -16,7 +16,11 @@ from torch.optim import Optimizer from pytorch_lightning.plugins.native_amp import NativeAMPPlugin -from pytorch_lightning.utilities import _FAIRSCALE_AVAILABLE, _NATIVE_AMP_AVAILABLE +from pytorch_lightning.utilities import ( + _FAIRSCALE_AVAILABLE, + _NATIVE_AMP_AVAILABLE, + GradClipAlgorithmType, +) if _NATIVE_AMP_AVAILABLE and _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS @@ -33,8 +37,8 @@ def clip_gradients(self, grad_clip_val: Union[int, float], gradient_clip_algorithm: str, norm_type: Union[float, int]): - if gradient_clip_algorithm == 'value': + if gradient_clip_algorithm == GradClipAlgorithmType.VALUE: raise NotImplementedError("Value grad clipping with sharded ddp is not implemented yet") - elif gradient_clip_algorithm.startswith('norm'): + elif gradient_clip_algorithm == GradClipAlgorithmType.NORM: optimizer = cast(OSS, optimizer) optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type) diff --git a/pytorch_lightning/trainer/connectors/training_trick_connector.py b/pytorch_lightning/trainer/connectors/training_trick_connector.py index 9bab15226e2e1..a33c929d8fe49 100644 --- a/pytorch_lightning/trainer/connectors/training_trick_connector.py +++ b/pytorch_lightning/trainer/connectors/training_trick_connector.py @@ -14,6 +14,7 @@ from pytorch_lightning.callbacks import GradientAccumulationScheduler +from pytorch_lightning.utilities import GradClipAlgorithmType from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -35,8 +36,9 @@ def on_trainer_init( self.trainer.terminate_on_nan = terminate_on_nan # gradient clipping - if gradient_clip_algorithm not in ['value', 'norm']: - raise MisconfigurationException("gradient_clip_algorithm should be 'value' or 'norm'") + if gradient_clip_algorithm not in [GradClipAlgorithmType.VALUE, GradClipAlgorithmType.NORM]: + raise MisconfigurationException(f"gradient_clip_algorithm should be " + f"'{GradClipAlgorithmType.VALUE}' or '{GradClipAlgorithmType.NORM}'") self.trainer.gradient_clip_val = gradient_clip_val self.trainer.gradient_clip_algorithm = gradient_clip_algorithm diff --git a/pytorch_lightning/utilities/__init__.py b/pytorch_lightning/utilities/__init__.py index 0a5ed04eb72a3..d197dc0ddb593 100644 --- a/pytorch_lightning/utilities/__init__.py +++ b/pytorch_lightning/utilities/__init__.py @@ -22,7 +22,13 @@ rank_zero_only, rank_zero_warn, ) -from pytorch_lightning.utilities.enums import AMPType, DeviceType, DistributedType, LightningEnum # noqa: F401 +from pytorch_lightning.utilities.enums import ( # noqa: F401 + AMPType, + DeviceType, + DistributedType, + GradClipAlgorithmType, + LightningEnum, +) from pytorch_lightning.utilities.imports import ( # noqa: F401 _APEX_AVAILABLE, _BOLTS_AVAILABLE, diff --git a/pytorch_lightning/utilities/enums.py b/pytorch_lightning/utilities/enums.py index f6c0bf1d6cc54..75ce4ab069650 100644 --- a/pytorch_lightning/utilities/enums.py +++ b/pytorch_lightning/utilities/enums.py @@ -80,3 +80,13 @@ class DeviceType(LightningEnum): CPU = 'CPU' GPU = 'GPU' TPU = 'TPU' + + +class GradClipAlgorithmType(LightningEnum): + """ Define gradient_clip_algorithm types - training-tricks. + + >>> GradClipAlgorithmType.VALUE in ('value', 'norm') + True + """ + VALUE = 'value' + NORM = 'norm' From e2484aeb41567d5aba38b25ce1153ef4d41a8df5 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Wed, 20 Jan 2021 11:05:06 +0900 Subject: [PATCH 09/16] edit CHANGELOG.md to prevent conflicts --- CHANGELOG.md | 4 ++-- pytorch_lightning/accelerators/tpu_accelerator.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2201fdca61c26..6b0d69c20fecf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -54,10 +54,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added missing val/test hooks in `LightningModule` ([#5467](https://github.com/PyTorchLightning/pytorch-lightning/pull/5467)) -- `Recall` and `Precision` metrics (and their functional counterparts `recall` and `precision`) can now be generalized to Recall@K and Precision@K with the use of `top_k` parameter ([#4842](https://github.com/PyTorchLightning/pytorch-lightning/pull/4842)) +- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#5477](https://github.com/PyTorchLightning/pytorch-lightning/pull/5477)). -- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#5477](https://github.com/PyTorchLightning/pytorch-lightning/pull/5477)). +- `Recall` and `Precision` metrics (and their functional counterparts `recall` and `precision`) can now be generalized to Recall@K and Precision@K with the use of `top_k` parameter ([#4842](https://github.com/PyTorchLightning/pytorch-lightning/pull/4842)) ### Changed diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index 88068f40058b9..cb4dd381aaf2a 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -25,12 +25,12 @@ from pytorch_lightning.cluster_environments import ClusterEnvironment from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import ( - GradClipAlgorithmType, _TPU_AVAILABLE, move_data_to_device, rank_zero_info, rank_zero_only, rank_zero_warn, + GradClipAlgorithmType, ) from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.exceptions import MisconfigurationException From 393b77b15114749f2f2143a25bcdf3ff13ad91c8 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Wed, 20 Jan 2021 12:40:22 +0900 Subject: [PATCH 10/16] add test codes --- docs/source/training_tricks.rst | 2 +- pytorch_lightning/accelerators/accelerator.py | 2 +- .../accelerators/tpu_accelerator.py | 6 +- pytorch_lightning/plugins/apex.py | 7 +- pytorch_lightning/plugins/native_amp.py | 4 +- pytorch_lightning/plugins/precision_plugin.py | 4 +- .../plugins/sharded_native_amp_plugin.py | 8 +- pytorch_lightning/trainer/trainer.py | 2 +- tests/models/test_horovod.py | 16 ++++ tests/models/test_tpu.py | 19 +++++ tests/trainer/test_trainer.py | 78 +++++++++++++++++++ 11 files changed, 131 insertions(+), 17 deletions(-) diff --git a/docs/source/training_tricks.rst b/docs/source/training_tricks.rst index 7211dfefc6c88..faffe86d9ba2a 100644 --- a/docs/source/training_tricks.rst +++ b/docs/source/training_tricks.rst @@ -42,7 +42,7 @@ If gradient_clip_algorithm option is set to 'value', which is 'norm' by default, trainer = Trainer(gradient_clip_val=0.5) # clip gradients with value above 0.5 - trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='value' ) + trainer = Trainer(gradient_clip_val=0.5, gradient_clip_algorithm='value') ---------- diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py index 3c656792a4a61..5c3839326182c 100644 --- a/pytorch_lightning/accelerators/accelerator.py +++ b/pytorch_lightning/accelerators/accelerator.py @@ -118,7 +118,7 @@ def clip_gradients(self, optimizer, clip_val=None): return self._clip_gradients(optimizer, grad_clip_val) - def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: Union[float, int], norm_type: float = 2.0): + def _clip_gradients(self, optimizer: Optimizer, grad_clip_val: float, norm_type: float = 2.0): clip_algorithm = self.trainer.gradient_clip_algorithm if self.trainer.amp_backend: self.trainer.precision_connector.backend.clip_gradients(grad_clip_val, clip_algorithm, optimizer, norm_type) diff --git a/pytorch_lightning/accelerators/tpu_accelerator.py b/pytorch_lightning/accelerators/tpu_accelerator.py index cb4dd381aaf2a..fecf2209d985d 100644 --- a/pytorch_lightning/accelerators/tpu_accelerator.py +++ b/pytorch_lightning/accelerators/tpu_accelerator.py @@ -26,11 +26,11 @@ from pytorch_lightning.core import LightningModule from pytorch_lightning.utilities import ( _TPU_AVAILABLE, + GradClipAlgorithmType, move_data_to_device, rank_zero_info, rank_zero_only, rank_zero_warn, - GradClipAlgorithmType, ) from pytorch_lightning.utilities.cloud_io import atomic_save from pytorch_lightning.utilities.exceptions import MisconfigurationException @@ -247,9 +247,9 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): def _clip_gradients(self, optimizer: Optimizer, - grad_clip_val: Union[float, int], + grad_clip_val: float, gradient_clip_algorithm: str, - norm_type: Union[float, int]): + norm_type: float): # this code contains a modification of torch.nn.utils.clip_grad_norm_ # with TPU support based on https://github.com/pytorch/xla/blob/master/TROUBLESHOOTING.md model = self.trainer.get_model() diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py index 09c235fde204c..9f1c65afd4bc5 100644 --- a/pytorch_lightning/plugins/apex.py +++ b/pytorch_lightning/plugins/apex.py @@ -18,9 +18,8 @@ from pytorch_lightning.core.lightning import LightningModule from pytorch_lightning.plugins.precision_plugin import PrecisionPlugin -from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType +from pytorch_lightning.utilities import _APEX_AVAILABLE, AMPType, GradClipAlgorithmType from pytorch_lightning.utilities.distributed import rank_zero_warn -from pytorch_lightning.utilities.enums import GradClipAlgorithmType if _APEX_AVAILABLE: from apex import amp @@ -102,9 +101,9 @@ def configure_apex(self, amp, model, optimizers, amp_level): def clip_gradients(self, optimizer: Optimizer, - grad_clip_val: Union[int, float], + grad_clip_val: float, gradient_clip_algorithm: str, - norm_type: Union[float, int]): + norm_type: float): """ This code contains a modification of :meth:`torch.nn.utils.clip_grad_norm_` using a higher epsilon for fp16 weights. This is important when setting amp_level to O2, and the master weights are in fp16. diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index df363d8329611..1fa8860bfcbc1 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -59,9 +59,9 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): def clip_gradients(self, optimizer: Optimizer, - grad_clip_val: Union[int, float], + grad_clip_val: float, gradient_clip_algorithm: str, - norm_type: Union[float, int]): + norm_type: float): model = self.trainer.get_model() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) if gradient_clip_algorithm == GradClipAlgorithmType.VALUE: diff --git a/pytorch_lightning/plugins/precision_plugin.py b/pytorch_lightning/plugins/precision_plugin.py index 2d64f48855150..cd04ba199b815 100644 --- a/pytorch_lightning/plugins/precision_plugin.py +++ b/pytorch_lightning/plugins/precision_plugin.py @@ -37,7 +37,7 @@ def backward(self, closure_loss, optimizer, opt_idx, *args, **kwargs): def clip_gradients(self, optimizer: Optimizer, - grad_clip_val: Union[int, float], + grad_clip_val: float, gradient_clip_algorithm: str, - norm_type: Union[float, int]): + norm_type: float): raise NotImplementedError diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py index 25b99ab9363c1..dbeb0a1f8efd7 100644 --- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py +++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py @@ -34,11 +34,13 @@ def scaler(self): def clip_gradients(self, optimizer: Optimizer, - grad_clip_val: Union[int, float], + grad_clip_val: float, gradient_clip_algorithm: str, - norm_type: Union[float, int]): + norm_type: float): + + optimizer = cast(OSS, optimizer) if gradient_clip_algorithm == GradClipAlgorithmType.VALUE: raise NotImplementedError("Value grad clipping with sharded ddp is not implemented yet") + # optimizer.clip_grad_value(grad_clip_val) elif gradient_clip_algorithm == GradClipAlgorithmType.NORM: - optimizer = cast(OSS, optimizer) optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type) diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 498145c4c31c3..57e38865ec8b8 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -87,7 +87,7 @@ def __init__( checkpoint_callback: bool = True, callbacks: Optional[Union[List[Callback], Callback]] = None, default_root_dir: Optional[str] = None, - gradient_clip_val: float = 0, + gradient_clip_val: Union[int, float] = 0, gradient_clip_algorithm: str = 'norm', process_position: int = 0, num_nodes: int = 1, diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 7ac7cd235f392..9edfa6ed96484 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -17,6 +17,7 @@ import shlex import subprocess import sys +from copy import deepcopy import numpy as np import pytest @@ -85,6 +86,9 @@ def test_horovod_cpu(enable_pl_optimizer, tmpdir): enable_pl_optimizer=enable_pl_optimizer, ) _run_horovod(trainer_options) + trainer_options_clip_grad_val = deepcopy(trainer_options) + trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'}) + _run_horovod(trainer_options_clip_grad_val, on_gpu=True) @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") @@ -103,6 +107,9 @@ def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir): enable_pl_optimizer=enable_pl_optimizer, ) _run_horovod(trainer_options) + trainer_options_clip_grad_val = deepcopy(trainer_options) + trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'}) + _run_horovod(trainer_options_clip_grad_val, on_gpu=True) @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") @@ -123,6 +130,9 @@ def test_horovod_multi_gpu(tmpdir): accelerator='horovod', ) _run_horovod(trainer_options, on_gpu=True) + trainer_options_clip_grad_val = deepcopy(trainer_options) + trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'}) + _run_horovod(trainer_options_clip_grad_val, on_gpu=True) @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") @@ -146,6 +156,9 @@ def test_horovod_apex(tmpdir): precision=16, ) _run_horovod(trainer_options, on_gpu=True) + trainer_options_clip_grad_val = deepcopy(trainer_options) + trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'}) + _run_horovod(trainer_options_clip_grad_val, on_gpu=True) @pytest.mark.skip(reason="Skip till Horovod fixes integration with Native torch.cuda.amp") @@ -170,6 +183,9 @@ def test_horovod_amp(tmpdir): precision=16, ) _run_horovod(trainer_options, on_gpu=True) + trainer_options_clip_grad_val = deepcopy(trainer_options) + trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'}) + _run_horovod(trainer_options_clip_grad_val, on_gpu=True) @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py index 5e977eed765d0..61fcadf7b28da 100644 --- a/tests/models/test_tpu.py +++ b/tests/models/test_tpu.py @@ -197,6 +197,25 @@ def test_tpu_grad_norm(tmpdir): tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) +@pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine") +@pl_multi_process_test +def test_tpu_clip_grad_by_value(tmpdir): + """Test if clip_gradients by value works on TPU.""" + trainer_options = dict( + default_root_dir=tmpdir, + progress_bar_refresh_rate=0, + max_epochs=1, + tpu_cores=1, + limit_train_batches=0.4, + limit_val_batches=0.4, + gradient_clip_val=0.1, + gradient_clip_algorithm='value' + ) + + model = EvalModelTemplate() + tpipes.run_model_test(trainer_options, model, on_gpu=False, with_hpc=False) + + @pytest.mark.skipif(not _TPU_AVAILABLE, reason="test requires TPU machine") @pl_multi_process_test def test_dataloaders_passed_to_fit(tmpdir): diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 97785d9e61a86..f66a3c119e256 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -962,6 +962,44 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde trainer.fit(model) +def test_gradient_clipping_by_value(tmpdir): + """ + Test gradient clipping by value + """ + tutils.reset_seed() + + model = EvalModelTemplate() + + grad_clip_val = 0.001 + trainer = Trainer( + max_steps=10, + max_epochs=1, + gradient_clip_val=grad_clip_val, + gradient_clip_algorithm='value', + default_root_dir=tmpdir, + ) + + trainer.train_loop.old_training_step_and_backward = trainer.train_loop.training_step_and_backward + + def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens): + """ + wrap the forward step in a closure so second order methods work + """ + # test that gradient is clipped correctly + ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) + parameters = model.parameters() + grad_max = torch.max(torch.stack([p.grad.detach() for p in parameters]).abs()) + assert grad_max.item() <= grad_clip_val + + return ret_val + + trainer.train_loop.training_step_and_backward = training_step_and_backward + # for the test + model.prev_called_batch_idx = 0 + + trainer.fit(model) + + @pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") @pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires native AMP.") def test_gradient_clipping_fp16(tmpdir): @@ -1001,6 +1039,46 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde trainer.fit(model) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine") +@pytest.mark.skipif(not _NATIVE_AMP_AVAILABLE, reason="test requires native AMP.") +def test_gradient_clipping_by_value_fp16(tmpdir): + """ + Test gradient clipping by value with fp16 + """ + tutils.reset_seed() + + model = EvalModelTemplate() + grad_clip_val = 0.001 + trainer = Trainer( + max_steps=10, + max_epochs=1, + precision=16, + gpus=1, + gradient_clip_val=grad_clip_val, + gradient_clip_algorithm='value', + default_root_dir=tmpdir, + ) + + trainer.train_loop.old_training_step_and_backward = trainer.train_loop.training_step_and_backward + + def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens): + """ + wrap the forward step in a closure so second order methods work + """ + # test that gradient is clipped correctly + ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) + parameters = model.parameters() + grad_max = torch.max(torch.stack([p.grad.detach() for p in parameters])) + assert grad_max.item() <= grad_clip_val + + return ret_val + + trainer.train_loop.training_step_and_backward = training_step_and_backward + model.prev_called_batch_idx = 0 + + trainer.fit(model) + + def test_gpu_choice(tmpdir): trainer_options = dict( default_root_dir=tmpdir, From 2403f96b773a05c050e8275e5bb2f360a9812194 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Wed, 20 Jan 2021 14:08:44 +0900 Subject: [PATCH 11/16] pep8 formatting --- pytorch_lightning/plugins/apex.py | 2 +- pytorch_lightning/plugins/native_amp.py | 1 - pytorch_lightning/plugins/precision_plugin.py | 1 - pytorch_lightning/plugins/sharded_native_amp_plugin.py | 2 +- 4 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/apex.py b/pytorch_lightning/plugins/apex.py index 9f1c65afd4bc5..8d3d065dac0dd 100644 --- a/pytorch_lightning/plugins/apex.py +++ b/pytorch_lightning/plugins/apex.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import List, Tuple, Union +from typing import List, Tuple import torch from torch.optim.optimizer import Optimizer diff --git a/pytorch_lightning/plugins/native_amp.py b/pytorch_lightning/plugins/native_amp.py index 1fa8860bfcbc1..41fe80d6199d0 100644 --- a/pytorch_lightning/plugins/native_amp.py +++ b/pytorch_lightning/plugins/native_amp.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union import torch from torch.optim import Optimizer diff --git a/pytorch_lightning/plugins/precision_plugin.py b/pytorch_lightning/plugins/precision_plugin.py index cd04ba199b815..93cdaa889b1d1 100644 --- a/pytorch_lightning/plugins/precision_plugin.py +++ b/pytorch_lightning/plugins/precision_plugin.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union from torch.optim import Optimizer diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py index dbeb0a1f8efd7..78d6c88cebea7 100644 --- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py +++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union, cast +from typing import cast from torch.optim import Optimizer From 79d41491edbc6ef62d1142bcc52525b91ad18f23 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Wed, 20 Jan 2021 14:16:39 +0900 Subject: [PATCH 12/16] update test codes --- tests/models/test_horovod.py | 14 ++++++++++++-- tests/trainer/test_trainer.py | 3 ++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 9edfa6ed96484..945bf94489e6e 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -86,9 +86,11 @@ def test_horovod_cpu(enable_pl_optimizer, tmpdir): enable_pl_optimizer=enable_pl_optimizer, ) _run_horovod(trainer_options) + + # clip_grad_by_value test trainer_options_clip_grad_val = deepcopy(trainer_options) trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'}) - _run_horovod(trainer_options_clip_grad_val, on_gpu=True) + _run_horovod(trainer_options_clip_grad_val) @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") @@ -107,9 +109,11 @@ def test_horovod_cpu_implicit(enable_pl_optimizer, tmpdir): enable_pl_optimizer=enable_pl_optimizer, ) _run_horovod(trainer_options) + + # clip_grad_by_value test trainer_options_clip_grad_val = deepcopy(trainer_options) trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'}) - _run_horovod(trainer_options_clip_grad_val, on_gpu=True) + _run_horovod(trainer_options_clip_grad_val) @pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows") @@ -130,6 +134,8 @@ def test_horovod_multi_gpu(tmpdir): accelerator='horovod', ) _run_horovod(trainer_options, on_gpu=True) + + # clip_grad_by_value test trainer_options_clip_grad_val = deepcopy(trainer_options) trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'}) _run_horovod(trainer_options_clip_grad_val, on_gpu=True) @@ -156,6 +162,8 @@ def test_horovod_apex(tmpdir): precision=16, ) _run_horovod(trainer_options, on_gpu=True) + + # clip_grad_by_value test trainer_options_clip_grad_val = deepcopy(trainer_options) trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'}) _run_horovod(trainer_options_clip_grad_val, on_gpu=True) @@ -183,6 +191,8 @@ def test_horovod_amp(tmpdir): precision=16, ) _run_horovod(trainer_options, on_gpu=True) + + # clip_grad_by_value test trainer_options_clip_grad_val = deepcopy(trainer_options) trainer_options_clip_grad_val.update({'gradient_clip_algorithm': 'value'}) _run_horovod(trainer_options_clip_grad_val, on_gpu=True) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index f66a3c119e256..f48e55f72520f 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -988,7 +988,8 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde # test that gradient is clipped correctly ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) parameters = model.parameters() - grad_max = torch.max(torch.stack([p.grad.detach() for p in parameters]).abs()) + grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters] + grad_max = torch.max(torch.stack(grad_max_list)) assert grad_max.item() <= grad_clip_val return ret_val From 9fc6f62893a6b3d468abadb1c9c6f28041413854 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Wed, 20 Jan 2021 15:33:35 +0900 Subject: [PATCH 13/16] update test codes --- tests/trainer/test_trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index f48e55f72520f..b1bfeada38919 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -990,7 +990,7 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde parameters = model.parameters() grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters] grad_max = torch.max(torch.stack(grad_max_list)) - assert grad_max.item() <= grad_clip_val + assert grad_max.item() <= grad_clip_val, f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ." return ret_val @@ -1070,7 +1070,7 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) parameters = model.parameters() grad_max = torch.max(torch.stack([p.grad.detach() for p in parameters])) - assert grad_max.item() <= grad_clip_val + assert grad_max.item() <= grad_clip_val, f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ." return ret_val From 853fb095a0fd69d0c5b392e138b18902205c8320 Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Wed, 20 Jan 2021 15:57:04 +0900 Subject: [PATCH 14/16] update test codes --- tests/trainer/test_trainer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index b1bfeada38919..44f5467ffb08e 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -970,7 +970,7 @@ def test_gradient_clipping_by_value(tmpdir): model = EvalModelTemplate() - grad_clip_val = 0.001 + grad_clip_val = 0.0001 trainer = Trainer( max_steps=10, max_epochs=1, @@ -990,7 +990,8 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde parameters = model.parameters() grad_max_list = [torch.max(p.grad.detach().abs()) for p in parameters] grad_max = torch.max(torch.stack(grad_max_list)) - assert grad_max.item() <= grad_clip_val, f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ." + assert round(grad_max.item(), 6) <= grad_clip_val, \ + f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ." return ret_val @@ -1049,7 +1050,7 @@ def test_gradient_clipping_by_value_fp16(tmpdir): tutils.reset_seed() model = EvalModelTemplate() - grad_clip_val = 0.001 + grad_clip_val = 0.0001 trainer = Trainer( max_steps=10, max_epochs=1, @@ -1070,7 +1071,8 @@ def training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hidde ret_val = trainer.train_loop.old_training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens) parameters = model.parameters() grad_max = torch.max(torch.stack([p.grad.detach() for p in parameters])) - assert grad_max.item() <= grad_clip_val, f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ." + assert round(grad_max.item(), 6) <= grad_clip_val, \ + f"Gradient max value {grad_max} > grad_clip_val {grad_clip_val} ." return ret_val From 257bcb6974f09d0e3d50c24898e6c50fa32b6d5e Mon Sep 17 00:00:00 2001 From: "dong.hyun" Date: Fri, 22 Jan 2021 12:10:40 +0900 Subject: [PATCH 15/16] add value clipping for sharded ddp --- benchmarks/test_sharded_parity.py | 31 +++++++++++++++++++ .../plugins/sharded_native_amp_plugin.py | 7 +++-- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/benchmarks/test_sharded_parity.py b/benchmarks/test_sharded_parity.py index 05fde8e11523a..67d00a1637044 100644 --- a/benchmarks/test_sharded_parity.py +++ b/benchmarks/test_sharded_parity.py @@ -162,6 +162,31 @@ def test_ddp_sharded_plugin_correctness_multi_gpu_multi_optim_manual(tmpdir): ) +@pytest.mark.skipif(not _FAIRSCALE_AVAILABLE, reason="Fairscale is not available") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine") +@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1', + reason="test should be run outside of pytest") +@DDPLauncher.run("--accelerator ddp --gpus 2 --precision 16") +def test_ddp_sharded_plugin_clip_gradients(tmpdir, args=None): + plugin_parity_test( + gpus=args.gpus, + precision=args.precision, + accelerator=args.accelerator, + plugin=DDPShardedPlugin(), + model_cls=SeedTrainLoaderModel, + gradient_clip_val=0.001, + ) + plugin_parity_test( + gpus=args.gpus, + precision=args.precision, + accelerator=args.accelerator, + plugin=DDPShardedPlugin(), + model_cls=SeedTrainLoaderModel, + gradient_clip_val=0.001, + gradient_clip_algorithm='value', + ) + + class SeedTrainLoaderModel(BoringModel): """ Overrides training loader to ensure we enforce the same seed for all DDP processes. @@ -261,6 +286,8 @@ def plugin_parity_test( gpus: int = 0, precision: int = 32, max_percent_speed_diff: float = 0.1, + gradient_clip_val: Union[int, float] = 0, + gradient_clip_algorithm: str = 'norm', ): """ Ensures that the trained model is identical to the standard DDP implementation. @@ -274,6 +301,8 @@ def plugin_parity_test( gpus: Number of GPUS to enable. precision: Whether to use AMP or normal FP32 training. max_percent_speed_diff: The maximum speed difference compared to normal DDP training. + gradient_clip_val: 0 means don't clip. + gradient_clip_algorithm: 'value' means clip_by_value, 'norm' means clip_by_norm. defualt 'norm' This is more a safety net for variability in CI which can vary in speed, not for benchmarking. """ @@ -308,6 +337,8 @@ def plugin_parity_test( precision=precision, accelerator=accelerator, plugins=[plugin], + gradient_clip_val=gradient_clip_val, + gradient_clip_algorithm=gradient_clip_algorithm, ) max_memory_custom, custom_model_time = record_ddp_fit_model_stats( diff --git a/pytorch_lightning/plugins/sharded_native_amp_plugin.py b/pytorch_lightning/plugins/sharded_native_amp_plugin.py index 78d6c88cebea7..8396421b442e1 100644 --- a/pytorch_lightning/plugins/sharded_native_amp_plugin.py +++ b/pytorch_lightning/plugins/sharded_native_amp_plugin.py @@ -13,6 +13,7 @@ # limitations under the License. from typing import cast +import torch from torch.optim import Optimizer from pytorch_lightning.plugins.native_amp import NativeAMPPlugin @@ -38,9 +39,9 @@ def clip_gradients(self, gradient_clip_algorithm: str, norm_type: float): - optimizer = cast(OSS, optimizer) if gradient_clip_algorithm == GradClipAlgorithmType.VALUE: - raise NotImplementedError("Value grad clipping with sharded ddp is not implemented yet") - # optimizer.clip_grad_value(grad_clip_val) + model = self.trainer.get_model() + torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val) elif gradient_clip_algorithm == GradClipAlgorithmType.NORM: + optimizer = cast(OSS, optimizer) optimizer.clip_grad_norm(grad_clip_val, norm_type=norm_type) From dcf9ff0b2bd30d13b8221d22430b5768b59b6b2d Mon Sep 17 00:00:00 2001 From: Anthony Kim Date: Sat, 30 Jan 2021 01:27:18 +0900 Subject: [PATCH 16/16] remove bad line in native_amp.py bugfix --- pytorch_lightning/plugins/legacy/native_amp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pytorch_lightning/plugins/legacy/native_amp.py b/pytorch_lightning/plugins/legacy/native_amp.py index b08135fa264c1..941042d9bc4ad 100644 --- a/pytorch_lightning/plugins/legacy/native_amp.py +++ b/pytorch_lightning/plugins/legacy/native_amp.py @@ -66,7 +66,6 @@ def clip_gradients(self, gradient_clip_algorithm: str, norm_type: float): model = self.trainer.get_model() - torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip_val, norm_type=norm_type) if gradient_clip_algorithm == GradClipAlgorithmType.VALUE: torch.nn.utils.clip_grad_value_(model.parameters(), clip_value=grad_clip_val) elif gradient_clip_algorithm == GradClipAlgorithmType.NORM: