Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Removed legacy code to include `step` dictionary returns in `callback_metrics`. Use `self.log_dict` instead. ([#6682](https://github.com/PyTorchLightning/pytorch-lightning/pull/6682))


- Removed legacy code to log or include metrics in the progress bar by returning them in a dict with the `"log"/"progress_bar"` magic keys. Use `self.log` instead ([#6734](https://github.com/PyTorchLightning/pytorch-lightning/pull/6734))


- Removed `optimizer_idx` argument from `training_step` in manual optimization ([#6093](https://github.com/PyTorchLightning/pytorch-lightning/pull/6093))


Expand Down
9 changes: 2 additions & 7 deletions docs/source/ecosystem/asr_nlp_tts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -751,13 +751,8 @@ be customized with PyTorch Lightning since every NeMo model is a LightningModule

l_mle, l_length, logdet, loss, _ = self.step(y, y_lengths, x, x_lengths)

output = {
"loss": loss, # required
"progress_bar": {"l_mle": l_mle, "l_length": l_length, "logdet": logdet},
"log": {"loss": loss, "l_mle": l_mle, "l_length": l_length, "logdet": logdet},
}

return output
self.log_dict({"l_mle": l_mle, "l_length": l_length, "logdet": logdet}, prog_bar=True)
return loss
...

Neural Types in NeMo TTS
Expand Down
4 changes: 0 additions & 4 deletions pytorch_lightning/core/step_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,10 +526,6 @@ def reduce_across_time(cls, time_outputs):
# auto-reduce across time for tbptt
meta = time_outputs[0]['meta']

# in 1.0 the results have 'extra'. Once we deprecate 0.10.0 we may not need this
if 'extra' in time_outputs[0]:
[x.pop('extra', None) for x in time_outputs]

result = cls()
result = recursive_gather(time_outputs, result)
recursive_stack(result)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,12 +394,10 @@ def run_batch_from_func_name(self, func_name) -> Dict:

def get_latest_batch_log_metrics(self) -> Dict:
batch_log_metrics = self.run_batch_from_func_name("get_batch_log_metrics")
batch_log_metrics.update(self.legacy_batch_log_metrics)
return batch_log_metrics

def get_latest_batch_pbar_metrics(self) -> Dict:
batch_pbar_metrics = self.run_batch_from_func_name("get_batch_pbar_metrics")
batch_pbar_metrics.update(self.legacy_batch_pbar_metrics)
return batch_pbar_metrics

@property
Expand Down Expand Up @@ -451,8 +449,6 @@ def reset(self):
self._opt_idx: Optional[int] = None
self._batch_size: Optional[int] = None
self._has_batch_loop_finished = False
self.legacy_batch_log_metrics = {}
self.legacy_batch_pbar_metrics = {}

def __call__(
self,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -191,10 +191,7 @@ def cache_training_step_metrics(self, opt_closure_result):
self.add_progress_bar_metrics(pbar_metrics_tmp)

self._callback_metrics.update(callback_metrics_tmp)

# save legacy log metrics
self._logged_metrics.update(logged_metrics_tmp)
self.cached_results.legacy_batch_log_metrics.update(logged_metrics_tmp)

def log_metrics(self, metrics, grad_norm_dic, step=None):
"""Logs the metric dict passed in.
Expand Down
135 changes: 0 additions & 135 deletions pytorch_lightning/trainer/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect
from abc import ABC
from collections import Mapping

import torch

from pytorch_lightning.utilities import DistributedType
from pytorch_lightning.utilities.distributed import rank_zero_warn
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.memory import recursive_detach


class TrainerLoggingMixin(ABC):

# this is just a summary on variables used in this abstract class,
# the proper values/initialisation should be done in child class
_distrib_type: DistributedType
num_gpus: int

def metrics_to_scalars(self, metrics):
new_metrics = {}
# TODO: this is duplicated in MetricsHolder. should be unified
Expand All @@ -49,128 +39,3 @@ def metrics_to_scalars(self, metrics):
new_metrics[k] = v

return new_metrics

def process_dict_result(self, output, train=False):
"""Reduces output according to the training mode.

Separates loss from logging and progress bar metrics
"""
# --------------------
# WARN DEPRECATED KEYS
# --------------------
# TODO: 1.0.0 remove
if isinstance(output, dict):
for k, v in output.items():
if k in ['log', 'progress_bar']:
m = inspect.cleandoc(
f"The {{{k}:dict keyword}} was deprecated in 0.9.1 and will be removed in 1.0.0\n"
" Please use self.log(...) inside the lightningModule instead.\n"
" # log on a step or aggregate epoch metric to the logger and/or progress bar"
" (inside LightningModule)\n"
" self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True)"
)
rank_zero_warn(m)

# --------------------------
# handle single scalar only
# --------------------------
# single scalar returned from a xx_step
if isinstance(output, torch.Tensor):
return output, {}, {}, None

# ---------------
# EXTRACT PROGRESS BAR KEYS
# ---------------
try:
progress_output = output['progress_bar']

# reduce progress metrics for progress bar when using dp
if train and self._distrib_type in (DistributedType.DP, DistributedType.DDP2):
num_gpus = self.num_gpus
progress_output = self.reduce_distributed_output(progress_output, num_gpus)

progress_bar_metrics = progress_output
# todo: specify the possible exception
except Exception:
progress_bar_metrics = {}

# ---------------
# EXTRACT LOGGING KEYS
# ---------------
# extract metrics to log to experiment
try:
log_output = output['log']

# reduce progress metrics for progress bar when using dp
if train and self._distrib_type in (DistributedType.DP, DistributedType.DDP2):
num_gpus = self.num_gpus
log_output = self.reduce_distributed_output(log_output, num_gpus)

log_metrics = log_output
# todo: specify the possible exception
except Exception:
log_metrics = {}

# ---------------
# EXTRACT LOSS
# ---------------
# if output dict doesn't have the keyword loss
# then assume the output=loss if scalar
loss = None
if train:
try:
loss = output['loss']
# todo: specify the possible exception
except Exception as exp:
if isinstance(output, torch.Tensor):
loss = output
else:
raise RuntimeError(
'No `loss` value in the dictionary returned from `model.training_step()`.'
) from exp

# when using dp need to reduce the loss
if self._distrib_type in (DistributedType.DP, DistributedType.DDP2):
loss = self.reduce_distributed_output(loss, self.num_gpus)

# ---------------
# EXTRACT HIDDEN
# ---------------
hiddens = output.get('hiddens', None) if isinstance(output, Mapping) else None
if hiddens is not None:
hiddens = hiddens.detach()

# detach all metrics for callbacks to prevent memory leaks
# no .item() because it will slow things down
progress_bar_metrics = recursive_detach(progress_bar_metrics)
log_metrics = recursive_detach(log_metrics)

return loss, progress_bar_metrics, log_metrics, hiddens

def reduce_distributed_output(self, output, num_gpus):
if num_gpus <= 1:
return output

# when using DP, we get one output per gpu
# average outputs and return
if isinstance(output, torch.Tensor):
return output.mean()

for k, v in output.items():
# recurse on nested dics
if isinstance(output[k], dict):
output[k] = self.reduce_distributed_output(output[k], num_gpus)

# compute the average of scalars
elif isinstance(output[k], list):
output[k] = sum(output[k]) / len(output[k])

# do nothing when there's a scalar
elif isinstance(output[k], torch.Tensor) and output[k].dim() == 0:
pass

# do not reduce metrics that have batch size > num gpus
elif output[k].size(0) <= num_gpus:
output[k] = torch.mean(output[k])

return output
58 changes: 7 additions & 51 deletions pytorch_lightning/trainer/training_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
from pytorch_lightning.utilities import _TPU_AVAILABLE, AMPType, DeviceType, parsing
from pytorch_lightning.utilities.distributed import rank_zero_info
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.memory import recursive_detach
from pytorch_lightning.utilities.model_helpers import is_overridden
from pytorch_lightning.utilities.parsing import AttributeDict
from pytorch_lightning.utilities.warnings import WarningCache
Expand Down Expand Up @@ -242,12 +241,7 @@ def get_optimizers_iterable(self):
return [[opt_idx, self.trainer.optimizers[opt_idx]]]

def on_after_backward(self, training_step_output, batch_idx, untouched_loss):
is_result_obj = isinstance(training_step_output, Result)

if is_result_obj:
training_step_output = training_step_output.detach()
else:
training_step_output.batch_loss = training_step_output.batch_loss.detach()
training_step_output.detach()

# insert after step hook
self.trainer.call_hook("on_after_backward")
Expand Down Expand Up @@ -284,24 +278,16 @@ def training_step(self, split_batch, batch_idx, opt_idx, hiddens):
training_step_output_for_epoch_end, training_step_output = self._process_training_step_output(
training_step_output, split_batch
)
is_result_obj = isinstance(training_step_output, Result)

if training_step_output_for_epoch_end is None:
return None
return

# enable empty loss when using manual opt
closure_loss = None
untouched_loss = None

if self.automatic_optimization:
# accumulate loss
# (if accumulate_grad_batches = 1 no effect)
if is_result_obj:
closure_loss = training_step_output.minimize
else:
closure_loss = training_step_output.batch_loss

closure_loss = closure_loss / self.trainer.accumulate_grad_batches
# accumulate loss. if accumulate_grad_batches==1, no effect
closure_loss = training_step_output.minimize / self.trainer.accumulate_grad_batches

# the loss will get scaled for amp. avoid any modifications to it
untouched_loss = closure_loss.detach().clone()
Expand All @@ -322,35 +308,6 @@ def _process_training_step_output(self, training_step_output, split_batch):
if training_step_output_for_epoch_end is None:
return None, None

# -----------------------------------------
# process hybrid (1.0)
# -----------------------------------------
# no need for these checks in 1.0.0
# TODO: remove checks in 1.0.0
is_tensor = isinstance(training_step_output_for_epoch_end, torch.Tensor)
is_1_0_output = is_tensor or ("log" not in training_step_output and "progress_bar" not in training_step_output)
if is_1_0_output:
return self._process_training_step_output_1_0(training_step_output, split_batch)

# -----------------------------------------
# process old dict (deprecate 1.0)
# -----------------------------------------
training_step_output = self.trainer.process_dict_result(training_step_output, train=True)

training_step_output = AttributeDict(
batch_loss=training_step_output[0],
pbar_on_batch_end=training_step_output[1],
log_metrics=training_step_output[2],
)
# if the user decides to finally reduce things in epoch_end, save raw output without graphs
if isinstance(training_step_output_for_epoch_end, torch.Tensor):
training_step_output_for_epoch_end = training_step_output_for_epoch_end.detach()
else:
training_step_output_for_epoch_end = recursive_detach(training_step_output_for_epoch_end)

return training_step_output_for_epoch_end, training_step_output

def _process_training_step_output_1_0(self, training_step_output, split_batch):
result = self.trainer.lightning_module._results

loss = None
Expand All @@ -361,6 +318,8 @@ def _process_training_step_output_1_0(self, training_step_output, split_batch):
if isinstance(training_step_output, dict):
loss = training_step_output.pop("loss", None)
hiddens = training_step_output.pop("hiddens", None)
if hiddens is not None:
hiddens = hiddens.detach()
result["extra"] = training_step_output

# handle scalar return
Expand All @@ -380,10 +339,7 @@ def _process_training_step_output_1_0(self, training_step_output, split_batch):
if self.trainer.move_metrics_to_cpu:
training_step_output_for_epoch_end = training_step_output_for_epoch_end.cpu()

# what flows back into the system
training_step_output = result

return training_step_output_for_epoch_end, training_step_output
return training_step_output_for_epoch_end, result

def optimizer_step(self, optimizer, opt_idx, batch_idx, train_step_and_backward_closure):
model_ref = self.trainer.lightning_module
Expand Down
2 changes: 0 additions & 2 deletions tests/checkpointing/test_model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -876,7 +876,6 @@ def validation_epoch_end(self, outputs):
assert trainer.dev_debugger.checkpoint_callback_history[-1]['epoch'] == len(monitor) - 1


@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
def test_checkpoint_repeated_strategy(tmpdir):
"""
This test validates that the checkpoint can be called when provided to callbacks list
Expand Down Expand Up @@ -923,7 +922,6 @@ def validation_step(self, batch, batch_idx):
assert set(os.listdir(tmpdir.join("lightning_logs"))) == {f'version_{i}' for i in range(4)}


@mock.patch.dict(os.environ, {"PL_DEV_DEBUG": "1"})
def test_checkpoint_repeated_strategy_extended(tmpdir):
"""
This test validates checkpoint can be called several times without
Expand Down
Loading