diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py index 6ed5d6c31f719..072daf9dffcb6 100644 --- a/pytorch_lightning/trainer/trainer.py +++ b/pytorch_lightning/trainer/trainer.py @@ -203,8 +203,10 @@ def __init__( and will be removed in v1.7.0. Please use the ``strategy`` argument instead. accumulate_grad_batches: Accumulates grads every k batches or as set up in the dict. + Default: ``None``. amp_backend: The mixed precision backend to use ("native" or "apex"). + Default: ``'native''``. amp_level: The optimization level to use (O1, O2, etc...). By default it will be set to "O2" if ``amp_backend`` is set to "apex". @@ -213,23 +215,29 @@ def __init__( trying to optimize initial learning for faster convergence. trainer.tune() method will set the suggested learning rate in self.lr or self.learning_rate in the LightningModule. To use a different key set a string instead of True with the key name. + Default: ``False``. auto_scale_batch_size: If set to True, will `initially` run a batch size finder trying to find the largest batch size that fits into memory. The result will be stored in self.batch_size in the LightningModule. Additionally, can be set to either `power` that estimates the batch size through a power search or `binsearch` that estimates the batch size through a binary search. + Default: ``False``. auto_select_gpus: If enabled and ``gpus`` is an integer, pick available gpus automatically. This is especially useful when GPUs are configured to be in "exclusive mode", such that only one process at a time can access them. + Default: ``False``. - benchmark: If true enables cudnn.benchmark. + benchmark: If ``True``, enables cudnn.benchmark. + Default: ``False``. callbacks: Add a callback or list of callbacks. + Default: ``None``. checkpoint_callback: If ``True``, enable checkpointing. + Default: ``None``. .. deprecated:: v1.5 ``checkpoint_callback`` has been deprecated in v1.5 and will be removed in v1.7. @@ -238,14 +246,18 @@ def __init__( enable_checkpointing: If ``True``, enable checkpointing. It will configure a default ModelCheckpoint callback if there is no user-defined ModelCheckpoint in :paramref:`~pytorch_lightning.trainer.trainer.Trainer.callbacks`. + Default: ``True``. check_val_every_n_epoch: Check val every n train epochs. + Default: ``1``. + default_root_dir: Default path for logs and weights when no logger/ckpt_callback passed. Default: ``os.getcwd()``. Can be remote file paths such as `s3://mybucket/path` or 'hdfs://path/' detect_anomaly: Enable anomaly detection for the autograd engine. + Default: ``False``. deterministic: If ``True``, sets whether PyTorch operations must use deterministic algorithms. Default: ``False``. @@ -255,6 +267,7 @@ def __init__( fast_dev_run: Runs n if set to ``n`` (int) else 1 if set to ``True`` batch(es) of train, val and test to find any bugs (ie: a sort of unit test). + Default: ``False``. flush_logs_every_n_steps: How often to flush logs to disk (defaults to every 100 steps). @@ -263,27 +276,34 @@ def __init__( Please configure flushing directly in the logger instead. gpus: Number of GPUs to train on (int) or which GPUs to train on (list or str) applied per node + Default: ``None``. gradient_clip_val: The value at which to clip gradients. Passing ``gradient_clip_val=None`` disables gradient clipping. If using Automatic Mixed Precision (AMP), the gradients will be unscaled before. + Default: ``None``. gradient_clip_algorithm: The gradient clipping algorithm to use. Pass ``gradient_clip_algorithm="value"`` to clip by value, and ``gradient_clip_algorithm="norm"`` to clip by norm. By default it will be set to ``"norm"``. limit_train_batches: How much of training dataset to check (float = fraction, int = num_batches). + Default: ``1.0``. limit_val_batches: How much of validation dataset to check (float = fraction, int = num_batches). + Default: ``1.0``. limit_test_batches: How much of test dataset to check (float = fraction, int = num_batches). + Default: ``1.0``. limit_predict_batches: How much of prediction dataset to check (float = fraction, int = num_batches). + Default: ``1.0``. logger: Logger (or iterable collection of loggers) for experiment tracking. A ``True`` value uses the default ``TensorBoardLogger``. ``False`` will disable logging. If multiple loggers are provided and the `save_dir` property of that logger is not set, local files (checkpoints, profiler traces, etc.) are saved in ``default_root_dir`` rather than in the ``log_dir`` of any of the individual loggers. + Default: ``True``. log_gpu_memory: None, 'min_max', 'all'. Might slow performance. @@ -291,7 +311,8 @@ def __init__( Deprecated in v1.5.0 and will be removed in v1.7.0 Please use the ``DeviceStatsMonitor`` callback directly instead. - log_every_n_steps: How often to log within steps (defaults to every 50 steps). + log_every_n_steps: How often to log within steps. + Default: ``50``. prepare_data_per_node: If True, each LOCAL_RANK=0 will call prepare data. Otherwise only NODE_RANK=0, LOCAL_RANK=0 will prepare data @@ -319,15 +340,20 @@ def __init__( pass ``enable_progress_bar = False`` to the Trainer. enable_progress_bar: Whether to enable to progress bar by default. + Default: ``False``. profiler: To profile individual steps during training and assist in identifying bottlenecks. + Default: ``None``. overfit_batches: Overfit a fraction of training data (float) or a set number of batches (int). + Default: ``0.0``. plugins: Plugins allow modification of core behavior like ddp and amp, and enable custom lightning plugins. + Default: ``None``. precision: Double precision (64), full precision (32), half precision (16) or bfloat16 precision (bf16). Can be used on CPU, GPU, TPUs or IPUs. + Default: ``32``. max_epochs: Stop training once this number of epochs is reached. Disabled by default (None). If both max_epochs and max_steps are not specified, defaults to ``max_epochs = 1000``. @@ -339,21 +365,25 @@ def __init__( and ``max_epochs = None``, will default to ``max_epochs = 1000``. To enable infinite training, set ``max_epochs`` to ``-1``. - min_steps: Force training for at least these number of steps. Disabled by default (None). + min_steps: Force training for at least these number of steps. Disabled by default (``None``). - max_time: Stop training after this amount of time has passed. Disabled by default (None). + max_time: Stop training after this amount of time has passed. Disabled by default (``None``). The time duration can be specified in the format DD:HH:MM:SS (days, hours, minutes seconds), as a :class:`datetime.timedelta`, or a dictionary with keys that will be passed to :class:`datetime.timedelta`. num_nodes: Number of GPU nodes for distributed training. + Default: ``1``. num_processes: Number of processes for distributed training with ``accelerator="cpu"``. + Default: ``1``. num_sanity_val_steps: Sanity check runs n validation batches before starting the training routine. Set it to `-1` to run all batches in all validation dataloaders. + Default: ``2``. reload_dataloaders_every_n_epochs: Set to a non-negative integer to reload dataloaders every n epochs. + Default: ``0``. replace_sampler_ddp: Explicitly enables or disables sampler replacement. If not specified this will toggled automatically when DDP is used. By default it will add ``shuffle=True`` for @@ -370,8 +400,10 @@ def __init__( strategy: Supports different training strategies with aliases as well custom training type plugins. + Default: ``None``. sync_batchnorm: Synchronize batch norm layers between process groups/whole world. + Default: ``False``. terminate_on_nan: If set to True, will terminate training (by raising a `ValueError`) at the end of each training batch, if any of the parameters or the loss are NaN or +/-inf. @@ -381,18 +413,24 @@ def __init__( Please use ``detect_anomaly`` instead. detect_anomaly: Enable anomaly detection for the autograd engine. + Default: ``False``. - tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on [1] + tpu_cores: How many TPU cores to train on (1 or 8) / Single TPU to train on (1) + Default: ``None``. ipus: How many IPUs to train on. + Default: ``None``. track_grad_norm: -1 no tracking. Otherwise tracks that p-norm. May be set to 'inf' infinity-norm. If using Automatic Mixed Precision (AMP), the gradients will be unscaled before logging them. + Default: ``-1``. val_check_interval: How often to check the validation set. Use float to check within a training epoch, use int to check every n steps (batches). + Default: ``1.0``. enable_model_summary: Whether to enable model summarization by default. + Default: ``True``. weights_summary: Prints a summary of the weights when training begins. @@ -410,14 +448,17 @@ def __init__( move_metrics_to_cpu: Whether to force internal logged metrics to be moved to cpu. This can save some gpu memory, but can make training slower. Use with attention. + Default: ``False``. multiple_trainloader_mode: How to loop over the datasets when there are multiple train loaders. In 'max_size_cycle' mode, the trainer ends one epoch when the largest dataset is traversed, and smaller datasets reload when running out of their data. In 'min_size' mode, all the datasets reload when reaching the minimum length of datasets. + Default: ``"max_size_cycle"``. stochastic_weight_avg: Whether to use `Stochastic Weight Averaging (SWA) `_. + Default: ``False``. .. deprecated:: v1.5 ``stochastic_weight_avg`` has been deprecated in v1.5 and will be removed in v1.7.