diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000000000..f10c1873295db --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,42 @@ +#see https://github.com/codecov/support/wiki/Codecov-Yaml +codecov: + notify: + require_ci_to_pass: yes + +coverage: + precision: 0 # 2 = xx.xx%, 0 = xx% + round: nearest # how coverage is rounded: down/up/nearest + range: 40...100 # custom range of coverage colors from red -> yellow -> green + status: + # https://codecov.readme.io/v1.0/docs/commit-status + project: + default: + against: auto + target: 99% # specify the target coverage for each commit status + threshold: 20% # allow this little decrease on project + # https://github.com/codecov/support/wiki/Filtering-Branches + # branches: master + if_ci_failed: error + # https://github.com/codecov/support/wiki/Patch-Status + patch: + default: + against: auto + target: 40% # specify the target "X%" coverage to hit + # threshold: 50% # allow this much decrease on patch + changes: false + +parsers: + gcov: + branch_detection: + conditional: true + loop: true + macro: false + method: false + javascript: + enable_partials: false + +comment: + layout: header, diff + require_changes: false + behavior: default # update if exists else create new + # branches: * diff --git a/.travis.yml b/.travis.yml index d80b291f14299..fb1f8c95410b7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,17 +1,46 @@ +# vim ft=yaml + +# After changing this file, check it on: +# http://yaml-online-parser.appspot.com/ + +# See doc/travis_notes.txt for some guidelines + +# this file is *not* meant to cover or endorse the use of travis, but rather to +# help confirm pull requests to this project. + +dist: xenial # Ubuntu 16.04 + +env: + global: + - DISPLAY="" + language: python -python: - - "3.7" -# command to install dependencies + +matrix: + include: + - python: 3.6 + env: TOXENV=py36 + - python: 3.7 + env: TOXENV=py37 + +# See http://docs.travis-ci.com/user/caching/#pip-cache cache: pip + install: - - pip install -e . - pip install -r requirements.txt - - pip install -r tests/requirements.txt - - pip install -U numpy + - pip install -r ./tests/requirements.txt + - pip --version ; pip list -# keep build from timing out -dist: xenial - -# command to run tests script: - - py.test -v # or py.test for Python versions 3.5 and below \ No newline at end of file + # integration + - tox --sitepackages + - python setup.py install --dry-run + +after_success: + - coverage report + # disable auto coverage bc it isn't accurate since it misses gpu code. + # to get coverage, run local and push results + # - codecov + +notifications: + email: false diff --git a/MANIFEST.in b/MANIFEST.in index 53c1b22055d2e..e39ffbad5520d 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,9 +1,36 @@ -graft docs +# Manifest syntax https://docs.python.org/2/distutils/sourcedist.html +graft wheelhouse -include COPYING -include AUTHORS +recursive-include birl *.py +recursive-exclude __pycache__ *.py[cod] *.orig -recursive-include src/einsteinpy/tests *.py *.html +# Include the README +include *.md -prune docs/source/examples/.ipynb_checkpoints -global-exclude *.py[cod] __pycache__ *.so *.dylib +# Include the license file +include LICENSE + +exclude *.sh +exclude *.toml +recursive-include examples *.py +recursive-include pytorch_lightning *.py + +# exclude tests from package +recursive-exclude tests * +exclude tests + +# Exclude the documentation files +recursive-exclude docs * +exclude docs + +# Include the Requirements +include requirements.txt + +# Exclude build configs +exclude *.yml + +prune .git +prune .github +prune notebook* +prune temp* +prune test* \ No newline at end of file diff --git a/README.md b/README.md index 7542eeef3a6f2..b2d286ece826c 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,26 @@ -

- - - -

-

- PyTorch Lightning -

-

- The PyTorch Keras for ML researchers. More control. Less boilerplate. -

- -

- PyPI version - PyPI version - Supported Python Version - - - - -

+
+![Logo](./docs/source/_static/lightning_logo_small.png) + +# PyTorch Lightning + +**The PyTorch Keras for ML researchers. More control. Less boilerplate.** + + +[![PyPI Status](https://badge.fury.io/py/pytorch-lightning.svg)](https://badge.fury.io/py/pytorch-lightning) +[![PyPI Status](https://pepy.tech/badge/pytorch-lightning)](https://pepy.tech/project/pytorch-lightning) +[![Build Status](https://travis-ci.org/williamFalcon/pytorch-lightning.svg?branch=master)](https://travis-ci.org/williamFalcon/pytorch-lightning) + +[![codecov](https://codecov.io/gh/Borda/pytorch-lightning/branch/master/graph/badge.svg)](https://codecov.io/gh/Borda/pytorch-lightning) +[![CodeFactor](https://www.codefactor.io/repository/github/borda/pytorch-lightning/badge)](https://www.codefactor.io/repository/github/borda/pytorch-lightning) +[![ReadTheDocs](https://readthedocs.org/projects/pytorch-lightning/badge/?version=latest)](https://pytorch-lightning.readthedocs.io/en/latest) +[![license](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/williamFalcon/pytorch-lightning/blob/master/LICENSE) + +
+ +Simple installation from PyPI ```bash pip install pytorch-lightning ``` @@ -127,7 +128,7 @@ trainer = Trainer(experiment=exp, max_nb_epochs=1, train_percent_check=0.1) trainer.fit(model) # view tensorflow logs -print(f'View tensorboard logs by running\ntensorboard --logdir {os.getcwd()}') +print('View tensorboard logs by running\ntensorboard --logdir %s' % os.getcwd()) print('and going to http://localhost:6006 on your browser') ``` @@ -137,11 +138,7 @@ print('and going to http://localhost:6006 on your browser') Everything in gray! You define the blue parts using the LightningModule interface: -

- - - -

+![Ouverview](./docs/source/_static/overview_flat.jpg) ```{.python} # what to do in the training loop @@ -223,19 +220,11 @@ def validation_end(self, outputs): ## Tensorboard Lightning is fully integrated with tensorboard. -

- - - -

+![tensorboard-support](./docs/source/_static/tf_loss.png) Lightning also adds a text column with all the hyperparameters for this experiment. -

- - - -

+![tensorboard-support](./docs/source/_static/tf_tags.png) Simply note the path you set for the Experiment ``` {.python} diff --git a/appveyor.yml b/appveyor.yml new file mode 100644 index 0000000000000..ad03a4786b057 --- /dev/null +++ b/appveyor.yml @@ -0,0 +1,66 @@ +# https://www.appveyor.com/docs/appveyor-yml/ +environment: + + # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the + # /E:ON and /V:ON options are not enabled in the batch script interpreter + # See: http://stackoverflow.com/a/13751649/163740 + CMD_IN_ENV: "cmd /E:ON /V:ON /C obvci_appveyor_python_build_env.cmd" + + matrix: + # Pre-installed Python versions, which Appveyor may upgrade to + # a later point release. + # See: http://www.appveyor.com/docs/installed-software#python + + + # - PYTHON: "C:\\Python35-x64" + # PYTHON_VERSION: "3.5.x" + # PYTHON_ARCH: "64" + # TOXENV: "py35" + + - PYTHON: "C:\\Python36-x64" + PYTHON_VERSION: "3.6.x" + PYTHON_ARCH: "64" + TOXENV: "py36" + PIP_PYVER: "36" + + - PYTHON: "C:\\Python37-x64" + PYTHON_VERSION: "3.7.x" + PYTHON_ARCH: "64" + TOXENV: "py37" + PIP_PYVER: "37" + +build: off + +# https://www.appveyor.com/docs/build-cache/ +cache: + - C:\ProgramData\chocolatey\bin -> appveyor.yml + - C:\ProgramData\chocolatey\lib -> appveyor.yml + - '%LOCALAPPDATA%\pip\Cache -> appveyor.yml' + +# scripts that run after cloning repository +install: + # If there is a newer build queued for the same PR, cancel this one. + # The AppVeyor 'rollout builds' option is supposed to serve the same + # purpose but it is problematic because it tends to cancel builds pushed + # directly to master instead of just PR builds (or the converse). + - SET PATH=%PYTHON%;%PYTHON%\\Scripts;%path% + - pip install -U --user pip + - pip install "https://download.pytorch.org/whl/cu90/torch-1.1.0-cp%PIP_PYVER%-cp%PIP_PYVER%m-win_amd%PYTHON_ARCH%.whl" + pip install "https://download.pytorch.org/whl/cu90/torchvision-0.3.0-cp%PIP_PYVER%-cp%PIP_PYVER%m-win_amd%PYTHON_ARCH%.whl" + - pip install -r requirements.txt + - pip install -r ./tests/requirements.txt + +# scripts to run before tests (working directory and environment changes are persisted from the previous steps such as "before_build") +before_test: + - python --version + - pip --version + - pip list + - dir + +# to run your custom scripts instead of automatic tests +test_script: + - tox --sitepackages --parallel auto + +on_success: + - coverage report + # - codecov diff --git a/coverage.svg b/coverage.svg deleted file mode 100644 index 6bfc8faf24d3c..0000000000000 --- a/coverage.svg +++ /dev/null @@ -1,21 +0,0 @@ - - - - - - - - - - - - - - - - coverage - coverage - 99% - 99% - - diff --git a/docs/Trainer/Distributed training.md b/docs/Trainer/Distributed training.md index aedbd20ed1f42..cafc719df0a95 100644 --- a/docs/Trainer/Distributed training.md +++ b/docs/Trainer/Distributed training.md @@ -94,7 +94,7 @@ cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo') cluster.add_command('export NCCL_DEBUG=INFO') # setting a master port here is a good idea. -cluster.add_command(f'export MASTER_PORT={PORT}') +cluster.add_command('export MASTER_PORT=%r' % PORT) # good to load the latest NCCL version cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0']) diff --git a/docs/source/_static/lightning_logo_medium.png b/docs/source/_static/lightning_logo_medium.png new file mode 100644 index 0000000000000..a28606b541632 Binary files /dev/null and b/docs/source/_static/lightning_logo_medium.png differ diff --git a/docs/source/_static/lightning_logo_small.png b/docs/source/_static/lightning_logo_small.png new file mode 100644 index 0000000000000..17d0aa92bce2b Binary files /dev/null and b/docs/source/_static/lightning_logo_small.png differ diff --git a/examples/__init__.py b/examples/__init__.py index 6743d7f97919c..0d456dacb6443 100644 --- a/examples/__init__.py +++ b/examples/__init__.py @@ -1 +1,5 @@ -from .new_project_templates.lightning_module_template import LightningTemplateModel \ No newline at end of file +from .new_project_templates.lightning_module_template import LightningTemplateModel + +__all__ = [ + 'LightningTemplateModel' +] diff --git a/examples/new_project_templates/lightning_module_template.py b/examples/new_project_templates/lightning_module_template.py index 483a4e3a048df..d6bdd13068e40 100644 --- a/examples/new_project_templates/lightning_module_template.py +++ b/examples/new_project_templates/lightning_module_template.py @@ -47,11 +47,13 @@ def __build_model(self): Layout model :return: """ - self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim) + self.c_d1 = nn.Linear(in_features=self.hparams.in_features, + out_features=self.hparams.hidden_dim) self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim) self.c_d1_drop = nn.Dropout(self.hparams.drop_prob) - self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features) + self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, + out_features=self.hparams.out_features) # --------------------- # TRAINING @@ -171,8 +173,10 @@ def configure_optimizers(self): def __dataloader(self, train): # init data generators - transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]) - dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True) + transform = transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.5,), (1.0,))]) + dataset = MNIST(root=self.hparams.data_root, train=train, + transform=transform, download=True) # when using multi-node we need to add the datasampler train_sampler = None @@ -182,7 +186,7 @@ def __dataloader(self, train): if self.on_gpu: train_sampler = DistributedSampler(dataset, rank=self.trainer.proc_rank) batch_size = batch_size // self.trainer.world_size # scale batch size - except Exception as e: + except Exception: pass should_shuffle = train_sampler is None @@ -211,7 +215,7 @@ def test_dataloader(self): return self.__dataloader(train=False) @staticmethod - def add_model_specific_args(parent_parser, root_dir): # pragma: no cover + def add_model_specific_args(parent_parser, root_dir): # pragma: no cover """ Parameters you define here will be available to your model through self.hparams :param parent_parser: @@ -224,20 +228,25 @@ def add_model_specific_args(parent_parser, root_dir): # pragma: no cover # parser.set_defaults(gradient_clip=5.0) # network params - parser.add_argument('--in_features', default=28*28, type=int) + parser.add_argument('--in_features', default=28 * 28, type=int) parser.add_argument('--out_features', default=10, type=int) - parser.add_argument('--hidden_dim', default=50000, type=int) # use 500 for CPU, 50000 for GPU to see speed difference + # use 500 for CPU, 50000 for GPU to see speed difference + parser.add_argument('--hidden_dim', default=50000, type=int) parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=False) # data parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str) # training params (opt) - parser.opt_list('--learning_rate', default=0.001*8, type=float, options=[0.0001, 0.0005, 0.001, 0.005], + parser.opt_list('--learning_rate', default=0.001 * 8, type=float, + options=[0.0001, 0.0005, 0.001, 0.005], tunable=False) - parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False) - - # if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu - parser.opt_list('--batch_size', default=256*8, type=int, options=[32, 64, 128, 256], tunable=False, - help='batch size will be divided over all the gpus being used across all nodes') + parser.opt_list('--optimizer_name', default='adam', type=str, + options=['adam'], tunable=False) + + # if using 2 nodes with 4 gpus each the batch size here + # (256) will be 256 / (2*8) = 16 per gpu + parser.opt_list('--batch_size', default=256 * 8, type=int, + options=[32, 64, 128, 256], tunable=False, + help='batch size will be divided over all gpus being used across all nodes') return parser diff --git a/examples/new_project_templates/multi_node_cluster_template.py b/examples/new_project_templates/multi_node_cluster_template.py index 5f6914d107ae0..c4af6d416b20a 100644 --- a/examples/new_project_templates/multi_node_cluster_template.py +++ b/examples/new_project_templates/multi_node_cluster_template.py @@ -10,12 +10,12 @@ from pytorch_lightning.models.trainer import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from examples.new_project_templates.lightning_module_template import LightningTemplateModel + SEED = 2334 torch.manual_seed(SEED) np.random.seed(SEED) -from .lightning_module_template import LightningTemplateModel - def main_local(hparams): main(hparams, None, None) @@ -112,8 +112,10 @@ def optimize_on_cluster(hyperparams): cluster.add_command('source activate lightning') # run only on 32GB voltas - cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus') - cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus') + cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', + comment='use 32gb gpus') + cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, + comment='use 32gb gpus') # run hopt # creates and submits jobs to slurm @@ -140,15 +142,23 @@ def optimize_on_cluster(hyperparams): parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual') # TODO: make 1 param - parent_parser.add_argument('--per_experiment_nb_gpus', type=int, help='how many gpus to use in a node') - parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node') - - parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1, help='how many nodes to use in a cluster') - parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') - parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir, help='where to save slurm meta') - parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') - parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') - parent_parser.add_argument('--nb_hopt_trials', type=int, default=1, help='how many grid search trials to run') + parent_parser.add_argument('--per_experiment_nb_gpus', type=int, + help='how many gpus to use in a node') + parent_parser.add_argument('--gpus', type=str, default='-1', + help='how many gpus to use in the node') + + parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1, + help='how many nodes to use in a cluster') + parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, + help='where to save logs') + parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir, + help='where to save slurm meta') + parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, + help='where to save model') + parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', + help='test tube exp name') + parent_parser.add_argument('--nb_hopt_trials', type=int, default=1, + help='how many grid search trials to run') # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) diff --git a/examples/new_project_templates/single_cpu_template.py b/examples/new_project_templates/single_cpu_template.py index 29c255981627a..c0f4826f99634 100644 --- a/examples/new_project_templates/single_cpu_template.py +++ b/examples/new_project_templates/single_cpu_template.py @@ -9,12 +9,12 @@ from pytorch_lightning.models.trainer import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from examples.new_project_templates.lightning_module_template import LightningTemplateModel + SEED = 2334 torch.manual_seed(SEED) np.random.seed(SEED) -from .lightning_module_template import LightningTemplateModel - def main(hparams): """ @@ -90,9 +90,12 @@ def main(hparams): parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) # gpu args - parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') - parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') - parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') + parent_parser.add_argument('--test_tube_save_path', type=str, + default=test_tube_dir, help='where to save logs') + parent_parser.add_argument('--model_save_path', type=str, + default=checkpoint_dir, help='where to save model') + parent_parser.add_argument('--experiment_name', type=str, + default='pt_lightning_exp_a', help='test tube exp name') # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) @@ -102,5 +105,5 @@ def main(hparams): # RUN TRAINING # --------------------- # run on HPC cluster - print(f'RUNNING ON CPU') + print('RUNNING ON CPU') main(hyperparams) diff --git a/examples/new_project_templates/single_gpu_node_16bit_template.py b/examples/new_project_templates/single_gpu_node_16bit_template.py index 14db484e56409..babf18e73a2a4 100644 --- a/examples/new_project_templates/single_gpu_node_16bit_template.py +++ b/examples/new_project_templates/single_gpu_node_16bit_template.py @@ -9,12 +9,12 @@ from pytorch_lightning.models.trainer import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from examples.new_project_templates.lightning_module_template import LightningTemplateModel + SEED = 2334 torch.manual_seed(SEED) np.random.seed(SEED) -from .lightning_module_template import LightningTemplateModel - def main(hparams): """ @@ -92,10 +92,15 @@ def main(hparams): parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) # gpu args - parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node') - parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') - parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') - parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') + parent_parser.add_argument('--gpus', type=str, default='-1', + help='how many gpus to use in the node.' + 'value -1 uses all the gpus on the node') + parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, + help='where to save logs') + parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, + help='where to save model') + parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', + help='test tube exp name') # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) @@ -105,5 +110,5 @@ def main(hparams): # RUN TRAINING # --------------------- # run on HPC cluster - print(f'RUNNING INTERACTIVE MODE ON GPUS. gpu ids: {hyperparams.gpus}') + print('RUNNING INTERACTIVE MODE ON GPUS. gpu ids: %i' % hyperparams.gpus) main(hyperparams) diff --git a/examples/new_project_templates/single_gpu_node_ddp_template.py b/examples/new_project_templates/single_gpu_node_ddp_template.py index 56e301b20478a..68a332aeb8b5a 100644 --- a/examples/new_project_templates/single_gpu_node_ddp_template.py +++ b/examples/new_project_templates/single_gpu_node_ddp_template.py @@ -9,12 +9,12 @@ from pytorch_lightning.models.trainer import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from examples.new_project_templates.lightning_module_template import LightningTemplateModel + SEED = 2334 torch.manual_seed(SEED) np.random.seed(SEED) -from .lightning_module_template import LightningTemplateModel - def main(hparams): """ @@ -92,10 +92,15 @@ def main(hparams): parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) # gpu args - parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node') - parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') - parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') - parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') + parent_parser.add_argument('--gpus', type=str, default='-1', + help='how many gpus to use in the node.' + ' value -1 uses all the gpus on the node') + parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, + help='where to save logs') + parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, + help='where to save model') + parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', + help='test tube exp name') # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) @@ -105,5 +110,5 @@ def main(hparams): # RUN TRAINING # --------------------- # run on HPC cluster - print(f'RUNNING INTERACTIVE MODE ON GPUS. gpu ids: {hyperparams.gpus}') + print('RUNNING INTERACTIVE MODE ON GPUS. gpu ids: %i' % hyperparams.gpus) main(hyperparams) diff --git a/examples/new_project_templates/single_gpu_node_dp_template.py b/examples/new_project_templates/single_gpu_node_dp_template.py index 9d6992533b680..d752713ee1df4 100644 --- a/examples/new_project_templates/single_gpu_node_dp_template.py +++ b/examples/new_project_templates/single_gpu_node_dp_template.py @@ -9,12 +9,12 @@ from pytorch_lightning.models.trainer import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint +from examples.new_project_templates.lightning_module_template import LightningTemplateModel + SEED = 2334 torch.manual_seed(SEED) np.random.seed(SEED) -from .lightning_module_template import LightningTemplateModel - def main(hparams): """ @@ -91,10 +91,15 @@ def main(hparams): parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False) # gpu args - parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node') - parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs') - parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model') - parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name') + parent_parser.add_argument('--gpus', type=str, default='-1', + help='how many gpus to use in the node.' + ' value -1 uses all the gpus on the node') + parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, + help='where to save logs') + parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, + help='where to save model') + parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', + help='test tube exp name') # allow model to overwrite or extend args parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir) @@ -104,5 +109,5 @@ def main(hparams): # RUN TRAINING # --------------------- # run on HPC cluster - print(f'RUNNING INTERACTIVE MODE ON GPUS. gpu ids: {hyperparams.gpus}') + print('RUNNING INTERACTIVE MODE ON GPUS. gpu ids: %i' % hyperparams.gpus) main(hyperparams) diff --git a/examples/new_project_templates/trainer_cpu_template.py b/examples/new_project_templates/trainer_cpu_template.py index de6ba7c424571..84a29a9bf3627 100644 --- a/examples/new_project_templates/trainer_cpu_template.py +++ b/examples/new_project_templates/trainer_cpu_template.py @@ -6,7 +6,7 @@ from pytorch_lightning.utilities.arg_parse import add_default_args from pytorch_lightning.callbacks.pt_callbacks import EarlyStopping, ModelCheckpoint -from .lightning_module_template import LightningTemplateModel +from examples.new_project_templates.lightning_module_template import LightningTemplateModel def main(hparams): @@ -67,7 +67,7 @@ def main(hparams): add_default_args(parent_parser, root_dir) # allow model to overwrite or extend args - parser = ExampleModel.add_model_specific_args(parent_parser) + parser = LightningTemplateModel.add_model_specific_args(parent_parser) hyperparams = parser.parse_args() # train model diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py index 4b5961ac2daf4..73067d63858f9 100644 --- a/pytorch_lightning/__init__.py +++ b/pytorch_lightning/__init__.py @@ -1,3 +1,19 @@ from .models.trainer import Trainer from .root_module.root_module import LightningModule -from .root_module.decorators import data_loader \ No newline at end of file +from .root_module.decorators import data_loader + +__version__ = '0.3.6.9' +__author__ = "William Falcon", +__author_email__ = "waf2107@columbia.edu" +__license__ = 'Apache-2' +__homepage__ = 'https://github.com/williamFalcon/pytorch-lightning', +__copyright__ = 'Copyright (c) 2018-2019, %s.' % __author__ +__doc__ = """ +The Keras for ML researchers using PyTorch +""" + +__all__ = [ + 'Trainer', + 'LightningModule', + 'data_loader', +] diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py index f180c2542490f..035deb0681d7d 100644 --- a/pytorch_lightning/callbacks/__init__.py +++ b/pytorch_lightning/callbacks/__init__.py @@ -1 +1,6 @@ -from .pt_callbacks import EarlyStopping, ModelCheckpoint \ No newline at end of file +from .pt_callbacks import EarlyStopping, ModelCheckpoint + +__all__ = [ + 'EarlyStopping', + 'ModelCheckpoint', +] diff --git a/pytorch_lightning/callbacks/pt_callbacks.py b/pytorch_lightning/callbacks/pt_callbacks.py index 89c8f6b2e5f8f..f07c6a28e33c3 100644 --- a/pytorch_lightning/callbacks/pt_callbacks.py +++ b/pytorch_lightning/callbacks/pt_callbacks.py @@ -122,9 +122,9 @@ def on_epoch_end(self, epoch, logs=None): current = logs.get(self.monitor) stop_training = False if current is None: - print('Early stopping conditioned on metric `%s` ''which is not available. Available metrics are: %s' % - (self.monitor, ','.join(list(logs.keys()))), RuntimeWarning - ) + print('Early stopping conditioned on metric `%s` ' + 'which is not available. Available metrics are: %s' % + (self.monitor, ','.join(list(logs.keys()))), RuntimeWarning) exit(-1) if self.monitor_op(current - self.min_delta, self.best): @@ -188,8 +188,7 @@ def __init__(self, filepath, monitor='val_loss', verbose=0, if mode not in ['auto', 'min', 'max']: print('ModelCheckpoint mode %s is unknown, ' - 'fallback to auto mode.' % (mode), - RuntimeWarning) + 'fallback to auto mode.' % (mode), RuntimeWarning) mode = 'auto' if mode == 'min': @@ -233,8 +232,8 @@ def on_epoch_end(self, epoch, logs=None): if self.save_best_only: current = logs.get(self.monitor) if current is None: - print('Can save best model only with %s available, ' - 'skipping.' % (self.monitor), RuntimeWarning) + print('Can save best model only with %s available,' + ' skipping.' % (self.monitor), RuntimeWarning) else: if self.monitor_op(current, self.best): if self.verbose > 0: diff --git a/pytorch_lightning/models/__init__.py b/pytorch_lightning/models/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py index 2655809c4a7de..574c8061c55d9 100644 --- a/pytorch_lightning/models/trainer.py +++ b/pytorch_lightning/models/trainer.py @@ -1,12 +1,10 @@ """ The trainer handles all the logic for running a val loop, training loop, distributing, etc... """ -import subprocess -import traceback -import warnings + import os -import pdb import re +import warnings import numpy as np import tqdm @@ -17,13 +15,14 @@ from ..root_module.memory import get_gpu_memory_map from ..root_module.model_saving import TrainerIO -from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel +from ..pt_overrides.override_data_parallel import ( + LightningDistributedDataParallel, LightningDataParallel) from ..utilities.debugging import MisconfigurationException try: from apex import amp APEX_AVAILABLE = True -except Exception: +except ImportError: APEX_AVAILABLE = False @@ -66,17 +65,20 @@ def __init__(self, check_val_every_n_epoch=1, fast_dev_run=False, accumulate_grad_batches=1, - max_nb_epochs=1000, min_nb_epochs=1, - train_percent_check=1.0, val_percent_check=1.0, test_percent_check=1.0, + max_nb_epochs=1000, + min_nb_epochs=1, + train_percent_check=1.0, + val_percent_check=1.0, + test_percent_check=1.0, val_check_interval=0.95, - log_save_interval=100, add_log_row_interval=10, + log_save_interval=100, + add_log_row_interval=10, distributed_backend='dp', use_amp=False, print_nan_grads=False, print_weights_summary=True, amp_level='O2', nb_sanity_val_steps=5): - """ :param experiment: Test-tube experiment @@ -102,16 +104,15 @@ def __init__(self, :param val_check_interval: :param log_save_interval: :param add_log_row_interval: - :param distributed_backend: 'np' to use DistributedParallel, 'ddp' to use DistributedDataParallel + :param distributed_backend: + 'np' to use DistributedParallel, 'dp' to use DistributedDataParallel :param use_amp: :param print_nan_grads: :param print_weights_summary: :param amp_level: :param nb_sanity_val_steps: """ - # Transfer params - self.nb_gpu_nodes = nb_gpu_nodes self.gradient_clip = gradient_clip self.check_val_every_n_epoch = check_val_every_n_epoch @@ -173,13 +174,14 @@ def __init__(self, # set the correct cuda visible devices (using pci order) os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in self.data_parallel_device_ids]) - print(f'VISIBLE GPUS: {os.environ["CUDA_VISIBLE_DEVICES"]}') + os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in + self.data_parallel_device_ids]) + print('VISIBLE GPUS: %r' % os.environ["CUDA_VISIBLE_DEVICES"]) # make DP and DDP mutually exclusive # single GPU will also use DP with devices=[0] - have_gpus = self.data_parallel_device_ids is not None and len(self.data_parallel_device_ids) > 0 - if have_gpus: + requested_gpus = self.data_parallel_device_ids is not None + if requested_gpus and len(self.data_parallel_device_ids) > 0: self.use_dp = distributed_backend == 'dp' self.use_ddp = distributed_backend == 'ddp' @@ -201,7 +203,7 @@ def __init__(self, try: self.nb_slurm_tasks = int(os.environ['SLURM_NTASKS']) self.is_slurm_managing_tasks = self.nb_slurm_tasks == self.nb_requested_gpus - except Exception as e: + except Exception: # likely not on slurm, so set the slurm managed flag to false self.is_slurm_managing_tasks = False @@ -226,7 +228,8 @@ def __init__(self, self.val_dataloader = None # how much of the data to use - self.__determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct) + self.__determine_data_use_amount(train_percent_check, val_percent_check, + test_percent_check, overfit_pct) print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu)) # 16 bit mixed precision training using apex @@ -235,20 +238,21 @@ def __init__(self, print('using 16bit precision') if use_amp and not APEX_AVAILABLE: # pragma: no cover - msg = ''' + msg = """ You set use_amp=True but do not have apex installed. - Install apex first using this guide and rerun with use_amp=True: + Install apex first using this guide and rerun with use_amp=True: https://github.com/NVIDIA/apex#linux - + this run will NOT use 16 bit precision - ''' + """ raise ModuleNotFoundError(msg) @property def data_parallel(self): return self.use_dp or self.use_ddp - def __determine_data_use_amount(self, train_percent_check, val_percent_check, test_percent_check, overfit_pct): + def __determine_data_use_amount(self, train_percent_check, val_percent_check, + test_percent_check, overfit_pct): """ Use less data for debugging purposes """ @@ -275,7 +279,7 @@ def __tng_tqdm_dic(self): 'tng_loss': '{0:.3f}'.format(self.avg_loss), 'v_nb': '{}'.format(self.experiment.version), 'epoch': '{}'.format(self.current_epoch), - 'batch_nb':'{}'.format(self.batch_nb), + 'batch_nb': '{}'.format(self.batch_nb), } tqdm_dic.update(self.tqdm_metrics) @@ -389,18 +393,19 @@ def get_dataloaders(self, model): self.val_dataloader = model.val_dataloader if self.use_ddp and not isinstance(self.tng_dataloader.sampler, DistributedSampler): - msg = ''' - when using multiple gpus and multiple nodes you must pass a DistributedSampler to DataLoader(sampler). - - ie: this: - dataset = myDataset() - dataloader = Dataloader(dataset) - - becomes: - dataset = myDataset() - dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset) - dataloader = Dataloader(dataset, sampler=dist_sampler) - ''' + msg = """ +when using multiple gpus and multiple nodes you must pass + a DistributedSampler to DataLoader(sampler). + +ie: this: +dataset = myDataset() +dataloader = Dataloader(dataset) + +becomes: +dataset = myDataset() +dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset) +dataloader = Dataloader(dataset, sampler=dist_sampler) +""" raise MisconfigurationException(msg) # ----------------------------- @@ -410,19 +415,20 @@ def fit(self, model): # when using multi-node or DDP within a node start each module in a separate process if self.use_ddp: - # must copy only the meta of the exp so it survives pickle/unpickle when going to new process + # must copy only the meta of the exp so it survives pickle/unpickle + # when going to new process self.experiment = self.experiment.get_meta_copy() if self.is_slurm_managing_tasks: task = int(os.environ['SLURM_LOCALID']) self.ddp_train(task, model) else: - msg = f""" - You requested {self.nb_requested_gpus} GPUs but launched {self.nb_slurm_tasks} slurm tasks. - We will launch {self.nb_requested_gpus} processes for you. - We recommend you let slurm manage the processes by setting: --ntasks-per-node={self.nb_requested_gpus} - If you're not using SLURM, ignore this message! - """ + msg = """ +You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks. +We will launch %(nb_gpus)s processes for you. +We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s +If you're not using SLURM, ignore this message! +""" % {'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks} warnings.warn(msg) mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, )) @@ -435,7 +441,8 @@ def fit(self, model): else: # run through amp wrapper if self.use_amp: - raise MisconfigurationException('amp + cpu is not supported. Please use a GPU option') + raise MisconfigurationException('amp + cpu is not supported.' + ' Please use a GPU option') # CHOOSE OPTIMIZER # allow for lr schedulers as well @@ -462,9 +469,11 @@ def __dp_train(self, model): # check for this bug (amp + dp + !01 doesn't work) # https://github.com/NVIDIA/apex/issues/227 if self.use_dp and self.use_amp: - m = f'amp level {self.amp_level} with DataParallel is not supported. ' \ - f'See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. ' \ - f'We recommend you switch to ddp if you want to use amp' + m = """ +Amp level %r with DataParallel is not supported. +See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. +We recommend you switch to ddp if you want to use amp +""" % self.amp_level raise MisconfigurationException(m) model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids) @@ -484,7 +493,7 @@ def ddp_train(self, gpu_nb, model): try: node_id = os.environ['SLURM_NODEID'] self.node_rank = int(node_id) - except Exception as e: + except Exception: self.node_rank = 0 # recover original exp before went into process @@ -527,7 +536,8 @@ def ddp_train(self, gpu_nb, model): ) self.optimizers = optimizers - model = LightningDistributedDataParallel(model, device_ids=[gpu_nb], find_unused_parameters=True) + model = LightningDistributedDataParallel(model, device_ids=[gpu_nb], + find_unused_parameters=True) # continue training routine self.__run_pretrain_routine(model) @@ -543,14 +553,14 @@ def __init_tcp_connection(self): # sets the appropriate port try: port = os.environ['MASTER_PORT'] - except Exception as e: + except Exception: port = 12910 - os.environ['MASTER_PORT'] = f'{port}' + os.environ['MASTER_PORT'] = str(port) # figure out the root node addr try: root_node = os.environ['SLURM_NODELIST'].split(' ')[0] - except Exception as e: + except Exception: root_node = '127.0.0.2' root_node = self.resolve_root_node_address(root_node) @@ -642,7 +652,8 @@ def __train(self): # init progbar when requested if self.progress_bar: - self.prog_bar = tqdm.tqdm(range(self.total_batches), position=self.process_position) + self.prog_bar = tqdm.tqdm(range(self.total_batches), + position=self.process_position) for batch_nb, data_batch in enumerate(self.tng_dataloader): self.batch_nb = batch_nb @@ -651,7 +662,8 @@ def __train(self): model = self.__get_model() model.global_step = self.global_step - # stop when the flag is changed or we've gone past the amount requested in the batches + # stop when the flag is changed or we've gone past the amount + # requested in the batches self.total_batch_nb += 1 met_batch_limit = batch_nb > self.nb_tng_batches if met_batch_limit: @@ -698,7 +710,8 @@ def __train(self): model.on_tng_metrics(metrics) # log metrics - scalar_metrics = self.__metrics_to_scalars(metrics, blacklist=self.__log_vals_blacklist()) + scalar_metrics = self.__metrics_to_scalars( + metrics, blacklist=self.__log_vals_blacklist()) if self.proc_rank == 0: self.experiment.log(scalar_metrics, global_step=self.global_step) self.experiment.save() @@ -720,7 +733,8 @@ def __train(self): # early stopping met_min_epochs = epoch_nb > self.min_nb_epochs if self.enable_early_stop and met_min_epochs: - should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb, logs=self.__tng_tqdm_dic) + should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb, + logs=self.__tng_tqdm_dic) # stop training stop = should_stop and met_min_epochs @@ -773,14 +787,14 @@ def __run_tng_batch(self, data_batch, batch_nb): try: model_specific_tqdm_metrics_dic = output['prog'] - except Exception as e: + except Exception: model_specific_tqdm_metrics_dic = {} # if output dict doesn't have the keyword loss # then assume the output=loss if scalar try: loss = output['loss'] - except Exception as e: + except Exception: if type(output) is torch.Tensor: loss = output @@ -828,7 +842,8 @@ def __run_tng_batch(self, data_batch, batch_nb): # clear gradients optimizer.zero_grad() - # queuing loss across batches blows it up proportionally... divide out the number accumulated + # queuing loss across batches blows it up proportionally... + # divide out the number accumulated self.batch_loss_value = self.batch_loss_value / self.accumulate_grad_batches # track loss @@ -885,4 +900,5 @@ def __run_validation(self): # model checkpointing if self.proc_rank == 0 and self.checkpoint_callback is not None: print('save callback...') - self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch, logs=self.__tng_tqdm_dic) + self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch, + logs=self.__tng_tqdm_dic) diff --git a/pytorch_lightning/pt_overrides/override_data_parallel.py b/pytorch_lightning/pt_overrides/override_data_parallel.py index f06f8030ad500..ab88d286bb73b 100644 --- a/pytorch_lightning/pt_overrides/override_data_parallel.py +++ b/pytorch_lightning/pt_overrides/override_data_parallel.py @@ -63,7 +63,6 @@ def forward(self, *inputs, **kwargs): outputs = self.parallel_apply(replicas, inputs, kwargs) return self.gather(outputs, self.output_device) - def parallel_apply(self, replicas, inputs, kwargs): return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)]) diff --git a/pytorch_lightning/root_module/grads.py b/pytorch_lightning/root_module/grads.py index d50fa450961a2..7bdc8572b97fb 100644 --- a/pytorch_lightning/root_module/grads.py +++ b/pytorch_lightning/root_module/grads.py @@ -4,6 +4,7 @@ from torch import nn + class GradInformation(nn.Module): def grad_norm(self, norm_type): @@ -16,12 +17,13 @@ def grad_norm(self, norm_type): total_norm += param_norm ** norm_type norm = param_norm ** (1 / norm_type) - results['grad_{}_norm_{}'.format(norm_type, i)] = round(norm.data.cpu().numpy().flatten()[0], 3) - except Exception as e: + grad = round(norm.data.cpu().numpy().flatten()[0], 3) + results['grad_{}_norm_{}'.format(norm_type, i)] = grad + except Exception: # this param had no grad pass total_norm = total_norm ** (1. / norm_type) - results['grad_{}_norm_total'.format(norm_type)] = round(total_norm.data.cpu().numpy().flatten()[0], 3) + grad = round(total_norm.data.cpu().numpy().flatten()[0], 3) + results['grad_{}_norm_total'.format(norm_type)] = grad return results - diff --git a/pytorch_lightning/root_module/hooks.py b/pytorch_lightning/root_module/hooks.py index 849826a83eabb..00ece234824e5 100644 --- a/pytorch_lightning/root_module/hooks.py +++ b/pytorch_lightning/root_module/hooks.py @@ -43,4 +43,3 @@ def on_after_backward(self): :return: """ pass - diff --git a/pytorch_lightning/root_module/memory.py b/pytorch_lightning/root_module/memory.py index 128fb16f18b43..3a636b040a270 100644 --- a/pytorch_lightning/root_module/memory.py +++ b/pytorch_lightning/root_module/memory.py @@ -94,7 +94,7 @@ def get_parameter_sizes(self): mods = list(self.model.modules()) sizes = [] - for i in range(1,len(mods)): + for i in range(1, len(mods)): m = mods[i] p = list(m.parameters()) modsz = [] @@ -127,7 +127,7 @@ def make_summary(self): if self.model.example_input_array is not None: cols.extend(['In_sizes', 'Out_sizes']) - df = pd.DataFrame(np.zeros( (len(self.layer_names), len(cols)))) + df = pd.DataFrame(np.zeros((len(self.layer_names), len(cols)))) df.columns = cols df['Name'] = self.layer_names @@ -152,16 +152,16 @@ def summarize(self): self.make_summary() -def print_mem_stack(): # pragma: no cover +def print_mem_stack(): # pragma: no cover for obj in gc.get_objects(): try: if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)): print(type(obj), obj.size()) - except Exception as e: + except Exception: pass -def count_mem_items(): # pragma: no cover +def count_mem_items(): # pragma: no cover nb_params = 0 nb_tensors = 0 for obj in gc.get_objects(): @@ -172,7 +172,7 @@ def count_mem_items(): # pragma: no cover nb_params += 1 else: nb_tensors += 1 - except Exception as e: + except Exception: pass return nb_params, nb_tensors @@ -196,6 +196,6 @@ def get_gpu_memory_map(): gpu_memory = [int(x) for x in result.strip().split('\n')] gpu_memory_map = {} for k, v in zip(range(len(gpu_memory)), gpu_memory): - k = f'gpu_{k}' + k = 'gpu_%i' % k gpu_memory_map[k] = v return gpu_memory_map diff --git a/pytorch_lightning/root_module/model_saving.py b/pytorch_lightning/root_module/model_saving.py index 0bde0943f45a8..0765142cd537f 100644 --- a/pytorch_lightning/root_module/model_saving.py +++ b/pytorch_lightning/root_module/model_saving.py @@ -3,7 +3,8 @@ import torch -from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel +from ..pt_overrides.override_data_parallel import ( + LightningDistributedDataParallel, LightningDataParallel) class ModelIO(object): @@ -45,7 +46,8 @@ def on_hpc_load(self, checkpoint): class TrainerIO(object): def __get_model(self): - is_dp_module = type(self.model) is LightningDistributedDataParallel or type(self.model) is LightningDataParallel + is_dp_module = isinstance(self.model, (LightningDistributedDataParallel, + LightningDataParallel)) model = self.model.module if is_dp_module else self.model return model diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py index 1421a72d5b357..96dbfdcd442a0 100644 --- a/pytorch_lightning/root_module/root_module.py +++ b/pytorch_lightning/root_module/root_module.py @@ -129,6 +129,3 @@ def freeze(self): def unfreeze(self): for param in self.parameters(): param.requires_grad = True - - - diff --git a/pytorch_lightning/testing/lm_test_module.py b/pytorch_lightning/testing/lm_test_module.py index 9861810e8a7f8..61ecf874f337d 100644 --- a/pytorch_lightning/testing/lm_test_module.py +++ b/pytorch_lightning/testing/lm_test_module.py @@ -48,11 +48,13 @@ def __build_model(self): Layout model :return: """ - self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim) + self.c_d1 = nn.Linear(in_features=self.hparams.in_features, + out_features=self.hparams.hidden_dim) self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim) self.c_d1_drop = nn.Dropout(self.hparams.drop_prob) - self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features) + self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, + out_features=self.hparams.out_features) # --------------------- # TRAINING @@ -191,8 +193,10 @@ def configure_optimizers(self): def __dataloader(self, train): # init data generators - transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))]) - dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True) + transform = transforms.Compose([transforms.ToTensor(), + transforms.Normalize((0.5,), (1.0,))]) + dataset = MNIST(root=self.hparams.data_root, train=train, + transform=transform, download=True) # when using multi-node we need to add the datasampler train_sampler = None @@ -202,7 +206,7 @@ def __dataloader(self, train): if self.on_gpu and not self.force_remove_distributed_sampler: train_sampler = DistributedSampler(dataset, rank=self.trainer.proc_rank) batch_size = batch_size // self.trainer.world_size # scale batch size - except Exception as e: + except Exception: pass should_shuffle = train_sampler is None @@ -242,19 +246,24 @@ def add_model_specific_args(parent_parser, root_dir): # network params parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=False) - parser.add_argument('--in_features', default=28*28, type=int) + parser.add_argument('--in_features', default=28 * 28, type=int) parser.add_argument('--out_features', default=10, type=int) - parser.add_argument('--hidden_dim', default=50000, type=int) # use 500 for CPU, 50000 for GPU to see speed difference + # use 500 for CPU, 50000 for GPU to see speed difference + parser.add_argument('--hidden_dim', default=50000, type=int) # data parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str) # training params (opt) - parser.opt_list('--learning_rate', default=0.001*8, type=float, options=[0.0001, 0.0005, 0.001, 0.005], + parser.opt_list('--learning_rate', default=0.001 * 8, type=float, + options=[0.0001, 0.0005, 0.001, 0.005], tunable=False) - parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False) - - # if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu - parser.opt_list('--batch_size', default=256*8, type=int, options=[32, 64, 128, 256], tunable=False, - help='batch size will be divided over all the gpus being used across all nodes') + parser.opt_list('--optimizer_name', default='adam', type=str, + options=['adam'], tunable=False) + + # if using 2 nodes with 4 gpus each the batch size here + # (256) will be 256 / (2*8) = 16 per gpu + parser.opt_list('--batch_size', default=256 * 8, type=int, + options=[32, 64, 128, 256], tunable=False, + help='batch size will be divided over all gpus being used across all nodes') return parser diff --git a/pytorch_lightning/utilities/arg_parse.py b/pytorch_lightning/utilities/arg_parse.py index f274e751c3b6d..39d4ec81f9486 100644 --- a/pytorch_lightning/utilities/arg_parse.py +++ b/pytorch_lightning/utilities/arg_parse.py @@ -3,32 +3,46 @@ Might need to update with the new flags """ +import os + + def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None): # tng, test, val check intervals - parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true', help='true = run test set also') - parser.add_argument('--check_val_every_n_epoch', default=1, type=int, help='check val every n epochs') + parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true', + help='true = run test set also') + parser.add_argument('--check_val_every_n_epoch', default=1, type=int, + help='check val every n epochs') parser.opt_list('--accumulate_grad_batches', default=1, type=int, tunable=False, - help='accumulates gradients k times before applying update. Simulates huge batch size') + help='accumulates gradients k times before applying update.' + ' Simulates huge batch size') parser.add_argument('--max_nb_epochs', default=200, type=int, help='cap epochs') parser.add_argument('--min_nb_epochs', default=2, type=int, help='min epochs') - parser.add_argument('--train_percent_check', default=1.0, type=float, help='how much of tng set to check') - parser.add_argument('--val_percent_check', default=1.0, type=float, help='how much of val set to check') - parser.add_argument('--test_percent_check', default=1.0, type=float, help='how much of test set to check') - - parser.add_argument('--val_check_interval', default=0.95, type=float, help='how much within 1 epoch to check val') - parser.add_argument('--log_save_interval', default=100, type=int, help='how many batches between log saves') - parser.add_argument('--add_log_row_interval', default=100, type=int, help='add log every k batches') + parser.add_argument('--train_percent_check', default=1.0, type=float, + help='how much of tng set to check') + parser.add_argument('--val_percent_check', default=1.0, type=float, + help='how much of val set to check') + parser.add_argument('--test_percent_check', default=1.0, type=float, + help='how much of test set to check') + + parser.add_argument('--val_check_interval', default=0.95, type=float, + help='how much within 1 epoch to check val') + parser.add_argument('--log_save_interval', default=100, type=int, + help='how many batches between log saves') + parser.add_argument('--add_log_row_interval', default=100, type=int, + help='add log every k batches') # early stopping parser.add_argument('--disable_early_stop', dest='enable_early_stop', action='store_false') parser.add_argument('--early_stop_metric', default='val_acc', type=str) parser.add_argument('--early_stop_mode', default='min', type=str) - parser.add_argument('--early_stop_patience', default=3, type=int, help='number of epochs until stop') + parser.add_argument('--early_stop_patience', default=3, type=int, + help='number of epochs until stop') # gradient handling parser.add_argument('--gradient_clip', default=-1, type=int) - parser.add_argument('--track_grad_norm', default=-1, type=int, help='if > 0, will track this grad norm') + parser.add_argument('--track_grad_norm', default=-1, type=int, + help='if > 0, will track this grad norm') # model saving parser.add_argument('--model_save_path', default=root_dir + '/model_weights') @@ -44,7 +58,8 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None # test_tube settings parser.add_argument('-en', '--tt_name', default='pt_test') parser.add_argument('-td', '--tt_description', default='pytorch lightning test') - parser.add_argument('--tt_save_path', default=root_dir + '/test_tube_logs', help='logging dir') + parser.add_argument('--tt_save_path', default=os.path.join(root_dir, 'test_tube_logs'), + help='logging dir') parser.add_argument('--enable_single_run', dest='single_run', action='store_true') parser.add_argument('--nb_hopt_trials', default=1, type=int) parser.add_argument('--log_stdout', dest='log_stdout', action='store_true') @@ -55,25 +70,30 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None parser.add_argument('--default_tensor_type', default='torch.cuda.FloatTensor', type=str) parser.add_argument('--use_amp', dest='use_amp', action='store_true') parser.add_argument('--check_grad_nans', dest='check_grad_nans', action='store_true') - parser.add_argument('--amp_level', default='O2',type=str) - + parser.add_argument('--amp_level', default='O2', type=str) # run on hpc parser.add_argument('--on_cluster', dest='on_cluster', action='store_true') # FAST training # use these settings to make sure network has no bugs without running a full dataset - parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true', help='runs validation after 1 tng step') - parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true', help='false removes the prog bar') - parser.add_argument('--overfit', default=-1, type=float, help='% of dataset to use with this option. float, or -1 for none') + parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true', + help='runs validation after 1 tng step') + parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true', + help='false removes the prog bar') + parser.add_argument('--overfit', default=-1, type=float, + help='% of dataset to use with this option. float, or -1 for none') # debug args if rand_seed is not None: parser.add_argument('--random_seed', default=rand_seed, type=int) - parser.add_argument('--interactive', dest='interactive', action='store_true', help='runs on gpu without cluster') - parser.add_argument('--debug', dest='debug', action='store_true', help='enables/disables test tube') - parser.add_argument('--local', dest='local', action='store_true', help='enables local tng') + parser.add_argument('--interactive', dest='interactive', action='store_true', + help='runs on gpu without cluster') + parser.add_argument('--debug', dest='debug', action='store_true', + help='enables/disables test tube') + parser.add_argument('--local', dest='local', action='store_true', + help='enables local tng') # optimizer parser.add_argument('--lr_scheduler_milestones', default=None, type=str) diff --git a/requirements.txt b/requirements.txt index aba573c0c6085..86c02573bf48e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ tqdm==4.32.1 twine==1.13.0 numpy==1.16.4 torch>=1.1.0 -torchvision==0.3.0 +torchvision>=0.3.0 +pandas \ No newline at end of file diff --git a/setup.py b/setup.py index e680ef4301207..a8f0d370834cc 100755 --- a/setup.py +++ b/setup.py @@ -1,29 +1,55 @@ #!/usr/bin/env python -from setuptools import setup, find_packages +from setuptools import setup + +import pytorch_lightning # https://packaging.python.org/guides/single-sourcing-package-version/ # http://blog.ionelmc.ro/2014/05/25/python-packaging/ setup( name="pytorch-lightning", - version='0.3.6.9', - description="The Keras for ML researchers using PyTorch", - author="William Falcon", - author_email="waf2107@columbia.edu", - url="https://github.com/williamFalcon/pytorch-lightning", - download_url="https://github.com/williamFalcon/pytorch-lightning", - license="MIT", + version=pytorch_lightning.__version__, + description=pytorch_lightning.__doc__, + author=pytorch_lightning.__author__, + author_email=pytorch_lightning.__author_email__, + url=pytorch_lightning.__homepage__, + license=pytorch_lightning.__license__, + packages=['pytorch_lightning'], + + long_description=open("README.md", encoding="utf-8").read(), + long_description_content_type='text/markdown', + + include_package_data=True, + zip_safe=False, + keywords=["deep learning", "pytorch", "AI"], - python_requires=">=3.5", + python_requires=">=3.6", install_requires=[ "torch>=1.1.0", "tqdm", "test-tube>=0.6.7.6", ], - packages=find_packages(), - long_description=open("README.md", encoding="utf-8").read(), - long_description_content_type='text/markdown', - include_package_data=True, - zip_safe=False, + + classifiers=[ + 'Environment :: Console', + 'Natural Language :: English', + # How mature is this project? Common values are + # 3 - Alpha, 4 - Beta, 5 - Production/Stable + 'Development Status :: 4 - Beta', + # Indicate who your project is intended for + 'Intended Audience :: Developers', + 'Topic :: Scientific/Engineering :: Artificial Intelligence', + 'Topic :: Scientific/Engineering :: Image Recognition', + 'Topic :: Scientific/Engineering :: Information Analysis', + # Pick your license as you wish + 'License :: OSI Approved :: BSD License', + 'Operating System :: OS Independent', + # Specify the Python versions you support here. In particular, ensure + # that you indicate whether you support Python 2, Python 3 or both. + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + ], ) diff --git a/tests/README.md b/tests/README.md index a121ff14145e1..032cf04f1d2de 100644 --- a/tests/README.md +++ b/tests/README.md @@ -17,7 +17,7 @@ pip install -e . pip install -r requirements.txt # run tests -py.test +py.test -v ``` To test models that require GPU make sure to run the above command on a GPU machine. @@ -50,10 +50,14 @@ cd pytorch-lightning # generate coverage pip install coverage -coverage run tests/test_models.py +coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules # print coverage stats -coverage report -m +coverage report -m + +# exporting resulys +coverage xml +codecov -t 17327163-8cca-4a5d-86c8-ca5f2ef700bc -v ``` diff --git a/tests/debug.py b/tests/debug.py index 09e9186b9ccc1..6a5efbecfa2ca 100644 --- a/tests/debug.py +++ b/tests/debug.py @@ -61,8 +61,8 @@ def get_model(): root_dir = os.path.dirname(os.path.realpath(__file__)) hparams = Namespace(**{'drop_prob': 0.2, 'batch_size': 32, - 'in_features': 28*28, - 'learning_rate': 0.001*8, + 'in_features': 28 * 28, + 'learning_rate': 0.001 * 8, 'optimizer_name': 'adam', 'data_root': os.path.join(root_dir, 'mnist'), 'out_features': 10, @@ -107,7 +107,8 @@ def load_model(exp, save_dir): checkpoints = [x for x in os.listdir(save_dir) if '.ckpt' in x] weights_dir = os.path.join(save_dir, checkpoints[0]) - trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir, tags_csv=tags_path, on_gpu=True) + trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir, + tags_csv=tags_path, on_gpu=True) assert trained_model is not None, 'loading model failed' @@ -132,7 +133,7 @@ def run_prediction(dataloader, trained_model): print(val_acc) - assert val_acc > 0.70, f'this model is expected to get > 0.7 in test set (it got {val_acc})' + assert val_acc > 0.70, 'this model is expected to get > 0.7 in test set (it got %f)' % val_acc def main(): diff --git a/tests/requirements.txt b/tests/requirements.txt index c16efc2add252..076bfd6500639 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,2 +1,8 @@ -coverage==4.5.3 -pytest==5.0.1 +tox +coverage +codecov +pytest>=3.0.5 +pytest-cov +flake8 +check-manifest +test_tube \ No newline at end of file diff --git a/tests/test_models.py b/tests/test_models.py index 73ba2e43c1209..cd03d8b411399 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -1,20 +1,22 @@ import os import shutil import warnings +from argparse import Namespace import pytest import numpy as np import torch +from test_tube import Experiment, SlurmCluster + +# sys.path += [os.path.abspath('..'), os.path.abspath('../..')] from pytorch_lightning import Trainer -from examples import LightningTemplateModel from pytorch_lightning.testing.lm_test_module import LightningTestModel -from argparse import Namespace -from test_tube import Experiment, SlurmCluster from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping from pytorch_lightning.utilities.debugging import MisconfigurationException from pytorch_lightning.root_module import memory from pytorch_lightning.models.trainer import reduce_distributed_output from pytorch_lightning.root_module import model_saving +from examples import LightningTemplateModel SEED = 2334 torch.manual_seed(SEED) @@ -30,10 +32,12 @@ def test_amp_gpu_ddp(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_amp_gpu_ddp cannot run.' + 'Rerun on a GPU node to run this test') return if not torch.cuda.device_count() > 1: - warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test') + warnings.warn('test_amp_gpu_ddp cannot run.' + 'Rerun on a node with 2+ GPUs to run this test') return os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) @@ -105,7 +109,8 @@ def test_cpu_slurm_save_load(): # wipe-out trainer and model # retrain with not much data... this simulates picking training back up after slurm # we want to see if the weights come back correctly - continue_tng_hparams = get_hparams(continue_training=True, hpc_exp_number=cluster_a.hpc_exp_number) + continue_tng_hparams = get_hparams(continue_training=True, + hpc_exp_number=cluster_a.hpc_exp_number) trainer_options = dict( max_nb_epochs=1, cluster=SlurmCluster(continue_tng_hparams), @@ -136,11 +141,9 @@ def assert_pred_same(): def test_loading_meta_tags(): hparams = get_hparams() - save_dir = init_save_dir() - # save tags exp = get_exp(False) - exp.tag({'some_str':'a_str', 'an_int': 1, 'a_float': 2.0}) + exp.tag({'some_str': 'a_str', 'an_int': 1, 'a_float': 2.0}) exp.argparse(hparams) exp.save() @@ -221,7 +224,8 @@ def test_model_saving_loading(): # load new model tags_path = exp.get_data_path(exp.name, exp.version) tags_path = os.path.join(tags_path, 'meta_tags.csv') - model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path, tags_csv=tags_path, on_gpu=False) + model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path, + tags_csv=tags_path, on_gpu=False) model_2.eval() # make prediction @@ -246,10 +250,12 @@ def test_amp_gpu_ddp_slurm_managed(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_amp_gpu_ddp cannot run.' + ' Rerun on a GPU node to run this test') return if not torch.cuda.device_count() > 1: - warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test') + warnings.warn('test_amp_gpu_ddp cannot run.' + ' Rerun on a node with 2+ GPUs to run this test') return # simulate setting slurm flags @@ -413,7 +419,8 @@ def test_single_gpu_model(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_single_gpu_model cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_single_gpu_model cannot run.' + ' Rerun on a GPU node to run this test') return model, hparams = get_model() @@ -434,10 +441,12 @@ def test_multi_gpu_model_dp(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_multi_gpu_model_dp cannot run.' + ' Rerun on a GPU node to run this test') return if not torch.cuda.device_count() > 1: - warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a node with 2+ GPUs to run this test') + warnings.warn('test_multi_gpu_model_dp cannot run.' + ' Rerun on a node with 2+ GPUs to run this test') return model, hparams = get_model() trainer_options = dict( @@ -460,10 +469,12 @@ def test_amp_gpu_dp(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_amp_gpu_dp cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_amp_gpu_dp cannot run.' + ' Rerun on a GPU node to run this test') return if not torch.cuda.device_count() > 1: - warnings.warn('test_amp_gpu_dp cannot run. Rerun on a node with 2+ GPUs to run this test') + warnings.warn('test_amp_gpu_dp cannot run.' + ' Rerun on a node with 2+ GPUs to run this test') return model, hparams = get_model() trainer_options = dict( @@ -482,10 +493,12 @@ def test_multi_gpu_model_ddp(): :return: """ if not torch.cuda.is_available(): - warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a GPU node to run this test') + warnings.warn('test_multi_gpu_model_ddp cannot run.' + ' Rerun on a GPU node to run this test') return if not torch.cuda.device_count() > 1: - warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a node with 2+ GPUs to run this test') + warnings.warn('test_multi_gpu_model_ddp cannot run.' + ' Rerun on a node with 2+ GPUs to run this test') return os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0]) @@ -502,7 +515,6 @@ def test_multi_gpu_model_ddp(): run_gpu_model_test(trainer_options, model, hparams) - def test_ddp_sampler_error(): """ Make sure DDP + AMP work @@ -587,8 +599,8 @@ def get_hparams(continue_training=False, hpc_exp_number=0): args = { 'drop_prob': 0.2, 'batch_size': 32, - 'in_features': 28*28, - 'learning_rate': 0.001*8, + 'in_features': 28 * 28, + 'learning_rate': 0.001 * 8, 'optimizer_name': 'adam', 'data_root': os.path.join(root_dir, 'mnist'), 'out_features': 10, @@ -673,13 +685,13 @@ def run_prediction(dataloader, trained_model): print(val_acc) - assert val_acc > 0.50, f'this model is expected to get > 0.50 in test set (it got {val_acc})' + assert val_acc > 0.50, 'this model is expected to get > 0.50 in test set (it got %f)' % val_acc def assert_ok_acc(trainer): # this model should get 0.80+ acc acc = trainer.tng_tqdm_dic['val_acc'] - assert acc > 0.50, f'model failed to get expected 0.50 validation accuracy. Got: {acc}' + assert acc > 0.50, 'model failed to get expected 0.50 validation accuracy. Got: %f' % acc if __name__ == '__main__': diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000000000..a91ac5efce9d5 --- /dev/null +++ b/tox.ini @@ -0,0 +1,47 @@ +# this file is *not* meant to cover or endorse the use of tox or pytest or testing in general, +# +# It's meant to show the use of: +# +# - check-manifest +# confirm items checked into vcs are in your segdist +# - python setup.py check +# confirm required package meta-data in setup.py +# - readme_renderer (when using a ReStructuredText README) +# confirms your long_description will render correctly on PyPI. +# +# and also to help confirm pull requests to this project. + +[tox] +envlist = py{35,36,37} + +[pytest] +log_cli = 0 +log_cli_level = CRITICAL +log_cli_format = %(message)s +log_file = pytest.log +log_file_level = DEBUG +log_file_format = %(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s) +log_file_date_format=%Y-%m-%d %H:%M:%S + +[testenv] +basepython = + py35: python3.5 + py36: python3.6 + py37: python3.7 +deps = + -r requirements.txt + -r ./tests/requirements.txt +commands = + check-manifest --ignore tox.ini + python setup.py check -m -s + coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules + flake8 . + +[flake8] +exclude = .tox,*.egg,build,temp +select = E,W,F +doctests = True +verbose = 2 +# https://pep8.readthedocs.io/en/latest/intro.html#error-codes +format = pylint +max-line-length = 100 diff --git a/update.sh b/update.sh index 4eaf1149d083f..40fcc22d6b79b 100644 --- a/update.sh +++ b/update.sh @@ -11,10 +11,7 @@ rm -rf ./dist/* python3 setup.py sdist twine upload dist/* - - # to update docs # cd to root dir # mkdocs gh-deploy -