diff --git a/.codecov.yml b/.codecov.yml
new file mode 100644
index 0000000000000..f10c1873295db
--- /dev/null
+++ b/.codecov.yml
@@ -0,0 +1,42 @@
+#see https://github.com/codecov/support/wiki/Codecov-Yaml
+codecov:
+  notify:
+    require_ci_to_pass: yes
+
+coverage:
+  precision: 0  # 2 = xx.xx%, 0 = xx%
+  round: nearest # how coverage is rounded: down/up/nearest
+  range: 40...100 # custom range of coverage colors from red -> yellow -> green
+  status:
+    # https://codecov.readme.io/v1.0/docs/commit-status
+    project:
+      default:
+        against: auto
+        target: 99% # specify the target coverage for each commit status
+        threshold: 20% # allow this little decrease on project
+        # https://github.com/codecov/support/wiki/Filtering-Branches
+        # branches: master
+        if_ci_failed: error
+    # https://github.com/codecov/support/wiki/Patch-Status
+    patch:
+      default:
+        against: auto
+        target: 40% # specify the target "X%" coverage to hit
+        # threshold: 50% # allow this much decrease on patch
+    changes: false
+
+parsers:
+  gcov:
+    branch_detection:
+      conditional: true
+      loop: true
+      macro: false
+      method: false
+  javascript:
+    enable_partials: false
+
+comment:
+  layout: header, diff
+  require_changes: false
+  behavior: default  # update if exists else create new
+  # branches: *
diff --git a/.travis.yml b/.travis.yml
index d80b291f14299..fb1f8c95410b7 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,17 +1,46 @@
+# vim ft=yaml
+
+# After changing this file, check it on:
+#   http://yaml-online-parser.appspot.com/
+
+# See doc/travis_notes.txt for some guidelines
+
+# this file is *not* meant to cover or endorse the use of travis, but rather to
+# help confirm pull requests to this project.
+
+dist: xenial  # Ubuntu 16.04
+
+env:
+  global:
+    - DISPLAY=""
+
 language: python
-python:
-  - "3.7"
-# command to install dependencies
+
+matrix:
+  include:
+    - python: 3.6
+      env: TOXENV=py36
+    - python: 3.7
+      env: TOXENV=py37
+
+# See http://docs.travis-ci.com/user/caching/#pip-cache
 cache: pip
+
 install:
-  - pip install -e .
   - pip install -r requirements.txt
-  - pip install -r tests/requirements.txt
-  - pip install -U numpy
+  - pip install -r ./tests/requirements.txt
+  - pip --version ; pip list
 
-# keep build from timing out
-dist: xenial
-
-# command to run tests
 script:
-  - py.test -v # or py.test for Python versions 3.5 and below
\ No newline at end of file
+  # integration
+  - tox --sitepackages
+  - python setup.py install --dry-run
+
+after_success:
+  - coverage report
+  # disable auto coverage bc it isn't accurate since it misses gpu code.
+  # to get coverage, run local and push results
+  # - codecov
+
+notifications:
+  email: false
diff --git a/MANIFEST.in b/MANIFEST.in
index 53c1b22055d2e..e39ffbad5520d 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,9 +1,36 @@
-graft docs
+# Manifest syntax https://docs.python.org/2/distutils/sourcedist.html
+graft wheelhouse
 
-include COPYING
-include AUTHORS
+recursive-include birl *.py
+recursive-exclude __pycache__  *.py[cod] *.orig
 
-recursive-include src/einsteinpy/tests *.py *.html
+# Include the README
+include *.md
 
-prune docs/source/examples/.ipynb_checkpoints
-global-exclude *.py[cod] __pycache__ *.so *.dylib
+# Include the license file
+include LICENSE
+
+exclude *.sh
+exclude *.toml
+recursive-include examples *.py
+recursive-include pytorch_lightning *.py
+
+# exclude tests from package
+recursive-exclude tests *
+exclude tests
+
+# Exclude the documentation files
+recursive-exclude docs *
+exclude docs
+
+# Include the Requirements
+include requirements.txt
+
+# Exclude build configs
+exclude *.yml
+
+prune .git
+prune .github
+prune notebook*
+prune temp*
+prune test*
\ No newline at end of file
diff --git a/README.md b/README.md
index 7542eeef3a6f2..b2d286ece826c 100644
--- a/README.md
+++ b/README.md
@@ -1,25 +1,26 @@
-<p align="center">
-  <a href="https://williamfalcon.github.io/pytorch-lightning/">
-    <img alt="" src="https://github.com/williamFalcon/pytorch-lightning/blob/master/docs/source/_static/lightning_logo.png" width="50">
-  </a>
-</p>
-<h3 align="center">
-  PyTorch Lightning
-</h3>
-<p align="center">
-  The PyTorch Keras for ML researchers. More control. Less boilerplate.    
-</p>
-
-<p align="center">
-  <a href="https://badge.fury.io/py/pytorch-lightning"><img src="https://badge.fury.io/py/pytorch-lightning.svg" alt="PyPI version" height="18"></a>
-  <a href="https://pepy.tech/project/pytorch-lightning"><img src="https://pepy.tech/badge/pytorch-lightning" alt="PyPI version" height="18"></a>
-    <a href="https://pepy.tech/project/pytorch-lightning"><img src="https://img.shields.io/badge/python-3.6-blue.svg" alt="Supported Python Version" height="18"></a>
-  <a href="https://github.com/williamFalcon/pytorch-lightning/blob/master/tests/README.md#running-coverage"><img src="https://github.com/williamFalcon/pytorch-lightning/blob/master/coverage.svg"></a>
-  <a href="https://travis-ci.org/williamFalcon/pytorch-lightning"><img src="https://travis-ci.org/williamFalcon/pytorch-lightning.svg?branch=master"></a>
-  <a href="https://williamfalcon.github.io/pytorch-lightning/"><img src="https://readthedocs.org/projects/pytorch-lightning/badge/?version=latest"></a>
-  <a href="https://github.com/williamFalcon/pytorch-lightning/blob/master/LICENSE"><img src="https://img.shields.io/badge/License-Apache%202.0-blue.svg"></a>
-</p>   
+<div align="center">
 
+![Logo](./docs/source/_static/lightning_logo_small.png)
+
+# PyTorch Lightning
+
+**The PyTorch Keras for ML researchers. More control. Less boilerplate.**
+
+
+[![PyPI Status](https://badge.fury.io/py/pytorch-lightning.svg)](https://badge.fury.io/py/pytorch-lightning)
+[![PyPI Status](https://pepy.tech/badge/pytorch-lightning)](https://pepy.tech/project/pytorch-lightning)
+[![Build Status](https://travis-ci.org/williamFalcon/pytorch-lightning.svg?branch=master)](https://travis-ci.org/williamFalcon/pytorch-lightning)
+<!-- 
+removed until windows install issues resolved.
+[![Build status](https://ci.appveyor.com/api/projects/status/rum89d7hq8l1kfye?svg=true)](https://ci.appveyor.com/project/Borda/pytorch-lightning) -->
+[![codecov](https://codecov.io/gh/Borda/pytorch-lightning/branch/master/graph/badge.svg)](https://codecov.io/gh/Borda/pytorch-lightning)
+[![CodeFactor](https://www.codefactor.io/repository/github/borda/pytorch-lightning/badge)](https://www.codefactor.io/repository/github/borda/pytorch-lightning)
+[![ReadTheDocs](https://readthedocs.org/projects/pytorch-lightning/badge/?version=latest)](https://pytorch-lightning.readthedocs.io/en/latest)
+[![license](https://img.shields.io/badge/License-MIT-yellow.svg)](https://github.com/williamFalcon/pytorch-lightning/blob/master/LICENSE)
+
+</div>
+
+Simple installation from PyPI
 ```bash
 pip install pytorch-lightning  
 ```
@@ -127,7 +128,7 @@ trainer = Trainer(experiment=exp, max_nb_epochs=1, train_percent_check=0.1)
 trainer.fit(model)
 
 # view tensorflow logs 
-print(f'View tensorboard logs by running\ntensorboard --logdir {os.getcwd()}')
+print('View tensorboard logs by running\ntensorboard --logdir %s' % os.getcwd())
 print('and going to http://localhost:6006 on your browser')
 ```
 
@@ -137,11 +138,7 @@ print('and going to http://localhost:6006 on your browser')
 Everything in gray!    
 You define the blue parts using the LightningModule interface:  
 
-<p align="center">
-  <a href="https://github.com/williamFalcon/pytorch-lightning/blob/master/docs/source/_static/overview_flat.jpg">
-    <img alt="" src="https://github.com/williamFalcon/pytorch-lightning/blob/master/docs/source/_static/overview_flat.jpg" height="700px">
-  </a>
-</p>  
+![Ouverview](./docs/source/_static/overview_flat.jpg)
 
 ```{.python}
 # what to do in the training loop
@@ -223,19 +220,11 @@ def validation_end(self, outputs):
 ## Tensorboard    
 Lightning is fully integrated with tensorboard.   
 
-<p align="center">
-  <a href="https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#tensorboard-support">
-    <img alt="" src="https://github.com/williamFalcon/pytorch-lightning/blob/master/docs/source/_static/tf_loss.png" width="900px">
-  </a>
-</p>
+![tensorboard-support](./docs/source/_static/tf_loss.png)
 
 Lightning also adds a text column with all the hyperparameters for this experiment.      
 
-<p align="center">
-  <a href="https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#tensorboard-support">
-        <img alt="" src="https://github.com/williamFalcon/pytorch-lightning/blob/master/docs/source/_static/tf_tags.png" width="900px">
-  </a>
-</p>
+![tensorboard-support](./docs/source/_static/tf_tags.png)
 
 Simply note the path you set for the Experiment    
 ``` {.python}   
diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 0000000000000..ad03a4786b057
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,66 @@
+# https://www.appveyor.com/docs/appveyor-yml/
+environment:
+
+  # SDK v7.0 MSVC Express 2008's SetEnv.cmd script will fail if the
+  # /E:ON and /V:ON options are not enabled in the batch script interpreter
+  # See: http://stackoverflow.com/a/13751649/163740
+  CMD_IN_ENV: "cmd /E:ON /V:ON /C obvci_appveyor_python_build_env.cmd"
+
+  matrix:
+    # Pre-installed Python versions, which Appveyor may upgrade to
+    # a later point release.
+    # See: http://www.appveyor.com/docs/installed-software#python
+
+
+    # - PYTHON: "C:\\Python35-x64"
+    #   PYTHON_VERSION: "3.5.x"
+    #   PYTHON_ARCH: "64"
+    #  TOXENV: "py35"
+
+    - PYTHON: "C:\\Python36-x64"
+      PYTHON_VERSION: "3.6.x"
+      PYTHON_ARCH: "64"
+      TOXENV: "py36"
+      PIP_PYVER: "36"
+
+    - PYTHON: "C:\\Python37-x64"
+      PYTHON_VERSION: "3.7.x"
+      PYTHON_ARCH: "64"
+      TOXENV: "py37"
+      PIP_PYVER: "37"
+
+build: off
+
+# https://www.appveyor.com/docs/build-cache/
+cache:
+  - C:\ProgramData\chocolatey\bin -> appveyor.yml
+  - C:\ProgramData\chocolatey\lib -> appveyor.yml
+  - '%LOCALAPPDATA%\pip\Cache -> appveyor.yml'
+
+# scripts that run after cloning repository
+install:
+  # If there is a newer build queued for the same PR, cancel this one.
+  # The AppVeyor 'rollout builds' option is supposed to serve the same
+  # purpose but it is problematic because it tends to cancel builds pushed
+  # directly to master instead of just PR builds (or the converse).
+  - SET PATH=%PYTHON%;%PYTHON%\\Scripts;%path%
+  - pip install -U --user pip
+  - pip install "https://download.pytorch.org/whl/cu90/torch-1.1.0-cp%PIP_PYVER%-cp%PIP_PYVER%m-win_amd%PYTHON_ARCH%.whl"
+    pip install "https://download.pytorch.org/whl/cu90/torchvision-0.3.0-cp%PIP_PYVER%-cp%PIP_PYVER%m-win_amd%PYTHON_ARCH%.whl"
+  - pip install -r requirements.txt
+  - pip install -r ./tests/requirements.txt
+
+# scripts to run before tests (working directory and environment changes are persisted from the previous steps such as "before_build")
+before_test:
+  - python --version
+  - pip --version
+  - pip list
+  - dir
+
+# to run your custom scripts instead of automatic tests
+test_script:
+  - tox --sitepackages --parallel auto
+
+on_success:
+  - coverage report
+  # - codecov
diff --git a/coverage.svg b/coverage.svg
deleted file mode 100644
index 6bfc8faf24d3c..0000000000000
--- a/coverage.svg
+++ /dev/null
@@ -1,21 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg" width="99" height="20">
-    <linearGradient id="b" x2="0" y2="100%">
-        <stop offset="0" stop-color="#bbb" stop-opacity=".1"/>
-        <stop offset="1" stop-opacity=".1"/>
-    </linearGradient>
-    <mask id="a">
-        <rect width="99" height="20" rx="3" fill="#fff"/>
-    </mask>
-    <g mask="url(#a)">
-        <path fill="#555" d="M0 0h63v20H0z"/>
-        <path fill="#4c1" d="M63 0h36v20H63z"/>
-        <path fill="url(#b)" d="M0 0h99v20H0z"/>
-    </g>
-    <g fill="#fff" text-anchor="middle" font-family="DejaVu Sans,Verdana,Geneva,sans-serif" font-size="11">
-        <text x="31.5" y="15" fill="#010101" fill-opacity=".3">coverage</text>
-        <text x="31.5" y="14">coverage</text>
-        <text x="80" y="15" fill="#010101" fill-opacity=".3">99%</text>
-        <text x="80" y="14">99%</text>
-    </g>
-</svg>
diff --git a/docs/Trainer/Distributed training.md b/docs/Trainer/Distributed training.md
index aedbd20ed1f42..cafc719df0a95 100644
--- a/docs/Trainer/Distributed training.md	
+++ b/docs/Trainer/Distributed training.md	
@@ -94,7 +94,7 @@ cluster.add_command('export NCCL_SOCKET_IFNAME=^docker0,lo')
 cluster.add_command('export NCCL_DEBUG=INFO')
 
 # setting a master port here is a good idea.
-cluster.add_command(f'export MASTER_PORT={PORT}')
+cluster.add_command('export MASTER_PORT=%r' % PORT)
 
 # good to load the latest NCCL version
 cluster.load_modules(['NCCL/2.4.7-1-cuda.10.0'])
diff --git a/docs/source/_static/lightning_logo_medium.png b/docs/source/_static/lightning_logo_medium.png
new file mode 100644
index 0000000000000..a28606b541632
Binary files /dev/null and b/docs/source/_static/lightning_logo_medium.png differ
diff --git a/docs/source/_static/lightning_logo_small.png b/docs/source/_static/lightning_logo_small.png
new file mode 100644
index 0000000000000..17d0aa92bce2b
Binary files /dev/null and b/docs/source/_static/lightning_logo_small.png differ
diff --git a/examples/__init__.py b/examples/__init__.py
index 6743d7f97919c..0d456dacb6443 100644
--- a/examples/__init__.py
+++ b/examples/__init__.py
@@ -1 +1,5 @@
-from .new_project_templates.lightning_module_template import LightningTemplateModel
\ No newline at end of file
+from .new_project_templates.lightning_module_template import LightningTemplateModel
+
+__all__ = [
+    'LightningTemplateModel'
+]
diff --git a/examples/new_project_templates/lightning_module_template.py b/examples/new_project_templates/lightning_module_template.py
index 483a4e3a048df..d6bdd13068e40 100644
--- a/examples/new_project_templates/lightning_module_template.py
+++ b/examples/new_project_templates/lightning_module_template.py
@@ -47,11 +47,13 @@ def __build_model(self):
         Layout model
         :return:
         """
-        self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim)
+        self.c_d1 = nn.Linear(in_features=self.hparams.in_features,
+                              out_features=self.hparams.hidden_dim)
         self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim)
         self.c_d1_drop = nn.Dropout(self.hparams.drop_prob)
 
-        self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features)
+        self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim,
+                              out_features=self.hparams.out_features)
 
     # ---------------------
     # TRAINING
@@ -171,8 +173,10 @@ def configure_optimizers(self):
 
     def __dataloader(self, train):
         # init data generators
-        transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
-        dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True)
+        transform = transforms.Compose([transforms.ToTensor(),
+                                        transforms.Normalize((0.5,), (1.0,))])
+        dataset = MNIST(root=self.hparams.data_root, train=train,
+                        transform=transform, download=True)
 
         # when using multi-node we need to add the datasampler
         train_sampler = None
@@ -182,7 +186,7 @@ def __dataloader(self, train):
             if self.on_gpu:
                 train_sampler = DistributedSampler(dataset, rank=self.trainer.proc_rank)
                 batch_size = batch_size // self.trainer.world_size  # scale batch size
-        except Exception as e:
+        except Exception:
             pass
 
         should_shuffle = train_sampler is None
@@ -211,7 +215,7 @@ def test_dataloader(self):
         return self.__dataloader(train=False)
 
     @staticmethod
-    def add_model_specific_args(parent_parser, root_dir): # pragma: no cover
+    def add_model_specific_args(parent_parser, root_dir):  # pragma: no cover
         """
         Parameters you define here will be available to your model through self.hparams
         :param parent_parser:
@@ -224,20 +228,25 @@ def add_model_specific_args(parent_parser, root_dir): # pragma: no cover
         # parser.set_defaults(gradient_clip=5.0)
 
         # network params
-        parser.add_argument('--in_features', default=28*28, type=int)
+        parser.add_argument('--in_features', default=28 * 28, type=int)
         parser.add_argument('--out_features', default=10, type=int)
-        parser.add_argument('--hidden_dim', default=50000, type=int) # use 500 for CPU, 50000 for GPU to see speed difference
+        # use 500 for CPU, 50000 for GPU to see speed difference
+        parser.add_argument('--hidden_dim', default=50000, type=int)
         parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=False)
 
         # data
         parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)
 
         # training params (opt)
-        parser.opt_list('--learning_rate', default=0.001*8, type=float, options=[0.0001, 0.0005, 0.001, 0.005],
+        parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
+                        options=[0.0001, 0.0005, 0.001, 0.005],
                         tunable=False)
-        parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False)
-
-        # if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu
-        parser.opt_list('--batch_size', default=256*8, type=int, options=[32, 64, 128, 256], tunable=False,
-                        help='batch size will be divided over all the gpus being used across all nodes')
+        parser.opt_list('--optimizer_name', default='adam', type=str,
+                        options=['adam'], tunable=False)
+
+        # if using 2 nodes with 4 gpus each the batch size here
+        #  (256) will be 256 / (2*8) = 16 per gpu
+        parser.opt_list('--batch_size', default=256 * 8, type=int,
+                        options=[32, 64, 128, 256], tunable=False,
+                        help='batch size will be divided over all gpus being used across all nodes')
         return parser
diff --git a/examples/new_project_templates/multi_node_cluster_template.py b/examples/new_project_templates/multi_node_cluster_template.py
index 5f6914d107ae0..c4af6d416b20a 100644
--- a/examples/new_project_templates/multi_node_cluster_template.py
+++ b/examples/new_project_templates/multi_node_cluster_template.py
@@ -10,12 +10,12 @@
 from pytorch_lightning.models.trainer import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 
+from examples.new_project_templates.lightning_module_template import LightningTemplateModel
+
 SEED = 2334
 torch.manual_seed(SEED)
 np.random.seed(SEED)
 
-from .lightning_module_template import LightningTemplateModel
-
 
 def main_local(hparams):
     main(hparams, None, None)
@@ -112,8 +112,10 @@ def optimize_on_cluster(hyperparams):
     cluster.add_command('source activate lightning')
 
     # run only on 32GB voltas
-    cluster.add_slurm_cmd(cmd='constraint', value='volta32gb', comment='use 32gb gpus')
-    cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition, comment='use 32gb gpus')
+    cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
+                          comment='use 32gb gpus')
+    cluster.add_slurm_cmd(cmd='partition', value=hyperparams.gpu_partition,
+                          comment='use 32gb gpus')
 
     # run hopt
     # creates and submits jobs to slurm
@@ -140,15 +142,23 @@ def optimize_on_cluster(hyperparams):
     parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual')
 
     # TODO: make 1 param
-    parent_parser.add_argument('--per_experiment_nb_gpus', type=int, help='how many gpus to use in a node')
-    parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node')
-
-    parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1, help='how many nodes to use in a cluster')
-    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
-    parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir, help='where to save slurm meta')
-    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
-    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
-    parent_parser.add_argument('--nb_hopt_trials', type=int, default=1, help='how many grid search trials to run')
+    parent_parser.add_argument('--per_experiment_nb_gpus', type=int,
+                               help='how many gpus to use in a node')
+    parent_parser.add_argument('--gpus', type=str, default='-1',
+                               help='how many gpus to use in the node')
+
+    parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1,
+                               help='how many nodes to use in a cluster')
+    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
+                               help='where to save logs')
+    parent_parser.add_argument('--slurm_log_path', type=str, default=slurm_out_dir,
+                               help='where to save slurm meta')
+    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
+                               help='where to save model')
+    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
+                               help='test tube exp name')
+    parent_parser.add_argument('--nb_hopt_trials', type=int, default=1,
+                               help='how many grid search trials to run')
 
     # allow model to overwrite or extend args
     parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
diff --git a/examples/new_project_templates/single_cpu_template.py b/examples/new_project_templates/single_cpu_template.py
index 29c255981627a..c0f4826f99634 100644
--- a/examples/new_project_templates/single_cpu_template.py
+++ b/examples/new_project_templates/single_cpu_template.py
@@ -9,12 +9,12 @@
 from pytorch_lightning.models.trainer import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 
+from examples.new_project_templates.lightning_module_template import LightningTemplateModel
+
 SEED = 2334
 torch.manual_seed(SEED)
 np.random.seed(SEED)
 
-from .lightning_module_template import LightningTemplateModel
-
 
 def main(hparams):
     """
@@ -90,9 +90,12 @@ def main(hparams):
     parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
 
     # gpu args
-    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
-    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
-    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
+    parent_parser.add_argument('--test_tube_save_path', type=str,
+                               default=test_tube_dir, help='where to save logs')
+    parent_parser.add_argument('--model_save_path', type=str,
+                               default=checkpoint_dir, help='where to save model')
+    parent_parser.add_argument('--experiment_name', type=str,
+                               default='pt_lightning_exp_a', help='test tube exp name')
 
     # allow model to overwrite or extend args
     parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
@@ -102,5 +105,5 @@ def main(hparams):
     # RUN TRAINING
     # ---------------------
     # run on HPC cluster
-    print(f'RUNNING ON CPU')
+    print('RUNNING ON CPU')
     main(hyperparams)
diff --git a/examples/new_project_templates/single_gpu_node_16bit_template.py b/examples/new_project_templates/single_gpu_node_16bit_template.py
index 14db484e56409..babf18e73a2a4 100644
--- a/examples/new_project_templates/single_gpu_node_16bit_template.py
+++ b/examples/new_project_templates/single_gpu_node_16bit_template.py
@@ -9,12 +9,12 @@
 from pytorch_lightning.models.trainer import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 
+from examples.new_project_templates.lightning_module_template import LightningTemplateModel
+
 SEED = 2334
 torch.manual_seed(SEED)
 np.random.seed(SEED)
 
-from .lightning_module_template import LightningTemplateModel
-
 
 def main(hparams):
     """
@@ -92,10 +92,15 @@ def main(hparams):
     parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
 
     # gpu args
-    parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
-    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
-    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
-    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
+    parent_parser.add_argument('--gpus', type=str, default='-1',
+                               help='how many gpus to use in the node.'
+                                    'value -1 uses all the gpus on the node')
+    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
+                               help='where to save logs')
+    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
+                               help='where to save model')
+    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
+                               help='test tube exp name')
 
     # allow model to overwrite or extend args
     parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
@@ -105,5 +110,5 @@ def main(hparams):
     # RUN TRAINING
     # ---------------------
     # run on HPC cluster
-    print(f'RUNNING INTERACTIVE MODE ON GPUS. gpu ids: {hyperparams.gpus}')
+    print('RUNNING INTERACTIVE MODE ON GPUS. gpu ids: %i' % hyperparams.gpus)
     main(hyperparams)
diff --git a/examples/new_project_templates/single_gpu_node_ddp_template.py b/examples/new_project_templates/single_gpu_node_ddp_template.py
index 56e301b20478a..68a332aeb8b5a 100644
--- a/examples/new_project_templates/single_gpu_node_ddp_template.py
+++ b/examples/new_project_templates/single_gpu_node_ddp_template.py
@@ -9,12 +9,12 @@
 from pytorch_lightning.models.trainer import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 
+from examples.new_project_templates.lightning_module_template import LightningTemplateModel
+
 SEED = 2334
 torch.manual_seed(SEED)
 np.random.seed(SEED)
 
-from .lightning_module_template import LightningTemplateModel
-
 
 def main(hparams):
     """
@@ -92,10 +92,15 @@ def main(hparams):
     parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
 
     # gpu args
-    parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
-    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
-    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
-    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
+    parent_parser.add_argument('--gpus', type=str, default='-1',
+                               help='how many gpus to use in the node.'
+                                    ' value -1 uses all the gpus on the node')
+    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
+                               help='where to save logs')
+    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
+                               help='where to save model')
+    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
+                               help='test tube exp name')
 
     # allow model to overwrite or extend args
     parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
@@ -105,5 +110,5 @@ def main(hparams):
     # RUN TRAINING
     # ---------------------
     # run on HPC cluster
-    print(f'RUNNING INTERACTIVE MODE ON GPUS. gpu ids: {hyperparams.gpus}')
+    print('RUNNING INTERACTIVE MODE ON GPUS. gpu ids: %i' % hyperparams.gpus)
     main(hyperparams)
diff --git a/examples/new_project_templates/single_gpu_node_dp_template.py b/examples/new_project_templates/single_gpu_node_dp_template.py
index 9d6992533b680..d752713ee1df4 100644
--- a/examples/new_project_templates/single_gpu_node_dp_template.py
+++ b/examples/new_project_templates/single_gpu_node_dp_template.py
@@ -9,12 +9,12 @@
 from pytorch_lightning.models.trainer import Trainer
 from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 
+from examples.new_project_templates.lightning_module_template import LightningTemplateModel
+
 SEED = 2334
 torch.manual_seed(SEED)
 np.random.seed(SEED)
 
-from .lightning_module_template import LightningTemplateModel
-
 
 def main(hparams):
     """
@@ -91,10 +91,15 @@ def main(hparams):
     parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
 
     # gpu args
-    parent_parser.add_argument('--gpus', type=str, default='-1', help='how many gpus to use in the node. -1 uses all the gpus on the node')
-    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir, help='where to save logs')
-    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir, help='where to save model')
-    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a', help='test tube exp name')
+    parent_parser.add_argument('--gpus', type=str, default='-1',
+                               help='how many gpus to use in the node.'
+                                    ' value -1 uses all the gpus on the node')
+    parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
+                               help='where to save logs')
+    parent_parser.add_argument('--model_save_path', type=str, default=checkpoint_dir,
+                               help='where to save model')
+    parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
+                               help='test tube exp name')
 
     # allow model to overwrite or extend args
     parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
@@ -104,5 +109,5 @@ def main(hparams):
     # RUN TRAINING
     # ---------------------
     # run on HPC cluster
-    print(f'RUNNING INTERACTIVE MODE ON GPUS. gpu ids: {hyperparams.gpus}')
+    print('RUNNING INTERACTIVE MODE ON GPUS. gpu ids: %i' % hyperparams.gpus)
     main(hyperparams)
diff --git a/examples/new_project_templates/trainer_cpu_template.py b/examples/new_project_templates/trainer_cpu_template.py
index de6ba7c424571..84a29a9bf3627 100644
--- a/examples/new_project_templates/trainer_cpu_template.py
+++ b/examples/new_project_templates/trainer_cpu_template.py
@@ -6,7 +6,7 @@
 from pytorch_lightning.utilities.arg_parse import add_default_args
 from pytorch_lightning.callbacks.pt_callbacks import EarlyStopping, ModelCheckpoint
 
-from .lightning_module_template import LightningTemplateModel
+from examples.new_project_templates.lightning_module_template import LightningTemplateModel
 
 
 def main(hparams):
@@ -67,7 +67,7 @@ def main(hparams):
     add_default_args(parent_parser, root_dir)
 
     # allow model to overwrite or extend args
-    parser = ExampleModel.add_model_specific_args(parent_parser)
+    parser = LightningTemplateModel.add_model_specific_args(parent_parser)
     hyperparams = parser.parse_args()
 
     # train model
diff --git a/pytorch_lightning/__init__.py b/pytorch_lightning/__init__.py
index 4b5961ac2daf4..73067d63858f9 100644
--- a/pytorch_lightning/__init__.py
+++ b/pytorch_lightning/__init__.py
@@ -1,3 +1,19 @@
 from .models.trainer import Trainer
 from .root_module.root_module import LightningModule
-from .root_module.decorators import data_loader
\ No newline at end of file
+from .root_module.decorators import data_loader
+
+__version__ = '0.3.6.9'
+__author__ = "William Falcon",
+__author_email__ = "waf2107@columbia.edu"
+__license__ = 'Apache-2'
+__homepage__ = 'https://github.com/williamFalcon/pytorch-lightning',
+__copyright__ = 'Copyright (c) 2018-2019, %s.' % __author__
+__doc__ = """
+The Keras for ML researchers using PyTorch
+"""
+
+__all__ = [
+    'Trainer',
+    'LightningModule',
+    'data_loader',
+]
diff --git a/pytorch_lightning/callbacks/__init__.py b/pytorch_lightning/callbacks/__init__.py
index f180c2542490f..035deb0681d7d 100644
--- a/pytorch_lightning/callbacks/__init__.py
+++ b/pytorch_lightning/callbacks/__init__.py
@@ -1 +1,6 @@
-from .pt_callbacks import EarlyStopping, ModelCheckpoint
\ No newline at end of file
+from .pt_callbacks import EarlyStopping, ModelCheckpoint
+
+__all__ = [
+    'EarlyStopping',
+    'ModelCheckpoint',
+]
diff --git a/pytorch_lightning/callbacks/pt_callbacks.py b/pytorch_lightning/callbacks/pt_callbacks.py
index 89c8f6b2e5f8f..f07c6a28e33c3 100644
--- a/pytorch_lightning/callbacks/pt_callbacks.py
+++ b/pytorch_lightning/callbacks/pt_callbacks.py
@@ -122,9 +122,9 @@ def on_epoch_end(self, epoch, logs=None):
         current = logs.get(self.monitor)
         stop_training = False
         if current is None:
-            print('Early stopping conditioned on metric `%s` ''which is not available. Available metrics are: %s' %
-                (self.monitor, ','.join(list(logs.keys()))), RuntimeWarning
-            )
+            print('Early stopping conditioned on metric `%s` '
+                  'which is not available. Available metrics are: %s' %
+                  (self.monitor, ','.join(list(logs.keys()))), RuntimeWarning)
             exit(-1)
 
         if self.monitor_op(current - self.min_delta, self.best):
@@ -188,8 +188,7 @@ def __init__(self, filepath, monitor='val_loss', verbose=0,
 
         if mode not in ['auto', 'min', 'max']:
             print('ModelCheckpoint mode %s is unknown, '
-                          'fallback to auto mode.' % (mode),
-                          RuntimeWarning)
+                  'fallback to auto mode.' % (mode), RuntimeWarning)
             mode = 'auto'
 
         if mode == 'min':
@@ -233,8 +232,8 @@ def on_epoch_end(self, epoch, logs=None):
             if self.save_best_only:
                 current = logs.get(self.monitor)
                 if current is None:
-                    print('Can save best model only with %s available, '
-                                  'skipping.' % (self.monitor), RuntimeWarning)
+                    print('Can save best model only with %s available,'
+                          ' skipping.' % (self.monitor), RuntimeWarning)
                 else:
                     if self.monitor_op(current, self.best):
                         if self.verbose > 0:
diff --git a/pytorch_lightning/models/__init__.py b/pytorch_lightning/models/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pytorch_lightning/models/trainer.py b/pytorch_lightning/models/trainer.py
index 2655809c4a7de..574c8061c55d9 100644
--- a/pytorch_lightning/models/trainer.py
+++ b/pytorch_lightning/models/trainer.py
@@ -1,12 +1,10 @@
 """
 The trainer handles all the logic for running a val loop, training loop, distributing, etc...
 """
-import subprocess
-import traceback
-import warnings
+
 import os
-import pdb
 import re
+import warnings
 
 import numpy as np
 import tqdm
@@ -17,13 +15,14 @@
 
 from ..root_module.memory import get_gpu_memory_map
 from ..root_module.model_saving import TrainerIO
-from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel
+from ..pt_overrides.override_data_parallel import (
+    LightningDistributedDataParallel, LightningDataParallel)
 from ..utilities.debugging import MisconfigurationException
 
 try:
     from apex import amp
     APEX_AVAILABLE = True
-except Exception:
+except ImportError:
     APEX_AVAILABLE = False
 
 
@@ -66,17 +65,20 @@ def __init__(self,
                  check_val_every_n_epoch=1,
                  fast_dev_run=False,
                  accumulate_grad_batches=1,
-                 max_nb_epochs=1000, min_nb_epochs=1,
-                 train_percent_check=1.0, val_percent_check=1.0, test_percent_check=1.0,
+                 max_nb_epochs=1000,
+                 min_nb_epochs=1,
+                 train_percent_check=1.0,
+                 val_percent_check=1.0,
+                 test_percent_check=1.0,
                  val_check_interval=0.95,
-                 log_save_interval=100, add_log_row_interval=10,
+                 log_save_interval=100,
+                 add_log_row_interval=10,
                  distributed_backend='dp',
                  use_amp=False,
                  print_nan_grads=False,
                  print_weights_summary=True,
                  amp_level='O2',
                  nb_sanity_val_steps=5):
-
         """
 
         :param experiment: Test-tube experiment
@@ -102,16 +104,15 @@ def __init__(self,
         :param val_check_interval:
         :param log_save_interval:
         :param add_log_row_interval:
-        :param distributed_backend: 'np' to use DistributedParallel, 'ddp' to use DistributedDataParallel
+        :param distributed_backend:
+            'np' to use DistributedParallel, 'dp' to use DistributedDataParallel
         :param use_amp:
         :param print_nan_grads:
         :param print_weights_summary:
         :param amp_level:
         :param nb_sanity_val_steps:
         """
-
         # Transfer params
-
         self.nb_gpu_nodes = nb_gpu_nodes
         self.gradient_clip = gradient_clip
         self.check_val_every_n_epoch = check_val_every_n_epoch
@@ -173,13 +174,14 @@ def __init__(self,
 
             # set the correct cuda visible devices (using pci order)
             os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-            os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in self.data_parallel_device_ids])
-            print(f'VISIBLE GPUS: {os.environ["CUDA_VISIBLE_DEVICES"]}')
+            os.environ["CUDA_VISIBLE_DEVICES"] = ','.join([str(x) for x in
+                                                           self.data_parallel_device_ids])
+            print('VISIBLE GPUS: %r' % os.environ["CUDA_VISIBLE_DEVICES"])
 
         # make DP and DDP mutually exclusive
         # single GPU will also use DP with devices=[0]
-        have_gpus = self.data_parallel_device_ids is not None and len(self.data_parallel_device_ids) > 0
-        if have_gpus:
+        requested_gpus = self.data_parallel_device_ids is not None
+        if requested_gpus and len(self.data_parallel_device_ids) > 0:
             self.use_dp = distributed_backend == 'dp'
             self.use_ddp = distributed_backend == 'ddp'
 
@@ -201,7 +203,7 @@ def __init__(self,
             try:
                 self.nb_slurm_tasks = int(os.environ['SLURM_NTASKS'])
                 self.is_slurm_managing_tasks = self.nb_slurm_tasks == self.nb_requested_gpus
-            except Exception as e:
+            except Exception:
                 # likely not on slurm, so set the slurm managed flag to false
                 self.is_slurm_managing_tasks = False
 
@@ -226,7 +228,8 @@ def __init__(self,
         self.val_dataloader = None
 
         # how much of the data to use
-        self.__determine_data_use_amount(train_percent_check, val_percent_check, test_percent_check, overfit_pct)
+        self.__determine_data_use_amount(train_percent_check, val_percent_check,
+                                         test_percent_check, overfit_pct)
         print('gpu available: {}, used: {}'.format(torch.cuda.is_available(), self.on_gpu))
 
         # 16 bit mixed precision training using apex
@@ -235,20 +238,21 @@ def __init__(self,
             print('using 16bit precision')
 
         if use_amp and not APEX_AVAILABLE:  # pragma: no cover
-            msg = '''
+            msg = """
             You set use_amp=True but do not have apex installed.
-            Install apex first using this guide and rerun with use_amp=True: 
+            Install apex first using this guide and rerun with use_amp=True:
             https://github.com/NVIDIA/apex#linux
-            
+
             this run will NOT use 16 bit precision
-            '''
+            """
             raise ModuleNotFoundError(msg)
 
     @property
     def data_parallel(self):
         return self.use_dp or self.use_ddp
 
-    def __determine_data_use_amount(self, train_percent_check, val_percent_check, test_percent_check, overfit_pct):
+    def __determine_data_use_amount(self, train_percent_check, val_percent_check,
+                                    test_percent_check, overfit_pct):
         """
         Use less data for debugging purposes
         """
@@ -275,7 +279,7 @@ def __tng_tqdm_dic(self):
             'tng_loss': '{0:.3f}'.format(self.avg_loss),
             'v_nb': '{}'.format(self.experiment.version),
             'epoch': '{}'.format(self.current_epoch),
-            'batch_nb':'{}'.format(self.batch_nb),
+            'batch_nb': '{}'.format(self.batch_nb),
         }
         tqdm_dic.update(self.tqdm_metrics)
 
@@ -389,18 +393,19 @@ def get_dataloaders(self, model):
         self.val_dataloader = model.val_dataloader
 
         if self.use_ddp and not isinstance(self.tng_dataloader.sampler, DistributedSampler):
-            msg = '''
-            when using multiple gpus and multiple nodes you must pass a DistributedSampler to DataLoader(sampler).
-            
-            ie: this:
-            dataset = myDataset()
-            dataloader = Dataloader(dataset)
-            
-            becomes:
-            dataset = myDataset()
-            dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
-            dataloader = Dataloader(dataset, sampler=dist_sampler)
-            '''
+            msg = """
+when using multiple gpus and multiple nodes you must pass
+ a DistributedSampler to DataLoader(sampler).
+
+ie: this:
+dataset = myDataset()
+dataloader = Dataloader(dataset)
+
+becomes:
+dataset = myDataset()
+dist_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+dataloader = Dataloader(dataset, sampler=dist_sampler)
+"""
             raise MisconfigurationException(msg)
 
     # -----------------------------
@@ -410,19 +415,20 @@ def fit(self, model):
 
         # when using multi-node or DDP within a node start each module in a separate process
         if self.use_ddp:
-            # must copy only the meta of the exp so it survives pickle/unpickle when going to new process
+            # must copy only the meta of the exp so it survives pickle/unpickle
+            #  when going to new process
             self.experiment = self.experiment.get_meta_copy()
 
             if self.is_slurm_managing_tasks:
                 task = int(os.environ['SLURM_LOCALID'])
                 self.ddp_train(task, model)
             else:
-                msg = f"""
-                You requested {self.nb_requested_gpus} GPUs but launched {self.nb_slurm_tasks} slurm tasks. 
-                We will launch {self.nb_requested_gpus} processes for you. 
-                We recommend you let slurm manage the processes by setting: --ntasks-per-node={self.nb_requested_gpus}
-                If you're not using SLURM, ignore this message!
-                """
+                msg = """
+You requested %(nb_gpus)s GPUs but launched %(nb_tasks)s slurm tasks.
+We will launch %(nb_gpus)s processes for you.
+We recommend you let slurm manage the processes by setting: --ntasks-per-node=%(nb_gpus)s
+If you're not using SLURM, ignore this message!
+""" % {'nb_gpus': self.nb_requested_gpus, 'nb_tasks': self.nb_slurm_tasks}
                 warnings.warn(msg)
                 mp.spawn(self.ddp_train, nprocs=len(self.data_parallel_device_ids), args=(model, ))
 
@@ -435,7 +441,8 @@ def fit(self, model):
         else:
             # run through amp wrapper
             if self.use_amp:
-                raise MisconfigurationException('amp + cpu is not supported. Please use a GPU option')
+                raise MisconfigurationException('amp + cpu is not supported.'
+                                                ' Please use a GPU option')
 
             # CHOOSE OPTIMIZER
             # allow for lr schedulers as well
@@ -462,9 +469,11 @@ def __dp_train(self, model):
         # check for this bug (amp + dp + !01 doesn't work)
         # https://github.com/NVIDIA/apex/issues/227
         if self.use_dp and self.use_amp:
-            m = f'amp level {self.amp_level} with DataParallel is not supported. ' \
-                f'See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227. ' \
-                f'We recommend you switch to ddp if you want to use amp'
+            m = """
+Amp level %r with DataParallel is not supported.
+See this note from NVIDIA for more info: https://github.com/NVIDIA/apex/issues/227.
+We recommend you switch to ddp if you want to use amp
+""" % self.amp_level
             raise MisconfigurationException(m)
 
         model = LightningDataParallel(model, device_ids=self.data_parallel_device_ids)
@@ -484,7 +493,7 @@ def ddp_train(self, gpu_nb, model):
         try:
             node_id = os.environ['SLURM_NODEID']
             self.node_rank = int(node_id)
-        except Exception as e:
+        except Exception:
             self.node_rank = 0
 
         # recover original exp before went into process
@@ -527,7 +536,8 @@ def ddp_train(self, gpu_nb, model):
             )
             self.optimizers = optimizers
 
-        model = LightningDistributedDataParallel(model, device_ids=[gpu_nb], find_unused_parameters=True)
+        model = LightningDistributedDataParallel(model, device_ids=[gpu_nb],
+                                                 find_unused_parameters=True)
 
         # continue training routine
         self.__run_pretrain_routine(model)
@@ -543,14 +553,14 @@ def __init_tcp_connection(self):
         # sets the appropriate port
         try:
             port = os.environ['MASTER_PORT']
-        except Exception as e:
+        except Exception:
             port = 12910
-            os.environ['MASTER_PORT'] = f'{port}'
+            os.environ['MASTER_PORT'] = str(port)
 
         # figure out the root node addr
         try:
             root_node = os.environ['SLURM_NODELIST'].split(' ')[0]
-        except Exception as e:
+        except Exception:
             root_node = '127.0.0.2'
 
         root_node = self.resolve_root_node_address(root_node)
@@ -642,7 +652,8 @@ def __train(self):
 
             # init progbar when requested
             if self.progress_bar:
-                self.prog_bar = tqdm.tqdm(range(self.total_batches), position=self.process_position)
+                self.prog_bar = tqdm.tqdm(range(self.total_batches),
+                                          position=self.process_position)
 
             for batch_nb, data_batch in enumerate(self.tng_dataloader):
                 self.batch_nb = batch_nb
@@ -651,7 +662,8 @@ def __train(self):
                 model = self.__get_model()
                 model.global_step = self.global_step
 
-                # stop when the flag is changed or we've gone past the amount requested in the batches
+                # stop when the flag is changed or we've gone past the amount
+                #  requested in the batches
                 self.total_batch_nb += 1
                 met_batch_limit = batch_nb > self.nb_tng_batches
                 if met_batch_limit:
@@ -698,7 +710,8 @@ def __train(self):
                         model.on_tng_metrics(metrics)
 
                     # log metrics
-                    scalar_metrics = self.__metrics_to_scalars(metrics, blacklist=self.__log_vals_blacklist())
+                    scalar_metrics = self.__metrics_to_scalars(
+                        metrics, blacklist=self.__log_vals_blacklist())
                     if self.proc_rank == 0:
                         self.experiment.log(scalar_metrics, global_step=self.global_step)
                         self.experiment.save()
@@ -720,7 +733,8 @@ def __train(self):
             # early stopping
             met_min_epochs = epoch_nb > self.min_nb_epochs
             if self.enable_early_stop and met_min_epochs:
-                should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb, logs=self.__tng_tqdm_dic)
+                should_stop = self.early_stop_callback.on_epoch_end(epoch=epoch_nb,
+                                                                    logs=self.__tng_tqdm_dic)
 
                 # stop training
                 stop = should_stop and met_min_epochs
@@ -773,14 +787,14 @@ def __run_tng_batch(self, data_batch, batch_nb):
 
         try:
             model_specific_tqdm_metrics_dic = output['prog']
-        except Exception as e:
+        except Exception:
             model_specific_tqdm_metrics_dic = {}
 
         # if output dict doesn't have the keyword loss
         # then assume the output=loss if scalar
         try:
             loss = output['loss']
-        except Exception as e:
+        except Exception:
             if type(output) is torch.Tensor:
                 loss = output
 
@@ -828,7 +842,8 @@ def __run_tng_batch(self, data_batch, batch_nb):
                 # clear gradients
                 optimizer.zero_grad()
 
-            # queuing loss across batches blows it up proportionally... divide out the number accumulated
+            # queuing loss across batches blows it up proportionally...
+            #  divide out the number accumulated
             self.batch_loss_value = self.batch_loss_value / self.accumulate_grad_batches
 
             # track loss
@@ -885,4 +900,5 @@ def __run_validation(self):
         # model checkpointing
         if self.proc_rank == 0 and self.checkpoint_callback is not None:
             print('save callback...')
-            self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch, logs=self.__tng_tqdm_dic)
+            self.checkpoint_callback.on_epoch_end(epoch=self.current_epoch,
+                                                  logs=self.__tng_tqdm_dic)
diff --git a/pytorch_lightning/pt_overrides/override_data_parallel.py b/pytorch_lightning/pt_overrides/override_data_parallel.py
index f06f8030ad500..ab88d286bb73b 100644
--- a/pytorch_lightning/pt_overrides/override_data_parallel.py
+++ b/pytorch_lightning/pt_overrides/override_data_parallel.py
@@ -63,7 +63,6 @@ def forward(self, *inputs, **kwargs):
         outputs = self.parallel_apply(replicas, inputs, kwargs)
         return self.gather(outputs, self.output_device)
 
-
     def parallel_apply(self, replicas, inputs, kwargs):
         return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
 
diff --git a/pytorch_lightning/root_module/grads.py b/pytorch_lightning/root_module/grads.py
index d50fa450961a2..7bdc8572b97fb 100644
--- a/pytorch_lightning/root_module/grads.py
+++ b/pytorch_lightning/root_module/grads.py
@@ -4,6 +4,7 @@
 
 from torch import nn
 
+
 class GradInformation(nn.Module):
 
     def grad_norm(self, norm_type):
@@ -16,12 +17,13 @@ def grad_norm(self, norm_type):
                     total_norm += param_norm ** norm_type
                     norm = param_norm ** (1 / norm_type)
 
-                    results['grad_{}_norm_{}'.format(norm_type, i)] = round(norm.data.cpu().numpy().flatten()[0], 3)
-                except Exception as e:
+                    grad = round(norm.data.cpu().numpy().flatten()[0], 3)
+                    results['grad_{}_norm_{}'.format(norm_type, i)] = grad
+                except Exception:
                     # this param had no grad
                     pass
 
         total_norm = total_norm ** (1. / norm_type)
-        results['grad_{}_norm_total'.format(norm_type)] = round(total_norm.data.cpu().numpy().flatten()[0], 3)
+        grad = round(total_norm.data.cpu().numpy().flatten()[0], 3)
+        results['grad_{}_norm_total'.format(norm_type)] = grad
         return results
-
diff --git a/pytorch_lightning/root_module/hooks.py b/pytorch_lightning/root_module/hooks.py
index 849826a83eabb..00ece234824e5 100644
--- a/pytorch_lightning/root_module/hooks.py
+++ b/pytorch_lightning/root_module/hooks.py
@@ -43,4 +43,3 @@ def on_after_backward(self):
         :return:
         """
         pass
-
diff --git a/pytorch_lightning/root_module/memory.py b/pytorch_lightning/root_module/memory.py
index 128fb16f18b43..3a636b040a270 100644
--- a/pytorch_lightning/root_module/memory.py
+++ b/pytorch_lightning/root_module/memory.py
@@ -94,7 +94,7 @@ def get_parameter_sizes(self):
         mods = list(self.model.modules())
         sizes = []
 
-        for i in range(1,len(mods)):
+        for i in range(1, len(mods)):
             m = mods[i]
             p = list(m.parameters())
             modsz = []
@@ -127,7 +127,7 @@ def make_summary(self):
         if self.model.example_input_array is not None:
             cols.extend(['In_sizes', 'Out_sizes'])
 
-        df = pd.DataFrame(np.zeros( (len(self.layer_names), len(cols))))
+        df = pd.DataFrame(np.zeros((len(self.layer_names), len(cols))))
         df.columns = cols
 
         df['Name'] = self.layer_names
@@ -152,16 +152,16 @@ def summarize(self):
         self.make_summary()
 
 
-def print_mem_stack(): # pragma: no cover
+def print_mem_stack():  # pragma: no cover
     for obj in gc.get_objects():
         try:
             if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
                 print(type(obj), obj.size())
-        except Exception as e:
+        except Exception:
             pass
 
 
-def count_mem_items(): # pragma: no cover
+def count_mem_items():  # pragma: no cover
     nb_params = 0
     nb_tensors = 0
     for obj in gc.get_objects():
@@ -172,7 +172,7 @@ def count_mem_items(): # pragma: no cover
                     nb_params += 1
                 else:
                     nb_tensors += 1
-        except Exception as e:
+        except Exception:
             pass
 
     return nb_params, nb_tensors
@@ -196,6 +196,6 @@ def get_gpu_memory_map():
     gpu_memory = [int(x) for x in result.strip().split('\n')]
     gpu_memory_map = {}
     for k, v in zip(range(len(gpu_memory)), gpu_memory):
-        k = f'gpu_{k}'
+        k = 'gpu_%i' % k
         gpu_memory_map[k] = v
     return gpu_memory_map
diff --git a/pytorch_lightning/root_module/model_saving.py b/pytorch_lightning/root_module/model_saving.py
index 0bde0943f45a8..0765142cd537f 100644
--- a/pytorch_lightning/root_module/model_saving.py
+++ b/pytorch_lightning/root_module/model_saving.py
@@ -3,7 +3,8 @@
 
 import torch
 
-from ..pt_overrides.override_data_parallel import LightningDistributedDataParallel, LightningDataParallel
+from ..pt_overrides.override_data_parallel import (
+    LightningDistributedDataParallel, LightningDataParallel)
 
 
 class ModelIO(object):
@@ -45,7 +46,8 @@ def on_hpc_load(self, checkpoint):
 class TrainerIO(object):
 
     def __get_model(self):
-        is_dp_module = type(self.model) is LightningDistributedDataParallel or type(self.model) is LightningDataParallel
+        is_dp_module = isinstance(self.model, (LightningDistributedDataParallel,
+                                               LightningDataParallel))
         model = self.model.module if is_dp_module else self.model
         return model
 
diff --git a/pytorch_lightning/root_module/root_module.py b/pytorch_lightning/root_module/root_module.py
index 1421a72d5b357..96dbfdcd442a0 100644
--- a/pytorch_lightning/root_module/root_module.py
+++ b/pytorch_lightning/root_module/root_module.py
@@ -129,6 +129,3 @@ def freeze(self):
     def unfreeze(self):
         for param in self.parameters():
             param.requires_grad = True
-
-
-
diff --git a/pytorch_lightning/testing/lm_test_module.py b/pytorch_lightning/testing/lm_test_module.py
index 9861810e8a7f8..61ecf874f337d 100644
--- a/pytorch_lightning/testing/lm_test_module.py
+++ b/pytorch_lightning/testing/lm_test_module.py
@@ -48,11 +48,13 @@ def __build_model(self):
         Layout model
         :return:
         """
-        self.c_d1 = nn.Linear(in_features=self.hparams.in_features, out_features=self.hparams.hidden_dim)
+        self.c_d1 = nn.Linear(in_features=self.hparams.in_features,
+                              out_features=self.hparams.hidden_dim)
         self.c_d1_bn = nn.BatchNorm1d(self.hparams.hidden_dim)
         self.c_d1_drop = nn.Dropout(self.hparams.drop_prob)
 
-        self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim, out_features=self.hparams.out_features)
+        self.c_d2 = nn.Linear(in_features=self.hparams.hidden_dim,
+                              out_features=self.hparams.out_features)
 
     # ---------------------
     # TRAINING
@@ -191,8 +193,10 @@ def configure_optimizers(self):
 
     def __dataloader(self, train):
         # init data generators
-        transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (1.0,))])
-        dataset = MNIST(root=self.hparams.data_root, train=train, transform=transform, download=True)
+        transform = transforms.Compose([transforms.ToTensor(),
+                                        transforms.Normalize((0.5,), (1.0,))])
+        dataset = MNIST(root=self.hparams.data_root, train=train,
+                        transform=transform, download=True)
 
         # when using multi-node we need to add the datasampler
         train_sampler = None
@@ -202,7 +206,7 @@ def __dataloader(self, train):
             if self.on_gpu and not self.force_remove_distributed_sampler:
                 train_sampler = DistributedSampler(dataset, rank=self.trainer.proc_rank)
                 batch_size = batch_size // self.trainer.world_size  # scale batch size
-        except Exception as e:
+        except Exception:
             pass
 
         should_shuffle = train_sampler is None
@@ -242,19 +246,24 @@ def add_model_specific_args(parent_parser, root_dir):
 
         # network params
         parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=False)
-        parser.add_argument('--in_features', default=28*28, type=int)
+        parser.add_argument('--in_features', default=28 * 28, type=int)
         parser.add_argument('--out_features', default=10, type=int)
-        parser.add_argument('--hidden_dim', default=50000, type=int) # use 500 for CPU, 50000 for GPU to see speed difference
+        # use 500 for CPU, 50000 for GPU to see speed difference
+        parser.add_argument('--hidden_dim', default=50000, type=int)
 
         # data
         parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)
 
         # training params (opt)
-        parser.opt_list('--learning_rate', default=0.001*8, type=float, options=[0.0001, 0.0005, 0.001, 0.005],
+        parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
+                        options=[0.0001, 0.0005, 0.001, 0.005],
                         tunable=False)
-        parser.opt_list('--optimizer_name', default='adam', type=str, options=['adam'], tunable=False)
-
-        # if using 2 nodes with 4 gpus each the batch size here (256) will be 256 / (2*8) = 16 per gpu
-        parser.opt_list('--batch_size', default=256*8, type=int, options=[32, 64, 128, 256], tunable=False,
-                        help='batch size will be divided over all the gpus being used across all nodes')
+        parser.opt_list('--optimizer_name', default='adam', type=str,
+                        options=['adam'], tunable=False)
+
+        # if using 2 nodes with 4 gpus each the batch size here
+        #  (256) will be 256 / (2*8) = 16 per gpu
+        parser.opt_list('--batch_size', default=256 * 8, type=int,
+                        options=[32, 64, 128, 256], tunable=False,
+                        help='batch size will be divided over all gpus being used across all nodes')
         return parser
diff --git a/pytorch_lightning/utilities/arg_parse.py b/pytorch_lightning/utilities/arg_parse.py
index f274e751c3b6d..39d4ec81f9486 100644
--- a/pytorch_lightning/utilities/arg_parse.py
+++ b/pytorch_lightning/utilities/arg_parse.py
@@ -3,32 +3,46 @@
 Might need to update with the new flags
 """
 
+import os
+
+
 def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None):
 
     # tng, test, val check intervals
-    parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true', help='true = run test set also')
-    parser.add_argument('--check_val_every_n_epoch', default=1, type=int, help='check val every n epochs')
+    parser.add_argument('--eval_test_set', dest='eval_test_set', action='store_true',
+                        help='true = run test set also')
+    parser.add_argument('--check_val_every_n_epoch', default=1, type=int,
+                        help='check val every n epochs')
     parser.opt_list('--accumulate_grad_batches', default=1, type=int, tunable=False,
-                    help='accumulates gradients k times before applying update. Simulates huge batch size')
+                    help='accumulates gradients k times before applying update.'
+                         ' Simulates huge batch size')
     parser.add_argument('--max_nb_epochs', default=200, type=int, help='cap epochs')
     parser.add_argument('--min_nb_epochs', default=2, type=int, help='min epochs')
-    parser.add_argument('--train_percent_check', default=1.0, type=float, help='how much of tng set to check')
-    parser.add_argument('--val_percent_check', default=1.0, type=float, help='how much of val set to check')
-    parser.add_argument('--test_percent_check', default=1.0, type=float, help='how much of test set to check')
-
-    parser.add_argument('--val_check_interval', default=0.95, type=float, help='how much within 1 epoch to check val')
-    parser.add_argument('--log_save_interval', default=100, type=int, help='how many batches between log saves')
-    parser.add_argument('--add_log_row_interval', default=100, type=int, help='add log every k batches')
+    parser.add_argument('--train_percent_check', default=1.0, type=float,
+                        help='how much of tng set to check')
+    parser.add_argument('--val_percent_check', default=1.0, type=float,
+                        help='how much of val set to check')
+    parser.add_argument('--test_percent_check', default=1.0, type=float,
+                        help='how much of test set to check')
+
+    parser.add_argument('--val_check_interval', default=0.95, type=float,
+                        help='how much within 1 epoch to check val')
+    parser.add_argument('--log_save_interval', default=100, type=int,
+                        help='how many batches between log saves')
+    parser.add_argument('--add_log_row_interval', default=100, type=int,
+                        help='add log every k batches')
 
     # early stopping
     parser.add_argument('--disable_early_stop', dest='enable_early_stop', action='store_false')
     parser.add_argument('--early_stop_metric', default='val_acc', type=str)
     parser.add_argument('--early_stop_mode', default='min', type=str)
-    parser.add_argument('--early_stop_patience', default=3, type=int, help='number of epochs until stop')
+    parser.add_argument('--early_stop_patience', default=3, type=int,
+                        help='number of epochs until stop')
 
     # gradient handling
     parser.add_argument('--gradient_clip', default=-1, type=int)
-    parser.add_argument('--track_grad_norm', default=-1, type=int, help='if > 0, will track this grad norm')
+    parser.add_argument('--track_grad_norm', default=-1, type=int,
+                        help='if > 0, will track this grad norm')
 
     # model saving
     parser.add_argument('--model_save_path', default=root_dir + '/model_weights')
@@ -44,7 +58,8 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None
     # test_tube settings
     parser.add_argument('-en', '--tt_name', default='pt_test')
     parser.add_argument('-td', '--tt_description', default='pytorch lightning test')
-    parser.add_argument('--tt_save_path', default=root_dir + '/test_tube_logs', help='logging dir')
+    parser.add_argument('--tt_save_path', default=os.path.join(root_dir, 'test_tube_logs'),
+                        help='logging dir')
     parser.add_argument('--enable_single_run', dest='single_run', action='store_true')
     parser.add_argument('--nb_hopt_trials', default=1, type=int)
     parser.add_argument('--log_stdout', dest='log_stdout', action='store_true')
@@ -55,25 +70,30 @@ def add_default_args(parser, root_dir, rand_seed=None, possible_model_names=None
     parser.add_argument('--default_tensor_type', default='torch.cuda.FloatTensor', type=str)
     parser.add_argument('--use_amp', dest='use_amp', action='store_true')
     parser.add_argument('--check_grad_nans', dest='check_grad_nans', action='store_true')
-    parser.add_argument('--amp_level', default='O2',type=str)
-
+    parser.add_argument('--amp_level', default='O2', type=str)
 
     # run on hpc
     parser.add_argument('--on_cluster', dest='on_cluster', action='store_true')
 
     # FAST training
     # use these settings to make sure network has no bugs without running a full dataset
-    parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true', help='runs validation after 1 tng step')
-    parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true', help='false removes the prog bar')
-    parser.add_argument('--overfit', default=-1, type=float, help='% of dataset to use with this option. float, or -1 for none')
+    parser.add_argument('--fast_dev_run', dest='fast_dev_run', default=False, action='store_true',
+                        help='runs validation after 1 tng step')
+    parser.add_argument('--enable_tqdm', dest='enable_tqdm', default=False, action='store_true',
+                        help='false removes the prog bar')
+    parser.add_argument('--overfit', default=-1, type=float,
+                        help='% of dataset to use with this option. float, or -1 for none')
 
     # debug args
     if rand_seed is not None:
         parser.add_argument('--random_seed', default=rand_seed, type=int)
 
-    parser.add_argument('--interactive', dest='interactive', action='store_true', help='runs on gpu without cluster')
-    parser.add_argument('--debug', dest='debug', action='store_true', help='enables/disables test tube')
-    parser.add_argument('--local', dest='local', action='store_true', help='enables local tng')
+    parser.add_argument('--interactive', dest='interactive', action='store_true',
+                        help='runs on gpu without cluster')
+    parser.add_argument('--debug', dest='debug', action='store_true',
+                        help='enables/disables test tube')
+    parser.add_argument('--local', dest='local', action='store_true',
+                        help='enables local tng')
 
     # optimizer
     parser.add_argument('--lr_scheduler_milestones', default=None, type=str)
diff --git a/requirements.txt b/requirements.txt
index aba573c0c6085..86c02573bf48e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ tqdm==4.32.1
 twine==1.13.0
 numpy==1.16.4
 torch>=1.1.0
-torchvision==0.3.0
+torchvision>=0.3.0
+pandas
\ No newline at end of file
diff --git a/setup.py b/setup.py
index e680ef4301207..a8f0d370834cc 100755
--- a/setup.py
+++ b/setup.py
@@ -1,29 +1,55 @@
 #!/usr/bin/env python
 
-from setuptools import setup, find_packages
+from setuptools import setup
+
+import pytorch_lightning
 
 # https://packaging.python.org/guides/single-sourcing-package-version/
 
 # http://blog.ionelmc.ro/2014/05/25/python-packaging/
 setup(
     name="pytorch-lightning",
-    version='0.3.6.9',
-    description="The Keras for ML researchers using PyTorch",
-    author="William Falcon",
-    author_email="waf2107@columbia.edu",
-    url="https://github.com/williamFalcon/pytorch-lightning",
-    download_url="https://github.com/williamFalcon/pytorch-lightning",
-    license="MIT",
+    version=pytorch_lightning.__version__,
+    description=pytorch_lightning.__doc__,
+    author=pytorch_lightning.__author__,
+    author_email=pytorch_lightning.__author_email__,
+    url=pytorch_lightning.__homepage__,
+    license=pytorch_lightning.__license__,
+    packages=['pytorch_lightning'],
+
+    long_description=open("README.md", encoding="utf-8").read(),
+    long_description_content_type='text/markdown',
+
+    include_package_data=True,
+    zip_safe=False,
+
     keywords=["deep learning", "pytorch", "AI"],
-    python_requires=">=3.5",
+    python_requires=">=3.6",
     install_requires=[
         "torch>=1.1.0",
         "tqdm",
         "test-tube>=0.6.7.6",
     ],
-    packages=find_packages(),
-    long_description=open("README.md", encoding="utf-8").read(),
-    long_description_content_type='text/markdown',
-    include_package_data=True,
-    zip_safe=False,
+
+    classifiers=[
+        'Environment :: Console',
+        'Natural Language :: English',
+        # How mature is this project? Common values are
+        #   3 - Alpha, 4 - Beta, 5 - Production/Stable
+        'Development Status :: 4 - Beta',
+        # Indicate who your project is intended for
+        'Intended Audience :: Developers',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: Scientific/Engineering :: Image Recognition',
+        'Topic :: Scientific/Engineering :: Information Analysis',
+        # Pick your license as you wish
+        'License :: OSI Approved :: BSD License',
+        'Operating System :: OS Independent',
+        # Specify the Python versions you support here. In particular, ensure
+        # that you indicate whether you support Python 2, Python 3 or both.
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+    ],
 )
diff --git a/tests/README.md b/tests/README.md
index a121ff14145e1..032cf04f1d2de 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -17,7 +17,7 @@ pip install -e .
 pip install -r requirements.txt
 
 # run tests
-py.test
+py.test -v
 ```
 
 To test models that require GPU make sure to run the above command on a GPU machine.
@@ -50,10 +50,14 @@ cd pytorch-lightning
 
 # generate coverage 
 pip install coverage
-coverage run tests/test_models.py   
+coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules
 
 # print coverage stats
-coverage report -m   
+coverage report -m
+
+# exporting resulys
+coverage xml
+codecov -t 17327163-8cca-4a5d-86c8-ca5f2ef700bc  -v
 ```
 
 
diff --git a/tests/debug.py b/tests/debug.py
index 09e9186b9ccc1..6a5efbecfa2ca 100644
--- a/tests/debug.py
+++ b/tests/debug.py
@@ -61,8 +61,8 @@ def get_model():
     root_dir = os.path.dirname(os.path.realpath(__file__))
     hparams = Namespace(**{'drop_prob': 0.2,
                            'batch_size': 32,
-                           'in_features': 28*28,
-                           'learning_rate': 0.001*8,
+                           'in_features': 28 * 28,
+                           'learning_rate': 0.001 * 8,
                            'optimizer_name': 'adam',
                            'data_root': os.path.join(root_dir, 'mnist'),
                            'out_features': 10,
@@ -107,7 +107,8 @@ def load_model(exp, save_dir):
     checkpoints = [x for x in os.listdir(save_dir) if '.ckpt' in x]
     weights_dir = os.path.join(save_dir, checkpoints[0])
 
-    trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir, tags_csv=tags_path, on_gpu=True)
+    trained_model = LightningTemplateModel.load_from_metrics(weights_path=weights_dir,
+                                                             tags_csv=tags_path, on_gpu=True)
 
     assert trained_model is not None, 'loading model failed'
 
@@ -132,7 +133,7 @@ def run_prediction(dataloader, trained_model):
 
     print(val_acc)
 
-    assert val_acc > 0.70, f'this model is expected to get > 0.7 in test set (it got {val_acc})'
+    assert val_acc > 0.70, 'this model is expected to get > 0.7 in test set (it got %f)' % val_acc
 
 
 def main():
diff --git a/tests/requirements.txt b/tests/requirements.txt
index c16efc2add252..076bfd6500639 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -1,2 +1,8 @@
-coverage==4.5.3
-pytest==5.0.1
+tox
+coverage
+codecov
+pytest>=3.0.5
+pytest-cov
+flake8
+check-manifest
+test_tube
\ No newline at end of file
diff --git a/tests/test_models.py b/tests/test_models.py
index 73ba2e43c1209..cd03d8b411399 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -1,20 +1,22 @@
 import os
 import shutil
 import warnings
+from argparse import Namespace
 
 import pytest
 import numpy as np
 import torch
+from test_tube import Experiment, SlurmCluster
+
+# sys.path += [os.path.abspath('..'), os.path.abspath('../..')]
 from pytorch_lightning import Trainer
-from examples import LightningTemplateModel
 from pytorch_lightning.testing.lm_test_module import LightningTestModel
-from argparse import Namespace
-from test_tube import Experiment, SlurmCluster
 from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
 from pytorch_lightning.utilities.debugging import MisconfigurationException
 from pytorch_lightning.root_module import memory
 from pytorch_lightning.models.trainer import reduce_distributed_output
 from pytorch_lightning.root_module import model_saving
+from examples import LightningTemplateModel
 
 SEED = 2334
 torch.manual_seed(SEED)
@@ -30,10 +32,12 @@ def test_amp_gpu_ddp():
     :return:
     """
     if not torch.cuda.is_available():
-        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_amp_gpu_ddp cannot run.'
+                      'Rerun on a GPU node to run this test')
         return
     if not torch.cuda.device_count() > 1:
-        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        warnings.warn('test_amp_gpu_ddp cannot run.'
+                      'Rerun on a node with 2+ GPUs to run this test')
         return
 
     os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
@@ -105,7 +109,8 @@ def test_cpu_slurm_save_load():
     # wipe-out trainer and model
     # retrain with not much data... this simulates picking training back up after slurm
     # we want to see if the weights come back correctly
-    continue_tng_hparams = get_hparams(continue_training=True, hpc_exp_number=cluster_a.hpc_exp_number)
+    continue_tng_hparams = get_hparams(continue_training=True,
+                                       hpc_exp_number=cluster_a.hpc_exp_number)
     trainer_options = dict(
         max_nb_epochs=1,
         cluster=SlurmCluster(continue_tng_hparams),
@@ -136,11 +141,9 @@ def assert_pred_same():
 def test_loading_meta_tags():
     hparams = get_hparams()
 
-    save_dir = init_save_dir()
-
     # save tags
     exp = get_exp(False)
-    exp.tag({'some_str':'a_str', 'an_int': 1, 'a_float': 2.0})
+    exp.tag({'some_str': 'a_str', 'an_int': 1, 'a_float': 2.0})
     exp.argparse(hparams)
     exp.save()
 
@@ -221,7 +224,8 @@ def test_model_saving_loading():
     # load new model
     tags_path = exp.get_data_path(exp.name, exp.version)
     tags_path = os.path.join(tags_path, 'meta_tags.csv')
-    model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path, tags_csv=tags_path, on_gpu=False)
+    model_2 = LightningTestModel.load_from_metrics(weights_path=new_weights_path,
+                                                   tags_csv=tags_path, on_gpu=False)
     model_2.eval()
 
     # make prediction
@@ -246,10 +250,12 @@ def test_amp_gpu_ddp_slurm_managed():
     :return:
     """
     if not torch.cuda.is_available():
-        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_amp_gpu_ddp cannot run.'
+                      ' Rerun on a GPU node to run this test')
         return
     if not torch.cuda.device_count() > 1:
-        warnings.warn('test_amp_gpu_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        warnings.warn('test_amp_gpu_ddp cannot run.'
+                      ' Rerun on a node with 2+ GPUs to run this test')
         return
 
     # simulate setting slurm flags
@@ -413,7 +419,8 @@ def test_single_gpu_model():
     :return:
     """
     if not torch.cuda.is_available():
-        warnings.warn('test_single_gpu_model cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_single_gpu_model cannot run.'
+                      ' Rerun on a GPU node to run this test')
         return
     model, hparams = get_model()
 
@@ -434,10 +441,12 @@ def test_multi_gpu_model_dp():
     :return:
     """
     if not torch.cuda.is_available():
-        warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_multi_gpu_model_dp cannot run.'
+                      ' Rerun on a GPU node to run this test')
         return
     if not torch.cuda.device_count() > 1:
-        warnings.warn('test_multi_gpu_model_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        warnings.warn('test_multi_gpu_model_dp cannot run.'
+                      ' Rerun on a node with 2+ GPUs to run this test')
         return
     model, hparams = get_model()
     trainer_options = dict(
@@ -460,10 +469,12 @@ def test_amp_gpu_dp():
     :return:
     """
     if not torch.cuda.is_available():
-        warnings.warn('test_amp_gpu_dp cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_amp_gpu_dp cannot run.'
+                      ' Rerun on a GPU node to run this test')
         return
     if not torch.cuda.device_count() > 1:
-        warnings.warn('test_amp_gpu_dp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        warnings.warn('test_amp_gpu_dp cannot run.'
+                      ' Rerun on a node with 2+ GPUs to run this test')
         return
     model, hparams = get_model()
     trainer_options = dict(
@@ -482,10 +493,12 @@ def test_multi_gpu_model_ddp():
     :return:
     """
     if not torch.cuda.is_available():
-        warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a GPU node to run this test')
+        warnings.warn('test_multi_gpu_model_ddp cannot run.'
+                      ' Rerun on a GPU node to run this test')
         return
     if not torch.cuda.device_count() > 1:
-        warnings.warn('test_multi_gpu_model_ddp cannot run. Rerun on a node with 2+ GPUs to run this test')
+        warnings.warn('test_multi_gpu_model_ddp cannot run.'
+                      ' Rerun on a node with 2+ GPUs to run this test')
         return
 
     os.environ['MASTER_PORT'] = str(np.random.randint(12000, 19000, 1)[0])
@@ -502,7 +515,6 @@ def test_multi_gpu_model_ddp():
     run_gpu_model_test(trainer_options, model, hparams)
 
 
-
 def test_ddp_sampler_error():
     """
     Make sure DDP + AMP work
@@ -587,8 +599,8 @@ def get_hparams(continue_training=False, hpc_exp_number=0):
     args = {
         'drop_prob': 0.2,
         'batch_size': 32,
-        'in_features': 28*28,
-        'learning_rate': 0.001*8,
+        'in_features': 28 * 28,
+        'learning_rate': 0.001 * 8,
         'optimizer_name': 'adam',
         'data_root': os.path.join(root_dir, 'mnist'),
         'out_features': 10,
@@ -673,13 +685,13 @@ def run_prediction(dataloader, trained_model):
 
     print(val_acc)
 
-    assert val_acc > 0.50, f'this model is expected to get > 0.50 in test set (it got {val_acc})'
+    assert val_acc > 0.50, 'this model is expected to get > 0.50 in test set (it got %f)' % val_acc
 
 
 def assert_ok_acc(trainer):
     # this model should get 0.80+ acc
     acc = trainer.tng_tqdm_dic['val_acc']
-    assert acc > 0.50, f'model failed to get expected 0.50 validation accuracy. Got: {acc}'
+    assert acc > 0.50, 'model failed to get expected 0.50 validation accuracy. Got: %f' % acc
 
 
 if __name__ == '__main__':
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000000000..a91ac5efce9d5
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,47 @@
+# this file is *not* meant to cover or endorse the use of tox or pytest or testing in general,
+#
+#  It's meant to show the use of:
+#
+#  - check-manifest
+#     confirm items checked into vcs are in your segdist
+#  - python setup.py check 
+#     confirm required package meta-data in setup.py
+#  - readme_renderer (when using a ReStructuredText README)
+#     confirms your long_description will render correctly on PyPI.
+#
+#  and also to help confirm pull requests to this project.
+
+[tox]
+envlist = py{35,36,37}
+
+[pytest]
+log_cli = 0
+log_cli_level = CRITICAL
+log_cli_format = %(message)s
+log_file = pytest.log
+log_file_level = DEBUG
+log_file_format = %(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)
+log_file_date_format=%Y-%m-%d %H:%M:%S
+
+[testenv]
+basepython =
+    py35: python3.5
+    py36: python3.6
+    py37: python3.7
+deps =
+    -r requirements.txt
+    -r ./tests/requirements.txt
+commands =
+    check-manifest --ignore tox.ini
+    python setup.py check -m -s
+    coverage run --source pytorch_lightning -m py.test pytorch_lightning tests examples -v --doctest-modules
+    flake8 .
+
+[flake8]
+exclude = .tox,*.egg,build,temp
+select = E,W,F
+doctests = True
+verbose = 2
+# https://pep8.readthedocs.io/en/latest/intro.html#error-codes
+format = pylint
+max-line-length = 100
diff --git a/update.sh b/update.sh
index 4eaf1149d083f..40fcc22d6b79b 100644
--- a/update.sh
+++ b/update.sh
@@ -11,10 +11,7 @@ rm -rf ./dist/*
 python3 setup.py sdist
 twine upload dist/*
 
-
-
 # to update docs
 # cd to root dir
 # mkdocs gh-deploy
 
-