diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ff7a69713a..740cc5442d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 exclude: ^(docs/logos|pymc3/tests/data)/
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v3.4.0
+  rev: v4.0.1
   hooks:
     -   id: check-merge-conflict
     -   id: check-toml
@@ -14,44 +14,43 @@ repos:
         exclude: ^requirements-dev\.txt$
     -   id: trailing-whitespace
 - repo: https://github.com/PyCQA/isort
-  rev: 5.7.0
+  rev: 5.10.1
   hooks:
     - id: isort
       name: isort
 - repo: https://github.com/asottile/pyupgrade
-  rev: v2.10.0
+  rev: v2.29.1
   hooks:
     - id: pyupgrade
       args: [--py37-plus]
 - repo: https://github.com/psf/black
-  rev: 20.8b1
+  rev: 21.12b0
   hooks:
     - id: black
 - repo: https://github.com/PyCQA/pylint
-  rev: pylint-2.6.0
+  rev: v2.12.2
   hooks:
     - id: pylint
       args: [--rcfile=.pylintrc]
-      files: ^pymc3/
+      files: ^pymc/
+- repo: https://github.com/MarcoGorelli/madforhooks
+  rev: 0.2.1
+  hooks:
+    - id: conda-env-sorter
+      files: ^conda-envs/environment-dev-py.+\.yml$
 - repo: local
   hooks:
     - id: check-no-tests-are-ignored
+      additional_dependencies: [pandas,pyyaml]
       entry: python scripts/check_all_tests_are_covered.py
       files: ^\.github/workflows/pytest\.yml$
       language: python
       name: Check no tests are ignored
       pass_filenames: false
-    - id: conda-env-sort
-      additional_dependencies: [pyyaml]
-      entry: python scripts/sort_conda_envs.py
-      files: ^conda-envs/environment-dev-py37\.yml$
-      language: python
-      name: Sort dependencies in conda envs
-      types: [yaml]
     - id: pip-from-conda
       additional_dependencies: [pyyaml]
       entry: python scripts/generate_pip_deps_from_conda.py
-      files: ^conda-envs/
+      files: ^conda-envs/environment-dev-py.+.yml$
       language: python
       name: Generate pip dependency from conda
     - id: no-relative-imports
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
index 98b349cc03..558bfe4cf7 100644
--- a/RELEASE-NOTES.md
+++ b/RELEASE-NOTES.md
@@ -3,6 +3,8 @@
 ## PyMC 3.11.5 (TBD)
 ### Backports
 + The `pm.logp(rv, x)` syntax is now available and recommended to make your model code `v4`-ready. Note that this backport is just an alias and much less capable than what's available with `pymc >=4` (see [#5083](https://github.com/pymc-devs/pymc/pulls/5083)).
++ The `pm.Distribution(testval=...)` kwarg was deprecated and will be replaced by `pm.Distribution(initval=...)`in `pymc >=4` (see [#5226](https://github.com/pymc-devs/pymc/pulls/5226)).
++ The `pm.sample(start=...)` kwarg was deprecated and will be replaced by `pm.sample(initvals=...)`in `pymc >=4` (see [#5226](https://github.com/pymc-devs/pymc/pulls/5226)).
 
 ## PyMC3 3.11.4 (20 August 2021)
 
diff --git a/conda-envs/environment-dev-py37.yml b/conda-envs/environment-dev-py37.yml
index c944732928..311e42d2c9 100644
--- a/conda-envs/environment-dev-py37.yml
+++ b/conda-envs/environment-dev-py37.yml
@@ -11,6 +11,7 @@ dependencies:
 - numpy=1.15
 - numpydoc>=0.9
 - pandas=0.24
+- pip
 - pre-commit>=2.8.0
 - pytest-cov>=2.5
 - pytest>=3.0
@@ -21,3 +22,5 @@ dependencies:
 - sphinx-autobuild>=0.7
 - sphinx>=1.5
 - watermark
+- pip:
+  - deprecat
diff --git a/conda-envs/environment-dev-py38.yml b/conda-envs/environment-dev-py38.yml
index 24e9040cb3..c204bb6d01 100644
--- a/conda-envs/environment-dev-py38.yml
+++ b/conda-envs/environment-dev-py38.yml
@@ -9,6 +9,7 @@ dependencies:
 - mkl-service
 - nbsphinx>=0.4
 - numpydoc>=0.9
+- pip
 - pre-commit>=2.8.0
 - pytest-cov>=2.5
 - pytest>=3.0
@@ -18,3 +19,5 @@ dependencies:
 - sphinx-autobuild>=0.7
 - sphinx>=1.5
 - watermark
+- pip:
+  - deprecat
diff --git a/conda-envs/environment-dev-py39.yml b/conda-envs/environment-dev-py39.yml
index 6f6db58e67..77606a297e 100644
--- a/conda-envs/environment-dev-py39.yml
+++ b/conda-envs/environment-dev-py39.yml
@@ -9,6 +9,7 @@ dependencies:
 - mkl-service
 - nbsphinx>=0.4
 - numpydoc>=0.9
+- pip
 - pre-commit>=2.8.0
 - pytest-cov>=2.5
 - pytest>=3.0
@@ -18,3 +19,5 @@ dependencies:
 - sphinx-autobuild>=0.7
 - sphinx>=1.5
 - watermark
+- pip:
+  - deprecat
diff --git a/conda-envs/windows-environment-dev-py38.yml b/conda-envs/windows-environment-dev-py38.yml
index ea8b0622c0..92aed0febb 100644
--- a/conda-envs/windows-environment-dev-py38.yml
+++ b/conda-envs/windows-environment-dev-py38.yml
@@ -24,3 +24,5 @@ dependencies:
 - sphinx-autobuild>=0.7
 - sphinx>=1.5
 - watermark
+- pip:
+  - deprecat
diff --git a/docs/source/_static/main.css b/docs/source/_static/main.css
index 16fffc59dc..87c4f842ec 100644
--- a/docs/source/_static/main.css
+++ b/docs/source/_static/main.css
@@ -3,3 +3,130 @@
     margin: 0.5em;
     content: ":";
 }
+
+:root {
+  --pst-color-primary: 19, 6, 84;
+  --pst-color-success: 40, 167, 69;
+  --pst-color-info: 0, 123, 255; /*23, 162, 184;*/
+  --pst-color-warning: 255, 193, 7;
+  --pst-color-danger: 220, 53, 69;
+  --pst-color-text-base: 51, 51, 51;
+
+  --pst-color-admonition-default: var(--pst-color-info);
+  --pst-color-admonition-note: var(--pst-color-info);
+  --pst-color-admonition-attention: var(--pst-color-warning);
+  --pst-color-admonition-caution: var(--pst-color-warning);
+  --pst-color-admonition-warning: var(--pst-color-warning);
+  --pst-color-admonition-danger: var(--pst-color-danger);
+  --pst-color-admonition-error: var(--pst-color-danger);
+  --pst-color-admonition-hint: var(--pst-color-success);
+  --pst-color-admonition-tip: var(--pst-color-success);
+  --pst-color-admonition-important: var(--pst-color-success);
+}
+
+.admonition,
+div.admonition {
+ margin:1.5625em auto;
+ padding:0 .6rem .8rem;
+ overflow:hidden;
+ page-break-inside:avoid;
+ border-left:.2rem solid;
+ border-left-color:rgba(var(--pst-color-admonition-default),1);
+ border-bottom-color:rgba(var(--pst-color-admonition-default),1);
+ border-right-color:rgba(var(--pst-color-admonition-default),1);
+ border-top-color:rgba(var(--pst-color-admonition-default),1);
+ border-radius:.2rem;
+ box-shadow:0 .2rem .5rem rgba(0,0,0,.05),0 0 .0625rem rgba(0,0,0,.1);
+ transition:color .25s,background-color .25s,border-color .25s
+}
+.admonition :last-child,
+div.admonition :last-child {
+ margin-bottom:0
+}
+.admonition p.admonition-title~*,
+div.admonition p.admonition-title~* {
+ padding:0 1.4rem
+}
+.admonition>ol,
+.admonition>ul,
+div.admonition>ol,
+div.admonition>ul {
+ margin-left:1em
+}
+.admonition>.admonition-title,
+div.admonition>.admonition-title {
+ position:relative;
+ margin:0 -.6rem;
+ padding:.4rem .6rem .4rem 2rem;
+ font-weight:700;
+ background-color:rgba(var(--pst-color-admonition-default),.1)
+}
+.admonition>.admonition-title+*,
+div.admonition>.admonition-title+* {
+ margin-top:.4em
+}
+.admonition.attention,
+div.admonition.attention {
+ border-color:rgba(var(--pst-color-admonition-attention),1)
+}
+.admonition.attention>.admonition-title,
+div.admonition.attention>.admonition-title {
+ background-color:rgba(var(--pst-color-admonition-attention),.1)
+}
+.admonition.caution,
+div.admonition.caution {
+ border-color:rgba(var(--pst-color-admonition-caution),1)
+}
+.admonition.caution>.admonition-title,
+div.admonition.caution>.admonition-title {
+ background-color:rgba(var(--pst-color-admonition-caution),.1)
+}
+div.admonition.warning {
+ border-color:rgba(var(--pst-color-admonition-warning),1)
+}
+.admonition.warning>.admonition-title,
+div.admonition.warning>.admonition-title {
+ background-color:rgba(var(--pst-color-admonition-warning),.1)
+}
+div.admonition.danger {
+ border-color:rgba(var(--pst-color-admonition-danger),1)
+}
+.admonition.danger>.admonition-title,
+div.admonition.danger>.admonition-title {
+ background-color:rgba(var(--pst-color-admonition-danger),.1)
+}
+div.admonition.error {
+ border-color:rgba(var(--pst-color-admonition-error),1)
+}
+.admonition.error>.admonition-title,
+div.admonition.error>.admonition-title {
+ background-color:rgba(var(--pst-color-admonition-error),.1)
+}
+div.admonition.hint {
+ border-color:rgba(var(--pst-color-admonition-hint),1)
+}
+.admonition.hint>.admonition-title,
+div.admonition.hint>.admonition-title {
+ background-color:rgba(var(--pst-color-admonition-hint),.1)
+}
+div.admonition.tip {
+ border-color:rgba(var(--pst-color-admonition-tip),1)
+}
+.admonition.tip>.admonition-title,
+div.admonition.tip>.admonition-title {
+ background-color:rgba(var(--pst-color-admonition-tip),.1)
+}
+div.admonition.important {
+ border-color:rgba(var(--pst-color-admonition-important),1)
+}
+.admonition.important>.admonition-title,
+div.admonition.important>.admonition-title {
+ background-color:rgba(var(--pst-color-admonition-important),.1)
+}
+div.admonition.note {
+ border-color:rgba(var(--pst-color-admonition-note),1)
+}
+.admonition.note>.admonition-title,
+div.admonition.note>.admonition-title {
+ background-color:rgba(var(--pst-color-admonition-note),.1)
+}
diff --git a/docs/source/sphinxext/gallery_generator.py b/docs/source/sphinxext/gallery_generator.py
index 46273e04d6..a04bfce673 100644
--- a/docs/source/sphinxext/gallery_generator.py
+++ b/docs/source/sphinxext/gallery_generator.py
@@ -194,7 +194,7 @@ def build_gallery(srcdir, gallery):
     with open(table_of_contents_file) as toc:
         table_of_contents = toc.read()
 
-    js_contents = "Gallery.examples = {}\n{}".format(json.dumps(data), table_of_contents)
+    js_contents = f"Gallery.examples = {json.dumps(data)}\n{table_of_contents}"
 
     with open(js_file, "w") as js:
         js.write(js_contents)
diff --git a/pymc3/__init__.py b/pymc3/__init__.py
index 131bd5b394..53ca2d3977 100644
--- a/pymc3/__init__.py
+++ b/pymc3/__init__.py
@@ -63,7 +63,7 @@ def __set_compiler_flags():
 
 
 def _hotfix_theano_printing():
-    """ This is a workaround for https://github.com/pymc-devs/aesara/issues/309 """
+    """This is a workaround for https://github.com/pymc-devs/aesara/issues/309"""
     try:
         import pydot
         import theano.printing
diff --git a/pymc3/backends/base.py b/pymc3/backends/base.py
index 8b52c3e09c..41801cc472 100644
--- a/pymc3/backends/base.py
+++ b/pymc3/backends/base.py
@@ -364,7 +364,7 @@ def __getattr__(self, name):
             return self.get_values(name)
         if name in self.stat_names:
             return self.get_sampler_stats(name)
-        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, name))
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
 
     def __len__(self):
         chain = self.chains[-1]
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index 3ba2d0a040..8ec1a3ea7c 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -24,6 +24,8 @@
 
 import dill
 
+from deprecat.sphinx import deprecat
+
 if TYPE_CHECKING:
     from typing import Optional, Callable
 
@@ -67,6 +69,7 @@
 )  # type: contextvars.ContextVar[Optional[Callable]]
 
 PLATFORM = sys.platform
+UNSET = object()
 
 
 class _Unpickling:
@@ -95,7 +98,7 @@ def __new__(cls, name, *args, **kwargs):
         data = kwargs.pop("observed", None)
         cls.data = data
         if isinstance(data, ObservedRV) or isinstance(data, FreeRV):
-            raise TypeError("observed needs to be data but got: {}".format(type(data)))
+            raise TypeError(f"observed needs to be data but got: {type(data)}")
         total_size = kwargs.pop("total_size", None)
 
         dims = kwargs.pop("dims", None)
@@ -130,15 +133,54 @@ def dist(cls, *args, **kwargs):
         dist.__init__(*args, **kwargs)
         return dist
 
+    @deprecat(
+        deprecated_args={
+            "testval": dict(version="3.11.5", reason="replaced by `initval` in PyMC 4.0.0"),
+        }
+    )
     def __init__(
-        self, shape, dtype, testval=None, defaults=(), transform=None, broadcastable=None, dims=None
+        self,
+        shape,
+        dtype,
+        initval=None,
+        defaults=(),
+        transform=None,
+        broadcastable=None,
+        dims=None,
+        *,
+        testval=UNSET,
     ):
+        """Creates a PyMC distribution object.
+
+        Parameters
+        ----------
+        shape : tuple
+            Output shape of the RV.
+            Forwarded to the Theano TensorType of this RV.
+        dtype
+            Forwarded to the Theano TensorType of this RV.
+        initval : np.ndarray
+            Initial value for this RV.
+            In PyMC 4.0.0 this will no longer assign test values to the tensors.
+        defaults : tuple
+        transform : pm.Transform
+        broadcastable : tuple
+            Forwarded to the Theano TensorType of this RV.
+        dims : tuple
+            Ignored.
+        testval : np.ndarray
+            The old way of specifying initial values assigning test-values.
+        """
+        # Handle deprecated kwargs
+        if testval is not UNSET:
+            initval = testval
+
         self.shape = np.atleast_1d(shape)
         if False in (np.floor(self.shape) == self.shape):
             raise TypeError("Expected int elements in shape")
         self.dtype = dtype
         self.type = TensorType(self.dtype, self.shape, broadcastable)
-        self.testval = testval
+        self.testval = initval
         self.defaults = defaults
         self.transform = transform
 
@@ -288,7 +330,7 @@ def __init__(
         **kwargs,
     ):
         super().__init__(
-            shape=shape, dtype=dtype, testval=testval, defaults=defaults, *args, **kwargs
+            shape=shape, dtype=dtype, initval=testval, defaults=defaults, *args, **kwargs
         )
         self.parent_dist = parent_dist
 
@@ -353,16 +395,22 @@ class DensityDist(Distribution):
 
     """
 
+    @deprecat(
+        deprecated_args={
+            "testval": dict(version="3.11.5", reason="replaced by `initval` in PyMC 4.0.0"),
+        }
+    )
     def __init__(
         self,
         logp,
         shape=(),
         dtype=None,
-        testval=0,
+        initval=0,
         random=None,
         wrap_random_with_dist_shape=True,
         check_shape_in_random=True,
         *args,
+        testval=UNSET,
         **kwargs,
     ):
         """
@@ -379,8 +427,8 @@ def __init__(
             a value here.
         dtype: None, str (Optional)
             The dtype of the distribution.
-        testval: number or array (Optional)
-            The ``testval`` of the RV's tensor that follow the ``DensityDist``
+        initval: number or array (Optional)
+            The ``initval`` of the RV's tensor that follow the ``DensityDist``
             distribution.
         random: None or callable (Optional)
             If ``None``, no random method is attached to the ``DensityDist``
@@ -403,6 +451,8 @@ def __init__(
             If ``True``, the shape of the random samples generate in the
             ``random`` method is checked with the expected return shape. This
             test is only performed if ``wrap_random_with_dist_shape is False``.
+        testval : np.ndarray
+            The old way of specifying initial values assigning test-values.
         args, kwargs: (Optional)
             These are passed to the parent class' ``__init__``.
 
@@ -525,9 +575,13 @@ def __init__(
                 assert prior.shape == (10, 100, 3)
 
         """
+        # Handle deprecated kwargs
+        if testval is not UNSET:
+            initval = testval
+
         if dtype is None:
             dtype = theano.config.floatX
-        super().__init__(shape, dtype, testval, *args, **kwargs)
+        super().__init__(shape, dtype, initval, *args, **kwargs)
         self.logp = logp
         if type(self.logp) == types.MethodType:
             if PLATFORM != "linux":
diff --git a/pymc3/distributions/posterior_predictive.py b/pymc3/distributions/posterior_predictive.py
index 470f0f02a7..17ee6b1839 100644
--- a/pymc3/distributions/posterior_predictive.py
+++ b/pymc3/distributions/posterior_predictive.py
@@ -342,7 +342,7 @@ def __init__(self, vars, trace: _TraceDict, samples, model: Model | None, size=N
         self.size = size
         self.logger = logging.getLogger("posterior_predictive")
 
-    def __enter__(self) -> "_PosteriorPredictiveSampler":
+    def __enter__(self) -> _PosteriorPredictiveSampler:
         self._tok = vectorized_ppc.set(posterior_predictive_draw_values)
         return self
 
diff --git a/pymc3/gp/cov.py b/pymc3/gp/cov.py
index a696c30b8a..c0b676daba 100644
--- a/pymc3/gp/cov.py
+++ b/pymc3/gp/cov.py
@@ -155,7 +155,7 @@ def __array_wrap__(self, result):
 class Combination(Covariance):
     def __init__(self, factor_list):
         input_dim = max(
-            [factor.input_dim for factor in factor_list if isinstance(factor, Covariance)]
+            factor.input_dim for factor in factor_list if isinstance(factor, Covariance)
         )
         super().__init__(input_dim=input_dim)
         self.factor_list = []
diff --git a/pymc3/gp/util.py b/pymc3/gp/util.py
index b0875b905a..54cac03367 100644
--- a/pymc3/gp/util.py
+++ b/pymc3/gp/util.py
@@ -36,7 +36,7 @@ def infer_shape(X, n_points=None):
 
 
 def stabilize(K):
-    """ adds small diagonal to a covariance matrix """
+    """adds small diagonal to a covariance matrix"""
     return K + 1e-6 * tt.identity_like(K)
 
 
@@ -62,7 +62,7 @@ def kmeans_inducing_points(n_inducing, X):
 
 
 def conditioned_vars(varnames):
-    """ Decorator for validating attrs that are conditioned on. """
+    """Decorator for validating attrs that are conditioned on."""
 
     def gp_wrapper(cls):
         def make_getter(name):
diff --git a/pymc3/model.py b/pymc3/model.py
index dff1e3b78b..c469a2355c 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -723,7 +723,7 @@ def __call__(self, array, grad_out=None, extra_vars=None):
 
         if array.shape != (self.size,):
             raise ValueError(
-                "Invalid shape for array. Must be {} but is {}.".format((self.size,), array.shape)
+                f"Invalid shape for array. Must be {(self.size,)} but is {array.shape}."
             )
 
         if grad_out is None:
diff --git a/pymc3/model_graph.py b/pymc3/model_graph.py
index cd3feb3070..a67706f707 100644
--- a/pymc3/model_graph.py
+++ b/pymc3/model_graph.py
@@ -84,7 +84,7 @@ def _filter_parents(self, var, parents) -> Set[VarName]:
                 if self.transform_map[p] != var.name:
                     keep.add(self.transform_map[p])
             else:
-                raise AssertionError("Do not know what to do with {}".format(get_var_name(p)))
+                raise AssertionError(f"Do not know what to do with {get_var_name(p)}")
         return keep
 
     def get_parents(self, var: Tensor) -> Set[VarName]:
diff --git a/pymc3/ode/ode.py b/pymc3/ode/ode.py
index 2eba398404..6534d5c95f 100644
--- a/pymc3/ode/ode.py
+++ b/pymc3/ode/ode.py
@@ -136,7 +136,7 @@ def _simulate(self, y0, theta):
 
     def make_node(self, y0, theta):
         inputs = (y0, theta)
-        _log.debug("make_node for inputs {}".format(hash(inputs)))
+        _log.debug(f"make_node for inputs {hash(inputs)}")
         states = self._otypes[0]()
         sens = self._otypes[1]()
 
@@ -220,7 +220,7 @@ def infer_shape(self, fgraph, node, input_shapes):
         return output_shapes
 
     def grad(self, inputs, output_grads):
-        _log.debug("grad w.r.t. inputs {}".format(hash(tuple(inputs))))
+        _log.debug(f"grad w.r.t. inputs {hash(tuple(inputs))}")
 
         # fetch symbolic sensitivity output node from cache
         ihash = hash(tuple(inputs))
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 22bf8ff53a..3e63412f37 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -23,7 +23,7 @@
 
 from collections import defaultdict
 from copy import copy, deepcopy
-from typing import Any, Dict, Iterable, List, Optional, Set, Union, cast
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Union, cast
 
 import arviz
 import numpy as np
@@ -32,6 +32,7 @@
 import xarray
 
 from arviz import InferenceData
+from deprecat.sphinx import deprecat
 from fastprogress.fastprogress import progress_bar
 
 import pymc3 as pm
@@ -232,12 +233,18 @@ def _print_step_hierarchy(s: Step, level=0) -> None:
         _log.info(">" * level + f"{s.__class__.__name__}: [{varnames}]")
 
 
+@deprecat(
+    deprecated_args={
+        "start": dict(version="3.11.5", reason="renamed to `initvals` in PyMC v4.0.0"),
+        "pickle_backend": dict(version="3.11.5", reason="removed in PyMC v4.0.0"),
+    }
+)
 def sample(
     draws=1000,
     step=None,
     init="auto",
     n_init=200000,
-    start=None,
+    initvals: Optional[Union[PointType, Sequence[Optional[PointType]]]] = None,
     trace=None,
     chain_idx=0,
     chains=None,
@@ -251,6 +258,7 @@ def sample(
     callback=None,
     jitter_max_retries=10,
     *,
+    start=None,
     return_inferencedata=None,
     idata_kwargs: dict = None,
     mp_ctx=None,
@@ -294,11 +302,10 @@ def sample(
         users.
     n_init : int
         Number of iterations of initializer. Only works for 'ADVI' init methods.
-    start : dict, or array of dict
-        Starting point in parameter space (or partial point)
-        Defaults to ``trace.point(-1))`` if there is a trace provided and model.test_point if not
-        (defaults to empty dict). Initialization methods for NUTS (see ``init`` keyword) can
-        overwrite the default.
+    initvals : optional, dict, array of dict
+        Dict or list of dicts with initial values to use instead of the defaults.
+        The keys should be names of transformed random variables.
+        Initialization methods for NUTS (see ``init`` keyword) can overwrite the default.
     trace : backend, list, or MultiTrace
         This should be a backend instance, a list of variables to track, or a MultiTrace object
         with past values. If a MultiTrace object is given, it must contain samples for the chain
@@ -339,6 +346,11 @@ def sample(
         Maximum number of repeated attempts (per chain) at creating an initial matrix with uniform jitter
         that yields a finite probability. This applies to ``jitter+adapt_diag`` and ``jitter+adapt_full``
         init methods.
+    start : dict, or array of dict
+        Starting point in parameter space (or partial point)
+        Defaults to ``trace.point(-1))`` if there is a trace provided and model.test_point if not
+        (defaults to empty dict). Initialization methods for NUTS (see ``init`` keyword) can
+        overwrite the default.
     return_inferencedata : bool, default=False
         Whether to return the trace as an :class:`arviz:arviz.InferenceData` (True) object or a `MultiTrace` (False)
         Defaults to `False`, but we'll switch to `True` in an upcoming release.
@@ -422,6 +434,10 @@ def sample(
             mean     sd  hdi_3%  hdi_97%
         p  0.609  0.047   0.528    0.699
     """
+    # Handle deprecated/forwards-compatible kwargs
+    if initvals is not None:
+        start = initvals
+
     model = modelcontext(model)
     start = deepcopy(start)
     if start is None:
diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py
index 4c2e6acc7a..7fec886d96 100644
--- a/pymc3/step_methods/hmc/quadpotential.py
+++ b/pymc3/step_methods/hmc/quadpotential.py
@@ -161,13 +161,9 @@ def __init__(
         if initial_mean.ndim != 1:
             raise ValueError("Initial mean must be one-dimensional.")
         if initial_diag is not None and len(initial_diag) != n:
-            raise ValueError(
-                "Wrong shape for initial_diag: expected {} got {}".format(n, len(initial_diag))
-            )
+            raise ValueError(f"Wrong shape for initial_diag: expected {n} got {len(initial_diag)}")
         if len(initial_mean) != n:
-            raise ValueError(
-                "Wrong shape for initial_mean: expected {} got {}".format(n, len(initial_mean))
-            )
+            raise ValueError(f"Wrong shape for initial_mean: expected {n} got {len(initial_mean)}")
 
         if dtype is None:
             dtype = theano.config.floatX
@@ -520,9 +516,7 @@ def __init__(
         if initial_cov is not None and initial_cov.shape != (n, n):
             raise ValueError(f"Wrong shape for initial_cov: expected {n} got {initial_cov.shape}")
         if len(initial_mean) != n:
-            raise ValueError(
-                "Wrong shape for initial_mean: expected {} got {}".format(n, len(initial_mean))
-            )
+            raise ValueError(f"Wrong shape for initial_mean: expected {n} got {len(initial_mean)}")
 
         if dtype is None:
             dtype = theano.config.floatX
diff --git a/pymc3/step_methods/mlda.py b/pymc3/step_methods/mlda.py
index c000af1e34..a438b5fd45 100644
--- a/pymc3/step_methods/mlda.py
+++ b/pymc3/step_methods/mlda.py
@@ -883,24 +883,16 @@ def update_error_estimate(self, accepted, skipped_logp):
                 pm.set_data(
                     {
                         "mu_B": sum(
-                            [
-                                bias.get_mu()
-                                for bias in self.bias_all[
-                                    : len(self.bias_all) - self.num_levels + 2
-                                ]
-                            ]
+                            bias.get_mu()
+                            for bias in self.bias_all[: len(self.bias_all) - self.num_levels + 2]
                         )
                     }
                 )
                 pm.set_data(
                     {
                         "Sigma_B": sum(
-                            [
-                                bias.get_sigma()
-                                for bias in self.bias_all[
-                                    : len(self.bias_all) - self.num_levels + 2
-                                ]
-                            ]
+                            bias.get_sigma()
+                            for bias in self.bias_all[: len(self.bias_all) - self.num_levels + 2]
                         )
                     }
                 )
@@ -979,7 +971,7 @@ def extract_Q_estimate(trace, levels):
 
     Q_0_raw = trace.get_sampler_stats("Q_0")
     # total number of base level samples from all iterations
-    total_base_level_samples = sum([it.shape[0] for it in Q_0_raw])
+    total_base_level_samples = sum(it.shape[0] for it in Q_0_raw)
     Q_0 = np.concatenate(Q_0_raw).reshape((1, total_base_level_samples))
     ess_Q_0 = az.ess(np.array(Q_0, np.float64))
     Q_0_var = Q_0.var() / ess_Q_0
@@ -989,7 +981,7 @@ def extract_Q_estimate(trace, levels):
     for l in range(1, levels):
         Q_diff_raw = trace.get_sampler_stats(f"Q_{l}_{l-1}")
         # total number of samples from all iterations
-        total_level_samples = sum([it.shape[0] for it in Q_diff_raw])
+        total_level_samples = sum(it.shape[0] for it in Q_diff_raw)
         Q_diff = np.concatenate(Q_diff_raw).reshape((1, total_level_samples))
         ess_diff = az.ess(np.array(Q_diff, np.float64))
 
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index 98baccb28f..351c65f7ab 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -280,7 +280,7 @@ def multinomial_logpdf(value, n, p):
 
 
 def dirichlet_multinomial_logpmf(value, n, a):
-    value, n, a = [np.asarray(x) for x in [value, n, a]]
+    value, n, a = (np.asarray(x) for x in [value, n, a])
     assert value.ndim == 1
     assert n.ndim == 0
     assert a.shape == value.shape
@@ -1875,7 +1875,7 @@ def test_multinomial_vec(self):
         )
 
         assert_almost_equal(
-            sum([model_single.fastlogp({"m": val}) for val in vals]),
+            sum(model_single.fastlogp({"m": val}) for val in vals),
             model_many.fastlogp({"m": vals}),
             decimal=4,
         )
@@ -1889,7 +1889,7 @@ def test_multinomial_vec_1d_n(self):
             Multinomial("m", n=ns, p=p, shape=vals.shape)
 
         assert_almost_equal(
-            sum([multinomial_logpdf(val, n, p) for val, n in zip(vals, ns)]),
+            sum(multinomial_logpdf(val, n, p) for val, n in zip(vals, ns)),
             model.fastlogp({"m": vals}),
             decimal=4,
         )
@@ -1903,7 +1903,7 @@ def test_multinomial_vec_1d_n_2d_p(self):
             Multinomial("m", n=ns, p=ps, shape=vals.shape)
 
         assert_almost_equal(
-            sum([multinomial_logpdf(val, n, p) for val, n, p in zip(vals, ns, ps)]),
+            sum(multinomial_logpdf(val, n, p) for val, n, p in zip(vals, ns, ps)),
             model.fastlogp({"m": vals}),
             decimal=4,
         )
@@ -1917,7 +1917,7 @@ def test_multinomial_vec_2d_p(self):
             Multinomial("m", n=n, p=ps, shape=vals.shape)
 
         assert_almost_equal(
-            sum([multinomial_logpdf(val, n, p) for val, p in zip(vals, ps)]),
+            sum(multinomial_logpdf(val, n, p) for val, p in zip(vals, ps)),
             model.fastlogp({"m": vals}),
             decimal=4,
         )
@@ -2009,7 +2009,7 @@ def test_dirichlet_multinomial_vec(self):
         )
 
         assert_almost_equal(
-            sum([model_single.fastlogp({"m": val}) for val in vals]),
+            sum(model_single.fastlogp({"m": val}) for val in vals),
             model_many.fastlogp({"m": vals}),
             decimal=4,
         )
@@ -2023,7 +2023,7 @@ def test_dirichlet_multinomial_vec_1d_n(self):
             DirichletMultinomial("m", n=ns, a=a, shape=vals.shape)
 
         assert_almost_equal(
-            sum([dirichlet_multinomial_logpmf(val, n, a) for val, n in zip(vals, ns)]),
+            sum(dirichlet_multinomial_logpmf(val, n, a) for val, n in zip(vals, ns)),
             model.fastlogp({"m": vals}),
             decimal=4,
         )
@@ -2037,7 +2037,7 @@ def test_dirichlet_multinomial_vec_1d_n_2d_a(self):
             DirichletMultinomial("m", n=ns, a=as_, shape=vals.shape)
 
         assert_almost_equal(
-            sum([dirichlet_multinomial_logpmf(val, n, a) for val, n, a in zip(vals, ns, as_)]),
+            sum(dirichlet_multinomial_logpmf(val, n, a) for val, n, a in zip(vals, ns, as_)),
             model.fastlogp({"m": vals}),
             decimal=4,
         )
@@ -2051,7 +2051,7 @@ def test_dirichlet_multinomial_vec_2d_a(self):
             DirichletMultinomial("m", n=n, a=as_, shape=vals.shape)
 
         assert_almost_equal(
-            sum([dirichlet_multinomial_logpmf(val, n, a) for val, a in zip(vals, as_)]),
+            sum(dirichlet_multinomial_logpmf(val, n, a) for val, a in zip(vals, as_)),
             model.fastlogp({"m": vals}),
             decimal=4,
         )
diff --git a/pymc3/tests/test_distributions_random.py b/pymc3/tests/test_distributions_random.py
index fff795c1fb..a5e3c254f6 100644
--- a/pymc3/tests/test_distributions_random.py
+++ b/pymc3/tests/test_distributions_random.py
@@ -216,7 +216,7 @@ def setup_method(self, *args, **kwargs):
             self.model = pm.Model()
 
         def get_random_variable(self, shape, with_vector_params=False, name=None):
-            """ Creates a RandomVariable of the parametrized distribution. """
+            """Creates a RandomVariable of the parametrized distribution."""
             if with_vector_params:
                 params = {
                     key: value * np.ones(self.shape, dtype=np.dtype(type(value)))
@@ -240,7 +240,7 @@ def get_random_variable(self, shape, with_vector_params=False, name=None):
 
         @staticmethod
         def sample_random_variable(random_variable, size):
-            """ Draws samples from a RandomVariable using its .random() method. """
+            """Draws samples from a RandomVariable using its .random() method."""
             try:
                 if size is None:
                     return random_variable.random()
@@ -255,7 +255,7 @@ def sample_random_variable(random_variable, size):
         @pytest.mark.parametrize("size", [None, (), 1, (1,), 5, (4, 5)], ids=str)
         @pytest.mark.parametrize("shape", [None, ()], ids=str)
         def test_scalar_distribution_shape(self, shape, size):
-            """ Draws samples of different [size] from a scalar [shape] RV. """
+            """Draws samples of different [size] from a scalar [shape] RV."""
             rv = self.get_random_variable(shape)
             exp_shape = self.default_shape if shape is None else tuple(np.atleast_1d(shape))
             exp_size = self.default_size if size is None else tuple(np.atleast_1d(size))
@@ -275,7 +275,7 @@ def test_scalar_distribution_shape(self, shape, size):
             "shape", [None, (), (1,), (1, 1), (1, 2), (10, 11, 1), (9, 10, 2)], ids=str
         )
         def test_scalar_sample_shape(self, shape, size):
-            """ Draws samples of scalar [size] from a [shape] RV. """
+            """Draws samples of scalar [size] from a [shape] RV."""
             rv = self.get_random_variable(shape)
             exp_shape = self.default_shape if shape is None else tuple(np.atleast_1d(shape))
             exp_size = self.default_size if size is None else tuple(np.atleast_1d(size))
@@ -934,6 +934,10 @@ def ref_rand_uchol(size, mu, rowchol, colchol):
                     ref_rand=ref_rand_chol_transpose,
                 )
 
+    @pytest.mark.xfail(
+        condition=sys.platform.startswith("win"),
+        reason="Compilation problems. See https://github.com/pymc-devs/pymc/issues/5253",
+    )
     def test_kronecker_normal(self):
         def ref_rand(size, mu, covs, sigma):
             cov = pm.math.kronecker(covs[0], covs[1]).eval()
diff --git a/pymc3/tests/test_modelcontext.py b/pymc3/tests/test_modelcontext.py
index b7d44ca63c..1e11a9a411 100644
--- a/pymc3/tests/test_modelcontext.py
+++ b/pymc3/tests/test_modelcontext.py
@@ -34,7 +34,7 @@ def test_thread_safety(self):
         that thread A enters the context manager first, then B,
         then A attempts to declare a variable while B is still in the context manager.
         """
-        aInCtxt, bInCtxt, aDone = [threading.Event() for _ in range(3)]
+        aInCtxt, bInCtxt, aDone = (threading.Event() for _ in range(3))
         modelA = Model()
         modelB = Model()
 
diff --git a/pymc3/tests/test_random.py b/pymc3/tests/test_random.py
index 7a4ae42ce2..dfdfee4f63 100644
--- a/pymc3/tests/test_random.py
+++ b/pymc3/tests/test_random.py
@@ -44,8 +44,8 @@ def test_draw_value():
 
     assert _draw_value(5) == 5
     assert _draw_value(5.0) == 5
-    assert isinstance(_draw_value(5.0), type(5.0))
-    assert isinstance(_draw_value(5), type(5))
+    assert isinstance(_draw_value(5.0), float)
+    assert isinstance(_draw_value(5), int)
 
     with pm.Model():
         mu = 2 * tt.constant(np.array([5.0, 6.0])) + theano.shared(np.array(5))
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
index 486db7d291..ac23171e51 100644
--- a/pymc3/tests/test_variational_inference.py
+++ b/pymc3/tests/test_variational_inference.py
@@ -145,7 +145,7 @@ def test_init_groups(three_var_model, raises, grouping):
         ({}, {MeanFieldGroup: (["one"], {}), FullRankGroup: (["two", "three"], {})}),
         ({}, {MeanFieldGroup: (["one"], {}), EmpiricalGroup: (["two", "three"], {"size": 100})}),
     ],
-    ids=lambda t: ", ".join("{}: {}".format(k.__name__, v[0]) for k, v in t[1].items()),
+    ids=lambda t: ", ".join(f"{k.__name__}: {v[0]}" for k, v in t[1].items()),
 )
 def three_var_groups(request, three_var_model):
     kw, grouping = request.param
@@ -199,7 +199,7 @@ def aevb_initial():
         (NormalizingFlowGroup, {"flow": "radial"}),
         (NormalizingFlowGroup, {"flow": "radial-loc"}),
     ],
-    ids=lambda t: "{c}: {d}".format(c=t[0].__name__, d=t[1]),
+    ids=lambda t: f"{t[0].__name__}: {t[1]}",
 )
 def parametric_grouped_approxes(request):
     return request.param
diff --git a/pymc3/theanof.py b/pymc3/theanof.py
index 7f22ce6f38..b2edccd80d 100644
--- a/pymc3/theanof.py
+++ b/pymc3/theanof.py
@@ -212,7 +212,7 @@ def grad(self, inp, grads):
         return grads
 
     def c_code(self, node, name, inp, out, sub):
-        return "{z} = {x};".format(x=inp[0], z=out[0])
+        return f"{out[0]} = {inp[0]};"
 
     def __eq__(self, other):
         return isinstance(self, type(other))
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
index 85eb08e65c..2937dfa5ca 100644
--- a/pymc3/variational/inference.py
+++ b/pymc3/variational/inference.py
@@ -266,7 +266,7 @@ def _infmean(input_array):
                 )
         else:
             if n < 10:
-                logger.info("Finished [100%]: Loss = {:,.5g}".format(scores[-1]))
+                logger.info(f"Finished [100%]: Loss = {scores[-1]:,.5g}")
             else:
                 avg_loss = _infmean(scores[max(0, i - 1000) : i + 1])
                 logger.info(f"Finished [100%]: Average Loss = {avg_loss:,.5g}")
diff --git a/requirements-dev.txt b/requirements-dev.txt
index ec92f04f0d..0f17952f5a 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,6 +1,7 @@
 # This file is auto-generated by scripts/generate_pip_deps_from_conda.py, do not modify.
 # See that file for comments about the need/usage of each dependency.
 
+deprecat
 h5py>=2.7
 ipython>=7.16
 nbsphinx>=0.4
diff --git a/requirements.txt b/requirements.txt
index d0900a22d4..80f31124bc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 arviz>=0.11.0
 cachetools>=4.2.1
+deprecat
 dill
 fastprogress>=0.2.0
 numpy>=1.15.0
diff --git a/scripts/check_all_tests_are_covered.py b/scripts/check_all_tests_are_covered.py
index 058a867d02..90a6a99d7a 100644
--- a/scripts/check_all_tests_are_covered.py
+++ b/scripts/check_all_tests_are_covered.py
@@ -6,23 +6,93 @@
 This is intended to be used as a pre-commit hook, see `.pre-commit-config.yaml`.
 You can run it manually with `pre-commit run check-no-tests-are-ignored --all`.
 """
-
-import re
+import itertools
+import logging
+import os
 
 from pathlib import Path
 
+import pandas
+import yaml
+
+_log = logging.getLogger(__file__)
+logging.basicConfig(level=logging.DEBUG)
+
+
+def find_testfiles():
+    dp_repo = Path(__file__).parent.parent
+    all_tests = {
+        str(fp.relative_to(dp_repo)).replace(os.sep, "/")
+        for fp in (dp_repo / "pymc3" / "tests").glob("**/test_*.py")
+    }
+    _log.info("Found %i tests in total.", len(all_tests))
+    return all_tests
+
+
+def from_yaml():
+    """Determins how often each test file is run per platform and floatX setting.
+
+    An exception is raised if tests run multiple times with the same configuration.
+    """
+    # First collect the matrix definitions from testing workflows
+    matrices = {}
+    for wf in ["pytest.yml", "arviz_compat.yml"]:
+        wfname = wf.strip(".yml")
+        wfdef = yaml.safe_load(open(Path(".github", "workflows", wf)))
+        for jobname, jobdef in wfdef["jobs"].items():
+            matrix = jobdef.get("strategy", {}).get("matrix", {})
+            if matrix:
+                matrices[(wfname, jobname)] = matrix
+            else:
+                _log.warning("No matrix in %s/%s", wf, jobname)
+
+    # Now create an empty DataFrame to count based on OS/floatX/testfile
+    all_os = []
+    all_floatX = []
+    for matrix in matrices.values():
+        all_os += matrix["os"]
+        all_floatX += matrix["floatx"]
+    all_os = tuple(sorted(set(all_os)))
+    all_floatX = tuple(sorted(set(all_floatX)))
+    all_tests = find_testfiles()
+
+    df = pandas.DataFrame(
+        columns=pandas.MultiIndex.from_product(
+            [sorted(all_floatX), sorted(all_os)], names=["floatX", "os"]
+        ),
+        index=pandas.Index(sorted(all_tests), name="testfile"),
+    )
+    df.loc[:, :] = 0
+
+    # Count how often the testfiles are included in job definitions
+    for matrix in matrices.values():
+        for os_, floatX, subset in itertools.product(
+            matrix["os"], matrix["floatx"], matrix["test-subset"]
+        ):
+            testfiles = subset.split("\n")
+            ignored = {item.strip("--ignore=") for item in testfiles if item.startswith("--ignore")}
+            included = {item for item in testfiles if item and not item.startswith("--ignore")}
+            if ignored and not included:
+                # if no testfile is specified explicitly pytest runs all except the ignored ones
+                included = all_tests - ignored
+
+            for testfile in included:
+                df.loc[testfile, (floatX, os_)] += 1
+
+    ignored_by_all = set(df[df.eq(0).all(axis=1)].index)
+    run_multiple_times = set(df[df.gt(1).any(axis=1)].index)
+
+    # Print summary, warnings and raise errors on unwanted configurations
+    _log.info("Number of test runs (❌=0, ✅=once)\n%s", df.replace(0, "❌").replace(1, "✅"))
+
+    if ignored_by_all:
+        _log.warning("%i tests are completely ignored:\n%s", len(ignored_by_all), ignored_by_all)
+    if run_multiple_times:
+        raise Exception(
+            f"{len(run_multiple_times)} tests are run multiple times with the same OS and floatX setting:\n{run_multiple_times}"
+        )
+    return
+
+
 if __name__ == "__main__":
-    testing_workflows = ["pytest.yml"]
-    ignored = set()
-    non_ignored = set()
-    for wfyml in testing_workflows:
-        pytest_ci_job = Path(".github") / "workflows" / wfyml
-        txt = pytest_ci_job.read_text()
-        ignored = set(re.findall(r"(?<=--ignore=)(pymc3/tests.*\.py)", txt))
-        non_ignored = non_ignored.union(set(re.findall(r"(?<!--ignore=)(pymc3/tests.*\.py)", txt)))
-    assert (
-        ignored <= non_ignored
-    ), f"The following tests are ignored by the first job but not run by the others: {ignored.difference(non_ignored)}"
-    assert (
-        ignored >= non_ignored
-    ), f"The following tests are run by multiple jobs: {non_ignored.difference(ignored)}"
+    from_yaml()
diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py
index 0413754af6..4831fc412a 100755
--- a/scripts/generate_pip_deps_from_conda.py
+++ b/scripts/generate_pip_deps_from_conda.py
@@ -45,6 +45,7 @@
     "pip",
     "python",
     "libblas",
+    "libblas=*=*mkl",
     "libpython",
     "m2w64-toolchain",
     "mkl-service",