diff --git a/HISTORY.md b/HISTORY.md
index fdb5125cb2..387d7f5ddf 100644
--- a/HISTORY.md
+++ b/HISTORY.md
@@ -1,3 +1,28 @@
+# Release 0.39.0
+
+## Update to the AdvancedVI interface
+
+Turing's variational inference interface was updated to match version 0.4 version of AdvancedVI.jl.
+
+AdvancedVI v0.4 introduces various new features:
+
+  - location-scale families with dense scale matrices,
+  - parameter-free stochastic optimization algorithms like `DoG` and `DoWG`,
+  - proximal operators for stable optimization,
+  - the sticking-the-landing control variate for faster convergence, and
+  - the score gradient estimator for non-differentiable targets.
+
+Please see the [Turing API documentation](https://turinglang.org/Turing.jl/stable/api/#Variational-inference), and [AdvancedVI's documentation](https://turinglang.org/AdvancedVI.jl/stable/), for more details.
+
+## Removal of Turing.Essential
+
+The Turing.Essential module has been removed.
+Anything exported from there can be imported from either `Turing` or `DynamicPPL`.
+
+## `@addlogprob!`
+
+The `@addlogprob!` macro is now exported from Turing, making it officially part of the public interface.
+
 # Release 0.38.6
 
 Added compatibility with AdvancedHMC 0.8.
diff --git a/Project.toml b/Project.toml
index 040ddac256..6fcb779e33 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "Turing"
 uuid = "fce5fe82-541a-59a6-adf8-730c64b5f9a0"
-version = "0.38.6"
+version = "0.39.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -53,7 +53,7 @@ Accessors = "0.1"
 AdvancedHMC = "0.3.0, 0.4.0, 0.5.2, 0.6, 0.7, 0.8"
 AdvancedMH = "0.8"
 AdvancedPS = "0.6.0"
-AdvancedVI = "0.2"
+AdvancedVI = "0.4"
 BangBang = "0.4.2"
 Bijectors = "0.14, 0.15"
 Compat = "4.15.0"
diff --git a/docs/make.jl b/docs/make.jl
index 978e5881b3..af24e7b1ec 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -23,8 +23,11 @@ makedocs(;
     pages=[
         "Home" => "index.md",
         "API" => "api.md",
-        "Submodule APIs" =>
-            ["Inference" => "api/Inference.md", "Optimisation" => "api/Optimisation.md"],
+        "Submodule APIs" => [
+            "Inference" => "api/Inference.md",
+            "Optimisation" => "api/Optimisation.md",
+            "Variational " => "api/Variational.md",
+        ],
     ],
     checkdocs=:exports,
     doctest=false,
diff --git a/docs/src/api.md b/docs/src/api.md
index 01f022e7e5..0b8351eb3b 100644
--- a/docs/src/api.md
+++ b/docs/src/api.md
@@ -42,6 +42,7 @@ even though [`Prior()`](@ref) is actually defined in the `Turing.Inference` modu
 | `to_submodel`        | [`DynamicPPL.to_submodel`](@extref)        | Define a submodel                                                                            |
 | `prefix`             | [`DynamicPPL.prefix`](@extref)             | Prefix all variable names in a model with a given VarName                                    |
 | `LogDensityFunction` | [`DynamicPPL.LogDensityFunction`](@extref) | A struct containing all information about how to evaluate a model. Mostly for advanced users |
+| `@addlogprob!`       | [`DynamicPPL.@addlogprob!`](@extref)       | Add arbitrary log-probability terms during model evaluation                                  |
 
 ### Inference
 
@@ -76,12 +77,14 @@ even though [`Prior()`](@ref) is actually defined in the `Turing.Inference` modu
 
 ### Variational inference
 
-See the [variational inference tutorial](https://turinglang.org/docs/tutorials/09-variational-inference/) for a walkthrough on how to use these.
+See the [docs of AdvancedVI.jl](https://turinglang.org/AdvancedVI.jl/stable/) for detailed usage and the [variational inference tutorial](https://turinglang.org/docs/tutorials/09-variational-inference/) for a basic walkthrough.
 
-| Exported symbol | Documentation                | Description                             |
-|:--------------- |:---------------------------- |:--------------------------------------- |
-| `vi`            | [`AdvancedVI.vi`](@extref)   | Perform variational inference           |
-| `ADVI`          | [`AdvancedVI.ADVI`](@extref) | Construct an instance of a VI algorithm |
+| Exported symbol        | Documentation                                     | Description                                                                              |
+|:---------------------- |:------------------------------------------------- |:---------------------------------------------------------------------------------------- |
+| `vi`                   | [`Turing.vi`](@ref)                               | Perform variational inference                                                            |
+| `q_locationscale`      | [`Turing.Variational.q_locationscale`](@ref)      | Find a numerically non-degenerate initialization for a location-scale variational family |
+| `q_meanfield_gaussian` | [`Turing.Variational.q_meanfield_gaussian`](@ref) | Find a numerically non-degenerate initialization for a mean-field Gaussian family        |
+| `q_fullrank_gaussian`  | [`Turing.Variational.q_fullrank_gaussian`](@ref)  | Find a numerically non-degenerate initialization for a full-rank Gaussian family         |
 
 ### Automatic differentiation types
 
diff --git a/docs/src/api/Variational.md b/docs/src/api/Variational.md
new file mode 100644
index 0000000000..382efe7e18
--- /dev/null
+++ b/docs/src/api/Variational.md
@@ -0,0 +1,6 @@
+# API: `Turing.Variational`
+
+```@autodocs
+Modules = [Turing.Variational]
+Order  = [:type, :function]
+```
diff --git a/src/Turing.jl b/src/Turing.jl
index aa5fbe8500..1ff2310174 100644
--- a/src/Turing.jl
+++ b/src/Turing.jl
@@ -23,7 +23,7 @@ using Printf: Printf
 using Random: Random
 using LinearAlgebra: I
 
-using ADTypes: ADTypes
+using ADTypes: ADTypes, AutoForwardDiff, AutoReverseDiff, AutoMooncake
 
 const DEFAULT_ADTYPE = ADTypes.AutoForwardDiff()
 
@@ -39,16 +39,12 @@ function setprogress!(progress::Bool)
     @info "[Turing]: progress logging is $(progress ? "enabled" : "disabled") globally"
     PROGRESS[] = progress
     AbstractMCMC.setprogress!(progress; silent=true)
-    # TODO: `AdvancedVI.turnprogress` is removed in AdvancedVI v0.3
-    AdvancedVI.turnprogress(progress)
     return progress
 end
 
 # Random probability measures.
 include("stdlib/distributions.jl")
 include("stdlib/RandomMeasures.jl")
-include("essential/Essential.jl")
-using .Essential
 include("mcmc/Inference.jl")  # inference algorithms
 using .Inference
 include("variational/VariationalInference.jl")
@@ -57,13 +53,13 @@ using .Variational
 include("optimisation/Optimisation.jl")
 using .Optimisation
 
-include("deprecated.jl") # to be removed in the next minor version release
-
 ###########
 # Exports #
 ###########
 # `using` statements for stuff to re-export
 using DynamicPPL:
+    @model,
+    @varname,
     pointwise_loglikelihoods,
     generated_quantities,
     returned,
@@ -73,9 +69,12 @@ using DynamicPPL:
     decondition,
     fix,
     unfix,
+    prefix,
     conditioned,
+    @submodel,
     to_submodel,
-    LogDensityFunction
+    LogDensityFunction,
+    @addlogprob!
 using StatsBase: predict
 using OrderedCollections: OrderedDict
 
@@ -90,6 +89,7 @@ export
     to_submodel,
     prefix,
     LogDensityFunction,
+    @addlogprob!,
     # Sampling - AbstractMCMC
     sample,
     MCMCThreads,
@@ -116,6 +116,9 @@ export
     # Variational inference - AdvancedVI
     vi,
     ADVI,
+    q_locationscale,
+    q_meanfield_gaussian,
+    q_fullrank_gaussian,
     # ADTypes
     AutoForwardDiff,
     AutoReverseDiff,
diff --git a/src/deprecated.jl b/src/deprecated.jl
deleted file mode 100644
index 34305d4423..0000000000
--- a/src/deprecated.jl
+++ /dev/null
@@ -1,39 +0,0 @@
-export setadbackend, setchunksize, setadsafe
-
-function setadbackend(::Union{Symbol,Val})
-    Base.depwarn(
-        "`ADBACKEND` and `setbackend` are deprecated. Please specify the chunk size directly in the sampler constructor, e.g., `HMC(0.1, 5; adtype=AutoForwardDiff())`.\n This function has no effects.",
-        :setbackend;
-        force=true,
-    )
-    return nothing
-end
-
-function setchunksize(::Int)
-    Base.depwarn(
-        "`CHUNKSIZE` and `setchunksize` are deprecated. Please specify the chunk size directly in the sampler constructor, e.g., `HMC(0.1, 5; adtype=AutoForwardDiff())`.\n This function has no effects.",
-        :setchunksize;
-        force=true,
-    )
-    return nothing
-end
-
-function setrdcache(::Union{Bool,Val})
-    Base.depwarn(
-        "`RDCACHE` and `setrdcache` are deprecated. Please specify if you wish to use compiled tape for ReverseDiff directly in the sampler constructor, e.g., `HMC(0.1, 5; adtype=AutoReverseDiff(false))`.\n This function has no effects.",
-        :setrdcache;
-        force=true,
-    )
-    return nothing
-end
-
-function setadsafe(::Bool)
-    Base.depwarn(
-        "`ADSAFE` and `setadsafe` are outdated and no longer in use.",
-        :setadsafe;
-        force=true,
-    )
-    return nothing
-end
-
-Base.@deprecate_binding Core Essential false
diff --git a/src/essential/Essential.jl b/src/essential/Essential.jl
deleted file mode 100644
index b045e4a857..0000000000
--- a/src/essential/Essential.jl
+++ /dev/null
@@ -1,24 +0,0 @@
-module Essential
-
-using DistributionsAD, Bijectors
-using Libtask, ForwardDiff, Random
-using Distributions, LinearAlgebra
-using Reexport
-using ..Turing: Turing
-using DynamicPPL: Model, AbstractSampler, Sampler, SampleFromPrior
-using LinearAlgebra: copytri!
-using Bijectors: PDMatDistribution
-using AdvancedVI
-using StatsFuns: logsumexp, softmax
-@reexport using DynamicPPL
-using ADTypes: ADTypes, AutoForwardDiff, AutoReverseDiff, AutoMooncake
-
-using AdvancedPS: AdvancedPS
-
-include("container.jl")
-
-export @model
-export @varname
-export AutoForwardDiff, AutoReverseDiff, AutoMooncake
-
-end # module
diff --git a/src/essential/container.jl b/src/essential/container.jl
deleted file mode 100644
index a1012d471f..0000000000
--- a/src/essential/container.jl
+++ /dev/null
@@ -1,70 +0,0 @@
-struct TracedModel{S<:AbstractSampler,V<:AbstractVarInfo,M<:Model,E<:Tuple} <:
-       AdvancedPS.AbstractGenericModel
-    model::M
-    sampler::S
-    varinfo::V
-    evaluator::E
-end
-
-function TracedModel(
-    model::Model,
-    sampler::AbstractSampler,
-    varinfo::AbstractVarInfo,
-    rng::Random.AbstractRNG,
-)
-    context = SamplingContext(rng, sampler, DefaultContext())
-    args, kwargs = DynamicPPL.make_evaluate_args_and_kwargs(model, varinfo, context)
-    if kwargs !== nothing && !isempty(kwargs)
-        error(
-            "Sampling with `$(sampler.alg)` does not support models with keyword arguments. See issue #2007 for more details.",
-        )
-    end
-    return TracedModel{AbstractSampler,AbstractVarInfo,Model,Tuple}(
-        model, sampler, varinfo, (model.f, args...)
-    )
-end
-
-function AdvancedPS.advance!(
-    trace::AdvancedPS.Trace{<:AdvancedPS.LibtaskModel{<:TracedModel}}, isref::Bool=false
-)
-    # Make sure we load/reset the rng in the new replaying mechanism
-    DynamicPPL.increment_num_produce!(trace.model.f.varinfo)
-    isref ? AdvancedPS.load_state!(trace.rng) : AdvancedPS.save_state!(trace.rng)
-    score = consume(trace.model.ctask)
-    if score === nothing
-        return nothing
-    else
-        return score + DynamicPPL.getlogp(trace.model.f.varinfo)
-    end
-end
-
-function AdvancedPS.delete_retained!(trace::TracedModel)
-    DynamicPPL.set_retained_vns_del!(trace.varinfo)
-    return trace
-end
-
-function AdvancedPS.reset_model(trace::TracedModel)
-    DynamicPPL.reset_num_produce!(trace.varinfo)
-    return trace
-end
-
-function AdvancedPS.reset_logprob!(trace::TracedModel)
-    DynamicPPL.resetlogp!!(trace.model.varinfo)
-    return trace
-end
-
-function AdvancedPS.update_rng!(
-    trace::AdvancedPS.Trace{<:AdvancedPS.LibtaskModel{<:TracedModel}}
-)
-    # Extract the `args`.
-    args = trace.model.ctask.args
-    # From `args`, extract the `SamplingContext`, which contains the RNG.
-    sampling_context = args[3]
-    rng = sampling_context.rng
-    trace.rng = rng
-    return trace
-end
-
-function Libtask.TapedTask(model::TracedModel, ::Random.AbstractRNG, args...; kwargs...) # RNG ?
-    return Libtask.TapedTask(model.evaluator[1], model.evaluator[2:end]...; kwargs...)
-end
diff --git a/src/mcmc/Inference.jl b/src/mcmc/Inference.jl
index 3140215215..a58ead419e 100644
--- a/src/mcmc/Inference.jl
+++ b/src/mcmc/Inference.jl
@@ -1,7 +1,7 @@
 module Inference
 
-using ..Essential
 using DynamicPPL:
+    @model,
     Metadata,
     VarInfo,
     # TODO(mhauru) all_varnames_grouped_by_symbol isn't exported by DPPL, because it is only
diff --git a/src/mcmc/particle_mcmc.jl b/src/mcmc/particle_mcmc.jl
index afa3a8a6fb..ac5cd76488 100644
--- a/src/mcmc/particle_mcmc.jl
+++ b/src/mcmc/particle_mcmc.jl
@@ -2,6 +2,79 @@
 ### Particle Filtering and Particle MCMC Samplers.
 ###
 
+### AdvancedPS models and interface
+
+struct TracedModel{S<:AbstractSampler,V<:AbstractVarInfo,M<:Model,E<:Tuple} <:
+       AdvancedPS.AbstractGenericModel
+    model::M
+    sampler::S
+    varinfo::V
+    evaluator::E
+end
+
+function TracedModel(
+    model::Model,
+    sampler::AbstractSampler,
+    varinfo::AbstractVarInfo,
+    rng::Random.AbstractRNG,
+)
+    context = SamplingContext(rng, sampler, DefaultContext())
+    args, kwargs = DynamicPPL.make_evaluate_args_and_kwargs(model, varinfo, context)
+    if kwargs !== nothing && !isempty(kwargs)
+        error(
+            "Sampling with `$(sampler.alg)` does not support models with keyword arguments. See issue #2007 for more details.",
+        )
+    end
+    return TracedModel{AbstractSampler,AbstractVarInfo,Model,Tuple}(
+        model, sampler, varinfo, (model.f, args...)
+    )
+end
+
+function AdvancedPS.advance!(
+    trace::AdvancedPS.Trace{<:AdvancedPS.LibtaskModel{<:TracedModel}}, isref::Bool=false
+)
+    # Make sure we load/reset the rng in the new replaying mechanism
+    DynamicPPL.increment_num_produce!(trace.model.f.varinfo)
+    isref ? AdvancedPS.load_state!(trace.rng) : AdvancedPS.save_state!(trace.rng)
+    score = consume(trace.model.ctask)
+    if score === nothing
+        return nothing
+    else
+        return score + DynamicPPL.getlogp(trace.model.f.varinfo)
+    end
+end
+
+function AdvancedPS.delete_retained!(trace::TracedModel)
+    DynamicPPL.set_retained_vns_del!(trace.varinfo)
+    return trace
+end
+
+function AdvancedPS.reset_model(trace::TracedModel)
+    DynamicPPL.reset_num_produce!(trace.varinfo)
+    return trace
+end
+
+function AdvancedPS.reset_logprob!(trace::TracedModel)
+    DynamicPPL.resetlogp!!(trace.model.varinfo)
+    return trace
+end
+
+function AdvancedPS.update_rng!(
+    trace::AdvancedPS.Trace{<:AdvancedPS.LibtaskModel{<:TracedModel}}
+)
+    # Extract the `args`.
+    args = trace.model.ctask.args
+    # From `args`, extract the `SamplingContext`, which contains the RNG.
+    sampling_context = args[3]
+    rng = sampling_context.rng
+    trace.rng = rng
+    return trace
+end
+
+function Libtask.TapedTask(model::TracedModel, ::Random.AbstractRNG, args...; kwargs...) # RNG ?
+    return Libtask.TapedTask(model.evaluator[1], model.evaluator[2:end]...; kwargs...)
+end
+
 abstract type ParticleInference <: InferenceAlgorithm end
 
 ####
@@ -410,7 +483,7 @@ function AdvancedPS.Trace(
     newvarinfo = deepcopy(varinfo)
     DynamicPPL.reset_num_produce!(newvarinfo)
 
-    tmodel = Turing.Essential.TracedModel(model, sampler, newvarinfo, rng)
+    tmodel = TracedModel(model, sampler, newvarinfo, rng)
     newtrace = AdvancedPS.Trace(tmodel, rng)
     AdvancedPS.addreference!(newtrace.model.ctask.task, newtrace)
     return newtrace
diff --git a/src/variational/VariationalInference.jl b/src/variational/VariationalInference.jl
index 189d3f7001..b9428af112 100644
--- a/src/variational/VariationalInference.jl
+++ b/src/variational/VariationalInference.jl
@@ -1,50 +1,329 @@
+
 module Variational
 
-using DistributionsAD: DistributionsAD
-using DynamicPPL: DynamicPPL
-using StatsBase: StatsBase
-using StatsFuns: StatsFuns
-using LogDensityProblems: LogDensityProblems
+using DynamicPPL
+using ADTypes
 using Distributions
+using LinearAlgebra
+using LogDensityProblems
+using Random
 
-using Random: Random
+import ..Turing: DEFAULT_ADTYPE, PROGRESS
 
 import AdvancedVI
 import Bijectors
 
-# Reexports
-using AdvancedVI: vi, ADVI, ELBO, elbo, TruncatedADAGrad, DecayedADAGrad
-export vi, ADVI, ELBO, elbo, TruncatedADAGrad, DecayedADAGrad
+export vi, q_locationscale, q_meanfield_gaussian, q_fullrank_gaussian
+
+include("deprecated.jl")
+
+function make_logdensity(model::DynamicPPL.Model)
+    weight = 1.0
+    ctx = DynamicPPL.MiniBatchContext(DynamicPPL.DefaultContext(), weight)
+    return DynamicPPL.LogDensityFunction(model, DynamicPPL.VarInfo(model), ctx)
+end
 
 """
-    make_logjoint(model::Model; weight = 1.0)
-Constructs the logjoint as a function of latent variables, i.e. the map z → p(x ∣ z) p(z).
-The weight used to scale the likelihood, e.g. when doing stochastic gradient descent one needs to
-use `DynamicPPL.MiniBatch` context to run the `Model` with a weight `num_total_obs / batch_size`.
-## Notes
-- For sake of efficiency, the returned function is closes over an instance of `VarInfo`. This means that you *might* run into some weird behaviour if you call this method sequentially using different types; if that's the case, just generate a new one for each type using `make_logjoint`.
+    q_initialize_scale(
+        [rng::Random.AbstractRNG,]
+        model::DynamicPPL.Model,
+        location::AbstractVector,
+        scale::AbstractMatrix,
+        basedist::Distributions.UnivariateDistribution;
+        num_samples::Int = 10,
+        num_max_trials::Int = 10,
+        reduce_factor::Real = one(eltype(scale)) / 2
+    )
+
+Given an initial location-scale distribution `q` formed by `location`, `scale`, and `basedist`, shrink `scale` until the expectation of log-densities of `model` taken over `q` are finite.
+If the log-densities are not finite even after `num_max_trials`, throw an error.
+
+For reference, a location-scale distribution \$q\$ formed by `location`, `scale`, and `basedist` is a distribution where its sampling process \$z \\sim q\$ can be represented as
+```julia
+u = rand(basedist, d)
+z = scale * u + location
+```
+
+# Arguments
+- `model`: The target `DynamicPPL.Model`.
+- `location`: The location parameter of the initialization.
+- `scale`: The scale parameter of the initialization.
+- `basedist`: The base distribution of the location-scale family.
+
+# Keyword Arguments
+- `num_samples`: Number of samples used to compute the average log-density at each trial.
+- `num_max_trials`: Number of trials until throwing an error.
+- `reduce_factor`: Factor for shrinking the scale. After `n` trials, the scale is then `scale*reduce_factor^n`.
+
+# Returns 
+- `scale_adj`: The adjusted scale matrix matching the type of `scale`.
 """
-function make_logjoint(model::DynamicPPL.Model; weight=1.0)
-    # setup
-    ctx = DynamicPPL.MiniBatchContext(DynamicPPL.DefaultContext(), weight)
-    f = DynamicPPL.LogDensityFunction(model, DynamicPPL.VarInfo(model), ctx)
-    return Base.Fix1(LogDensityProblems.logdensity, f)
+function q_initialize_scale(
+    rng::Random.AbstractRNG,
+    model::DynamicPPL.Model,
+    location::AbstractVector,
+    scale::AbstractMatrix,
+    basedist::Distributions.UnivariateDistribution;
+    num_samples::Int=10,
+    num_max_trials::Int=10,
+    reduce_factor::Real=one(eltype(scale)) / 2,
+)
+    prob = make_logdensity(model)
+    ℓπ = Base.Fix1(LogDensityProblems.logdensity, prob)
+    varinfo = DynamicPPL.VarInfo(model)
+
+    n_trial = 0
+    while true
+        q = AdvancedVI.MvLocationScale(location, scale, basedist)
+        b = Bijectors.bijector(model; varinfo=varinfo)
+        q_trans = Bijectors.transformed(q, Bijectors.inverse(b))
+        energy = mean(ℓπ, eachcol(rand(rng, q_trans, num_samples)))
+
+        if isfinite(energy)
+            return scale
+        elseif n_trial == num_max_trials
+            error("Could not find an initial")
+        end
+
+        scale = reduce_factor * scale
+        n_trial += 1
+    end
 end
 
-# objectives
-function (elbo::ELBO)(
+"""
+    q_locationscale(
+        [rng::Random.AbstractRNG,]
+        model::DynamicPPL.Model;
+        location::Union{Nothing,<:AbstractVector} = nothing,
+        scale::Union{Nothing,<:Diagonal,<:LowerTriangular} = nothing,
+        meanfield::Bool = true,
+        basedist::Distributions.UnivariateDistribution = Normal()
+    )
+
+Find a numerically non-degenerate variational distribution `q` for approximating the  target `model` within the location-scale variational family formed by the type of `scale` and `basedist`.
+
+The distribution can be manually specified by setting `location`, `scale`, and `basedist`.
+Otherwise, it chooses a standard Gaussian by default.
+Whether the default choice is used or not, the `scale` may be adjusted via `q_initialize_scale` so that the log-densities of `model` are finite over the samples from `q`.
+If `meanfield` is set as `true`, the scale of `q` is restricted to be a diagonal matrix and only the diagonal of `scale` is used.
+
+For reference, a location-scale distribution \$q\$ formed by `location`, `scale`, and `basedist` is a distribution where its sampling process \$z \\sim q\$ can be represented as
+```julia
+u = rand(basedist, d)
+z = scale * u + location
+```
+
+# Arguments
+- `model`: The target `DynamicPPL.Model`.
+
+# Keyword Arguments
+- `location`: The location parameter of the initialization. If `nothing`, a vector of zeros is used.
+- `scale`: The scale parameter of the initialization. If `nothing`, an identity matrix is used.
+- `meanfield`: Whether to use the mean-field approximation. If `true`, `scale` is converted into a `Diagonal` matrix. Otherwise, it is converted into a `LowerTriangular` matrix.
+- `basedist`: The base distribution of the location-scale family.
+
+The remaining keywords are passed to `q_initialize_scale`.
+
+# Returns 
+- `q::Bijectors.TransformedDistribution`: A `AdvancedVI.LocationScale` distribution matching the support of `model`.
+"""
+function q_locationscale(
+    rng::Random.AbstractRNG,
+    model::DynamicPPL.Model;
+    location::Union{Nothing,<:AbstractVector}=nothing,
+    scale::Union{Nothing,<:Diagonal,<:LowerTriangular}=nothing,
+    meanfield::Bool=true,
+    basedist::Distributions.UnivariateDistribution=Normal(),
+    kwargs...,
+)
+    varinfo = DynamicPPL.VarInfo(model)
+    # Use linked `varinfo` to determine the correct number of parameters.
+    # TODO: Replace with `length` once this is implemented for `VarInfo`.
+    varinfo_linked = DynamicPPL.link(varinfo, model)
+    num_params = length(varinfo_linked[:])
+
+    μ = if isnothing(location)
+        zeros(num_params)
+    else
+        @assert length(location) == num_params "Length of the provided location vector, $(length(location)), does not match dimension of the target distribution, $(num_params)."
+        location
+    end
+
+    L = if isnothing(scale)
+        if meanfield
+            q_initialize_scale(rng, model, μ, Diagonal(ones(num_params)), basedist; kwargs...)
+        else
+            L0 = LowerTriangular(Matrix{Float64}(I, num_params, num_params))
+            q_initialize_scale(rng, model, μ, L0, basedist; kwargs...)
+        end
+    else
+        @assert size(scale) == (num_params, num_params) "Dimensions of the provided scale matrix, $(size(scale)), does not match the dimension of the target distribution, $(num_params)."
+        if meanfield
+            Diagonal(diag(scale))
+        else
+            LowerTriangular(Matrix(scale))
+        end
+    end
+    q = AdvancedVI.MvLocationScale(μ, L, basedist)
+    b = Bijectors.bijector(model; varinfo=varinfo)
+    return Bijectors.transformed(q, Bijectors.inverse(b))
+end
+
+function q_locationscale(model::DynamicPPL.Model; kwargs...)
+    return q_locationscale(Random.default_rng(), model; kwargs...)
+end
+
+"""
+    q_meanfield_gaussian(
+        [rng::Random.AbstractRNG,]
+        model::DynamicPPL.Model;
+        location::Union{Nothing,<:AbstractVector} = nothing,
+        scale::Union{Nothing,<:Diagonal} = nothing,
+        kwargs...
+    )
+
+Find a numerically non-degenerate mean-field Gaussian `q` for approximating the  target `model`.
+
+# Arguments
+- `model`: The target `DynamicPPL.Model`.
+
+# Keyword Arguments
+- `location`: The location parameter of the initialization. If `nothing`, a vector of zeros is used.
+- `scale`: The scale parameter of the initialization. If `nothing`, an identity matrix is used.
+
+The remaining keyword arguments are passed to `q_locationscale`.
+
+# Returns 
+- `q::Bijectors.TransformedDistribution`: A `AdvancedVI.LocationScale` distribution matching the support of `model`.
+"""
+function q_meanfield_gaussian(
+    rng::Random.AbstractRNG,
+    model::DynamicPPL.Model;
+    location::Union{Nothing,<:AbstractVector}=nothing,
+    scale::Union{Nothing,<:Diagonal}=nothing,
+    kwargs...,
+)
+    return q_locationscale(
+        rng, model; location, scale, meanfield=true, basedist=Normal(), kwargs...
+    )
+end
+
+function q_meanfield_gaussian(model::DynamicPPL.Model; kwargs...)
+    return q_meanfield_gaussian(Random.default_rng(), model; kwargs...)
+end
+
+"""
+    q_fullrank_gaussian(
+        [rng::Random.AbstractRNG,]
+        model::DynamicPPL.Model;
+        location::Union{Nothing,<:AbstractVector} = nothing,
+        scale::Union{Nothing,<:LowerTriangular} = nothing,
+        kwargs...
+    )
+
+Find a numerically non-degenerate Gaussian `q` with a scale with full-rank factors (traditionally referred to as a "full-rank family") for approximating the target `model`.
+
+# Arguments
+- `model`: The target `DynamicPPL.Model`.
+
+# Keyword Arguments
+- `location`: The location parameter of the initialization. If `nothing`, a vector of zeros is used.
+- `scale`: The scale parameter of the initialization. If `nothing`, an identity matrix is used.
+
+The remaining keyword arguments are passed to `q_locationscale`.
+
+# Returns 
+- `q::Bijectors.TransformedDistribution`: A `AdvancedVI.LocationScale` distribution matching the support of `model`.
+"""
+function q_fullrank_gaussian(
+    rng::Random.AbstractRNG,
+    model::DynamicPPL.Model;
+    location::Union{Nothing,<:AbstractVector}=nothing,
+    scale::Union{Nothing,<:LowerTriangular}=nothing,
+    kwargs...,
+)
+    return q_locationscale(
+        rng, model; location, scale, meanfield=false, basedist=Normal(), kwargs...
+    )
+end
+
+function q_fullrank_gaussian(model::DynamicPPL.Model; kwargs...)
+    return q_fullrank_gaussian(Random.default_rng(), model; kwargs...)
+end
+
+"""
+    vi(
+        [rng::Random.AbstractRNG,]
+        model::DynamicPPL.Model;
+        q,
+        n_iterations::Int;
+        objective::AdvancedVI.AbstractVariationalObjective = AdvancedVI.RepGradELBO(
+            10; entropy = AdvancedVI.ClosedFormEntropyZeroGradient()
+        ),
+        show_progress::Bool = Turing.PROGRESS[],
+        optimizer::Optimisers.AbstractRule = AdvancedVI.DoWG(),
+        averager::AdvancedVI.AbstractAverager = AdvancedVI.PolynomialAveraging(),
+        operator::AdvancedVI.AbstractOperator = AdvancedVI.ProximalLocationScaleEntropy(),
+        adtype::ADTypes.AbstractADType = Turing.DEFAULT_ADTYPE,
+        kwargs...
+    )
+
+Approximating the target `model` via variational inference by optimizing `objective` with the initialization `q`.
+This is a thin wrapper around `AdvancedVI.optimize`.
+
+# Arguments
+- `model`: The target `DynamicPPL.Model`.
+- `q`: The initial variational approximation.
+- `n_iterations`: Number of optimization steps.
+
+# Keyword Arguments
+- `objective`: Variational objective to be optimized.
+- `show_progress`: Whether to show the progress bar.
+- `optimizer`: Optimization algorithm.
+- `averager`: Parameter averaging strategy.
+- `operator`: Operator applied after each optimization step.
+- `adtype`: Automatic differentiation backend.
+
+See the docs of `AdvancedVI.optimize` for additional keyword arguments.
+
+# Returns 
+- `q`: Variational distribution formed by the last iterate of the optimization run.
+- `q_avg`: Variational distribution formed by the averaged iterates according to `averager`.
+- `state`: Collection of states used for optimization. This can be used to resume from a past call to `vi`.
+- `info`: Information generated during the optimization run.
+"""
+function vi(
     rng::Random.AbstractRNG,
-    alg::AdvancedVI.VariationalInference,
-    q,
     model::DynamicPPL.Model,
-    num_samples;
-    weight=1.0,
+    q,
+    n_iterations::Int;
+    objective=AdvancedVI.RepGradELBO(
+        10; entropy=AdvancedVI.ClosedFormEntropyZeroGradient()
+    ),
+    show_progress::Bool=PROGRESS[],
+    optimizer=AdvancedVI.DoWG(),
+    averager=AdvancedVI.PolynomialAveraging(),
+    operator=AdvancedVI.ProximalLocationScaleEntropy(),
+    adtype::ADTypes.AbstractADType=DEFAULT_ADTYPE,
     kwargs...,
 )
-    return elbo(rng, alg, q, make_logjoint(model; weight=weight), num_samples; kwargs...)
+    return AdvancedVI.optimize(
+        rng,
+        make_logdensity(model),
+        objective,
+        q,
+        n_iterations;
+        show_progress=show_progress,
+        adtype,
+        optimizer,
+        averager,
+        operator,
+        kwargs...,
+    )
 end
 
-# VI algorithms
-include("advi.jl")
+function vi(model::DynamicPPL.Model, q, n_iterations::Int; kwargs...)
+    return vi(Random.default_rng(), model, q, n_iterations; kwargs...)
+end
 
 end
diff --git a/src/variational/advi.jl b/src/variational/advi.jl
deleted file mode 100644
index 3819109a09..0000000000
--- a/src/variational/advi.jl
+++ /dev/null
@@ -1,70 +0,0 @@
-"""
-    meanfield([rng, ]model::Model)
-
-Creates a mean-field approximation with multivariate normal as underlying distribution.
-"""
-meanfield(model::DynamicPPL.Model) = meanfield(Random.default_rng(), model)
-function meanfield(rng::Random.AbstractRNG, model::DynamicPPL.Model)
-    # Setup.
-    varinfo = DynamicPPL.VarInfo(model)
-    # Use linked `varinfo` to determine the correct number of parameters.
-    # TODO: Replace with `length` once this is implemented for `VarInfo`.
-    varinfo_linked = DynamicPPL.link(varinfo, model)
-    num_params = length(varinfo_linked[:])
-
-    # initial params
-    μ = randn(rng, num_params)
-    σ = StatsFuns.softplus.(randn(rng, num_params))
-
-    # Construct the base family.
-    d = DistributionsAD.TuringDiagMvNormal(μ, σ)
-
-    # Construct the bijector constrained → unconstrained.
-    b = Bijectors.bijector(model; varinfo=varinfo)
-
-    # We want to transform from unconstrained space to constrained,
-    # hence we need the inverse of `b`.
-    return Bijectors.transformed(d, Bijectors.inverse(b))
-end
-
-# Overloading stuff from `AdvancedVI` to specialize for Turing
-function AdvancedVI.update(d::DistributionsAD.TuringDiagMvNormal, μ, σ)
-    return DistributionsAD.TuringDiagMvNormal(μ, σ)
-end
-function AdvancedVI.update(td::Bijectors.TransformedDistribution, θ...)
-    return Bijectors.transformed(AdvancedVI.update(td.dist, θ...), td.transform)
-end
-function AdvancedVI.update(
-    td::Bijectors.TransformedDistribution{<:DistributionsAD.TuringDiagMvNormal},
-    θ::AbstractArray,
-)
-    # `length(td.dist) != length(td)` if `td.transform` changes the dimensionality,
-    # so we need to use the length of the underlying distribution `td.dist` here.
-    # TODO: Check if we can get away with `view` instead of `getindex` for all AD backends.
-    μ, ω = θ[begin:(begin + length(td.dist) - 1)], θ[(begin + length(td.dist)):end]
-    return AdvancedVI.update(td, μ, StatsFuns.softplus.(ω))
-end
-
-function AdvancedVI.vi(
-    model::DynamicPPL.Model, alg::AdvancedVI.ADVI; optimizer=AdvancedVI.TruncatedADAGrad()
-)
-    q = meanfield(model)
-    return AdvancedVI.vi(model, alg, q; optimizer=optimizer)
-end
-
-function AdvancedVI.vi(
-    model::DynamicPPL.Model,
-    alg::AdvancedVI.ADVI,
-    q::Bijectors.TransformedDistribution{<:DistributionsAD.TuringDiagMvNormal};
-    optimizer=AdvancedVI.TruncatedADAGrad(),
-)
-    # Initial parameters for mean-field approx
-    μ, σs = StatsBase.params(q)
-    θ = vcat(μ, StatsFuns.invsoftplus.(σs))
-
-    # Optimize
-    AdvancedVI.optimize!(elbo, alg, q, make_logjoint(model), θ; optimizer=optimizer)
-
-    # Return updated `Distribution`
-    return AdvancedVI.update(q, θ)
-end
diff --git a/src/variational/deprecated.jl b/src/variational/deprecated.jl
new file mode 100644
index 0000000000..9a9f4777b5
--- /dev/null
+++ b/src/variational/deprecated.jl
@@ -0,0 +1,61 @@
+
+import DistributionsAD
+export ADVI
+
+Base.@deprecate meanfield(model) q_meanfield_gaussian(model)
+
+struct ADVI{AD}
+    "Number of samples used to estimate the ELBO in each optimization step."
+    samples_per_step::Int
+    "Maximum number of gradient steps."
+    max_iters::Int
+    "AD backend used for automatic differentiation."
+    adtype::AD
+end
+
+function ADVI(
+    samples_per_step::Int=1,
+    max_iters::Int=1000;
+    adtype::ADTypes.AbstractADType=ADTypes.AutoForwardDiff(),
+)
+    Base.depwarn(
+        "The type ADVI will be removed in future releases. Please refer to the new interface for `vi`",
+        :ADVI;
+        force=true,
+    )
+    return ADVI{typeof(adtype)}(samples_per_step, max_iters, adtype)
+end
+
+function vi(model::DynamicPPL.Model, alg::ADVI; kwargs...)
+    Base.depwarn(
+        "This specialization along with the type `ADVI`  will be deprecated in future releases. Please refer to the new interface for `vi`.",
+        :vi;
+        force=true,
+    )
+    q = q_meanfield_gaussian(Random.default_rng(), model)
+    objective = AdvancedVI.RepGradELBO(
+        alg.samples_per_step; entropy=AdvancedVI.ClosedFormEntropy()
+    )
+    operator = AdvancedVI.IdentityOperator()
+    _, q_avg, _, _ = vi(model, q, alg.max_iters; objective, operator, kwargs...)
+    return q_avg
+end
+
+function vi(
+    model::DynamicPPL.Model,
+    alg::ADVI,
+    q::Bijectors.TransformedDistribution{<:DistributionsAD.TuringDiagMvNormal};
+    kwargs...,
+)
+    Base.depwarn(
+        "This specialization along with the type `ADVI`  will be deprecated in future releases. Please refer to the new interface for `vi`.",
+        :vi;
+        force=true,
+    )
+    objective = AdvancedVI.RepGradELBO(
+        alg.samples_per_step; entropy=AdvancedVI.ClosedFormEntropy()
+    )
+    operator = AdvancedVI.IdentityOperator()
+    _, q_avg, _, _ = vi(model, q, alg.max_iters; objective, operator, kwargs...)
+    return q_avg
+end
diff --git a/test/Project.toml b/test/Project.toml
index 5587f46d7d..7cab77a01a 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -44,7 +44,7 @@ AbstractMCMC = "5"
 AbstractPPL = "0.9, 0.10, 0.11"
 AdvancedMH = "0.6, 0.7, 0.8"
 AdvancedPS = "=0.6.0"
-AdvancedVI = "0.2"
+AdvancedVI = "0.4"
 Aqua = "0.8"
 BangBang = "0.4"
 Bijectors = "0.14, 0.15"
diff --git a/test/ad.jl b/test/ad.jl
index befef5af33..2f645fab5d 100644
--- a/test/ad.jl
+++ b/test/ad.jl
@@ -25,8 +25,8 @@ const always_valid_eltypes = (AbstractFloat, AbstractIrrational, Integer, Ration
 
 """A dictionary mapping ADTypes to the element types they use."""
 eltypes_by_adtype = Dict(
-    Turing.AutoForwardDiff => (ForwardDiff.Dual,),
-    Turing.AutoReverseDiff => (
+    AutoForwardDiff => (ForwardDiff.Dual,),
+    AutoReverseDiff => (
         ReverseDiff.TrackedArray,
         ReverseDiff.TrackedMatrix,
         ReverseDiff.TrackedReal,
@@ -37,7 +37,7 @@ eltypes_by_adtype = Dict(
     ),
 )
 if INCLUDE_MOONCAKE
-    eltypes_by_adtype[Turing.AutoMooncake] = (Mooncake.CoDual,)
+    eltypes_by_adtype[AutoMooncake] = (Mooncake.CoDual,)
 end
 
 """
@@ -189,9 +189,9 @@ end
 """
 All the ADTypes on which we want to run the tests.
 """
-ADTYPES = [Turing.AutoForwardDiff(), Turing.AutoReverseDiff(; compile=false)]
+ADTYPES = [AutoForwardDiff(), AutoReverseDiff(; compile=false)]
 if INCLUDE_MOONCAKE
-    push!(ADTYPES, Turing.AutoMooncake(; config=nothing))
+    push!(ADTYPES, AutoMooncake(; config=nothing))
 end
 
 # Check that ADTypeCheckContext itself works as expected.
@@ -199,12 +199,12 @@ end
     @model test_model() = x ~ Normal(0, 1)
     tm = test_model()
     adtypes = (
-        Turing.AutoForwardDiff(),
-        Turing.AutoReverseDiff(),
+        AutoForwardDiff(),
+        AutoReverseDiff(),
         # Don't need to test Mooncake as it doesn't use tracer types
     )
     for actual_adtype in adtypes
-        sampler = Turing.HMC(0.1, 5; adtype=actual_adtype)
+        sampler = HMC(0.1, 5; adtype=actual_adtype)
         for expected_adtype in adtypes
             contextualised_tm = DynamicPPL.contextualize(
                 tm, ADTypeCheckContext(expected_adtype, tm.context)
@@ -212,9 +212,9 @@ end
             @testset "Expected: $expected_adtype, Actual: $actual_adtype" begin
                 if actual_adtype == expected_adtype
                     # Check that this does not throw an error.
-                    Turing.sample(contextualised_tm, sampler, 2)
+                    sample(contextualised_tm, sampler, 2)
                 else
-                    @test_throws AbstractWrongADBackendError Turing.sample(
+                    @test_throws AbstractWrongADBackendError sample(
                         contextualised_tm, sampler, 2
                     )
                 end
diff --git a/test/mcmc/Inference.jl b/test/mcmc/Inference.jl
index bc31ebfa90..b055a441ae 100644
--- a/test/mcmc/Inference.jl
+++ b/test/mcmc/Inference.jl
@@ -122,8 +122,8 @@ using Turing
             return global loglike = getlogp(__varinfo__) - lp1
         end
         model = testmodel1([1.0])
-        varinfo = Turing.VarInfo(model)
-        model(varinfo, Turing.SampleFromPrior(), Turing.LikelihoodContext())
+        varinfo = DynamicPPL.VarInfo(model)
+        model(varinfo, DynamicPPL.SampleFromPrior(), DynamicPPL.LikelihoodContext())
         @test getlogp(varinfo) == loglike
 
         # Test MiniBatchContext
@@ -132,13 +132,13 @@ using Turing
             return x[1] ~ Bernoulli(a)
         end
         model = testmodel2([1.0])
-        varinfo1 = Turing.VarInfo(model)
+        varinfo1 = DynamicPPL.VarInfo(model)
         varinfo2 = deepcopy(varinfo1)
-        model(varinfo1, Turing.SampleFromPrior(), Turing.LikelihoodContext())
+        model(varinfo1, DynamicPPL.SampleFromPrior(), DynamicPPL.LikelihoodContext())
         model(
             varinfo2,
-            Turing.SampleFromPrior(),
-            Turing.MiniBatchContext(Turing.LikelihoodContext(), 10),
+            DynamicPPL.SampleFromPrior(),
+            DynamicPPL.MiniBatchContext(DynamicPPL.LikelihoodContext(), 10),
         )
         @test isapprox(getlogp(varinfo2) / getlogp(varinfo1), 10)
     end
@@ -623,20 +623,20 @@ using Turing
         @model function e(x=1.0)
             return x ~ Normal()
         end
-        evi = Turing.VarInfo(e())
+        evi = DynamicPPL.VarInfo(e())
         @test isempty(Turing.Inference.getparams(e(), evi))
 
         @model function f()
             return x ~ Normal()
         end
-        fvi = Turing.VarInfo(f())
+        fvi = DynamicPPL.VarInfo(f())
         @test only(Turing.Inference.getparams(f(), fvi)) == (@varname(x), fvi[@varname(x)])
 
         @model function g()
             x ~ Normal()
             return y ~ Poisson()
         end
-        gvi = Turing.VarInfo(g())
+        gvi = DynamicPPL.VarInfo(g())
         gparams = Turing.Inference.getparams(g(), gvi)
         @test gparams[1] == (@varname(x), gvi[@varname(x)])
         @test gparams[2] == (@varname(y), gvi[@varname(y)])
diff --git a/test/mcmc/external_sampler.jl b/test/mcmc/external_sampler.jl
index 95cf304378..d008d800db 100644
--- a/test/mcmc/external_sampler.jl
+++ b/test/mcmc/external_sampler.jl
@@ -6,7 +6,6 @@ using Distributions: sample
 using Distributions.FillArrays: Zeros
 using DynamicPPL: DynamicPPL
 using ForwardDiff: ForwardDiff
-using LinearAlgebra: I
 using LogDensityProblems: LogDensityProblems
 using Random: Random
 using ReverseDiff: ReverseDiff
@@ -15,7 +14,7 @@ using Test: @test, @test_throws, @testset
 using Turing
 using Turing.Inference: AdvancedHMC
 
-function initialize_nuts(model::Turing.Model)
+function initialize_nuts(model::DynamicPPL.Model)
     # Create a linked varinfo
     vi = DynamicPPL.VarInfo(model)
     linked_vi = DynamicPPL.link!!(vi, model)
diff --git a/test/mcmc/gibbs.jl b/test/mcmc/gibbs.jl
index c0b2abba88..19b015367c 100644
--- a/test/mcmc/gibbs.jl
+++ b/test/mcmc/gibbs.jl
@@ -74,7 +74,7 @@ end
             target_vns = collect(target_vns)
             local_varinfo = DynamicPPL.subset(global_varinfo, target_vns)
             ctx = Turing.Inference.GibbsContext(
-                target_vns, Ref(global_varinfo), Turing.DefaultContext()
+                target_vns, Ref(global_varinfo), DynamicPPL.DefaultContext()
             )
 
             # Check that the correct varnames are conditioned, and that getting their
@@ -151,7 +151,7 @@ end
         model::DynamicPPL.Model,
         sampler::DynamicPPL.Sampler{<:AlgWrapper},
         state,
-        params::Turing.AbstractVarInfo,
+        params::DynamicPPL.AbstractVarInfo,
     )
         return Inference.setparams_varinfo!!(model, unwrap_sampler(sampler), state, params)
     end
@@ -183,7 +183,7 @@ end
         return AbstractMCMC.step(rng, model, unwrap_sampler(sampler), args...; kwargs...)
     end
 
-    function Turing.DynamicPPL.initialstep(
+    function DynamicPPL.initialstep(
         rng::Random.AbstractRNG,
         model::DynamicPPL.Model,
         sampler::DynamicPPL.Sampler{<:AlgWrapper},
@@ -191,7 +191,7 @@ end
         kwargs...,
     )
         capture_targets_and_algs(sampler.alg.inner, model.context)
-        return Turing.DynamicPPL.initialstep(
+        return DynamicPPL.initialstep(
             rng, model, unwrap_sampler(sampler), args...; kwargs...
         )
     end
@@ -398,7 +398,7 @@ end
             @varname(m) => RepeatSampler(PG(10), 2),
         )
         for s in (s1, s2, s3, s4, s5, s6)
-            @test DynamicPPL.alg_str(Turing.Sampler(s)) == "Gibbs"
+            @test DynamicPPL.alg_str(DynamicPPL.Sampler(s)) == "Gibbs"
         end
 
         @test sample(gdemo_default, s1, N) isa MCMCChains.Chains
@@ -408,7 +408,7 @@ end
         @test sample(gdemo_default, s5, N) isa MCMCChains.Chains
         @test sample(gdemo_default, s6, N) isa MCMCChains.Chains
 
-        g = Turing.Sampler(s3)
+        g = DynamicPPL.Sampler(s3)
         @test sample(gdemo_default, g, N) isa MCMCChains.Chains
     end
 
@@ -493,7 +493,7 @@ end
         @nospecialize function AbstractMCMC.bundle_samples(
             samples::Vector,
             ::typeof(model),
-            ::Turing.Sampler{<:Gibbs},
+            ::DynamicPPL.Sampler{<:Gibbs},
             state,
             ::Type{MCMCChains.Chains};
             kwargs...,
@@ -594,11 +594,11 @@ end
         @testset verbose = true "$(model.f)" for model in DynamicPPL.TestUtils.DEMO_MODELS
             vns = DynamicPPL.TestUtils.varnames(model)
             samplers = [
-                Turing.Gibbs(@varname(s) => NUTS(), @varname(m) => NUTS()),
-                Turing.Gibbs(@varname(s) => NUTS(), @varname(m) => HMC(0.01, 4)),
-                Turing.Gibbs(@varname(s) => NUTS(), @varname(m) => ESS()),
-                Turing.Gibbs(@varname(s) => HMC(0.01, 4), @varname(m) => MH()),
-                Turing.Gibbs(@varname(s) => MH(), @varname(m) => HMC(0.01, 4)),
+                Gibbs(@varname(s) => NUTS(), @varname(m) => NUTS()),
+                Gibbs(@varname(s) => NUTS(), @varname(m) => HMC(0.01, 4)),
+                Gibbs(@varname(s) => NUTS(), @varname(m) => ESS()),
+                Gibbs(@varname(s) => HMC(0.01, 4), @varname(m) => MH()),
+                Gibbs(@varname(s) => MH(), @varname(m) => HMC(0.01, 4)),
             ]
 
             @testset "$sampler" for sampler in samplers
@@ -634,7 +634,7 @@ end
 
                 # Sampler to use for Gibbs components.
                 hmc = HMC(0.1, 32)
-                sampler = Turing.Gibbs(@varname(s) => hmc, @varname(m) => hmc)
+                sampler = Gibbs(@varname(s) => hmc, @varname(m) => hmc)
                 Random.seed!(42)
                 chain = sample(
                     model,
@@ -685,7 +685,7 @@ end
         @testset "with both `s` and `m` as random" begin
             model = gdemo(1.5, 2.0)
             vns = (@varname(s), @varname(m))
-            alg = Turing.Gibbs(vns => MH())
+            alg = Gibbs(vns => MH())
 
             # `step`
             transition, state = AbstractMCMC.step(rng, model, DynamicPPL.Sampler(alg))
@@ -706,7 +706,7 @@ end
         @testset "without `m` as random" begin
             model = gdemo(1.5, 2.0) | (m=7 / 6,)
             vns = (@varname(s),)
-            alg = Turing.Gibbs(vns => MH())
+            alg = Gibbs(vns => MH())
 
             # `step`
             transition, state = AbstractMCMC.step(rng, model, DynamicPPL.Sampler(alg))
@@ -756,7 +756,7 @@ end
     @testset "CSMC + ESS" begin
         rng = Random.default_rng()
         model = MoGtest_default
-        alg = Turing.Gibbs(
+        alg = Gibbs(
             (@varname(z1), @varname(z2), @varname(z3), @varname(z4)) => CSMC(15),
             @varname(mu1) => ESS(),
             @varname(mu2) => ESS(),
@@ -788,9 +788,7 @@ end
     @testset "CSMC + ESS (usage of implicit varname)" begin
         rng = Random.default_rng()
         model = MoGtest_default_z_vector
-        alg = Turing.Gibbs(
-            @varname(z) => CSMC(15), @varname(mu1) => ESS(), @varname(mu2) => ESS()
-        )
+        alg = Gibbs(@varname(z) => CSMC(15), @varname(mu1) => ESS(), @varname(mu2) => ESS())
         vns = (
             @varname(z[1]),
             @varname(z[2]),
@@ -836,9 +834,7 @@ end
             ),
         ]
         @testset "$(sampler_inner)" for sampler_inner in samplers_inner
-            sampler = Turing.Gibbs(
-                @varname(m1) => sampler_inner, @varname(m2) => sampler_inner
-            )
+            sampler = Gibbs(@varname(m1) => sampler_inner, @varname(m2) => sampler_inner)
             Random.seed!(42)
             chain = sample(
                 model, sampler, 1000; discard_initial=1000, thinning=10, n_adapts=0
diff --git a/test/mcmc/hmc.jl b/test/mcmc/hmc.jl
index 84d0b58233..0b57657484 100644
--- a/test/mcmc/hmc.jl
+++ b/test/mcmc/hmc.jl
@@ -147,7 +147,7 @@ using Turing
         @test DynamicPPL.alg_str(sampler) == "HMCDA"
 
         @test isa(alg, HMCDA)
-        @test isa(sampler, Sampler{<:Turing.Hamiltonian})
+        @test isa(sampler, Sampler{<:Turing.Inference.Hamiltonian})
     end
 
     @testset "nuts inference" begin
@@ -216,7 +216,7 @@ using Turing
         @model function demo_warn_initial_params()
             x ~ Normal()
             if (attempt += 1) < 30
-                Turing.@addlogprob! -Inf
+                @addlogprob! -Inf
             end
         end
 
@@ -231,7 +231,7 @@ using Turing
     @testset "error for impossible model" begin
         @model function demo_impossible()
             x ~ Normal()
-            Turing.@addlogprob! -Inf
+            @addlogprob! -Inf
         end
 
         @test_throws ErrorException sample(demo_impossible(), NUTS(), 5)
@@ -260,7 +260,7 @@ using Turing
                 )
             catch e
                 if e isa DomainError
-                    Turing.@addlogprob! -Inf
+                    @addlogprob! -Inf
                     return nothing
                 else
                     rethrow()
diff --git a/test/mcmc/mh.jl b/test/mcmc/mh.jl
index d190e589a5..2e9f108470 100644
--- a/test/mcmc/mh.jl
+++ b/test/mcmc/mh.jl
@@ -118,7 +118,7 @@ GKernel(var) = (x) -> Normal(x, sqrt.(var))
         model = M(zeros(2), I, 1)
         sampler = Inference.Sampler(MH())
 
-        dt, vt = Inference.dist_val_tuple(sampler, Turing.VarInfo(model))
+        dt, vt = Inference.dist_val_tuple(sampler, DynamicPPL.VarInfo(model))
 
         @test dt[:z] isa AdvancedMH.StaticProposal{false,<:MvNormal}
         @test dt[:m] isa
diff --git a/test/mcmc/sghmc.jl b/test/mcmc/sghmc.jl
index 3796ea915a..1671362ed4 100644
--- a/test/mcmc/sghmc.jl
+++ b/test/mcmc/sghmc.jl
@@ -17,13 +17,13 @@ using Turing
     @testset "sghmc constructor" begin
         alg = SGHMC(; learning_rate=0.01, momentum_decay=0.1)
         @test alg isa SGHMC
-        sampler = Turing.Sampler(alg)
-        @test sampler isa Turing.Sampler{<:SGHMC}
+        sampler = DynamicPPL.Sampler(alg)
+        @test sampler isa DynamicPPL.Sampler{<:SGHMC}
 
         alg = SGHMC(; learning_rate=0.01, momentum_decay=0.1)
         @test alg isa SGHMC
-        sampler = Turing.Sampler(alg)
-        @test sampler isa Turing.Sampler{<:SGHMC}
+        sampler = DynamicPPL.Sampler(alg)
+        @test sampler isa DynamicPPL.Sampler{<:SGHMC}
     end
 
     @testset "sghmc inference" begin
@@ -38,13 +38,13 @@ end
     @testset "sgld constructor" begin
         alg = SGLD(; stepsize=PolynomialStepsize(0.25))
         @test alg isa SGLD
-        sampler = Turing.Sampler(alg)
-        @test sampler isa Turing.Sampler{<:SGLD}
+        sampler = DynamicPPL.Sampler(alg)
+        @test sampler isa DynamicPPL.Sampler{<:SGLD}
 
         alg = SGLD(; stepsize=PolynomialStepsize(0.25))
         @test alg isa SGLD
-        sampler = Turing.Sampler(alg)
-        @test sampler isa Turing.Sampler{<:SGLD}
+        sampler = DynamicPPL.Sampler(alg)
+        @test sampler isa DynamicPPL.Sampler{<:SGLD}
     end
     @testset "sgld inference" begin
         rng = StableRNG(1)
diff --git a/test/optimisation/Optimisation.jl b/test/optimisation/Optimisation.jl
index d081fbd0d7..2acb7edc55 100644
--- a/test/optimisation/Optimisation.jl
+++ b/test/optimisation/Optimisation.jl
@@ -647,7 +647,7 @@ using Turing
         @model function saddle_model()
             x ~ Normal(0, 1)
             y ~ Normal(x, 1)
-            Turing.@addlogprob! x^2 - y^2
+            @addlogprob! x^2 - y^2
             return nothing
         end
         m = saddle_model()
diff --git a/test/runtests.jl b/test/runtests.jl
index dd443d7923..9fec2f737e 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -70,10 +70,6 @@ end
         end
     end
 
-    @testset "variational optimisers" begin
-        @timeit_include("variational/optimisers.jl")
-    end
-
     @testset "stdlib" verbose = true begin
         @timeit_include("stdlib/distributions.jl")
         @timeit_include("stdlib/RandomMeasures.jl")
diff --git a/test/skipped/advi_demo.jl b/test/skipped/advi_demo.jl
deleted file mode 100644
index a9cbabd0fd..0000000000
--- a/test/skipped/advi_demo.jl
+++ /dev/null
@@ -1,161 +0,0 @@
-using Random
-
-using Bijectors
-using Turing
-using Turing: Variational
-
-using StatsFuns: softplus, invsoftplus
-f = softplus
-f⁻¹ = invsoftplus
-
-# setup for plotting
-using Plots, StatsPlots, LaTeXStrings
-pyplot()
-
-# used to compute closed form expression of posterior
-using ConjugatePriors
-
-# define Turing model
-@model function model(x)
-    s ~ InverseGamma(2, 3)
-    m ~ Normal(0.0, sqrt(s))  # `Normal(μ, σ)` has mean μ and variance σ², i.e. parametrize with std. not variance
-
-    for i in 1:length(x)
-        x[i] ~ Normal(m, sqrt(s))
-    end
-end
-
-const seeds = [125, 245, 3]
-const ad_modes = [:forwarddiff, :reversediff, :tracker]
-
-for seed in seeds
-    @info seed
-
-    for ad_mode in ad_modes
-        @info ad_mode
-        setadbackend(ad_mode)
-
-        # set random seed
-        Random.seed!(seed)
-
-        # generate data
-        x = randn(1, 2000)
-
-        # construct model
-        m = model(x)
-
-        # ADVI
-        opt = Variational.TruncatedADAGrad()   # optimizer
-
-        advi = ADVI(10, 100)                   # <: VariationalInference
-        q = Variational.meanfield(m)           # => <: VariationalPosterior
-
-        elbo = Variational.ELBO()              # <: VariationalObjective
-
-        μ, σs = params(q)
-        θ = vcat(μ, f⁻¹.(σs))
-
-        history = [elbo(advi, q, m, 1000)]
-
-        # construct animation
-        anim = @animate for j in 1:25
-            # global q
-            Variational.optimize!(elbo, advi, q, m, θ; optimizer=opt)
-            μ, ω = θ[1:length(q)], θ[(length(q) + 1):end]
-
-            q = Variational.update(q, μ, f.(ω))
-            samples = rand(q, 2000)
-
-            # quick check
-            println([mean(samples; dims=2), [var(x), mean(x)]])
-
-            # plotting code assumes (samples, dim) shape so we just transpose
-            samples = transpose(samples)
-
-            # closed form computation
-            # notation mapping has been verified by explicitly computing expressions
-            # in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
-            μ₀ = 0.0 # => μ
-            κ₀ = 1.0 # => ν, which scales the precision of the Normal
-            α₀ = 2.0 # => "shape"
-            β₀ = 3.0 # => "rate", which is 1 / θ, where θ is "scale"
-
-            # prior
-            pri = NormalGamma(μ₀, κ₀, α₀, β₀)
-
-            # posterior
-            post = posterior(pri, Normal, x)
-
-            # marginal distribution of τ = 1 / σ²
-            # Eq. (90) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
-            # `scale(post)` = θ
-            p_τ = Gamma(post.shape, scale(post))
-            p_σ²_pdf = z -> pdf(p_τ, 1 / z) # τ => 1 / σ²
-
-            # marginal of μ
-            # Eq. (91) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy
-            p_μ = TDist(2 * post.shape)
-
-            μₙ = post.mu    # μ → μ
-            κₙ = post.nu    # κ → ν
-            αₙ = post.shape # α → shape
-            βₙ = post.rate  # β → rate
-
-            # numerically more stable but doesn't seem to have effect; issue is probably internal to
-            # `pdf` which needs to compute ≈ Γ(1000)
-            p_μ_pdf =
-                z -> exp(
-                    logpdf(
-                        p_μ,
-                        (z - μₙ) * exp(-0.5 * log(βₙ) + 0.5 * log(αₙ) + 0.5 * log(κₙ)),
-                    ),
-                )
-            # p_μ_pdf1 = z -> pdf(p_μ, (z - μₙ) / √(βₙ / (αₙ * κₙ)))
-
-            # posterior plots
-            p1 = plot()
-            density!(samples[:, 1]; label="s (ADVI)", color=:blue, linestyle=:dash)
-            histogram!(samples[:, 1]; label="", normed=true, alpha=0.3, color=:blue)
-
-            # normalize using Riemann approx. because of (almost certainly) numerical issues
-            Δ = 0.001
-            r = 0.75:0.001:1.50
-            norm_const = sum(p_σ²_pdf.(r) .* Δ)
-            plot!(r, p_σ²_pdf; label="s (posterior)", color=:red)
-            vline!([var(x)]; label="s (data)", linewidth=1.5, color=:black, alpha=0.7)
-            xlims!(0.5, 1.5)
-            title!("$(j * advi.max_iters) steps")
-
-            p2 = plot()
-            density!(samples[:, 2]; label="m (ADVI)", color=:blue, linestyle=:dash)
-            histogram!(samples[:, 2]; label="", normed=true, alpha=0.3, color=:blue)
-
-            # normalize using Riemann approx. because of (almost certainly) numerical issues
-            Δ = 0.0001
-            r = (-0.1 + mean(x)):Δ:(0.1 + mean(x))
-            norm_const = sum(p_μ_pdf.(r) .* Δ)
-            plot!(r, z -> p_μ_pdf(z) / norm_const; label="m (posterior)", color=:red)
-            vline!([mean(x)]; label="m (data)", linewidth=1.5, color=:black, alpha=0.7)
-
-            xlims!(-0.25, 0.25)
-
-            # visualize evolution of objective wrt. optimization iterations
-            obj = elbo(advi, q, m, 1000)
-            @info "ELBO" obj
-            push!(history, obj)
-            p3 = plot()
-            plot!(1:(advi.max_iters):(length(history) * advi.max_iters), history; label="")
-            title!("ELBO = $obj")
-
-            # plot the latest 25 objective evaluations to visualize trend
-            p4 = plot()
-            plot!(history[max(1, end - 10):end]; label="")
-
-            p = plot(p1, p2, p3, p4; layout=(4, 1))
-
-            @info "[$j] Done"
-            p
-        end
-        gif(anim, "advi_w_elbo_fps15_$(seed)_$(ad_mode).gif"; fps=15)
-    end
-end
diff --git a/test/skipped/bayes_lr.jl b/test/skipped/bayes_lr.jl
deleted file mode 100644
index 6b9b4d31e7..0000000000
--- a/test/skipped/bayes_lr.jl
+++ /dev/null
@@ -1,80 +0,0 @@
-using ReverseDiff: GradientTape, GradientConfig, gradient, gradient!, compile
-using Turing: _hmc_step
-using Turing
-
-@model function bayes_lr(xs, ys)
-    N = length(xs)
-    @assert N == length(ys)
-
-    s = 1
-    β ~ Normal(0, 1)
-
-    for n in 1:N
-        ys[n] ~ Normal(xs[n] * β, sqrt(s))
-    end
-end
-
-N = 100
-xs = collect(range(-10; stop=10, length=N))
-s = 1
-β = rand(Normal(0, 1))
-ys = xs * β + rand(Normal(0, sqrt(s)), N)
-
-println("s=$s, β=$β")
-
-mf = bayes_lr(xs, ys)
-chn = sample(mf, HMC(0.005, 3), 2000)
-
-println("mean of β: ", mean(chn[1000:end, :β]))
-
-θ_dim = 1
-function lj_func(θ)
-    N = length(xs)
-    _lj = zero(Real)
-
-    s = 1
-
-    β = θ[1]
-    _lj += logpdf(Normal(0, 1), β)
-    for n in 1:N
-        _lj += logpdf(Normal(xs[n] * β, sqrt(s)), ys[n])
-    end
-
-    return _lj
-end
-
-neg_lj_func(θ) = -lj_func(θ)
-const f_tape = GradientTape(neg_lj_func, randn(θ_dim))
-const compiled_f_tape = compile(f_tape)
-
-function grad_func(θ)
-    inputs = θ
-    results = similar(θ)
-    all_results = DiffResults.GradientResult(results)
-
-    ReverseDiff.gradient!(all_results, compiled_f_tape, inputs)
-
-    neg_lj = all_results.value
-    grad, = all_results.derivs
-
-    return -neg_lj, grad
-end
-
-std = ones(θ_dim)
-θ = randn(θ_dim)
-lj = lj_func(θ)
-
-chn = []
-accept_num = 1
-
-total_num = 2000
-for iter in 1:total_num
-    global θ, chn, lj, lj_func, grad_func, std, accept_num
-    push!(chn, θ)
-    θ, lj, is_accept, τ_valid, α = _hmc_step(θ, lj, lj_func, grad_func, 3, 0.005, std)
-    accept_num += is_accept
-    #   if (iter % 50 == 0) println(θ) end
-end
-
-@show mean(chn[1000:end]), lj
-@show accept_num / total_num
diff --git a/test/skipped/dual_averaging.jl b/test/skipped/dual_averaging.jl
deleted file mode 100644
index 897abe8d90..0000000000
--- a/test/skipped/dual_averaging.jl
+++ /dev/null
@@ -1,10 +0,0 @@
-function _adapt_ϵ(logϵ, Hbar, logϵbar, da_stat, m, M_adapt, δ, μ; γ=0.05, t0=10, κ=0.75)
-    if m <= M_adapt
-        Hbar = (1.0 - 1.0 / (m + t0)) * Hbar + (1 / (m + t0)) * (δ - da_stat)
-        logϵ = μ - sqrt(m) / γ * Hbar
-        logϵbar = m^(-κ) * logϵ + (1 - m^(-κ)) * logϵbar
-    else
-        logϵ = logϵbar
-    end
-    return logϵ, Hbar, logϵbar
-end
diff --git a/test/skipped/explicit_ret.jl b/test/skipped/explicit_ret.jl
deleted file mode 100644
index 7472e8c319..0000000000
--- a/test/skipped/explicit_ret.jl
+++ /dev/null
@@ -1,19 +0,0 @@
-using Turing
-using Test
-
-@model function test_ex_rt()
-    x ~ Normal(10, 1)
-    y ~ Normal(x / 2, 1)
-    z = y + 1
-    x = x - 1
-    return x, y, z
-end
-
-mf = test_ex_rt()
-
-for alg in
-    [HMC(0.2, 3), PG(20, 2000), SMC(), IS(10000), Gibbs(:x => PG(20, 1), :y => HMC(0.2, 3))]
-    chn = sample(mf, alg)
-    @test mean(chn[:x]) ≈ 10.0 atol = 0.2
-    @test mean(chn[:y]) ≈ 5.0 atol = 0.2
-end
diff --git a/test/skipped/gdemo.jl b/test/skipped/gdemo.jl
deleted file mode 100644
index 84242067eb..0000000000
--- a/test/skipped/gdemo.jl
+++ /dev/null
@@ -1,55 +0,0 @@
-include("unit_test_helper.jl")
-
-# Turing
-
-using Turing
-
-@model function gdemo()
-    s ~ InverseGamma(2, 3)
-    m ~ Normal(0, sqrt(s))
-    1.5 ~ Normal(m, sqrt(s))
-    2.0 ~ Normal(m, sqrt(s))
-    return s, m
-end
-
-# Plain Julia
-
-using ReverseDiff: GradientTape, GradientConfig, gradient, gradient!, compile
-using Turing: invlink, logpdf
-
-θ_dim = 2
-function lj_func(θ)
-    _lj = zero(Real)
-
-    d_s = InverseGamma(2, 3)
-    s = invlink(d_s, θ[1])
-    _lj += logpdf(d_s, s, true)
-    m = θ[2]
-    _lj += logpdf(Normal(0, sqrt(s)), m)
-
-    _lj += logpdf(Normal(m, sqrt(s)), 1.5)
-    _lj += logpdf(Normal(m, sqrt(s)), 2.0)
-
-    return _lj
-end
-
-neg_lj_func(θ) = -lj_func(θ)
-const f_tape = GradientTape(neg_lj_func, randn(θ_dim))
-const compiled_f_tape = compile(f_tape)
-
-function grad_func(θ)
-    inputs = θ
-    results = similar(θ)
-    all_results = DiffResults.GradientResult(results)
-
-    gradient!(all_results, compiled_f_tape, inputs)
-
-    neg_lj = all_results.value
-    grad, = all_results.derivs
-
-    return -neg_lj, grad
-end
-
-# Unit test for gradient
-
-test_grad(gdemo, grad_func; trans=[1])
diff --git a/test/skipped/gdemo_hmc.jl b/test/skipped/gdemo_hmc.jl
deleted file mode 100644
index 855b06f894..0000000000
--- a/test/skipped/gdemo_hmc.jl
+++ /dev/null
@@ -1,44 +0,0 @@
-using Turing: _hmc_step
-
-include("unit_test_helper.jl")
-include("gdemo.jl")
-
-# Turing
-
-mf = gdemo()
-chn = sample(mf, HMC(0.05, 5), 2000)
-
-println("mean of s: ", mean(chn[1000:end, :s]))
-println("mean of m: ", mean(chn[1000:end, :m]))
-
-# Plain Julia
-
-std = ones(θ_dim)
-θ = randn(θ_dim)
-lj = lj_func(θ)
-
-chn = Dict(:θ => Vector{Vector{Float64}}(), :logϵ => Vector{Float64}())
-accept_num = 0
-
-function dummy_print(args...)
-    return nothing
-end
-
-totla_num = 5000
-for iter in 1:totla_num
-    push!(chn[:θ], θ)
-    θ, lj, is_accept, τ_valid, α = _hmc_step(θ, lj, lj_func, grad_func, 5, 0.05, std)
-    accept_num += is_accept
-end
-
-@show lj
-samples_s = exp.(map(x -> x[1], chn[:θ]))
-samples_m = map(x -> x[2], chn[:θ])
-@show mean(samples_s[1000:end])
-@show mean(samples_m[1000:end])
-@show std(samples_s[1000:end])
-@show std(samples_m[1000:end])
-
-@show accept_num / totla_num
-
-# Unit tests
diff --git a/test/skipped/gdemo_nuts.jl b/test/skipped/gdemo_nuts.jl
deleted file mode 100644
index ecc4d5c7f8..0000000000
--- a/test/skipped/gdemo_nuts.jl
+++ /dev/null
@@ -1,70 +0,0 @@
-using Main.Test
-using Turing: _nuts_step
-
-include("unit_test_helper.jl")
-include("gdemo.jl")
-
-include("dual_averaging.jl")
-
-# Turing
-
-# mf = gdemo()
-# chn = sample(mf, HMC(0.05, 5), 5000)
-
-# println("mean of m: $(mean(chn[:m][1000:end]))")
-
-# Plain Julia
-
-M_adapt = 1000
-ϵ0 = 0.25
-logϵ = log(ϵ0)
-μ = log(10 * ϵ0)
-logϵbar = log(1)
-Hbar = 0
-
-δ = 0.75
-
-for test_id in 1:2
-    test_name = "$test_id. NUTS " * (test_id == 1 ? "without DA" : "with DA")
-
-    @testset "$test_name" begin
-        std = ones(θ_dim)
-        θ = randn(θ_dim)
-        lj = lj_func(θ)
-
-        chn = Dict(:θ => Vector{Vector{Float64}}(), :logϵ => Vector{Float64}())
-
-        function dummy_print(args...)
-            return nothing
-        end
-
-        println("Start to run NUTS")
-
-        totla_num = 10000
-        for iter in 1:totla_num
-            global logϵ, lj_func, grad_func, M_adapt, δ, μ
-            θ, da_stat = _nuts_step(θ, exp(logϵ), lj_func, grad_func, std)
-            if test_id == 1
-                logϵ, Hbar, logϵbar = _adapt_ϵ(
-                    logϵ, Hbar, logϵbar, da_stat, iter, M_adapt, δ, μ
-                )
-            end
-
-            push!(chn[:θ], θ)
-            push!(chn[:logϵ], logϵ)
-            # if (iter % 50 == 0) println(θ) end
-        end
-
-        samples_s = exp.(map(x -> x[1], chn[:θ]))
-        samples_m = map(x -> x[2], chn[:θ])
-        @show mean(samples_s[1000:end])
-        @test mean(samples_s[1000:end]) ≈ 49 / 24 atol = 0.2
-        @show mean(samples_m[1000:end])
-        @test mean(samples_m[1000:end]) ≈ 7 / 6 atol = 0.2
-
-        @show std(samples_s[1000:end])
-        @show std(samples_m[1000:end])
-
-        @show mean(exp.(chn[:logϵ]))
-    end
-end
diff --git a/test/skipped/hmcda_geweke.jl b/test/skipped/hmcda_geweke.jl
deleted file mode 100644
index 8e2f209864..0000000000
--- a/test/skipped/hmcda_geweke.jl
+++ /dev/null
@@ -1,98 +0,0 @@
-using StatsBase, Turing
-using Gadfly
-
-# include("ASCIIPlot.jl")
-import Gadfly.ElementOrFunction
-
-# First add a method to the basic Gadfly.plot function for QQPair types (generated by Distributions.qqbuild())
-function Gadfly.plot(qq::QQPair, elements::ElementOrFunction...)
-    return Gadfly.plot(;
-        x=qq.qx, y=qq.qy, Geom.point, Theme(; highlight_width=0px), elements...
-    )
-end
-
-# Now some shorthand functions
-qqplot(x, y, elements::ElementOrFunction...) = Gadfly.plot(qqbuild(x, y), elements...)
-function qqnorm(x, elements::ElementOrFunction...)
-    return qqplot(
-        Normal(),
-        x,
-        Guide.xlabel("Theoretical Normal quantiles"),
-        Guide.ylabel("Observed quantiles"),
-        elements...,
-    )
-end
-
-NSamples = 5000
-
-@model function gdemo_fw()
-    # s ~ InverseGamma(2,3)
-    s = 1
-    m ~ Normal(0, sqrt(s))
-    return y ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)])
-end
-
-@model function gdemo_bk(x)
-    # Backward Step 1: theta ~ theta | x
-    # s ~ InverseGamma(2,3)
-    s = 1
-    m ~ Normal(0, sqrt(s))
-    x ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)])
-    # Backward Step 2: x ~ x | theta
-    return y ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)])
-end
-
-fw = PG(50, NSamples)
-# bk = Gibbs(10, PG(10,10, :s, :y), HMC(0.25, 5, :m));
-bk = HMCDA(0.65, 0.2);
-
-s = sample(gdemo_fw(), fw);
-# describe(s)
-
-N = div(NSamples, 50)
-
-x = s[1, :y]
-s_bk = Array{Turing.Chain}(undef, N)
-
-simple_logger = Base.CoreLogging.SimpleLogger(stderr, Base.CoreLogging.Debug)
-Base.CoreLogging.with_logger(simple_logger) do
-    global x, bk, s_bk
-    i = 1
-    while i <= N
-        s_bk[i] = sample(gdemo_bk(x), bk)
-        x = s_bk[i][end, :y]
-        i += 1
-    end
-end
-
-s2 = reduce(vcat, s_bk);
-# describe(s2)
-
-using UnicodePlots
-
-qqm = qqbuild(s[:m], s2[:m])
-show(scatterplot(qqm.qx, qqm.qy; title="QQ plot for m", canvas=DotCanvas))
-show(
-    scatterplot(
-        qqm.qx[51:(end - 50)],
-        qqm.qy[51:(end - 50)];
-        title="QQ plot for m (removing first and last 50 quantiles):",
-        canvas=DotCanvas,
-    ),
-)
-
-qqm = qqbuild(s[:m], s2[:m])
-X = qqm.qx
-y = qqm.qy
-slope = (1 / (transpose(X) * X)[1] * transpose(X) * y)[1]
-
-print("  slopeₛ = $slope ≈ 1 (ϵ = 0.1)")
-ans1 = abs(slope - 1.0) <= 0.1
-if ans1
-    printstyled(" ✓\n"; color=:green)
-else
-    printstyled(" X\n"; color=:red)
-    printstyled("    slope = $slope, diff = $(slope - 1.0)\n"; color=:red)
-end
-
-# qqs = qqbuild(s[:s], s2[:s])
diff --git a/test/skipped/nuts_geweke.jl b/test/skipped/nuts_geweke.jl
deleted file mode 100644
index d312be8841..0000000000
--- a/test/skipped/nuts_geweke.jl
+++ /dev/null
@@ -1,120 +0,0 @@
-using StatsBase, Turing
-using Gadfly
-
-# include("ASCIIPlot.jl")
-import Gadfly.ElementOrFunction
-
-# First add a method to the basic Gadfly.plot function for QQPair types (generated by Distributions.qqbuild())
-function Gadfly.plot(qq::QQPair, elements::ElementOrFunction...)
-    return Gadfly.plot(;
-        x=qq.qx, y=qq.qy, Geom.point, Theme(; highlight_width=0px), elements...
-    )
-end
-
-# Now some shorthand functions
-qqplot(x, y, elements::ElementOrFunction...) = Gadfly.plot(qqbuild(x, y), elements...)
-function qqnorm(x, elements::ElementOrFunction...)
-    return qqplot(
-        Normal(),
-        x,
-        Guide.xlabel("Theoretical Normal quantiles"),
-        Guide.ylabel("Observed quantiles"),
-        elements...,
-    )
-end
-
-NSamples = 5000
-
-@model function gdemo_fw()
-    s ~ InverseGamma(2, 3)
-    # s = 1
-    m ~ Normal(0, sqrt(s))
-    return y ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)])
-end
-
-@model function gdemo_bk(x)
-    # Backward Step 1: theta ~ theta | x
-    s ~ InverseGamma(2, 3)
-    # s = 1
-    m ~ Normal(0, sqrt(s))
-    x ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)])
-    # Backward Step 2: x ~ x | theta
-    return y ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)])
-end
-
-fw = IS(NSamples)
-# bk = Gibbs(10, PG(10,10, :s, :y), HMC(1, 0.25, 5, :m));
-bk_sample_n = 20
-bk = NUTS(bk_sample_n, 100, 0.65);
-
-s = sample(gdemo_fw(), fw);
-# describe(s)
-
-N = div(NSamples, bk_sample_n)
-
-x = s[1, :y]
-s_bk = Array{MCMCChains.Chains}(undef, N)
-
-simple_logger = Base.CoreLogging.SimpleLogger(stderr, Base.CoreLogging.Debug)
-with_logger(simple_logger) do
-    i = 1
-    while i <= N
-        s_bk[i] = sample(gdemo_bk(x), bk)
-        x = s_bk[i][end, :y]
-        i += 1
-    end
-end
-
-s2 = chainscat(s_bk...)
-# describe(s2)
-
-# qqplot(s[:m], s2[:m])
-# qqplot(s[:s], s2[:s])
-
-qqm = qqbuild(s[:m], s2[:m])
-
-using UnicodePlots
-
-qqm = qqbuild(s[:m], s2[:m])
-show(scatterplot(qqm.qx, qqm.qy; title="QQ plot for m", canvas=DotCanvas))
-show(
-    scatterplot(
-        qqm.qx[51:(end - 50)],
-        qqm.qy[51:(end - 50)];
-        title="QQ plot for m (removing first and last 50 quantiles):",
-        canvas=DotCanvas,
-    ),
-)
-show(scatterplot(qqm.qx, qqm.qy; title="QQ plot for m"))
-show(
-    scatterplot(
-        qqm.qx[51:(end - 50)],
-        qqm.qy[51:(end - 50)];
-        title="QQ plot for m (removing first and last 50 quantiles):",
-    ),
-)
-
-qqs = qqbuild(s[:s].value, s2[:s].value)
-show(scatterplot(qqs.qx, qqs.qy; title="QQ plot for s"))
-show(
-    scatterplot(
-        qqs.qx[51:(end - 50)],
-        qqs.qy[51:(end - 50)];
-        title="QQ plot for s (removing first and last 50 quantiles):",
-    ),
-)
-
-X = qqm.qx
-y = qqm.qy
-slope = (1 / (transpose(X) * X)[1] * transpose(X) * y)[1]
-
-print("  slopeₛ = $slope ≈ 1 (ϵ = 0.1)")
-ans1 = abs(slope - 1.0) <= 0.1
-if ans1
-    printstyled(" ✓\n"; color=:green)
-else
-    printstyled(" X\n"; color=:red)
-    printstyled("    slope = $slope, diff = $(slope - 1.0)\n"; color=:red)
-end
-
-# qqs = qqbuild(s[:s], s2[:s])
diff --git a/test/skipped/opt_param_of_dist.jl b/test/skipped/opt_param_of_dist.jl
deleted file mode 100644
index 3c8e4d2dcd..0000000000
--- a/test/skipped/opt_param_of_dist.jl
+++ /dev/null
@@ -1,20 +0,0 @@
-using Turing
-using Test
-
-@model testassume begin
-    x ~ Bernoulli(1; :static=true)
-    y ~ Bernoulli(x / 2; :param=true)
-    1 ~ Normal(0, 1; :static=true)
-    2 ~ Normal(0, 1; :param=true)
-    y, x
-end
-
-s = SMC()
-
-res = sample(testassume, s)
-
-# check that x is always 1
-@test all(isone, res[:x])
-
-# check that the mean of y is between 0.4 and 0.6
-@test mean(res[:y]) ≈ 0.5 atol = 0.1
diff --git a/test/skipped/run_all.jl b/test/skipped/run_all.jl
deleted file mode 100644
index 6ffe6e34b7..0000000000
--- a/test/skipped/run_all.jl
+++ /dev/null
@@ -1,5 +0,0 @@
-include("leapfrog.jl")
-
-include("simple_gauss_hmc.jl")
-include("gdemo_hmc.jl")
-include("gdemo_nuts.jl")
diff --git a/test/skipped/simple_gauss.jl b/test/skipped/simple_gauss.jl
deleted file mode 100644
index 14f36dce1f..0000000000
--- a/test/skipped/simple_gauss.jl
+++ /dev/null
@@ -1,52 +0,0 @@
-include("unit_test_helper.jl")
-
-# Turing
-
-using Turing
-
-@model function simple_gauss()
-    s = 1
-    m ~ Normal(0, sqrt(s))
-    2.0 ~ Normal(m, sqrt(s))
-    return 2.5 ~ Normal(m, sqrt(s))
-end
-
-# Plain Julia
-
-using ReverseDiff: GradientTape, GradientConfig, gradient, gradient!, compile
-
-θ_dim = 1
-function lj_func(θ)
-    _lj = zero(Real)
-
-    s = 1
-
-    m = θ[1]
-    _lj += logpdf(Normal(0, sqrt(s)), m)
-
-    _lj += logpdf(Normal(m, sqrt(s)), 2.0)
-    _lj += logpdf(Normal(m, sqrt(s)), 2.5)
-
-    return _lj
-end
-
-neg_lj_func(θ) = -lj_func(θ)
-const f_tape = GradientTape(neg_lj_func, randn(θ_dim))
-const compiled_f_tape = compile(f_tape)
-
-function grad_func(θ)
-    inputs = θ
-    results = similar(θ)
-    all_results = DiffResults.GradientResult(results)
-
-    gradient!(all_results, compiled_f_tape, inputs)
-
-    neg_lj = all_results.value
-    grad, = all_results.derivs
-
-    return -neg_lj, grad
-end
-
-# Unit test for gradient
-
-test_grad(simple_gauss, grad_func)
diff --git a/test/skipped/simple_gauss_hmc.jl b/test/skipped/simple_gauss_hmc.jl
deleted file mode 100644
index ba1f9dd231..0000000000
--- a/test/skipped/simple_gauss_hmc.jl
+++ /dev/null
@@ -1,36 +0,0 @@
-using Turing: _hmc_step
-
-include("unit_test_helper.jl")
-include("simple_gauss.jl")
-
-# Turing
-
-mf = simple_gauss()
-chn = sample(mf, HMC(10000, 0.05, 10))
-
-println("mean of m: $(mean(chn[1000:end, :m]))")
-
-# Plain Julia
-
-std = ones(θ_dim)
-θ = randn(θ_dim)
-lj = lj_func(θ)
-
-chn = Dict(:θ => Vector{Vector{Float64}}(), :logϵ => Vector{Float64}())
-accept_num = 1
-
-totla_num = 10000
-for iter in 1:totla_num
-    push!(chn[:θ], θ)
-    θ, lj, is_accept, τ_valid, α = _hmc_step(θ, lj, lj_func, grad_func, 10, 0.05, std)
-    accept_num += is_accept
-end
-
-@show lj
-@show mean(chn[:θ])
-samples_first_dim = map(x -> x[1], chn[:θ])
-@show std(samples_first_dim)
-
-@show accept_num / totla_num
-
-# Unit tests
diff --git a/test/skipped/sv.jl b/test/skipped/sv.jl
deleted file mode 100644
index b5fe22e715..0000000000
--- a/test/skipped/sv.jl
+++ /dev/null
@@ -1,76 +0,0 @@
-using ReverseDiff: GradientTape, GradientConfig, gradient, gradient!, compile
-using Turing: _hmc_step
-using Turing
-using HDF5, JLD
-sv_data = load(TPATH * "/example-models/nips-2017/sv-data.jld.data")["data"]
-
-@model function sv_model(T, y)
-    ϕ ~ Uniform(-1, 1)
-    σ ~ truncated(Cauchy(0, 5), 0, Inf)
-    μ ~ Cauchy(0, 10)
-
-    h = Vector{Real}(T)
-    h[1] ~ Normal(μ, σ / sqrt(1 - ϕ^2))
-    y[1] ~ Normal(0, exp.(h[1] / 2))
-    for t in 2:T
-        h[t] ~ Normal(μ + ϕ * (h[t - 1] - μ), σ)
-        y[t] ~ Normal(0, exp.(h[t] / 2))
-    end
-end
-
-mf = sv_model(; data=sv_data[1])
-chain_nuts = sample(mf, HMC(0.05, 10), 2000)
-
-println("mean of m: $(mean(chn[1000:end, :μ]))")
-
-# θ_dim = 1
-# function lj_func(θ)
-#   _lj = zero(Real)
-
-#   s = 1
-
-#   m = θ[1]
-#   _lj += logpdf(Normal(0, sqrt(s)), m)
-
-#   _lj += logpdf(Normal(m, sqrt(s)), 2.0)
-#   _lj += logpdf(Normal(m, sqrt(s)), 2.5)
-
-#   return _lj
-# end
-
-# neg_lj_func(θ) = -lj_func(θ)
-# const f_tape = GradientTape(neg_lj_func, randn(θ_dim))
-# const compiled_f_tape = compile(f_tape)
-
-# function grad_func(θ)
-
-#   inputs = θ
-#   results = similar(θ)
-#   all_results = DiffResults.GradientResult(results)
-
-#   gradient!(all_results, compiled_f_tape, inputs)
-
-#   neg_lj = all_results.value
-#   grad, = all_results.derivs
-
-#   return -neg_lj, grad
-
-# end
-
-# std = ones(θ_dim)
-# θ = randn(θ_dim)
-# lj = lj_func(θ)
-
-# chn = []
-# accept_num = 1
-
-# totla_num = 5000
-# for iter = 1:totla_num
-#   push!(chn, θ)
-#   θ, lj, is_accept, τ_valid, α = _hmc_step(θ, lj, lj_func, grad_func, 5, 0.05, std)
-#   accept_num += is_accept
-#   # if (iter % 50 == 0) println(θ) end
-# end
-
-# @show mean(chn), lj
-# @show accept_num / totla_num
diff --git a/test/skipped/unit_test_helper.jl b/test/skipped/unit_test_helper.jl
deleted file mode 100644
index 7dec7757bb..0000000000
--- a/test/skipped/unit_test_helper.jl
+++ /dev/null
@@ -1,24 +0,0 @@
-using Test
-
-function test_grad(turing_model, grad_f; trans=Dict())
-    model_f = turing_model()
-    vi = model_f()
-    for i in trans
-        vi.flags["trans"][i] = true
-    end
-    d = length(vi.vals)
-    @testset "Gradient using random inputs" begin
-        ℓ = LogDensityProblemsAD.ADgradient(
-            Turing.AutoTracker(),
-            DynamicPPL.LogDensityFunction(
-                model_f,
-                vi,
-                DynamicPPL.SamplingContext(SampleFromPrior(), DynamicPPL.DefaultContext()),
-            ),
-        )
-        for _ in 1:10000
-            theta = rand(d)
-            @test LogDensityProblems.logdensity_and_gradient(ℓ, theta) == grad_f(theta)[2]
-        end
-    end
-end
diff --git a/test/skipped/vec_assume_mat.jl b/test/skipped/vec_assume_mat.jl
deleted file mode 100644
index b9efc94d0a..0000000000
--- a/test/skipped/vec_assume_mat.jl
+++ /dev/null
@@ -1,20 +0,0 @@
-using Turing, Test
-
-N = 5
-alg = HMC(0.2, 4)
-
-@model function vdemo(::Type{T}=Float64) where {T}
-    v = Vector{Matrix{T}}(undef, N)
-    @. v ~ Wishart(7, [1 0.5; 0.5 1])
-end
-
-t_vec = @elapsed res_vec = sample(vdemo(), alg, 1000)
-
-@model function vdemo()
-    v = Vector{Matrix{Real}}(undef, N)
-    for i in 1:N
-        v[i] ~ Wishart(7, [1 0.5; 0.5 1])
-    end
-end
-
-t_loop = @elapsed res = sample(vdemo(), alg, 1000)
diff --git a/test/skipped/vec_assume_mv.jl b/test/skipped/vec_assume_mv.jl
deleted file mode 100644
index ba373b124f..0000000000
--- a/test/skipped/vec_assume_mv.jl
+++ /dev/null
@@ -1,42 +0,0 @@
-
-using Turing, Test
-
-N = 10
-beta = [0.5, 0.5]
-alg = HMC(0.2, 4)
-
-# Test for vectorize UnivariateDistribution
-@model function vdemo()
-    phi = Vector{Vector{Real}}(undef, N)
-    @> phi ~ Dirichlet(beta)
-end
-
-ch_vec, t_vec, m_vec, gctime, memallocs = @timed res_vec = sample(vdemo(), alg)
-
-@model function vdemo()
-    phi = Matrix(undef, 2, N)
-    @. phi ~ Dirichlet(beta)
-end
-
-ch_vec_mat, t_vec_mat, m_vec_mat, gctime, memallocs = @timed res_vec_mat = sample(
-    vdemo(), alg
-)
-
-@model function vdemo()
-    phi = Vector{Vector{Real}}(undef, N)
-    for i in 1:N
-        phi[i] ~ Dirichlet(beta)
-    end
-end
-
-ch_loop, t_loop, m_loop, gctime, memallocs = @timed res = sample(vdemo(), alg)
-
-println("Time for")
-println("  Loop : $(sum(ch_loop[:elapsed]))")
-println("  Vec  : $(sum(ch_vec[:elapsed]))")
-println("  Vec2 : $(sum(ch_vec_mat[:elapsed]))")
-
-println("Mem for")
-println("  Loop : $m_loop")
-println("  Vec  : $m_vec")
-println("  Vec2 : $m_vec_mat")
diff --git a/test/variational/advi.jl b/test/variational/advi.jl
index c2abacb675..ed8f745df2 100644
--- a/test/variational/advi.jl
+++ b/test/variational/advi.jl
@@ -1,71 +1,133 @@
+
 module AdvancedVITests
 
 using ..Models: gdemo_default
 using ..NumericalTests: check_gdemo
-import AdvancedVI
-using AdvancedVI: TruncatedADAGrad, DecayedADAGrad
+
+using AdvancedVI
 using Bijectors: Bijectors
 using Distributions: Dirichlet, Normal
-using LinearAlgebra: I
+using LinearAlgebra
 using MCMCChains: Chains
-import Random
+using Random
+using StableRNGs: StableRNG
 using Test: @test, @testset
 using Turing
-using Turing.Essential: TuringDiagMvNormal
+using Turing.Variational
+
+@testset "ADVI" begin
+    @testset "q initialization" begin
+        m = gdemo_default
+        d = length(Turing.DynamicPPL.VarInfo(m)[:])
+        for q in [q_meanfield_gaussian(m), q_fullrank_gaussian(m)]
+            rand(q)
+        end
+
+        μ = ones(d)
+        q = q_meanfield_gaussian(m; location=μ)
+        @assert mean(q.dist) ≈ μ
 
-@testset "advi.jl" begin
-    @testset "advi constructor" begin
-        Random.seed!(0)
-        N = 500
+        q = q_fullrank_gaussian(m; location=μ)
+        @assert mean(q.dist) ≈ μ
 
-        s1 = ADVI()
-        q = vi(gdemo_default, s1)
-        c1 = rand(q, N)
+        L = Diagonal(fill(0.1, d))
+        q = q_meanfield_gaussian(m; scale=L)
+        @assert cov(q.dist) ≈ L * L
+
+        L = LowerTriangular(tril(0.01 * ones(d, d) + I))
+        q = q_fullrank_gaussian(m; scale=L)
+        @assert cov(q.dist) ≈ L * L'
     end
-    @testset "advi inference" begin
-        @testset for opt in [TruncatedADAGrad(), DecayedADAGrad()]
-            Random.seed!(1)
-            N = 500
-
-            alg = ADVI(10, 5000)
-            q = vi(gdemo_default, alg; optimizer=opt)
-            samples = transpose(rand(q, N))
-            chn = Chains(reshape(samples, size(samples)..., 1), ["s", "m"])
 
-            # TODO: uhmm, seems like a large `eps` here...
-            check_gdemo(chn; atol=0.5)
+    @testset "default interface" begin
+        for q0 in [q_meanfield_gaussian(gdemo_default), q_fullrank_gaussian(gdemo_default)]
+            _, q, _, _ = vi(gdemo_default, q0, 100; show_progress=Turing.PROGRESS[])
+            c1 = rand(q, 10)
         end
     end
 
-    @testset "advi different interfaces" begin
-        Random.seed!(1234)
-
-        target = MvNormal(zeros(2), I)
-        logπ(z) = logpdf(target, z)
-        advi = ADVI(10, 1000)
-
-        # Using a function z ↦ q(⋅∣z)
-        getq(θ) = TuringDiagMvNormal(θ[1:2], exp.(θ[3:4]))
-        q = vi(logπ, advi, getq, randn(4))
+    @testset "custom interface $name" for (name, objective, operator, optimizer) in [
+        (
+            "ADVI with closed-form entropy",
+            AdvancedVI.RepGradELBO(10),
+            AdvancedVI.ProximalLocationScaleEntropy(),
+            AdvancedVI.DoG(),
+        ),
+        (
+            "ADVI with proximal entropy",
+            AdvancedVI.RepGradELBO(10; entropy=AdvancedVI.ClosedFormEntropyZeroGradient()),
+            AdvancedVI.ClipScale(),
+            AdvancedVI.DoG(),
+        ),
+        (
+            "ADVI with STL entropy",
+            AdvancedVI.RepGradELBO(10; entropy=AdvancedVI.StickingTheLandingEntropy()),
+            AdvancedVI.ClipScale(),
+            AdvancedVI.DoG(),
+        ),
+    ]
+        T = 1000
+        q, q_avg, _, _ = vi(
+            gdemo_default,
+            q_meanfield_gaussian(gdemo_default),
+            T;
+            objective,
+            optimizer,
+            operator,
+            show_progress=Turing.PROGRESS[],
+        )
+
+        N = 1000
+        c1 = rand(q_avg, N)
+        c2 = rand(q, N)
+    end
 
-        xs = rand(target, 10)
-        @test mean(abs2, logpdf(q, xs) - logpdf(target, xs)) ≤ 0.07
+    @testset "inference $name" for (name, objective, operator, optimizer) in [
+        (
+            "ADVI with closed-form entropy",
+            AdvancedVI.RepGradELBO(10),
+            AdvancedVI.ProximalLocationScaleEntropy(),
+            AdvancedVI.DoG(),
+        ),
+        (
+            "ADVI with proximal entropy",
+            RepGradELBO(10; entropy=AdvancedVI.ClosedFormEntropyZeroGradient()),
+            AdvancedVI.ClipScale(),
+            AdvancedVI.DoG(),
+        ),
+        (
+            "ADVI with STL entropy",
+            AdvancedVI.RepGradELBO(10; entropy=AdvancedVI.StickingTheLandingEntropy()),
+            AdvancedVI.ClipScale(),
+            AdvancedVI.DoG(),
+        ),
+    ]
+        rng = StableRNG(0x517e1d9bf89bf94f)
+
+        T = 1000
+        q, q_avg, _, _ = vi(
+            rng,
+            gdemo_default,
+            q_meanfield_gaussian(gdemo_default),
+            T;
+            optimizer,
+            show_progress=Turing.PROGRESS[],
+        )
+
+        N = 1000
+        for q_out in [q_avg, q]
+            samples = transpose(rand(rng, q_out, N))
+            chn = Chains(reshape(samples, size(samples)..., 1), ["s", "m"])
 
-        # OR: implement `update` and pass a `Distribution`
-        function AdvancedVI.update(d::TuringDiagMvNormal, θ::AbstractArray{<:Real})
-            return TuringDiagMvNormal(θ[1:length(q)], exp.(θ[(length(q) + 1):end]))
+            check_gdemo(chn; atol=0.5)
         end
-
-        q0 = TuringDiagMvNormal(zeros(2), ones(2))
-        q = vi(logπ, advi, q0, randn(4))
-
-        xs = rand(target, 10)
-        @test mean(abs2, logpdf(q, xs) - logpdf(target, xs)) ≤ 0.05
     end
 
     # regression test for:
     # https://github.com/TuringLang/Turing.jl/issues/2065
     @testset "simplex bijector" begin
+        rng = StableRNG(0x517e1d9bf89bf94f)
+
         @model function dirichlet()
             x ~ Dirichlet([1.0, 1.0])
             return x
@@ -81,25 +143,27 @@ using Turing.Essential: TuringDiagMvNormal
         @test all(x0 .≈ x0_inv)
 
         # And regression for https://github.com/TuringLang/Turing.jl/issues/2160.
-        q = vi(m, ADVI(10, 1000))
-        x = rand(q, 1000)
+        _, q, _, _ = vi(rng, m, q_meanfield_gaussian(m), 1000)
+        x = rand(rng, q, 1000)
         @test mean(eachcol(x)) ≈ [0.5, 0.5] atol = 0.1
     end
 
     # Ref: https://github.com/TuringLang/Turing.jl/issues/2205
     @testset "with `condition` (issue #2205)" begin
+        rng = StableRNG(0x517e1d9bf89bf94f)
+
         @model function demo_issue2205()
             x ~ Normal()
             return y ~ Normal(x, 1)
         end
 
         model = demo_issue2205() | (y=1.0,)
-        q = vi(model, ADVI(10, 1000))
+        _, q, _, _ = vi(rng, model, q_meanfield_gaussian(model), 1000)
         # True mean.
         mean_true = 1 / 2
         var_true = 1 / 2
         # Check the mean and variance of the posterior.
-        samples = rand(q, 1000)
+        samples = rand(rng, q, 1000)
         mean_est = mean(samples)
         var_est = var(samples)
         @test mean_est ≈ mean_true atol = 0.2
diff --git a/test/variational/optimisers.jl b/test/variational/optimisers.jl
deleted file mode 100644
index 6f64d5fb1f..0000000000
--- a/test/variational/optimisers.jl
+++ /dev/null
@@ -1,29 +0,0 @@
-module VariationalOptimisersTests
-
-using AdvancedVI: DecayedADAGrad, TruncatedADAGrad, apply!
-import ForwardDiff
-import ReverseDiff
-using Test: @test, @testset
-using Turing
-
-function test_opt(ADPack, opt)
-    θ = randn(10, 10)
-    θ_fit = randn(10, 10)
-    loss(x, θ_) = mean(sum(abs2, θ * x - θ_ * x; dims=1))
-    for t in 1:(10^4)
-        x = rand(10)
-        Δ = ADPack.gradient(θ_ -> loss(x, θ_), θ_fit)
-        Δ = apply!(opt, θ_fit, Δ)
-        @. θ_fit = θ_fit - Δ
-    end
-    @test loss(rand(10, 100), θ_fit) < 0.01
-    @test length(opt.acc) == 1
-end
-for opt in [TruncatedADAGrad(), DecayedADAGrad(1e-2)]
-    test_opt(ForwardDiff, opt)
-end
-for opt in [TruncatedADAGrad(), DecayedADAGrad(1e-2)]
-    test_opt(ReverseDiff, opt)
-end
-
-end