diff --git a/HISTORY.md b/HISTORY.md index fdb5125cb2..387d7f5ddf 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,3 +1,28 @@ +# Release 0.39.0 + +## Update to the AdvancedVI interface + +Turing's variational inference interface was updated to match version 0.4 version of AdvancedVI.jl. + +AdvancedVI v0.4 introduces various new features: + + - location-scale families with dense scale matrices, + - parameter-free stochastic optimization algorithms like `DoG` and `DoWG`, + - proximal operators for stable optimization, + - the sticking-the-landing control variate for faster convergence, and + - the score gradient estimator for non-differentiable targets. + +Please see the [Turing API documentation](https://turinglang.org/Turing.jl/stable/api/#Variational-inference), and [AdvancedVI's documentation](https://turinglang.org/AdvancedVI.jl/stable/), for more details. + +## Removal of Turing.Essential + +The Turing.Essential module has been removed. +Anything exported from there can be imported from either `Turing` or `DynamicPPL`. + +## `@addlogprob!` + +The `@addlogprob!` macro is now exported from Turing, making it officially part of the public interface. + # Release 0.38.6 Added compatibility with AdvancedHMC 0.8. diff --git a/Project.toml b/Project.toml index 040ddac256..6fcb779e33 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "Turing" uuid = "fce5fe82-541a-59a6-adf8-730c64b5f9a0" -version = "0.38.6" +version = "0.39.0" [deps] ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b" @@ -53,7 +53,7 @@ Accessors = "0.1" AdvancedHMC = "0.3.0, 0.4.0, 0.5.2, 0.6, 0.7, 0.8" AdvancedMH = "0.8" AdvancedPS = "0.6.0" -AdvancedVI = "0.2" +AdvancedVI = "0.4" BangBang = "0.4.2" Bijectors = "0.14, 0.15" Compat = "4.15.0" diff --git a/docs/make.jl b/docs/make.jl index 978e5881b3..af24e7b1ec 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -23,8 +23,11 @@ makedocs(; pages=[ "Home" => "index.md", "API" => "api.md", - "Submodule APIs" => - ["Inference" => "api/Inference.md", "Optimisation" => "api/Optimisation.md"], + "Submodule APIs" => [ + "Inference" => "api/Inference.md", + "Optimisation" => "api/Optimisation.md", + "Variational " => "api/Variational.md", + ], ], checkdocs=:exports, doctest=false, diff --git a/docs/src/api.md b/docs/src/api.md index 01f022e7e5..0b8351eb3b 100644 --- a/docs/src/api.md +++ b/docs/src/api.md @@ -42,6 +42,7 @@ even though [`Prior()`](@ref) is actually defined in the `Turing.Inference` modu | `to_submodel` | [`DynamicPPL.to_submodel`](@extref) | Define a submodel | | `prefix` | [`DynamicPPL.prefix`](@extref) | Prefix all variable names in a model with a given VarName | | `LogDensityFunction` | [`DynamicPPL.LogDensityFunction`](@extref) | A struct containing all information about how to evaluate a model. Mostly for advanced users | +| `@addlogprob!` | [`DynamicPPL.@addlogprob!`](@extref) | Add arbitrary log-probability terms during model evaluation | ### Inference @@ -76,12 +77,14 @@ even though [`Prior()`](@ref) is actually defined in the `Turing.Inference` modu ### Variational inference -See the [variational inference tutorial](https://turinglang.org/docs/tutorials/09-variational-inference/) for a walkthrough on how to use these. +See the [docs of AdvancedVI.jl](https://turinglang.org/AdvancedVI.jl/stable/) for detailed usage and the [variational inference tutorial](https://turinglang.org/docs/tutorials/09-variational-inference/) for a basic walkthrough. -| Exported symbol | Documentation | Description | -|:--------------- |:---------------------------- |:--------------------------------------- | -| `vi` | [`AdvancedVI.vi`](@extref) | Perform variational inference | -| `ADVI` | [`AdvancedVI.ADVI`](@extref) | Construct an instance of a VI algorithm | +| Exported symbol | Documentation | Description | +|:---------------------- |:------------------------------------------------- |:---------------------------------------------------------------------------------------- | +| `vi` | [`Turing.vi`](@ref) | Perform variational inference | +| `q_locationscale` | [`Turing.Variational.q_locationscale`](@ref) | Find a numerically non-degenerate initialization for a location-scale variational family | +| `q_meanfield_gaussian` | [`Turing.Variational.q_meanfield_gaussian`](@ref) | Find a numerically non-degenerate initialization for a mean-field Gaussian family | +| `q_fullrank_gaussian` | [`Turing.Variational.q_fullrank_gaussian`](@ref) | Find a numerically non-degenerate initialization for a full-rank Gaussian family | ### Automatic differentiation types diff --git a/docs/src/api/Variational.md b/docs/src/api/Variational.md new file mode 100644 index 0000000000..382efe7e18 --- /dev/null +++ b/docs/src/api/Variational.md @@ -0,0 +1,6 @@ +# API: `Turing.Variational` + +```@autodocs +Modules = [Turing.Variational] +Order = [:type, :function] +``` diff --git a/src/Turing.jl b/src/Turing.jl index aa5fbe8500..1ff2310174 100644 --- a/src/Turing.jl +++ b/src/Turing.jl @@ -23,7 +23,7 @@ using Printf: Printf using Random: Random using LinearAlgebra: I -using ADTypes: ADTypes +using ADTypes: ADTypes, AutoForwardDiff, AutoReverseDiff, AutoMooncake const DEFAULT_ADTYPE = ADTypes.AutoForwardDiff() @@ -39,16 +39,12 @@ function setprogress!(progress::Bool) @info "[Turing]: progress logging is $(progress ? "enabled" : "disabled") globally" PROGRESS[] = progress AbstractMCMC.setprogress!(progress; silent=true) - # TODO: `AdvancedVI.turnprogress` is removed in AdvancedVI v0.3 - AdvancedVI.turnprogress(progress) return progress end # Random probability measures. include("stdlib/distributions.jl") include("stdlib/RandomMeasures.jl") -include("essential/Essential.jl") -using .Essential include("mcmc/Inference.jl") # inference algorithms using .Inference include("variational/VariationalInference.jl") @@ -57,13 +53,13 @@ using .Variational include("optimisation/Optimisation.jl") using .Optimisation -include("deprecated.jl") # to be removed in the next minor version release - ########### # Exports # ########### # `using` statements for stuff to re-export using DynamicPPL: + @model, + @varname, pointwise_loglikelihoods, generated_quantities, returned, @@ -73,9 +69,12 @@ using DynamicPPL: decondition, fix, unfix, + prefix, conditioned, + @submodel, to_submodel, - LogDensityFunction + LogDensityFunction, + @addlogprob! using StatsBase: predict using OrderedCollections: OrderedDict @@ -90,6 +89,7 @@ export to_submodel, prefix, LogDensityFunction, + @addlogprob!, # Sampling - AbstractMCMC sample, MCMCThreads, @@ -116,6 +116,9 @@ export # Variational inference - AdvancedVI vi, ADVI, + q_locationscale, + q_meanfield_gaussian, + q_fullrank_gaussian, # ADTypes AutoForwardDiff, AutoReverseDiff, diff --git a/src/deprecated.jl b/src/deprecated.jl deleted file mode 100644 index 34305d4423..0000000000 --- a/src/deprecated.jl +++ /dev/null @@ -1,39 +0,0 @@ -export setadbackend, setchunksize, setadsafe - -function setadbackend(::Union{Symbol,Val}) - Base.depwarn( - "`ADBACKEND` and `setbackend` are deprecated. Please specify the chunk size directly in the sampler constructor, e.g., `HMC(0.1, 5; adtype=AutoForwardDiff())`.\n This function has no effects.", - :setbackend; - force=true, - ) - return nothing -end - -function setchunksize(::Int) - Base.depwarn( - "`CHUNKSIZE` and `setchunksize` are deprecated. Please specify the chunk size directly in the sampler constructor, e.g., `HMC(0.1, 5; adtype=AutoForwardDiff())`.\n This function has no effects.", - :setchunksize; - force=true, - ) - return nothing -end - -function setrdcache(::Union{Bool,Val}) - Base.depwarn( - "`RDCACHE` and `setrdcache` are deprecated. Please specify if you wish to use compiled tape for ReverseDiff directly in the sampler constructor, e.g., `HMC(0.1, 5; adtype=AutoReverseDiff(false))`.\n This function has no effects.", - :setrdcache; - force=true, - ) - return nothing -end - -function setadsafe(::Bool) - Base.depwarn( - "`ADSAFE` and `setadsafe` are outdated and no longer in use.", - :setadsafe; - force=true, - ) - return nothing -end - -Base.@deprecate_binding Core Essential false diff --git a/src/essential/Essential.jl b/src/essential/Essential.jl deleted file mode 100644 index b045e4a857..0000000000 --- a/src/essential/Essential.jl +++ /dev/null @@ -1,24 +0,0 @@ -module Essential - -using DistributionsAD, Bijectors -using Libtask, ForwardDiff, Random -using Distributions, LinearAlgebra -using Reexport -using ..Turing: Turing -using DynamicPPL: Model, AbstractSampler, Sampler, SampleFromPrior -using LinearAlgebra: copytri! -using Bijectors: PDMatDistribution -using AdvancedVI -using StatsFuns: logsumexp, softmax -@reexport using DynamicPPL -using ADTypes: ADTypes, AutoForwardDiff, AutoReverseDiff, AutoMooncake - -using AdvancedPS: AdvancedPS - -include("container.jl") - -export @model -export @varname -export AutoForwardDiff, AutoReverseDiff, AutoMooncake - -end # module diff --git a/src/essential/container.jl b/src/essential/container.jl deleted file mode 100644 index a1012d471f..0000000000 --- a/src/essential/container.jl +++ /dev/null @@ -1,70 +0,0 @@ -struct TracedModel{S<:AbstractSampler,V<:AbstractVarInfo,M<:Model,E<:Tuple} <: - AdvancedPS.AbstractGenericModel - model::M - sampler::S - varinfo::V - evaluator::E -end - -function TracedModel( - model::Model, - sampler::AbstractSampler, - varinfo::AbstractVarInfo, - rng::Random.AbstractRNG, -) - context = SamplingContext(rng, sampler, DefaultContext()) - args, kwargs = DynamicPPL.make_evaluate_args_and_kwargs(model, varinfo, context) - if kwargs !== nothing && !isempty(kwargs) - error( - "Sampling with `$(sampler.alg)` does not support models with keyword arguments. See issue #2007 for more details.", - ) - end - return TracedModel{AbstractSampler,AbstractVarInfo,Model,Tuple}( - model, sampler, varinfo, (model.f, args...) - ) -end - -function AdvancedPS.advance!( - trace::AdvancedPS.Trace{<:AdvancedPS.LibtaskModel{<:TracedModel}}, isref::Bool=false -) - # Make sure we load/reset the rng in the new replaying mechanism - DynamicPPL.increment_num_produce!(trace.model.f.varinfo) - isref ? AdvancedPS.load_state!(trace.rng) : AdvancedPS.save_state!(trace.rng) - score = consume(trace.model.ctask) - if score === nothing - return nothing - else - return score + DynamicPPL.getlogp(trace.model.f.varinfo) - end -end - -function AdvancedPS.delete_retained!(trace::TracedModel) - DynamicPPL.set_retained_vns_del!(trace.varinfo) - return trace -end - -function AdvancedPS.reset_model(trace::TracedModel) - DynamicPPL.reset_num_produce!(trace.varinfo) - return trace -end - -function AdvancedPS.reset_logprob!(trace::TracedModel) - DynamicPPL.resetlogp!!(trace.model.varinfo) - return trace -end - -function AdvancedPS.update_rng!( - trace::AdvancedPS.Trace{<:AdvancedPS.LibtaskModel{<:TracedModel}} -) - # Extract the `args`. - args = trace.model.ctask.args - # From `args`, extract the `SamplingContext`, which contains the RNG. - sampling_context = args[3] - rng = sampling_context.rng - trace.rng = rng - return trace -end - -function Libtask.TapedTask(model::TracedModel, ::Random.AbstractRNG, args...; kwargs...) # RNG ? - return Libtask.TapedTask(model.evaluator[1], model.evaluator[2:end]...; kwargs...) -end diff --git a/src/mcmc/Inference.jl b/src/mcmc/Inference.jl index 3140215215..a58ead419e 100644 --- a/src/mcmc/Inference.jl +++ b/src/mcmc/Inference.jl @@ -1,7 +1,7 @@ module Inference -using ..Essential using DynamicPPL: + @model, Metadata, VarInfo, # TODO(mhauru) all_varnames_grouped_by_symbol isn't exported by DPPL, because it is only diff --git a/src/mcmc/particle_mcmc.jl b/src/mcmc/particle_mcmc.jl index afa3a8a6fb..ac5cd76488 100644 --- a/src/mcmc/particle_mcmc.jl +++ b/src/mcmc/particle_mcmc.jl @@ -2,6 +2,79 @@ ### Particle Filtering and Particle MCMC Samplers. ### +### AdvancedPS models and interface + +struct TracedModel{S<:AbstractSampler,V<:AbstractVarInfo,M<:Model,E<:Tuple} <: + AdvancedPS.AbstractGenericModel + model::M + sampler::S + varinfo::V + evaluator::E +end + +function TracedModel( + model::Model, + sampler::AbstractSampler, + varinfo::AbstractVarInfo, + rng::Random.AbstractRNG, +) + context = SamplingContext(rng, sampler, DefaultContext()) + args, kwargs = DynamicPPL.make_evaluate_args_and_kwargs(model, varinfo, context) + if kwargs !== nothing && !isempty(kwargs) + error( + "Sampling with `$(sampler.alg)` does not support models with keyword arguments. See issue #2007 for more details.", + ) + end + return TracedModel{AbstractSampler,AbstractVarInfo,Model,Tuple}( + model, sampler, varinfo, (model.f, args...) + ) +end + +function AdvancedPS.advance!( + trace::AdvancedPS.Trace{<:AdvancedPS.LibtaskModel{<:TracedModel}}, isref::Bool=false +) + # Make sure we load/reset the rng in the new replaying mechanism + DynamicPPL.increment_num_produce!(trace.model.f.varinfo) + isref ? AdvancedPS.load_state!(trace.rng) : AdvancedPS.save_state!(trace.rng) + score = consume(trace.model.ctask) + if score === nothing + return nothing + else + return score + DynamicPPL.getlogp(trace.model.f.varinfo) + end +end + +function AdvancedPS.delete_retained!(trace::TracedModel) + DynamicPPL.set_retained_vns_del!(trace.varinfo) + return trace +end + +function AdvancedPS.reset_model(trace::TracedModel) + DynamicPPL.reset_num_produce!(trace.varinfo) + return trace +end + +function AdvancedPS.reset_logprob!(trace::TracedModel) + DynamicPPL.resetlogp!!(trace.model.varinfo) + return trace +end + +function AdvancedPS.update_rng!( + trace::AdvancedPS.Trace{<:AdvancedPS.LibtaskModel{<:TracedModel}} +) + # Extract the `args`. + args = trace.model.ctask.args + # From `args`, extract the `SamplingContext`, which contains the RNG. + sampling_context = args[3] + rng = sampling_context.rng + trace.rng = rng + return trace +end + +function Libtask.TapedTask(model::TracedModel, ::Random.AbstractRNG, args...; kwargs...) # RNG ? + return Libtask.TapedTask(model.evaluator[1], model.evaluator[2:end]...; kwargs...) +end + abstract type ParticleInference <: InferenceAlgorithm end #### @@ -410,7 +483,7 @@ function AdvancedPS.Trace( newvarinfo = deepcopy(varinfo) DynamicPPL.reset_num_produce!(newvarinfo) - tmodel = Turing.Essential.TracedModel(model, sampler, newvarinfo, rng) + tmodel = TracedModel(model, sampler, newvarinfo, rng) newtrace = AdvancedPS.Trace(tmodel, rng) AdvancedPS.addreference!(newtrace.model.ctask.task, newtrace) return newtrace diff --git a/src/variational/VariationalInference.jl b/src/variational/VariationalInference.jl index 189d3f7001..b9428af112 100644 --- a/src/variational/VariationalInference.jl +++ b/src/variational/VariationalInference.jl @@ -1,50 +1,329 @@ + module Variational -using DistributionsAD: DistributionsAD -using DynamicPPL: DynamicPPL -using StatsBase: StatsBase -using StatsFuns: StatsFuns -using LogDensityProblems: LogDensityProblems +using DynamicPPL +using ADTypes using Distributions +using LinearAlgebra +using LogDensityProblems +using Random -using Random: Random +import ..Turing: DEFAULT_ADTYPE, PROGRESS import AdvancedVI import Bijectors -# Reexports -using AdvancedVI: vi, ADVI, ELBO, elbo, TruncatedADAGrad, DecayedADAGrad -export vi, ADVI, ELBO, elbo, TruncatedADAGrad, DecayedADAGrad +export vi, q_locationscale, q_meanfield_gaussian, q_fullrank_gaussian + +include("deprecated.jl") + +function make_logdensity(model::DynamicPPL.Model) + weight = 1.0 + ctx = DynamicPPL.MiniBatchContext(DynamicPPL.DefaultContext(), weight) + return DynamicPPL.LogDensityFunction(model, DynamicPPL.VarInfo(model), ctx) +end """ - make_logjoint(model::Model; weight = 1.0) -Constructs the logjoint as a function of latent variables, i.e. the map z → p(x ∣ z) p(z). -The weight used to scale the likelihood, e.g. when doing stochastic gradient descent one needs to -use `DynamicPPL.MiniBatch` context to run the `Model` with a weight `num_total_obs / batch_size`. -## Notes -- For sake of efficiency, the returned function is closes over an instance of `VarInfo`. This means that you *might* run into some weird behaviour if you call this method sequentially using different types; if that's the case, just generate a new one for each type using `make_logjoint`. + q_initialize_scale( + [rng::Random.AbstractRNG,] + model::DynamicPPL.Model, + location::AbstractVector, + scale::AbstractMatrix, + basedist::Distributions.UnivariateDistribution; + num_samples::Int = 10, + num_max_trials::Int = 10, + reduce_factor::Real = one(eltype(scale)) / 2 + ) + +Given an initial location-scale distribution `q` formed by `location`, `scale`, and `basedist`, shrink `scale` until the expectation of log-densities of `model` taken over `q` are finite. +If the log-densities are not finite even after `num_max_trials`, throw an error. + +For reference, a location-scale distribution \$q\$ formed by `location`, `scale`, and `basedist` is a distribution where its sampling process \$z \\sim q\$ can be represented as +```julia +u = rand(basedist, d) +z = scale * u + location +``` + +# Arguments +- `model`: The target `DynamicPPL.Model`. +- `location`: The location parameter of the initialization. +- `scale`: The scale parameter of the initialization. +- `basedist`: The base distribution of the location-scale family. + +# Keyword Arguments +- `num_samples`: Number of samples used to compute the average log-density at each trial. +- `num_max_trials`: Number of trials until throwing an error. +- `reduce_factor`: Factor for shrinking the scale. After `n` trials, the scale is then `scale*reduce_factor^n`. + +# Returns +- `scale_adj`: The adjusted scale matrix matching the type of `scale`. """ -function make_logjoint(model::DynamicPPL.Model; weight=1.0) - # setup - ctx = DynamicPPL.MiniBatchContext(DynamicPPL.DefaultContext(), weight) - f = DynamicPPL.LogDensityFunction(model, DynamicPPL.VarInfo(model), ctx) - return Base.Fix1(LogDensityProblems.logdensity, f) +function q_initialize_scale( + rng::Random.AbstractRNG, + model::DynamicPPL.Model, + location::AbstractVector, + scale::AbstractMatrix, + basedist::Distributions.UnivariateDistribution; + num_samples::Int=10, + num_max_trials::Int=10, + reduce_factor::Real=one(eltype(scale)) / 2, +) + prob = make_logdensity(model) + ℓπ = Base.Fix1(LogDensityProblems.logdensity, prob) + varinfo = DynamicPPL.VarInfo(model) + + n_trial = 0 + while true + q = AdvancedVI.MvLocationScale(location, scale, basedist) + b = Bijectors.bijector(model; varinfo=varinfo) + q_trans = Bijectors.transformed(q, Bijectors.inverse(b)) + energy = mean(ℓπ, eachcol(rand(rng, q_trans, num_samples))) + + if isfinite(energy) + return scale + elseif n_trial == num_max_trials + error("Could not find an initial") + end + + scale = reduce_factor * scale + n_trial += 1 + end end -# objectives -function (elbo::ELBO)( +""" + q_locationscale( + [rng::Random.AbstractRNG,] + model::DynamicPPL.Model; + location::Union{Nothing,<:AbstractVector} = nothing, + scale::Union{Nothing,<:Diagonal,<:LowerTriangular} = nothing, + meanfield::Bool = true, + basedist::Distributions.UnivariateDistribution = Normal() + ) + +Find a numerically non-degenerate variational distribution `q` for approximating the target `model` within the location-scale variational family formed by the type of `scale` and `basedist`. + +The distribution can be manually specified by setting `location`, `scale`, and `basedist`. +Otherwise, it chooses a standard Gaussian by default. +Whether the default choice is used or not, the `scale` may be adjusted via `q_initialize_scale` so that the log-densities of `model` are finite over the samples from `q`. +If `meanfield` is set as `true`, the scale of `q` is restricted to be a diagonal matrix and only the diagonal of `scale` is used. + +For reference, a location-scale distribution \$q\$ formed by `location`, `scale`, and `basedist` is a distribution where its sampling process \$z \\sim q\$ can be represented as +```julia +u = rand(basedist, d) +z = scale * u + location +``` + +# Arguments +- `model`: The target `DynamicPPL.Model`. + +# Keyword Arguments +- `location`: The location parameter of the initialization. If `nothing`, a vector of zeros is used. +- `scale`: The scale parameter of the initialization. If `nothing`, an identity matrix is used. +- `meanfield`: Whether to use the mean-field approximation. If `true`, `scale` is converted into a `Diagonal` matrix. Otherwise, it is converted into a `LowerTriangular` matrix. +- `basedist`: The base distribution of the location-scale family. + +The remaining keywords are passed to `q_initialize_scale`. + +# Returns +- `q::Bijectors.TransformedDistribution`: A `AdvancedVI.LocationScale` distribution matching the support of `model`. +""" +function q_locationscale( + rng::Random.AbstractRNG, + model::DynamicPPL.Model; + location::Union{Nothing,<:AbstractVector}=nothing, + scale::Union{Nothing,<:Diagonal,<:LowerTriangular}=nothing, + meanfield::Bool=true, + basedist::Distributions.UnivariateDistribution=Normal(), + kwargs..., +) + varinfo = DynamicPPL.VarInfo(model) + # Use linked `varinfo` to determine the correct number of parameters. + # TODO: Replace with `length` once this is implemented for `VarInfo`. + varinfo_linked = DynamicPPL.link(varinfo, model) + num_params = length(varinfo_linked[:]) + + μ = if isnothing(location) + zeros(num_params) + else + @assert length(location) == num_params "Length of the provided location vector, $(length(location)), does not match dimension of the target distribution, $(num_params)." + location + end + + L = if isnothing(scale) + if meanfield + q_initialize_scale(rng, model, μ, Diagonal(ones(num_params)), basedist; kwargs...) + else + L0 = LowerTriangular(Matrix{Float64}(I, num_params, num_params)) + q_initialize_scale(rng, model, μ, L0, basedist; kwargs...) + end + else + @assert size(scale) == (num_params, num_params) "Dimensions of the provided scale matrix, $(size(scale)), does not match the dimension of the target distribution, $(num_params)." + if meanfield + Diagonal(diag(scale)) + else + LowerTriangular(Matrix(scale)) + end + end + q = AdvancedVI.MvLocationScale(μ, L, basedist) + b = Bijectors.bijector(model; varinfo=varinfo) + return Bijectors.transformed(q, Bijectors.inverse(b)) +end + +function q_locationscale(model::DynamicPPL.Model; kwargs...) + return q_locationscale(Random.default_rng(), model; kwargs...) +end + +""" + q_meanfield_gaussian( + [rng::Random.AbstractRNG,] + model::DynamicPPL.Model; + location::Union{Nothing,<:AbstractVector} = nothing, + scale::Union{Nothing,<:Diagonal} = nothing, + kwargs... + ) + +Find a numerically non-degenerate mean-field Gaussian `q` for approximating the target `model`. + +# Arguments +- `model`: The target `DynamicPPL.Model`. + +# Keyword Arguments +- `location`: The location parameter of the initialization. If `nothing`, a vector of zeros is used. +- `scale`: The scale parameter of the initialization. If `nothing`, an identity matrix is used. + +The remaining keyword arguments are passed to `q_locationscale`. + +# Returns +- `q::Bijectors.TransformedDistribution`: A `AdvancedVI.LocationScale` distribution matching the support of `model`. +""" +function q_meanfield_gaussian( + rng::Random.AbstractRNG, + model::DynamicPPL.Model; + location::Union{Nothing,<:AbstractVector}=nothing, + scale::Union{Nothing,<:Diagonal}=nothing, + kwargs..., +) + return q_locationscale( + rng, model; location, scale, meanfield=true, basedist=Normal(), kwargs... + ) +end + +function q_meanfield_gaussian(model::DynamicPPL.Model; kwargs...) + return q_meanfield_gaussian(Random.default_rng(), model; kwargs...) +end + +""" + q_fullrank_gaussian( + [rng::Random.AbstractRNG,] + model::DynamicPPL.Model; + location::Union{Nothing,<:AbstractVector} = nothing, + scale::Union{Nothing,<:LowerTriangular} = nothing, + kwargs... + ) + +Find a numerically non-degenerate Gaussian `q` with a scale with full-rank factors (traditionally referred to as a "full-rank family") for approximating the target `model`. + +# Arguments +- `model`: The target `DynamicPPL.Model`. + +# Keyword Arguments +- `location`: The location parameter of the initialization. If `nothing`, a vector of zeros is used. +- `scale`: The scale parameter of the initialization. If `nothing`, an identity matrix is used. + +The remaining keyword arguments are passed to `q_locationscale`. + +# Returns +- `q::Bijectors.TransformedDistribution`: A `AdvancedVI.LocationScale` distribution matching the support of `model`. +""" +function q_fullrank_gaussian( + rng::Random.AbstractRNG, + model::DynamicPPL.Model; + location::Union{Nothing,<:AbstractVector}=nothing, + scale::Union{Nothing,<:LowerTriangular}=nothing, + kwargs..., +) + return q_locationscale( + rng, model; location, scale, meanfield=false, basedist=Normal(), kwargs... + ) +end + +function q_fullrank_gaussian(model::DynamicPPL.Model; kwargs...) + return q_fullrank_gaussian(Random.default_rng(), model; kwargs...) +end + +""" + vi( + [rng::Random.AbstractRNG,] + model::DynamicPPL.Model; + q, + n_iterations::Int; + objective::AdvancedVI.AbstractVariationalObjective = AdvancedVI.RepGradELBO( + 10; entropy = AdvancedVI.ClosedFormEntropyZeroGradient() + ), + show_progress::Bool = Turing.PROGRESS[], + optimizer::Optimisers.AbstractRule = AdvancedVI.DoWG(), + averager::AdvancedVI.AbstractAverager = AdvancedVI.PolynomialAveraging(), + operator::AdvancedVI.AbstractOperator = AdvancedVI.ProximalLocationScaleEntropy(), + adtype::ADTypes.AbstractADType = Turing.DEFAULT_ADTYPE, + kwargs... + ) + +Approximating the target `model` via variational inference by optimizing `objective` with the initialization `q`. +This is a thin wrapper around `AdvancedVI.optimize`. + +# Arguments +- `model`: The target `DynamicPPL.Model`. +- `q`: The initial variational approximation. +- `n_iterations`: Number of optimization steps. + +# Keyword Arguments +- `objective`: Variational objective to be optimized. +- `show_progress`: Whether to show the progress bar. +- `optimizer`: Optimization algorithm. +- `averager`: Parameter averaging strategy. +- `operator`: Operator applied after each optimization step. +- `adtype`: Automatic differentiation backend. + +See the docs of `AdvancedVI.optimize` for additional keyword arguments. + +# Returns +- `q`: Variational distribution formed by the last iterate of the optimization run. +- `q_avg`: Variational distribution formed by the averaged iterates according to `averager`. +- `state`: Collection of states used for optimization. This can be used to resume from a past call to `vi`. +- `info`: Information generated during the optimization run. +""" +function vi( rng::Random.AbstractRNG, - alg::AdvancedVI.VariationalInference, - q, model::DynamicPPL.Model, - num_samples; - weight=1.0, + q, + n_iterations::Int; + objective=AdvancedVI.RepGradELBO( + 10; entropy=AdvancedVI.ClosedFormEntropyZeroGradient() + ), + show_progress::Bool=PROGRESS[], + optimizer=AdvancedVI.DoWG(), + averager=AdvancedVI.PolynomialAveraging(), + operator=AdvancedVI.ProximalLocationScaleEntropy(), + adtype::ADTypes.AbstractADType=DEFAULT_ADTYPE, kwargs..., ) - return elbo(rng, alg, q, make_logjoint(model; weight=weight), num_samples; kwargs...) + return AdvancedVI.optimize( + rng, + make_logdensity(model), + objective, + q, + n_iterations; + show_progress=show_progress, + adtype, + optimizer, + averager, + operator, + kwargs..., + ) end -# VI algorithms -include("advi.jl") +function vi(model::DynamicPPL.Model, q, n_iterations::Int; kwargs...) + return vi(Random.default_rng(), model, q, n_iterations; kwargs...) +end end diff --git a/src/variational/advi.jl b/src/variational/advi.jl deleted file mode 100644 index 3819109a09..0000000000 --- a/src/variational/advi.jl +++ /dev/null @@ -1,70 +0,0 @@ -""" - meanfield([rng, ]model::Model) - -Creates a mean-field approximation with multivariate normal as underlying distribution. -""" -meanfield(model::DynamicPPL.Model) = meanfield(Random.default_rng(), model) -function meanfield(rng::Random.AbstractRNG, model::DynamicPPL.Model) - # Setup. - varinfo = DynamicPPL.VarInfo(model) - # Use linked `varinfo` to determine the correct number of parameters. - # TODO: Replace with `length` once this is implemented for `VarInfo`. - varinfo_linked = DynamicPPL.link(varinfo, model) - num_params = length(varinfo_linked[:]) - - # initial params - μ = randn(rng, num_params) - σ = StatsFuns.softplus.(randn(rng, num_params)) - - # Construct the base family. - d = DistributionsAD.TuringDiagMvNormal(μ, σ) - - # Construct the bijector constrained → unconstrained. - b = Bijectors.bijector(model; varinfo=varinfo) - - # We want to transform from unconstrained space to constrained, - # hence we need the inverse of `b`. - return Bijectors.transformed(d, Bijectors.inverse(b)) -end - -# Overloading stuff from `AdvancedVI` to specialize for Turing -function AdvancedVI.update(d::DistributionsAD.TuringDiagMvNormal, μ, σ) - return DistributionsAD.TuringDiagMvNormal(μ, σ) -end -function AdvancedVI.update(td::Bijectors.TransformedDistribution, θ...) - return Bijectors.transformed(AdvancedVI.update(td.dist, θ...), td.transform) -end -function AdvancedVI.update( - td::Bijectors.TransformedDistribution{<:DistributionsAD.TuringDiagMvNormal}, - θ::AbstractArray, -) - # `length(td.dist) != length(td)` if `td.transform` changes the dimensionality, - # so we need to use the length of the underlying distribution `td.dist` here. - # TODO: Check if we can get away with `view` instead of `getindex` for all AD backends. - μ, ω = θ[begin:(begin + length(td.dist) - 1)], θ[(begin + length(td.dist)):end] - return AdvancedVI.update(td, μ, StatsFuns.softplus.(ω)) -end - -function AdvancedVI.vi( - model::DynamicPPL.Model, alg::AdvancedVI.ADVI; optimizer=AdvancedVI.TruncatedADAGrad() -) - q = meanfield(model) - return AdvancedVI.vi(model, alg, q; optimizer=optimizer) -end - -function AdvancedVI.vi( - model::DynamicPPL.Model, - alg::AdvancedVI.ADVI, - q::Bijectors.TransformedDistribution{<:DistributionsAD.TuringDiagMvNormal}; - optimizer=AdvancedVI.TruncatedADAGrad(), -) - # Initial parameters for mean-field approx - μ, σs = StatsBase.params(q) - θ = vcat(μ, StatsFuns.invsoftplus.(σs)) - - # Optimize - AdvancedVI.optimize!(elbo, alg, q, make_logjoint(model), θ; optimizer=optimizer) - - # Return updated `Distribution` - return AdvancedVI.update(q, θ) -end diff --git a/src/variational/deprecated.jl b/src/variational/deprecated.jl new file mode 100644 index 0000000000..9a9f4777b5 --- /dev/null +++ b/src/variational/deprecated.jl @@ -0,0 +1,61 @@ + +import DistributionsAD +export ADVI + +Base.@deprecate meanfield(model) q_meanfield_gaussian(model) + +struct ADVI{AD} + "Number of samples used to estimate the ELBO in each optimization step." + samples_per_step::Int + "Maximum number of gradient steps." + max_iters::Int + "AD backend used for automatic differentiation." + adtype::AD +end + +function ADVI( + samples_per_step::Int=1, + max_iters::Int=1000; + adtype::ADTypes.AbstractADType=ADTypes.AutoForwardDiff(), +) + Base.depwarn( + "The type ADVI will be removed in future releases. Please refer to the new interface for `vi`", + :ADVI; + force=true, + ) + return ADVI{typeof(adtype)}(samples_per_step, max_iters, adtype) +end + +function vi(model::DynamicPPL.Model, alg::ADVI; kwargs...) + Base.depwarn( + "This specialization along with the type `ADVI` will be deprecated in future releases. Please refer to the new interface for `vi`.", + :vi; + force=true, + ) + q = q_meanfield_gaussian(Random.default_rng(), model) + objective = AdvancedVI.RepGradELBO( + alg.samples_per_step; entropy=AdvancedVI.ClosedFormEntropy() + ) + operator = AdvancedVI.IdentityOperator() + _, q_avg, _, _ = vi(model, q, alg.max_iters; objective, operator, kwargs...) + return q_avg +end + +function vi( + model::DynamicPPL.Model, + alg::ADVI, + q::Bijectors.TransformedDistribution{<:DistributionsAD.TuringDiagMvNormal}; + kwargs..., +) + Base.depwarn( + "This specialization along with the type `ADVI` will be deprecated in future releases. Please refer to the new interface for `vi`.", + :vi; + force=true, + ) + objective = AdvancedVI.RepGradELBO( + alg.samples_per_step; entropy=AdvancedVI.ClosedFormEntropy() + ) + operator = AdvancedVI.IdentityOperator() + _, q_avg, _, _ = vi(model, q, alg.max_iters; objective, operator, kwargs...) + return q_avg +end diff --git a/test/Project.toml b/test/Project.toml index 5587f46d7d..7cab77a01a 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -44,7 +44,7 @@ AbstractMCMC = "5" AbstractPPL = "0.9, 0.10, 0.11" AdvancedMH = "0.6, 0.7, 0.8" AdvancedPS = "=0.6.0" -AdvancedVI = "0.2" +AdvancedVI = "0.4" Aqua = "0.8" BangBang = "0.4" Bijectors = "0.14, 0.15" diff --git a/test/ad.jl b/test/ad.jl index befef5af33..2f645fab5d 100644 --- a/test/ad.jl +++ b/test/ad.jl @@ -25,8 +25,8 @@ const always_valid_eltypes = (AbstractFloat, AbstractIrrational, Integer, Ration """A dictionary mapping ADTypes to the element types they use.""" eltypes_by_adtype = Dict( - Turing.AutoForwardDiff => (ForwardDiff.Dual,), - Turing.AutoReverseDiff => ( + AutoForwardDiff => (ForwardDiff.Dual,), + AutoReverseDiff => ( ReverseDiff.TrackedArray, ReverseDiff.TrackedMatrix, ReverseDiff.TrackedReal, @@ -37,7 +37,7 @@ eltypes_by_adtype = Dict( ), ) if INCLUDE_MOONCAKE - eltypes_by_adtype[Turing.AutoMooncake] = (Mooncake.CoDual,) + eltypes_by_adtype[AutoMooncake] = (Mooncake.CoDual,) end """ @@ -189,9 +189,9 @@ end """ All the ADTypes on which we want to run the tests. """ -ADTYPES = [Turing.AutoForwardDiff(), Turing.AutoReverseDiff(; compile=false)] +ADTYPES = [AutoForwardDiff(), AutoReverseDiff(; compile=false)] if INCLUDE_MOONCAKE - push!(ADTYPES, Turing.AutoMooncake(; config=nothing)) + push!(ADTYPES, AutoMooncake(; config=nothing)) end # Check that ADTypeCheckContext itself works as expected. @@ -199,12 +199,12 @@ end @model test_model() = x ~ Normal(0, 1) tm = test_model() adtypes = ( - Turing.AutoForwardDiff(), - Turing.AutoReverseDiff(), + AutoForwardDiff(), + AutoReverseDiff(), # Don't need to test Mooncake as it doesn't use tracer types ) for actual_adtype in adtypes - sampler = Turing.HMC(0.1, 5; adtype=actual_adtype) + sampler = HMC(0.1, 5; adtype=actual_adtype) for expected_adtype in adtypes contextualised_tm = DynamicPPL.contextualize( tm, ADTypeCheckContext(expected_adtype, tm.context) @@ -212,9 +212,9 @@ end @testset "Expected: $expected_adtype, Actual: $actual_adtype" begin if actual_adtype == expected_adtype # Check that this does not throw an error. - Turing.sample(contextualised_tm, sampler, 2) + sample(contextualised_tm, sampler, 2) else - @test_throws AbstractWrongADBackendError Turing.sample( + @test_throws AbstractWrongADBackendError sample( contextualised_tm, sampler, 2 ) end diff --git a/test/mcmc/Inference.jl b/test/mcmc/Inference.jl index bc31ebfa90..b055a441ae 100644 --- a/test/mcmc/Inference.jl +++ b/test/mcmc/Inference.jl @@ -122,8 +122,8 @@ using Turing return global loglike = getlogp(__varinfo__) - lp1 end model = testmodel1([1.0]) - varinfo = Turing.VarInfo(model) - model(varinfo, Turing.SampleFromPrior(), Turing.LikelihoodContext()) + varinfo = DynamicPPL.VarInfo(model) + model(varinfo, DynamicPPL.SampleFromPrior(), DynamicPPL.LikelihoodContext()) @test getlogp(varinfo) == loglike # Test MiniBatchContext @@ -132,13 +132,13 @@ using Turing return x[1] ~ Bernoulli(a) end model = testmodel2([1.0]) - varinfo1 = Turing.VarInfo(model) + varinfo1 = DynamicPPL.VarInfo(model) varinfo2 = deepcopy(varinfo1) - model(varinfo1, Turing.SampleFromPrior(), Turing.LikelihoodContext()) + model(varinfo1, DynamicPPL.SampleFromPrior(), DynamicPPL.LikelihoodContext()) model( varinfo2, - Turing.SampleFromPrior(), - Turing.MiniBatchContext(Turing.LikelihoodContext(), 10), + DynamicPPL.SampleFromPrior(), + DynamicPPL.MiniBatchContext(DynamicPPL.LikelihoodContext(), 10), ) @test isapprox(getlogp(varinfo2) / getlogp(varinfo1), 10) end @@ -623,20 +623,20 @@ using Turing @model function e(x=1.0) return x ~ Normal() end - evi = Turing.VarInfo(e()) + evi = DynamicPPL.VarInfo(e()) @test isempty(Turing.Inference.getparams(e(), evi)) @model function f() return x ~ Normal() end - fvi = Turing.VarInfo(f()) + fvi = DynamicPPL.VarInfo(f()) @test only(Turing.Inference.getparams(f(), fvi)) == (@varname(x), fvi[@varname(x)]) @model function g() x ~ Normal() return y ~ Poisson() end - gvi = Turing.VarInfo(g()) + gvi = DynamicPPL.VarInfo(g()) gparams = Turing.Inference.getparams(g(), gvi) @test gparams[1] == (@varname(x), gvi[@varname(x)]) @test gparams[2] == (@varname(y), gvi[@varname(y)]) diff --git a/test/mcmc/external_sampler.jl b/test/mcmc/external_sampler.jl index 95cf304378..d008d800db 100644 --- a/test/mcmc/external_sampler.jl +++ b/test/mcmc/external_sampler.jl @@ -6,7 +6,6 @@ using Distributions: sample using Distributions.FillArrays: Zeros using DynamicPPL: DynamicPPL using ForwardDiff: ForwardDiff -using LinearAlgebra: I using LogDensityProblems: LogDensityProblems using Random: Random using ReverseDiff: ReverseDiff @@ -15,7 +14,7 @@ using Test: @test, @test_throws, @testset using Turing using Turing.Inference: AdvancedHMC -function initialize_nuts(model::Turing.Model) +function initialize_nuts(model::DynamicPPL.Model) # Create a linked varinfo vi = DynamicPPL.VarInfo(model) linked_vi = DynamicPPL.link!!(vi, model) diff --git a/test/mcmc/gibbs.jl b/test/mcmc/gibbs.jl index c0b2abba88..19b015367c 100644 --- a/test/mcmc/gibbs.jl +++ b/test/mcmc/gibbs.jl @@ -74,7 +74,7 @@ end target_vns = collect(target_vns) local_varinfo = DynamicPPL.subset(global_varinfo, target_vns) ctx = Turing.Inference.GibbsContext( - target_vns, Ref(global_varinfo), Turing.DefaultContext() + target_vns, Ref(global_varinfo), DynamicPPL.DefaultContext() ) # Check that the correct varnames are conditioned, and that getting their @@ -151,7 +151,7 @@ end model::DynamicPPL.Model, sampler::DynamicPPL.Sampler{<:AlgWrapper}, state, - params::Turing.AbstractVarInfo, + params::DynamicPPL.AbstractVarInfo, ) return Inference.setparams_varinfo!!(model, unwrap_sampler(sampler), state, params) end @@ -183,7 +183,7 @@ end return AbstractMCMC.step(rng, model, unwrap_sampler(sampler), args...; kwargs...) end - function Turing.DynamicPPL.initialstep( + function DynamicPPL.initialstep( rng::Random.AbstractRNG, model::DynamicPPL.Model, sampler::DynamicPPL.Sampler{<:AlgWrapper}, @@ -191,7 +191,7 @@ end kwargs..., ) capture_targets_and_algs(sampler.alg.inner, model.context) - return Turing.DynamicPPL.initialstep( + return DynamicPPL.initialstep( rng, model, unwrap_sampler(sampler), args...; kwargs... ) end @@ -398,7 +398,7 @@ end @varname(m) => RepeatSampler(PG(10), 2), ) for s in (s1, s2, s3, s4, s5, s6) - @test DynamicPPL.alg_str(Turing.Sampler(s)) == "Gibbs" + @test DynamicPPL.alg_str(DynamicPPL.Sampler(s)) == "Gibbs" end @test sample(gdemo_default, s1, N) isa MCMCChains.Chains @@ -408,7 +408,7 @@ end @test sample(gdemo_default, s5, N) isa MCMCChains.Chains @test sample(gdemo_default, s6, N) isa MCMCChains.Chains - g = Turing.Sampler(s3) + g = DynamicPPL.Sampler(s3) @test sample(gdemo_default, g, N) isa MCMCChains.Chains end @@ -493,7 +493,7 @@ end @nospecialize function AbstractMCMC.bundle_samples( samples::Vector, ::typeof(model), - ::Turing.Sampler{<:Gibbs}, + ::DynamicPPL.Sampler{<:Gibbs}, state, ::Type{MCMCChains.Chains}; kwargs..., @@ -594,11 +594,11 @@ end @testset verbose = true "$(model.f)" for model in DynamicPPL.TestUtils.DEMO_MODELS vns = DynamicPPL.TestUtils.varnames(model) samplers = [ - Turing.Gibbs(@varname(s) => NUTS(), @varname(m) => NUTS()), - Turing.Gibbs(@varname(s) => NUTS(), @varname(m) => HMC(0.01, 4)), - Turing.Gibbs(@varname(s) => NUTS(), @varname(m) => ESS()), - Turing.Gibbs(@varname(s) => HMC(0.01, 4), @varname(m) => MH()), - Turing.Gibbs(@varname(s) => MH(), @varname(m) => HMC(0.01, 4)), + Gibbs(@varname(s) => NUTS(), @varname(m) => NUTS()), + Gibbs(@varname(s) => NUTS(), @varname(m) => HMC(0.01, 4)), + Gibbs(@varname(s) => NUTS(), @varname(m) => ESS()), + Gibbs(@varname(s) => HMC(0.01, 4), @varname(m) => MH()), + Gibbs(@varname(s) => MH(), @varname(m) => HMC(0.01, 4)), ] @testset "$sampler" for sampler in samplers @@ -634,7 +634,7 @@ end # Sampler to use for Gibbs components. hmc = HMC(0.1, 32) - sampler = Turing.Gibbs(@varname(s) => hmc, @varname(m) => hmc) + sampler = Gibbs(@varname(s) => hmc, @varname(m) => hmc) Random.seed!(42) chain = sample( model, @@ -685,7 +685,7 @@ end @testset "with both `s` and `m` as random" begin model = gdemo(1.5, 2.0) vns = (@varname(s), @varname(m)) - alg = Turing.Gibbs(vns => MH()) + alg = Gibbs(vns => MH()) # `step` transition, state = AbstractMCMC.step(rng, model, DynamicPPL.Sampler(alg)) @@ -706,7 +706,7 @@ end @testset "without `m` as random" begin model = gdemo(1.5, 2.0) | (m=7 / 6,) vns = (@varname(s),) - alg = Turing.Gibbs(vns => MH()) + alg = Gibbs(vns => MH()) # `step` transition, state = AbstractMCMC.step(rng, model, DynamicPPL.Sampler(alg)) @@ -756,7 +756,7 @@ end @testset "CSMC + ESS" begin rng = Random.default_rng() model = MoGtest_default - alg = Turing.Gibbs( + alg = Gibbs( (@varname(z1), @varname(z2), @varname(z3), @varname(z4)) => CSMC(15), @varname(mu1) => ESS(), @varname(mu2) => ESS(), @@ -788,9 +788,7 @@ end @testset "CSMC + ESS (usage of implicit varname)" begin rng = Random.default_rng() model = MoGtest_default_z_vector - alg = Turing.Gibbs( - @varname(z) => CSMC(15), @varname(mu1) => ESS(), @varname(mu2) => ESS() - ) + alg = Gibbs(@varname(z) => CSMC(15), @varname(mu1) => ESS(), @varname(mu2) => ESS()) vns = ( @varname(z[1]), @varname(z[2]), @@ -836,9 +834,7 @@ end ), ] @testset "$(sampler_inner)" for sampler_inner in samplers_inner - sampler = Turing.Gibbs( - @varname(m1) => sampler_inner, @varname(m2) => sampler_inner - ) + sampler = Gibbs(@varname(m1) => sampler_inner, @varname(m2) => sampler_inner) Random.seed!(42) chain = sample( model, sampler, 1000; discard_initial=1000, thinning=10, n_adapts=0 diff --git a/test/mcmc/hmc.jl b/test/mcmc/hmc.jl index 84d0b58233..0b57657484 100644 --- a/test/mcmc/hmc.jl +++ b/test/mcmc/hmc.jl @@ -147,7 +147,7 @@ using Turing @test DynamicPPL.alg_str(sampler) == "HMCDA" @test isa(alg, HMCDA) - @test isa(sampler, Sampler{<:Turing.Hamiltonian}) + @test isa(sampler, Sampler{<:Turing.Inference.Hamiltonian}) end @testset "nuts inference" begin @@ -216,7 +216,7 @@ using Turing @model function demo_warn_initial_params() x ~ Normal() if (attempt += 1) < 30 - Turing.@addlogprob! -Inf + @addlogprob! -Inf end end @@ -231,7 +231,7 @@ using Turing @testset "error for impossible model" begin @model function demo_impossible() x ~ Normal() - Turing.@addlogprob! -Inf + @addlogprob! -Inf end @test_throws ErrorException sample(demo_impossible(), NUTS(), 5) @@ -260,7 +260,7 @@ using Turing ) catch e if e isa DomainError - Turing.@addlogprob! -Inf + @addlogprob! -Inf return nothing else rethrow() diff --git a/test/mcmc/mh.jl b/test/mcmc/mh.jl index d190e589a5..2e9f108470 100644 --- a/test/mcmc/mh.jl +++ b/test/mcmc/mh.jl @@ -118,7 +118,7 @@ GKernel(var) = (x) -> Normal(x, sqrt.(var)) model = M(zeros(2), I, 1) sampler = Inference.Sampler(MH()) - dt, vt = Inference.dist_val_tuple(sampler, Turing.VarInfo(model)) + dt, vt = Inference.dist_val_tuple(sampler, DynamicPPL.VarInfo(model)) @test dt[:z] isa AdvancedMH.StaticProposal{false,<:MvNormal} @test dt[:m] isa diff --git a/test/mcmc/sghmc.jl b/test/mcmc/sghmc.jl index 3796ea915a..1671362ed4 100644 --- a/test/mcmc/sghmc.jl +++ b/test/mcmc/sghmc.jl @@ -17,13 +17,13 @@ using Turing @testset "sghmc constructor" begin alg = SGHMC(; learning_rate=0.01, momentum_decay=0.1) @test alg isa SGHMC - sampler = Turing.Sampler(alg) - @test sampler isa Turing.Sampler{<:SGHMC} + sampler = DynamicPPL.Sampler(alg) + @test sampler isa DynamicPPL.Sampler{<:SGHMC} alg = SGHMC(; learning_rate=0.01, momentum_decay=0.1) @test alg isa SGHMC - sampler = Turing.Sampler(alg) - @test sampler isa Turing.Sampler{<:SGHMC} + sampler = DynamicPPL.Sampler(alg) + @test sampler isa DynamicPPL.Sampler{<:SGHMC} end @testset "sghmc inference" begin @@ -38,13 +38,13 @@ end @testset "sgld constructor" begin alg = SGLD(; stepsize=PolynomialStepsize(0.25)) @test alg isa SGLD - sampler = Turing.Sampler(alg) - @test sampler isa Turing.Sampler{<:SGLD} + sampler = DynamicPPL.Sampler(alg) + @test sampler isa DynamicPPL.Sampler{<:SGLD} alg = SGLD(; stepsize=PolynomialStepsize(0.25)) @test alg isa SGLD - sampler = Turing.Sampler(alg) - @test sampler isa Turing.Sampler{<:SGLD} + sampler = DynamicPPL.Sampler(alg) + @test sampler isa DynamicPPL.Sampler{<:SGLD} end @testset "sgld inference" begin rng = StableRNG(1) diff --git a/test/optimisation/Optimisation.jl b/test/optimisation/Optimisation.jl index d081fbd0d7..2acb7edc55 100644 --- a/test/optimisation/Optimisation.jl +++ b/test/optimisation/Optimisation.jl @@ -647,7 +647,7 @@ using Turing @model function saddle_model() x ~ Normal(0, 1) y ~ Normal(x, 1) - Turing.@addlogprob! x^2 - y^2 + @addlogprob! x^2 - y^2 return nothing end m = saddle_model() diff --git a/test/runtests.jl b/test/runtests.jl index dd443d7923..9fec2f737e 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -70,10 +70,6 @@ end end end - @testset "variational optimisers" begin - @timeit_include("variational/optimisers.jl") - end - @testset "stdlib" verbose = true begin @timeit_include("stdlib/distributions.jl") @timeit_include("stdlib/RandomMeasures.jl") diff --git a/test/skipped/advi_demo.jl b/test/skipped/advi_demo.jl deleted file mode 100644 index a9cbabd0fd..0000000000 --- a/test/skipped/advi_demo.jl +++ /dev/null @@ -1,161 +0,0 @@ -using Random - -using Bijectors -using Turing -using Turing: Variational - -using StatsFuns: softplus, invsoftplus -f = softplus -f⁻¹ = invsoftplus - -# setup for plotting -using Plots, StatsPlots, LaTeXStrings -pyplot() - -# used to compute closed form expression of posterior -using ConjugatePriors - -# define Turing model -@model function model(x) - s ~ InverseGamma(2, 3) - m ~ Normal(0.0, sqrt(s)) # `Normal(μ, σ)` has mean μ and variance σ², i.e. parametrize with std. not variance - - for i in 1:length(x) - x[i] ~ Normal(m, sqrt(s)) - end -end - -const seeds = [125, 245, 3] -const ad_modes = [:forwarddiff, :reversediff, :tracker] - -for seed in seeds - @info seed - - for ad_mode in ad_modes - @info ad_mode - setadbackend(ad_mode) - - # set random seed - Random.seed!(seed) - - # generate data - x = randn(1, 2000) - - # construct model - m = model(x) - - # ADVI - opt = Variational.TruncatedADAGrad() # optimizer - - advi = ADVI(10, 100) # <: VariationalInference - q = Variational.meanfield(m) # => <: VariationalPosterior - - elbo = Variational.ELBO() # <: VariationalObjective - - μ, σs = params(q) - θ = vcat(μ, f⁻¹.(σs)) - - history = [elbo(advi, q, m, 1000)] - - # construct animation - anim = @animate for j in 1:25 - # global q - Variational.optimize!(elbo, advi, q, m, θ; optimizer=opt) - μ, ω = θ[1:length(q)], θ[(length(q) + 1):end] - - q = Variational.update(q, μ, f.(ω)) - samples = rand(q, 2000) - - # quick check - println([mean(samples; dims=2), [var(x), mean(x)]]) - - # plotting code assumes (samples, dim) shape so we just transpose - samples = transpose(samples) - - # closed form computation - # notation mapping has been verified by explicitly computing expressions - # in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy - μ₀ = 0.0 # => μ - κ₀ = 1.0 # => ν, which scales the precision of the Normal - α₀ = 2.0 # => "shape" - β₀ = 3.0 # => "rate", which is 1 / θ, where θ is "scale" - - # prior - pri = NormalGamma(μ₀, κ₀, α₀, β₀) - - # posterior - post = posterior(pri, Normal, x) - - # marginal distribution of τ = 1 / σ² - # Eq. (90) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy - # `scale(post)` = θ - p_τ = Gamma(post.shape, scale(post)) - p_σ²_pdf = z -> pdf(p_τ, 1 / z) # τ => 1 / σ² - - # marginal of μ - # Eq. (91) in "Conjugate Bayesian analysis of the Gaussian distribution" by Murphy - p_μ = TDist(2 * post.shape) - - μₙ = post.mu # μ → μ - κₙ = post.nu # κ → ν - αₙ = post.shape # α → shape - βₙ = post.rate # β → rate - - # numerically more stable but doesn't seem to have effect; issue is probably internal to - # `pdf` which needs to compute ≈ Γ(1000) - p_μ_pdf = - z -> exp( - logpdf( - p_μ, - (z - μₙ) * exp(-0.5 * log(βₙ) + 0.5 * log(αₙ) + 0.5 * log(κₙ)), - ), - ) - # p_μ_pdf1 = z -> pdf(p_μ, (z - μₙ) / √(βₙ / (αₙ * κₙ))) - - # posterior plots - p1 = plot() - density!(samples[:, 1]; label="s (ADVI)", color=:blue, linestyle=:dash) - histogram!(samples[:, 1]; label="", normed=true, alpha=0.3, color=:blue) - - # normalize using Riemann approx. because of (almost certainly) numerical issues - Δ = 0.001 - r = 0.75:0.001:1.50 - norm_const = sum(p_σ²_pdf.(r) .* Δ) - plot!(r, p_σ²_pdf; label="s (posterior)", color=:red) - vline!([var(x)]; label="s (data)", linewidth=1.5, color=:black, alpha=0.7) - xlims!(0.5, 1.5) - title!("$(j * advi.max_iters) steps") - - p2 = plot() - density!(samples[:, 2]; label="m (ADVI)", color=:blue, linestyle=:dash) - histogram!(samples[:, 2]; label="", normed=true, alpha=0.3, color=:blue) - - # normalize using Riemann approx. because of (almost certainly) numerical issues - Δ = 0.0001 - r = (-0.1 + mean(x)):Δ:(0.1 + mean(x)) - norm_const = sum(p_μ_pdf.(r) .* Δ) - plot!(r, z -> p_μ_pdf(z) / norm_const; label="m (posterior)", color=:red) - vline!([mean(x)]; label="m (data)", linewidth=1.5, color=:black, alpha=0.7) - - xlims!(-0.25, 0.25) - - # visualize evolution of objective wrt. optimization iterations - obj = elbo(advi, q, m, 1000) - @info "ELBO" obj - push!(history, obj) - p3 = plot() - plot!(1:(advi.max_iters):(length(history) * advi.max_iters), history; label="") - title!("ELBO = $obj") - - # plot the latest 25 objective evaluations to visualize trend - p4 = plot() - plot!(history[max(1, end - 10):end]; label="") - - p = plot(p1, p2, p3, p4; layout=(4, 1)) - - @info "[$j] Done" - p - end - gif(anim, "advi_w_elbo_fps15_$(seed)_$(ad_mode).gif"; fps=15) - end -end diff --git a/test/skipped/bayes_lr.jl b/test/skipped/bayes_lr.jl deleted file mode 100644 index 6b9b4d31e7..0000000000 --- a/test/skipped/bayes_lr.jl +++ /dev/null @@ -1,80 +0,0 @@ -using ReverseDiff: GradientTape, GradientConfig, gradient, gradient!, compile -using Turing: _hmc_step -using Turing - -@model function bayes_lr(xs, ys) - N = length(xs) - @assert N == length(ys) - - s = 1 - β ~ Normal(0, 1) - - for n in 1:N - ys[n] ~ Normal(xs[n] * β, sqrt(s)) - end -end - -N = 100 -xs = collect(range(-10; stop=10, length=N)) -s = 1 -β = rand(Normal(0, 1)) -ys = xs * β + rand(Normal(0, sqrt(s)), N) - -println("s=$s, β=$β") - -mf = bayes_lr(xs, ys) -chn = sample(mf, HMC(0.005, 3), 2000) - -println("mean of β: ", mean(chn[1000:end, :β])) - -θ_dim = 1 -function lj_func(θ) - N = length(xs) - _lj = zero(Real) - - s = 1 - - β = θ[1] - _lj += logpdf(Normal(0, 1), β) - for n in 1:N - _lj += logpdf(Normal(xs[n] * β, sqrt(s)), ys[n]) - end - - return _lj -end - -neg_lj_func(θ) = -lj_func(θ) -const f_tape = GradientTape(neg_lj_func, randn(θ_dim)) -const compiled_f_tape = compile(f_tape) - -function grad_func(θ) - inputs = θ - results = similar(θ) - all_results = DiffResults.GradientResult(results) - - ReverseDiff.gradient!(all_results, compiled_f_tape, inputs) - - neg_lj = all_results.value - grad, = all_results.derivs - - return -neg_lj, grad -end - -std = ones(θ_dim) -θ = randn(θ_dim) -lj = lj_func(θ) - -chn = [] -accept_num = 1 - -total_num = 2000 -for iter in 1:total_num - global θ, chn, lj, lj_func, grad_func, std, accept_num - push!(chn, θ) - θ, lj, is_accept, τ_valid, α = _hmc_step(θ, lj, lj_func, grad_func, 3, 0.005, std) - accept_num += is_accept - # if (iter % 50 == 0) println(θ) end -end - -@show mean(chn[1000:end]), lj -@show accept_num / total_num diff --git a/test/skipped/dual_averaging.jl b/test/skipped/dual_averaging.jl deleted file mode 100644 index 897abe8d90..0000000000 --- a/test/skipped/dual_averaging.jl +++ /dev/null @@ -1,10 +0,0 @@ -function _adapt_ϵ(logϵ, Hbar, logϵbar, da_stat, m, M_adapt, δ, μ; γ=0.05, t0=10, κ=0.75) - if m <= M_adapt - Hbar = (1.0 - 1.0 / (m + t0)) * Hbar + (1 / (m + t0)) * (δ - da_stat) - logϵ = μ - sqrt(m) / γ * Hbar - logϵbar = m^(-κ) * logϵ + (1 - m^(-κ)) * logϵbar - else - logϵ = logϵbar - end - return logϵ, Hbar, logϵbar -end diff --git a/test/skipped/explicit_ret.jl b/test/skipped/explicit_ret.jl deleted file mode 100644 index 7472e8c319..0000000000 --- a/test/skipped/explicit_ret.jl +++ /dev/null @@ -1,19 +0,0 @@ -using Turing -using Test - -@model function test_ex_rt() - x ~ Normal(10, 1) - y ~ Normal(x / 2, 1) - z = y + 1 - x = x - 1 - return x, y, z -end - -mf = test_ex_rt() - -for alg in - [HMC(0.2, 3), PG(20, 2000), SMC(), IS(10000), Gibbs(:x => PG(20, 1), :y => HMC(0.2, 3))] - chn = sample(mf, alg) - @test mean(chn[:x]) ≈ 10.0 atol = 0.2 - @test mean(chn[:y]) ≈ 5.0 atol = 0.2 -end diff --git a/test/skipped/gdemo.jl b/test/skipped/gdemo.jl deleted file mode 100644 index 84242067eb..0000000000 --- a/test/skipped/gdemo.jl +++ /dev/null @@ -1,55 +0,0 @@ -include("unit_test_helper.jl") - -# Turing - -using Turing - -@model function gdemo() - s ~ InverseGamma(2, 3) - m ~ Normal(0, sqrt(s)) - 1.5 ~ Normal(m, sqrt(s)) - 2.0 ~ Normal(m, sqrt(s)) - return s, m -end - -# Plain Julia - -using ReverseDiff: GradientTape, GradientConfig, gradient, gradient!, compile -using Turing: invlink, logpdf - -θ_dim = 2 -function lj_func(θ) - _lj = zero(Real) - - d_s = InverseGamma(2, 3) - s = invlink(d_s, θ[1]) - _lj += logpdf(d_s, s, true) - m = θ[2] - _lj += logpdf(Normal(0, sqrt(s)), m) - - _lj += logpdf(Normal(m, sqrt(s)), 1.5) - _lj += logpdf(Normal(m, sqrt(s)), 2.0) - - return _lj -end - -neg_lj_func(θ) = -lj_func(θ) -const f_tape = GradientTape(neg_lj_func, randn(θ_dim)) -const compiled_f_tape = compile(f_tape) - -function grad_func(θ) - inputs = θ - results = similar(θ) - all_results = DiffResults.GradientResult(results) - - gradient!(all_results, compiled_f_tape, inputs) - - neg_lj = all_results.value - grad, = all_results.derivs - - return -neg_lj, grad -end - -# Unit test for gradient - -test_grad(gdemo, grad_func; trans=[1]) diff --git a/test/skipped/gdemo_hmc.jl b/test/skipped/gdemo_hmc.jl deleted file mode 100644 index 855b06f894..0000000000 --- a/test/skipped/gdemo_hmc.jl +++ /dev/null @@ -1,44 +0,0 @@ -using Turing: _hmc_step - -include("unit_test_helper.jl") -include("gdemo.jl") - -# Turing - -mf = gdemo() -chn = sample(mf, HMC(0.05, 5), 2000) - -println("mean of s: ", mean(chn[1000:end, :s])) -println("mean of m: ", mean(chn[1000:end, :m])) - -# Plain Julia - -std = ones(θ_dim) -θ = randn(θ_dim) -lj = lj_func(θ) - -chn = Dict(:θ => Vector{Vector{Float64}}(), :logϵ => Vector{Float64}()) -accept_num = 0 - -function dummy_print(args...) - return nothing -end - -totla_num = 5000 -for iter in 1:totla_num - push!(chn[:θ], θ) - θ, lj, is_accept, τ_valid, α = _hmc_step(θ, lj, lj_func, grad_func, 5, 0.05, std) - accept_num += is_accept -end - -@show lj -samples_s = exp.(map(x -> x[1], chn[:θ])) -samples_m = map(x -> x[2], chn[:θ]) -@show mean(samples_s[1000:end]) -@show mean(samples_m[1000:end]) -@show std(samples_s[1000:end]) -@show std(samples_m[1000:end]) - -@show accept_num / totla_num - -# Unit tests diff --git a/test/skipped/gdemo_nuts.jl b/test/skipped/gdemo_nuts.jl deleted file mode 100644 index ecc4d5c7f8..0000000000 --- a/test/skipped/gdemo_nuts.jl +++ /dev/null @@ -1,70 +0,0 @@ -using Main.Test -using Turing: _nuts_step - -include("unit_test_helper.jl") -include("gdemo.jl") - -include("dual_averaging.jl") - -# Turing - -# mf = gdemo() -# chn = sample(mf, HMC(0.05, 5), 5000) - -# println("mean of m: $(mean(chn[:m][1000:end]))") - -# Plain Julia - -M_adapt = 1000 -ϵ0 = 0.25 -logϵ = log(ϵ0) -μ = log(10 * ϵ0) -logϵbar = log(1) -Hbar = 0 - -δ = 0.75 - -for test_id in 1:2 - test_name = "$test_id. NUTS " * (test_id == 1 ? "without DA" : "with DA") - - @testset "$test_name" begin - std = ones(θ_dim) - θ = randn(θ_dim) - lj = lj_func(θ) - - chn = Dict(:θ => Vector{Vector{Float64}}(), :logϵ => Vector{Float64}()) - - function dummy_print(args...) - return nothing - end - - println("Start to run NUTS") - - totla_num = 10000 - for iter in 1:totla_num - global logϵ, lj_func, grad_func, M_adapt, δ, μ - θ, da_stat = _nuts_step(θ, exp(logϵ), lj_func, grad_func, std) - if test_id == 1 - logϵ, Hbar, logϵbar = _adapt_ϵ( - logϵ, Hbar, logϵbar, da_stat, iter, M_adapt, δ, μ - ) - end - - push!(chn[:θ], θ) - push!(chn[:logϵ], logϵ) - # if (iter % 50 == 0) println(θ) end - end - - samples_s = exp.(map(x -> x[1], chn[:θ])) - samples_m = map(x -> x[2], chn[:θ]) - @show mean(samples_s[1000:end]) - @test mean(samples_s[1000:end]) ≈ 49 / 24 atol = 0.2 - @show mean(samples_m[1000:end]) - @test mean(samples_m[1000:end]) ≈ 7 / 6 atol = 0.2 - - @show std(samples_s[1000:end]) - @show std(samples_m[1000:end]) - - @show mean(exp.(chn[:logϵ])) - end -end diff --git a/test/skipped/hmcda_geweke.jl b/test/skipped/hmcda_geweke.jl deleted file mode 100644 index 8e2f209864..0000000000 --- a/test/skipped/hmcda_geweke.jl +++ /dev/null @@ -1,98 +0,0 @@ -using StatsBase, Turing -using Gadfly - -# include("ASCIIPlot.jl") -import Gadfly.ElementOrFunction - -# First add a method to the basic Gadfly.plot function for QQPair types (generated by Distributions.qqbuild()) -function Gadfly.plot(qq::QQPair, elements::ElementOrFunction...) - return Gadfly.plot(; - x=qq.qx, y=qq.qy, Geom.point, Theme(; highlight_width=0px), elements... - ) -end - -# Now some shorthand functions -qqplot(x, y, elements::ElementOrFunction...) = Gadfly.plot(qqbuild(x, y), elements...) -function qqnorm(x, elements::ElementOrFunction...) - return qqplot( - Normal(), - x, - Guide.xlabel("Theoretical Normal quantiles"), - Guide.ylabel("Observed quantiles"), - elements..., - ) -end - -NSamples = 5000 - -@model function gdemo_fw() - # s ~ InverseGamma(2,3) - s = 1 - m ~ Normal(0, sqrt(s)) - return y ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)]) -end - -@model function gdemo_bk(x) - # Backward Step 1: theta ~ theta | x - # s ~ InverseGamma(2,3) - s = 1 - m ~ Normal(0, sqrt(s)) - x ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)]) - # Backward Step 2: x ~ x | theta - return y ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)]) -end - -fw = PG(50, NSamples) -# bk = Gibbs(10, PG(10,10, :s, :y), HMC(0.25, 5, :m)); -bk = HMCDA(0.65, 0.2); - -s = sample(gdemo_fw(), fw); -# describe(s) - -N = div(NSamples, 50) - -x = s[1, :y] -s_bk = Array{Turing.Chain}(undef, N) - -simple_logger = Base.CoreLogging.SimpleLogger(stderr, Base.CoreLogging.Debug) -Base.CoreLogging.with_logger(simple_logger) do - global x, bk, s_bk - i = 1 - while i <= N - s_bk[i] = sample(gdemo_bk(x), bk) - x = s_bk[i][end, :y] - i += 1 - end -end - -s2 = reduce(vcat, s_bk); -# describe(s2) - -using UnicodePlots - -qqm = qqbuild(s[:m], s2[:m]) -show(scatterplot(qqm.qx, qqm.qy; title="QQ plot for m", canvas=DotCanvas)) -show( - scatterplot( - qqm.qx[51:(end - 50)], - qqm.qy[51:(end - 50)]; - title="QQ plot for m (removing first and last 50 quantiles):", - canvas=DotCanvas, - ), -) - -qqm = qqbuild(s[:m], s2[:m]) -X = qqm.qx -y = qqm.qy -slope = (1 / (transpose(X) * X)[1] * transpose(X) * y)[1] - -print(" slopeₛ = $slope ≈ 1 (ϵ = 0.1)") -ans1 = abs(slope - 1.0) <= 0.1 -if ans1 - printstyled(" ✓\n"; color=:green) -else - printstyled(" X\n"; color=:red) - printstyled(" slope = $slope, diff = $(slope - 1.0)\n"; color=:red) -end - -# qqs = qqbuild(s[:s], s2[:s]) diff --git a/test/skipped/nuts_geweke.jl b/test/skipped/nuts_geweke.jl deleted file mode 100644 index d312be8841..0000000000 --- a/test/skipped/nuts_geweke.jl +++ /dev/null @@ -1,120 +0,0 @@ -using StatsBase, Turing -using Gadfly - -# include("ASCIIPlot.jl") -import Gadfly.ElementOrFunction - -# First add a method to the basic Gadfly.plot function for QQPair types (generated by Distributions.qqbuild()) -function Gadfly.plot(qq::QQPair, elements::ElementOrFunction...) - return Gadfly.plot(; - x=qq.qx, y=qq.qy, Geom.point, Theme(; highlight_width=0px), elements... - ) -end - -# Now some shorthand functions -qqplot(x, y, elements::ElementOrFunction...) = Gadfly.plot(qqbuild(x, y), elements...) -function qqnorm(x, elements::ElementOrFunction...) - return qqplot( - Normal(), - x, - Guide.xlabel("Theoretical Normal quantiles"), - Guide.ylabel("Observed quantiles"), - elements..., - ) -end - -NSamples = 5000 - -@model function gdemo_fw() - s ~ InverseGamma(2, 3) - # s = 1 - m ~ Normal(0, sqrt(s)) - return y ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)]) -end - -@model function gdemo_bk(x) - # Backward Step 1: theta ~ theta | x - s ~ InverseGamma(2, 3) - # s = 1 - m ~ Normal(0, sqrt(s)) - x ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)]) - # Backward Step 2: x ~ x | theta - return y ~ MvNormal([m; m; m], [sqrt(s) 0 0; 0 sqrt(s) 0; 0 0 sqrt(s)]) -end - -fw = IS(NSamples) -# bk = Gibbs(10, PG(10,10, :s, :y), HMC(1, 0.25, 5, :m)); -bk_sample_n = 20 -bk = NUTS(bk_sample_n, 100, 0.65); - -s = sample(gdemo_fw(), fw); -# describe(s) - -N = div(NSamples, bk_sample_n) - -x = s[1, :y] -s_bk = Array{MCMCChains.Chains}(undef, N) - -simple_logger = Base.CoreLogging.SimpleLogger(stderr, Base.CoreLogging.Debug) -with_logger(simple_logger) do - i = 1 - while i <= N - s_bk[i] = sample(gdemo_bk(x), bk) - x = s_bk[i][end, :y] - i += 1 - end -end - -s2 = chainscat(s_bk...) -# describe(s2) - -# qqplot(s[:m], s2[:m]) -# qqplot(s[:s], s2[:s]) - -qqm = qqbuild(s[:m], s2[:m]) - -using UnicodePlots - -qqm = qqbuild(s[:m], s2[:m]) -show(scatterplot(qqm.qx, qqm.qy; title="QQ plot for m", canvas=DotCanvas)) -show( - scatterplot( - qqm.qx[51:(end - 50)], - qqm.qy[51:(end - 50)]; - title="QQ plot for m (removing first and last 50 quantiles):", - canvas=DotCanvas, - ), -) -show(scatterplot(qqm.qx, qqm.qy; title="QQ plot for m")) -show( - scatterplot( - qqm.qx[51:(end - 50)], - qqm.qy[51:(end - 50)]; - title="QQ plot for m (removing first and last 50 quantiles):", - ), -) - -qqs = qqbuild(s[:s].value, s2[:s].value) -show(scatterplot(qqs.qx, qqs.qy; title="QQ plot for s")) -show( - scatterplot( - qqs.qx[51:(end - 50)], - qqs.qy[51:(end - 50)]; - title="QQ plot for s (removing first and last 50 quantiles):", - ), -) - -X = qqm.qx -y = qqm.qy -slope = (1 / (transpose(X) * X)[1] * transpose(X) * y)[1] - -print(" slopeₛ = $slope ≈ 1 (ϵ = 0.1)") -ans1 = abs(slope - 1.0) <= 0.1 -if ans1 - printstyled(" ✓\n"; color=:green) -else - printstyled(" X\n"; color=:red) - printstyled(" slope = $slope, diff = $(slope - 1.0)\n"; color=:red) -end - -# qqs = qqbuild(s[:s], s2[:s]) diff --git a/test/skipped/opt_param_of_dist.jl b/test/skipped/opt_param_of_dist.jl deleted file mode 100644 index 3c8e4d2dcd..0000000000 --- a/test/skipped/opt_param_of_dist.jl +++ /dev/null @@ -1,20 +0,0 @@ -using Turing -using Test - -@model testassume begin - x ~ Bernoulli(1; :static=true) - y ~ Bernoulli(x / 2; :param=true) - 1 ~ Normal(0, 1; :static=true) - 2 ~ Normal(0, 1; :param=true) - y, x -end - -s = SMC() - -res = sample(testassume, s) - -# check that x is always 1 -@test all(isone, res[:x]) - -# check that the mean of y is between 0.4 and 0.6 -@test mean(res[:y]) ≈ 0.5 atol = 0.1 diff --git a/test/skipped/run_all.jl b/test/skipped/run_all.jl deleted file mode 100644 index 6ffe6e34b7..0000000000 --- a/test/skipped/run_all.jl +++ /dev/null @@ -1,5 +0,0 @@ -include("leapfrog.jl") - -include("simple_gauss_hmc.jl") -include("gdemo_hmc.jl") -include("gdemo_nuts.jl") diff --git a/test/skipped/simple_gauss.jl b/test/skipped/simple_gauss.jl deleted file mode 100644 index 14f36dce1f..0000000000 --- a/test/skipped/simple_gauss.jl +++ /dev/null @@ -1,52 +0,0 @@ -include("unit_test_helper.jl") - -# Turing - -using Turing - -@model function simple_gauss() - s = 1 - m ~ Normal(0, sqrt(s)) - 2.0 ~ Normal(m, sqrt(s)) - return 2.5 ~ Normal(m, sqrt(s)) -end - -# Plain Julia - -using ReverseDiff: GradientTape, GradientConfig, gradient, gradient!, compile - -θ_dim = 1 -function lj_func(θ) - _lj = zero(Real) - - s = 1 - - m = θ[1] - _lj += logpdf(Normal(0, sqrt(s)), m) - - _lj += logpdf(Normal(m, sqrt(s)), 2.0) - _lj += logpdf(Normal(m, sqrt(s)), 2.5) - - return _lj -end - -neg_lj_func(θ) = -lj_func(θ) -const f_tape = GradientTape(neg_lj_func, randn(θ_dim)) -const compiled_f_tape = compile(f_tape) - -function grad_func(θ) - inputs = θ - results = similar(θ) - all_results = DiffResults.GradientResult(results) - - gradient!(all_results, compiled_f_tape, inputs) - - neg_lj = all_results.value - grad, = all_results.derivs - - return -neg_lj, grad -end - -# Unit test for gradient - -test_grad(simple_gauss, grad_func) diff --git a/test/skipped/simple_gauss_hmc.jl b/test/skipped/simple_gauss_hmc.jl deleted file mode 100644 index ba1f9dd231..0000000000 --- a/test/skipped/simple_gauss_hmc.jl +++ /dev/null @@ -1,36 +0,0 @@ -using Turing: _hmc_step - -include("unit_test_helper.jl") -include("simple_gauss.jl") - -# Turing - -mf = simple_gauss() -chn = sample(mf, HMC(10000, 0.05, 10)) - -println("mean of m: $(mean(chn[1000:end, :m]))") - -# Plain Julia - -std = ones(θ_dim) -θ = randn(θ_dim) -lj = lj_func(θ) - -chn = Dict(:θ => Vector{Vector{Float64}}(), :logϵ => Vector{Float64}()) -accept_num = 1 - -totla_num = 10000 -for iter in 1:totla_num - push!(chn[:θ], θ) - θ, lj, is_accept, τ_valid, α = _hmc_step(θ, lj, lj_func, grad_func, 10, 0.05, std) - accept_num += is_accept -end - -@show lj -@show mean(chn[:θ]) -samples_first_dim = map(x -> x[1], chn[:θ]) -@show std(samples_first_dim) - -@show accept_num / totla_num - -# Unit tests diff --git a/test/skipped/sv.jl b/test/skipped/sv.jl deleted file mode 100644 index b5fe22e715..0000000000 --- a/test/skipped/sv.jl +++ /dev/null @@ -1,76 +0,0 @@ -using ReverseDiff: GradientTape, GradientConfig, gradient, gradient!, compile -using Turing: _hmc_step -using Turing -using HDF5, JLD -sv_data = load(TPATH * "/example-models/nips-2017/sv-data.jld.data")["data"] - -@model function sv_model(T, y) - ϕ ~ Uniform(-1, 1) - σ ~ truncated(Cauchy(0, 5), 0, Inf) - μ ~ Cauchy(0, 10) - - h = Vector{Real}(T) - h[1] ~ Normal(μ, σ / sqrt(1 - ϕ^2)) - y[1] ~ Normal(0, exp.(h[1] / 2)) - for t in 2:T - h[t] ~ Normal(μ + ϕ * (h[t - 1] - μ), σ) - y[t] ~ Normal(0, exp.(h[t] / 2)) - end -end - -mf = sv_model(; data=sv_data[1]) -chain_nuts = sample(mf, HMC(0.05, 10), 2000) - -println("mean of m: $(mean(chn[1000:end, :μ]))") - -# θ_dim = 1 -# function lj_func(θ) -# _lj = zero(Real) - -# s = 1 - -# m = θ[1] -# _lj += logpdf(Normal(0, sqrt(s)), m) - -# _lj += logpdf(Normal(m, sqrt(s)), 2.0) -# _lj += logpdf(Normal(m, sqrt(s)), 2.5) - -# return _lj -# end - -# neg_lj_func(θ) = -lj_func(θ) -# const f_tape = GradientTape(neg_lj_func, randn(θ_dim)) -# const compiled_f_tape = compile(f_tape) - -# function grad_func(θ) - -# inputs = θ -# results = similar(θ) -# all_results = DiffResults.GradientResult(results) - -# gradient!(all_results, compiled_f_tape, inputs) - -# neg_lj = all_results.value -# grad, = all_results.derivs - -# return -neg_lj, grad - -# end - -# std = ones(θ_dim) -# θ = randn(θ_dim) -# lj = lj_func(θ) - -# chn = [] -# accept_num = 1 - -# totla_num = 5000 -# for iter = 1:totla_num -# push!(chn, θ) -# θ, lj, is_accept, τ_valid, α = _hmc_step(θ, lj, lj_func, grad_func, 5, 0.05, std) -# accept_num += is_accept -# # if (iter % 50 == 0) println(θ) end -# end - -# @show mean(chn), lj -# @show accept_num / totla_num diff --git a/test/skipped/unit_test_helper.jl b/test/skipped/unit_test_helper.jl deleted file mode 100644 index 7dec7757bb..0000000000 --- a/test/skipped/unit_test_helper.jl +++ /dev/null @@ -1,24 +0,0 @@ -using Test - -function test_grad(turing_model, grad_f; trans=Dict()) - model_f = turing_model() - vi = model_f() - for i in trans - vi.flags["trans"][i] = true - end - d = length(vi.vals) - @testset "Gradient using random inputs" begin - ℓ = LogDensityProblemsAD.ADgradient( - Turing.AutoTracker(), - DynamicPPL.LogDensityFunction( - model_f, - vi, - DynamicPPL.SamplingContext(SampleFromPrior(), DynamicPPL.DefaultContext()), - ), - ) - for _ in 1:10000 - theta = rand(d) - @test LogDensityProblems.logdensity_and_gradient(ℓ, theta) == grad_f(theta)[2] - end - end -end diff --git a/test/skipped/vec_assume_mat.jl b/test/skipped/vec_assume_mat.jl deleted file mode 100644 index b9efc94d0a..0000000000 --- a/test/skipped/vec_assume_mat.jl +++ /dev/null @@ -1,20 +0,0 @@ -using Turing, Test - -N = 5 -alg = HMC(0.2, 4) - -@model function vdemo(::Type{T}=Float64) where {T} - v = Vector{Matrix{T}}(undef, N) - @. v ~ Wishart(7, [1 0.5; 0.5 1]) -end - -t_vec = @elapsed res_vec = sample(vdemo(), alg, 1000) - -@model function vdemo() - v = Vector{Matrix{Real}}(undef, N) - for i in 1:N - v[i] ~ Wishart(7, [1 0.5; 0.5 1]) - end -end - -t_loop = @elapsed res = sample(vdemo(), alg, 1000) diff --git a/test/skipped/vec_assume_mv.jl b/test/skipped/vec_assume_mv.jl deleted file mode 100644 index ba373b124f..0000000000 --- a/test/skipped/vec_assume_mv.jl +++ /dev/null @@ -1,42 +0,0 @@ - -using Turing, Test - -N = 10 -beta = [0.5, 0.5] -alg = HMC(0.2, 4) - -# Test for vectorize UnivariateDistribution -@model function vdemo() - phi = Vector{Vector{Real}}(undef, N) - @> phi ~ Dirichlet(beta) -end - -ch_vec, t_vec, m_vec, gctime, memallocs = @timed res_vec = sample(vdemo(), alg) - -@model function vdemo() - phi = Matrix(undef, 2, N) - @. phi ~ Dirichlet(beta) -end - -ch_vec_mat, t_vec_mat, m_vec_mat, gctime, memallocs = @timed res_vec_mat = sample( - vdemo(), alg -) - -@model function vdemo() - phi = Vector{Vector{Real}}(undef, N) - for i in 1:N - phi[i] ~ Dirichlet(beta) - end -end - -ch_loop, t_loop, m_loop, gctime, memallocs = @timed res = sample(vdemo(), alg) - -println("Time for") -println(" Loop : $(sum(ch_loop[:elapsed]))") -println(" Vec : $(sum(ch_vec[:elapsed]))") -println(" Vec2 : $(sum(ch_vec_mat[:elapsed]))") - -println("Mem for") -println(" Loop : $m_loop") -println(" Vec : $m_vec") -println(" Vec2 : $m_vec_mat") diff --git a/test/variational/advi.jl b/test/variational/advi.jl index c2abacb675..ed8f745df2 100644 --- a/test/variational/advi.jl +++ b/test/variational/advi.jl @@ -1,71 +1,133 @@ + module AdvancedVITests using ..Models: gdemo_default using ..NumericalTests: check_gdemo -import AdvancedVI -using AdvancedVI: TruncatedADAGrad, DecayedADAGrad + +using AdvancedVI using Bijectors: Bijectors using Distributions: Dirichlet, Normal -using LinearAlgebra: I +using LinearAlgebra using MCMCChains: Chains -import Random +using Random +using StableRNGs: StableRNG using Test: @test, @testset using Turing -using Turing.Essential: TuringDiagMvNormal +using Turing.Variational + +@testset "ADVI" begin + @testset "q initialization" begin + m = gdemo_default + d = length(Turing.DynamicPPL.VarInfo(m)[:]) + for q in [q_meanfield_gaussian(m), q_fullrank_gaussian(m)] + rand(q) + end + + μ = ones(d) + q = q_meanfield_gaussian(m; location=μ) + @assert mean(q.dist) ≈ μ -@testset "advi.jl" begin - @testset "advi constructor" begin - Random.seed!(0) - N = 500 + q = q_fullrank_gaussian(m; location=μ) + @assert mean(q.dist) ≈ μ - s1 = ADVI() - q = vi(gdemo_default, s1) - c1 = rand(q, N) + L = Diagonal(fill(0.1, d)) + q = q_meanfield_gaussian(m; scale=L) + @assert cov(q.dist) ≈ L * L + + L = LowerTriangular(tril(0.01 * ones(d, d) + I)) + q = q_fullrank_gaussian(m; scale=L) + @assert cov(q.dist) ≈ L * L' end - @testset "advi inference" begin - @testset for opt in [TruncatedADAGrad(), DecayedADAGrad()] - Random.seed!(1) - N = 500 - - alg = ADVI(10, 5000) - q = vi(gdemo_default, alg; optimizer=opt) - samples = transpose(rand(q, N)) - chn = Chains(reshape(samples, size(samples)..., 1), ["s", "m"]) - # TODO: uhmm, seems like a large `eps` here... - check_gdemo(chn; atol=0.5) + @testset "default interface" begin + for q0 in [q_meanfield_gaussian(gdemo_default), q_fullrank_gaussian(gdemo_default)] + _, q, _, _ = vi(gdemo_default, q0, 100; show_progress=Turing.PROGRESS[]) + c1 = rand(q, 10) end end - @testset "advi different interfaces" begin - Random.seed!(1234) - - target = MvNormal(zeros(2), I) - logπ(z) = logpdf(target, z) - advi = ADVI(10, 1000) - - # Using a function z ↦ q(⋅∣z) - getq(θ) = TuringDiagMvNormal(θ[1:2], exp.(θ[3:4])) - q = vi(logπ, advi, getq, randn(4)) + @testset "custom interface $name" for (name, objective, operator, optimizer) in [ + ( + "ADVI with closed-form entropy", + AdvancedVI.RepGradELBO(10), + AdvancedVI.ProximalLocationScaleEntropy(), + AdvancedVI.DoG(), + ), + ( + "ADVI with proximal entropy", + AdvancedVI.RepGradELBO(10; entropy=AdvancedVI.ClosedFormEntropyZeroGradient()), + AdvancedVI.ClipScale(), + AdvancedVI.DoG(), + ), + ( + "ADVI with STL entropy", + AdvancedVI.RepGradELBO(10; entropy=AdvancedVI.StickingTheLandingEntropy()), + AdvancedVI.ClipScale(), + AdvancedVI.DoG(), + ), + ] + T = 1000 + q, q_avg, _, _ = vi( + gdemo_default, + q_meanfield_gaussian(gdemo_default), + T; + objective, + optimizer, + operator, + show_progress=Turing.PROGRESS[], + ) + + N = 1000 + c1 = rand(q_avg, N) + c2 = rand(q, N) + end - xs = rand(target, 10) - @test mean(abs2, logpdf(q, xs) - logpdf(target, xs)) ≤ 0.07 + @testset "inference $name" for (name, objective, operator, optimizer) in [ + ( + "ADVI with closed-form entropy", + AdvancedVI.RepGradELBO(10), + AdvancedVI.ProximalLocationScaleEntropy(), + AdvancedVI.DoG(), + ), + ( + "ADVI with proximal entropy", + RepGradELBO(10; entropy=AdvancedVI.ClosedFormEntropyZeroGradient()), + AdvancedVI.ClipScale(), + AdvancedVI.DoG(), + ), + ( + "ADVI with STL entropy", + AdvancedVI.RepGradELBO(10; entropy=AdvancedVI.StickingTheLandingEntropy()), + AdvancedVI.ClipScale(), + AdvancedVI.DoG(), + ), + ] + rng = StableRNG(0x517e1d9bf89bf94f) + + T = 1000 + q, q_avg, _, _ = vi( + rng, + gdemo_default, + q_meanfield_gaussian(gdemo_default), + T; + optimizer, + show_progress=Turing.PROGRESS[], + ) + + N = 1000 + for q_out in [q_avg, q] + samples = transpose(rand(rng, q_out, N)) + chn = Chains(reshape(samples, size(samples)..., 1), ["s", "m"]) - # OR: implement `update` and pass a `Distribution` - function AdvancedVI.update(d::TuringDiagMvNormal, θ::AbstractArray{<:Real}) - return TuringDiagMvNormal(θ[1:length(q)], exp.(θ[(length(q) + 1):end])) + check_gdemo(chn; atol=0.5) end - - q0 = TuringDiagMvNormal(zeros(2), ones(2)) - q = vi(logπ, advi, q0, randn(4)) - - xs = rand(target, 10) - @test mean(abs2, logpdf(q, xs) - logpdf(target, xs)) ≤ 0.05 end # regression test for: # https://github.com/TuringLang/Turing.jl/issues/2065 @testset "simplex bijector" begin + rng = StableRNG(0x517e1d9bf89bf94f) + @model function dirichlet() x ~ Dirichlet([1.0, 1.0]) return x @@ -81,25 +143,27 @@ using Turing.Essential: TuringDiagMvNormal @test all(x0 .≈ x0_inv) # And regression for https://github.com/TuringLang/Turing.jl/issues/2160. - q = vi(m, ADVI(10, 1000)) - x = rand(q, 1000) + _, q, _, _ = vi(rng, m, q_meanfield_gaussian(m), 1000) + x = rand(rng, q, 1000) @test mean(eachcol(x)) ≈ [0.5, 0.5] atol = 0.1 end # Ref: https://github.com/TuringLang/Turing.jl/issues/2205 @testset "with `condition` (issue #2205)" begin + rng = StableRNG(0x517e1d9bf89bf94f) + @model function demo_issue2205() x ~ Normal() return y ~ Normal(x, 1) end model = demo_issue2205() | (y=1.0,) - q = vi(model, ADVI(10, 1000)) + _, q, _, _ = vi(rng, model, q_meanfield_gaussian(model), 1000) # True mean. mean_true = 1 / 2 var_true = 1 / 2 # Check the mean and variance of the posterior. - samples = rand(q, 1000) + samples = rand(rng, q, 1000) mean_est = mean(samples) var_est = var(samples) @test mean_est ≈ mean_true atol = 0.2 diff --git a/test/variational/optimisers.jl b/test/variational/optimisers.jl deleted file mode 100644 index 6f64d5fb1f..0000000000 --- a/test/variational/optimisers.jl +++ /dev/null @@ -1,29 +0,0 @@ -module VariationalOptimisersTests - -using AdvancedVI: DecayedADAGrad, TruncatedADAGrad, apply! -import ForwardDiff -import ReverseDiff -using Test: @test, @testset -using Turing - -function test_opt(ADPack, opt) - θ = randn(10, 10) - θ_fit = randn(10, 10) - loss(x, θ_) = mean(sum(abs2, θ * x - θ_ * x; dims=1)) - for t in 1:(10^4) - x = rand(10) - Δ = ADPack.gradient(θ_ -> loss(x, θ_), θ_fit) - Δ = apply!(opt, θ_fit, Δ) - @. θ_fit = θ_fit - Δ - end - @test loss(rand(10, 100), θ_fit) < 0.01 - @test length(opt.acc) == 1 -end -for opt in [TruncatedADAGrad(), DecayedADAGrad(1e-2)] - test_opt(ForwardDiff, opt) -end -for opt in [TruncatedADAGrad(), DecayedADAGrad(1e-2)] - test_opt(ReverseDiff, opt) -end - -end