diff --git a/docs/.gitignore b/docs/.gitignore index ec44e567b..909052372 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,3 +1,3 @@ build/ site/ -src/examples/ \ No newline at end of file +src/examples/ diff --git a/docs/Project.toml b/docs/Project.toml index 907292917..8eb475354 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,4 +1,6 @@ [deps] +AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" KernelFunctions = "ec8451be-7e33-11e9-00cf-bbf324bd1392" PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150" diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl index 94346eea2..ebe279618 100644 --- a/examples/deepkernellearning.jl +++ b/examples/deepkernellearning.jl @@ -1,40 +1,78 @@ +# # Deep Kernel Learning with Flux +# ## Package loading +# We use a couple of useful packages to plot and optimize +# the different hyper-parameters using KernelFunctions -using MLDataUtils -using Zygote using Flux using Distributions, LinearAlgebra using Plots +using ProgressMeter +using AbstractGPs +pyplot(); +default(; legendfontsize=15.0, linewidth=3.0); -Flux.@functor SqExponentialKernel -Flux.@functor KernelSum -Flux.@functor Matern32Kernel -Flux.@functor FunctionTransform - -neuralnet = Chain(Dense(1, 3), Dense(3, 2)) -k = SqExponentialKernel(FunctionTransform(neuralnet)) +# ## Data creation +# We create a simple 1D Problem with very different variations xmin = -3; -xmax = 3; -x = range(xmin, xmax; length=100) -x_test = rand(Uniform(xmin, xmax), 200) -x, y = noisy_function(sinc, x; noise=0.1) -X = reshape(x, :, 1) -λ = [0.1] -function f(x, k, λ) - return kernelmatrix(k, X, x; obsdim=1) * - inv(kernelmatrix(k, X; obsdim=1) + exp(λ[1]) * I) * - y -end -f(X, k, 1.0) -loss(k, λ) = ŷ -> sum(y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ)(f(X, k, λ)) -loss(k, λ) +xmax = 3; # Limits +N = 150 +noise = 0.01 +x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset +target_f(x) = sinc(abs(x)^abs(x)) # We use sinc with a highly varying value +target_f(x::AbstractArray) = target_f(first(x)) +y_train = target_f.(x_train) + randn(N) * noise +x_test = collect(eachrow(range(xmin, xmax; length=200))) # Testing dataset +spectral_mixture_kernel() +# ## Model definition +# We create a neural net with 2 layers and 10 units each +# The data is passed through the NN before being used in the kernel +neuralnet = Chain(Dense(1, 20), Dense(20, 30), Dense(30, 5)) +# We use two cases : +# - The Squared Exponential Kernel +k = transform(SqExponentialKernel(), FunctionTransform(neuralnet)) + +# We use AbstractGPs.jl to define our model +gpprior = GP(k) # GP Prior +fx = AbstractGPs.FiniteGP(gpprior, x_train, noise) # Prior on f +fp = posterior(fx, y_train) # Posterior of f + +# This compute the log evidence of `y`, +# which is going to be used as the objective +loss(y) = -logpdf(fx, y) + +@info "Init Loss = $(loss(y_train))" + +# Flux will automatically extract all the parameters of the kernel ps = Flux.params(k) -# push!(ps,λ) -opt = Flux.Momentum(1.0) -## -for i in 1:10 - grads = Zygote.gradient(() -> loss(k, λ), ps) + +# We show the initial prediction with the untrained model +p_init = Plots.plot( + vcat(x_test...), target_f; lab="true f", title="Loss = $(loss(y_train))" +) +Plots.scatter!(vcat(x_train...), y_train; lab="data") +pred = marginals(fp(x_test)) +Plots.plot!(vcat(x_test...), mean.(pred); ribbon=std.(pred), lab="Prediction") +# ## Training +anim = Animation() +nmax = 1000 +opt = Flux.ADAM(0.1) +@showprogress for i in 1:nmax + global grads = gradient(ps) do + loss(y_train) + end Flux.Optimise.update!(opt, ps, grads) - p = Plots.scatter(x, y; lab="data", title="Loss = $(loss(k,λ))") - Plots.plot!(x, f(X, k, λ); lab="Prediction", lw=3.0) - display(p) + if i % 100 == 0 + @info "$i/$nmax" + L = loss(y_train) + # @info "Loss = $L" + p = Plots.plot( + vcat(x_test...), target_f; lab="true f", title="Loss = $(loss(y_train))" + ) + p = Plots.scatter!(vcat(x_train...), y_train; lab="data") + pred = marginals(posterior(fx, y_train)(x_test)) + Plots.plot!(vcat(x_test...), mean.(pred); ribbon=std.(pred), lab="Prediction") + frame(anim) + display(p) + end end +gif(anim; fps=5) diff --git a/examples/gaussianprocesspriors.jl b/examples/gaussianprocesspriors.jl new file mode 100644 index 000000000..38da13a9a --- /dev/null +++ b/examples/gaussianprocesspriors.jl @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- +# # Gaussian process prior samples +# +# The kernels defined in this package can also be used to specify the +# covariance of a Gaussian process prior. +# A Gaussian process (GP) is defined by its mean function $m(\cdot)$ and its covariance function or kernel $k(\cdot, \cdot')$: +# ```math +# f \sim \mathcal{GP}\big(m(\cdot), k(\cdot, \cdot')\big) +# ``` +# The function values of the GP at a finite number of points $X = \{x_n\}_{n=1}^N$ follow a multivariate normal distribution with mean vector $\mathbf{m}$ and covariance matrix $\mathrm{K}$, where +# ```math +# \begin{aligned} +# \mathbf{m}_i &= m(x_i) \\ +# \mathrm{K}_{i,j} &= k(x_i, x_j) +# \end{aligned} +# ``` +# where $1 \le i, j \le N$. +# +# In this notebook we show samples from zero-mean GPs with different kernels. + +## Load required packages +using KernelFunctions +using LinearAlgebra +using Distributions +using Plots +default(; lw=1.0, legendfontsize=15.0) +using Random: seed! +seed!(42); + +# We now define a function that visualizes a kernel +# for us. We use the same randomness to obtain +# comparable samples. + +num_inputs = 101 +xlim = (-5, 5) +X = reshape(collect(range(xlim...; length=num_inputs)), :, 1) +num_samples = 11 +v = randn(num_inputs, num_samples); + +function visualize(k::Kernel; xref=0.0) + K = kernelmatrix(k, X; obsdim=1) + L = cholesky(K + 1e-6 * I) + f = L.L * v + + p_kernel_2d = heatmap( + K; + yflip=true, + colorbar=false, + ylabel=string(k), + framestyle=:none, + #color=:blues, + vlim=(0, 1), + title=raw"$k(x, x')$", + ) + + p_kernel_cut = plot( + X, + k.(X, xref); + title=string(raw"$k(x, ", xref, raw")$"), + xlim=xlim, + xticks=(xlim, xlim), + label=nothing, + ) + + p_samples = plot( + X, + f; + c="blue", + title=raw"$f(x)$", + ylim=(-3, 3), + xlim=xlim, + xticks=(xlim, xlim), + label=nothing, + ) + + return plot(p_kernel_2d, p_kernel_cut, p_samples; layout=(1, 3), xlabel=raw"$x$") +end + +# We can now visualize a kernel and show samples from +# a Gaussian process with this kernel: + +visualize(SqExponentialKernel()) + +# This also allows us to compare different kernels: + +kernel_classes = [Matern12Kernel, Matern32Kernel, Matern52Kernel, SqExponentialKernel] +plot( + [visualize(k()) for k in kernel_classes]..., + #layout=(length(kernel_classes), 1) +) diff --git a/examples/kernel_ridge_regression.jl b/examples/kernel_ridge_regression.jl new file mode 100644 index 000000000..a76905462 --- /dev/null +++ b/examples/kernel_ridge_regression.jl @@ -0,0 +1,159 @@ +# # Kernel Ridge Regression +# +# Building on linear regression, we can fit non-linear data sets by introducing a feature space. In a higher-dimensional feature space, we can overfit the data; ridge regression introduces regularization to avoid this. In this notebook we show how we can use KernelFunctions.jl for *kernel* ridge regression. + +## Loading and setup of required packages +using KernelFunctions +using LinearAlgebra +using Distributions + +## Plotting +using Plots; +default(; lw=2.0, legendfontsize=15.0); + +using Random: seed! +seed!(42); + +# ## From linear regression to ridge regression +# Here we use a one-dimensional toy problem. We generate data using the fourth-order polynomial $f(x) = (x+4)(x+1)(x-1)(x-3)$: + +f_truth(x) = (x + 4) * (x + 1) * (x - 1) * (x - 3) + +x_train = collect(-5:0.5:5) +x_test = collect(-5:0.1:5) + +noise = rand(Uniform(-10, 10), size(x_train)) +y_train = f_truth.(x_train) + noise +y_test = f_truth.(x_test) + +plot(x_test, y_test; label=raw"$f(x)$") +scatter!(x_train, y_train; label="observations") + +# For training inputs $\mathrm{X}=(\mathbf{x}_n)_{n=1}^N$ and observations $\mathbf{y}=(y_n)_{n=1}^N$, the linear regression weights $\mathbf{w}$ using the least-squares estimator are given by +# ```math +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y} +# ``` +# We predict at test inputs $\mathbf{x}_*$ using +# ```math +# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} +# ``` +# This is implemented by `linear_regression`: + +function linear_regression(X, y, Xstar) + weights = (X' * X) \ (X' * y) + return Xstar * weights +end + +# A linear regression fit to the above data set: + +y_pred = linear_regression(x_train, y_train, x_test) +scatter(x_train, y_train; label="observations") +plot!(x_test, y_pred; label="linear fit") + +# We can improve the fit by including additional features, i.e. generalizing to $\mathrm{X} = (\phi(x_n))_{n=1}^N$, where $\phi(x)$ constructs a feature vector for each input $x$. Here we include powers of the input, $\phi(x) = (1, x, x^2, \dots, x^d)$: + +function featurize_poly(x; degree=1) + xcols = [x .^ d for d in 0:degree] + return hcat(xcols...) +end + +function featurized_fit_and_plot(degree) + X = featurize_poly(x_train; degree=degree) + Xstar = featurize_poly(x_test; degree=degree) + y_pred = linear_regression(X, y_train, Xstar) + scatter(x_train, y_train; legend=false, title="fit of order $degree") + return plot!(x_test, y_pred) +end + +plot([featurized_fit_and_plot(degree) for degree in 1:4]...) + +# Note that the fit becomes perfect when we include exactly as many orders in the features as we have in the underlying polynomial (4). +# +# However, when increasing the number of features, we can quickly overfit to noise in the data set: + +featurized_fit_and_plot(18) + +# To counteract this unwanted behaviour, we can introduce regularization. This leads to *ridge regression* with $L_2$ regularization of the weights ([Tikhonov regularization](https://en.wikipedia.org/wiki/Tikhonov_regularization)). +# Instead of the weights in linear regression, +# $$ +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y} +# $$ +# we introduce the ridge parameter $\lambda$: +# $$ +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y} +# $$ +# As before, we predict at test inputs $\mathbf{x}_*$ using +# ```math +# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} +# ``` +# This is implemented by `ridge_regression`: + +function ridge_regression(X, y, Xstar, lambda) + weights = (X' * X + lambda * I) \ (X' * y) + return Xstar * weights +end + +function regularized_fit_and_plot(degree, lambda) + X = featurize_poly(x_train; degree=degree) + Xstar = featurize_poly(x_test; degree=degree) + y_pred = ridge_regression(X, y_train, Xstar, lambda) + scatter(x_train, y_train; legend=false, title="\$\\lambda=$lambda\$") + return plot!(x_test, y_pred) +end + +plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]...) + +# Instead of constructing the feature matrix explicitly, we can use *kernels* to replace inner products of feature vectors with a kernel evaluation: $\langle \phi(x), \phi(x') \rangle = k(x, x')$ or $\mathrm{X} \mathrm{X}^\top = \mathrm{K}$, where $\mathrm{K}_{ij} = k(x_i, x_j)$. +# +# To apply this "kernel trick" to ridge regression, we can rewrite the ridge estimate for the weights +# $$ +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y} +# $$ +# using the [matrix inversion lemma](https://tlienart.github.io/pub/csml/mtheory/matinvlem.html#basic_lemmas) +# as +# $$ +# \mathbf{w} = \mathrm{X}^\top (\mathrm{X} \mathrm{X}^\top + \lambda \mathbb{1})^{-1} \mathbf{y} +# $$ +# where we can now replace the inner product with the kernel matrix, +# $$ +# \mathbf{w} = \mathrm{X}^\top (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y} +# $$ +# And the prediction yields another inner product, +# ```math +# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} = \langle \mathbf{x}_*, \mathbf{w} \rangle = \mathbf{k}_* (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y} +# ``` +# where $(\mathbf{k}_*)_n = k(x_*, x_n)$. +# +# This is implemented by `kernel_ridge_regression`: + +function kernel_ridge_regression(k, X, y, Xstar, lambda) + K = kernelmatrix(k, X) + kstar = kernelmatrix(k, Xstar, X) + return kstar * ((K + lambda * I) \ y) +end + +# Now, instead of explicitly constructing features, we can simply pass in a `PolynomialKernel` object: + +function kernelized_fit_and_plot(kernel, lambda=1e-4) + y_pred = kernel_ridge_regression(kernel, x_train, y_train, x_test, lambda) + if kernel isa PolynomialKernel + title = string("order ", kernel.degree) + else + title = string(kernel) + end + scatter(x_train, y_train; label=nothing) + p = plot!( + x_test, + y_pred; + label=nothing, + title=title, + #title=string(raw"$\lambda=", lambda, raw"$") + ) + return p +end + +plot([kernelized_fit_and_plot(PolynomialKernel(; degree=degree, c=1)) for degree in 1:4]...) + +# However, we can now also use kernels that would have an infinite-dimensional feature expansion, such as the squared exponential kernel: + +kernelized_fit_and_plot(SqExponentialKernel()) diff --git a/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl deleted file mode 100644 index 0bc02f25a..000000000 --- a/examples/kernelridgeregression.jl +++ /dev/null @@ -1,40 +0,0 @@ -using KernelFunctions -using MLDataUtils -using Zygote -using Flux -using Distributions, LinearAlgebra -using Plots - -Flux.@functor SqExponentialKernel -Flux.@functor ScaleTransform -Flux.@functor KernelSum -Flux.@functor Matern32Kernel - -xmin = -3; -xmax = 3; -x = range(xmin, xmax; length=100) -x_test = range(xmin, xmax; length=300) -x, y = noisy_function(sinc, x; noise=0.1) -X = reshape(x, :, 1) -X_test = reshape(x_test, :, 1) -k = SqExponentialKernel(1.0)#+Matern32Kernel(2.0) -λ = [-1.0] -function f(x, k, λ) - return kernelmatrix(k, x, X; obsdim=1) * - inv(kernelmatrix(k, X; obsdim=1) + exp(λ[1]) * I) * - y -end -f(X, k, 1.0) -loss(k, λ) = ŷ -> sum(y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ)(f(X, k, λ)) -loss(k, λ) -ps = Flux.params(k) -push!(ps, λ) -opt = Flux.Momentum(0.1) -## -for i in 1:10 - grads = Zygote.gradient(() -> loss(k, λ), ps) - Flux.Optimise.update!(opt, ps, grads) - p = Plots.scatter(x, y; lab="data", title="Loss = $(loss(k,λ))") - Plots.plot!(x_test, f(X_test, k, λ); lab="Prediction", lw=3.0) - display(p) -end diff --git a/examples/svm.jl b/examples/svm.jl index e07c7f6f3..6e738c55a 100644 --- a/examples/svm.jl +++ b/examples/svm.jl @@ -1,11 +1,13 @@ -# # Support vector machine +# # Support Vector Machines -using KernelFunctions -using Distributions -using Plots +# TODO: introduction +# +# We first load the packages we will need in this notebook: -using LinearAlgebra -using Random +using KernelFunctions +using Distributions, LinearAlgebra, Random +using Plots; +default(; legendfontsize=15.0, ms=5.0); ## Set plotting theme theme(:wong) @@ -13,27 +15,84 @@ theme(:wong) ## Set seed Random.seed!(1234); +# ## Data Generation +# +# We first generate a mixture of two Gaussians in 2 dimensions + +xmin = -3; +xmax = 3; # Limits for sampling μ₁ and μ₂ +μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers + +# We then sample both the input $x$ and the class $y$: + +N = 100 # Number of samples +y = rand((-1, 1), N) # Select randomly between the two classes +x = Vector{Vector{Float64}}(undef, N) # We preallocate x +x[y .== 1] = [rand(MvNormal(μ[:, 1], I)) for _ in 1:count(y .== 1)] # Features for samples of class 1 +x[y .== -1] = [rand(MvNormal(μ[:, 2], I)) for _ in 1:count(y .== -1)] # Features for samples of class 2 +scatter(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2); label="y = 1", title="Data") +scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2); label="y = 2") + # Number of samples: -N = 100; +#N = 100; # Select randomly between two classes: -y = rand([-1, 1], N); +#y = rand([-1, 1], N); # Random attributes for both classes: -X = Matrix{Float64}(undef, 2, N) -rand!(MvNormal(randn(2), I), view(X, :, y .== 1)) -rand!(MvNormal(randn(2), I), view(X, :, y .== -1)); +#X = Matrix{Float64}(undef, 2, N) +#rand!(MvNormal(randn(2), I), view(X, :, y .== 1)) +#rand!(MvNormal(randn(2), I), view(X, :, y .== -1)); # Create a 2D grid: -xgrid = range(floor(Int, minimum(X)), ceil(Int, maximum(X)); length=100) -Xgrid = ColVecs(mapreduce(collect, hcat, Iterators.product(xgrid, xgrid))); +#xgrid = range(floor(Int, minimum(X)), ceil(Int, maximum(X)); length=100) +#Xgrid = ColVecs(mapreduce(collect, hcat, Iterators.product(xgrid, xgrid))); + +# ## Model Definition +# TODO Write theory here + +# ### Kernel # Create kernel function: k = SqExponentialKernel() ∘ ScaleTransform(2.0) +λ = 1.0 # Regularization parameter + +# ### Predictor +# We create a function to return the optimal prediction for a test data `x_new` # Optimal prediction: -f(x, X, k, λ) = kernelmatrix(k, x, X) / (kernelmatrix(k, X) + exp(λ) * I) * y +f(x, X, y, k, λ) = kernelmatrix(k, x, X) / (kernelmatrix(k, X) + λ * I) * y +# f(x, X, y, k, λ) = kernelmatrix(k, x, X) * ((kernelmatrix(k, X) + λ * I) \ y) + +# ### Loss +# We also compute the total loss of the model that we want to minimize + +hingeloss(y, ŷ) = maximum(zero(ŷ), 1 - y * ŷ) # hingeloss function +function reg_hingeloss(k, x, y, λ) + ŷ = f(x, x, y, k, λ) + return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation +end + +# ### Testing the trained model +# We create a 2D grid based on the maximum values of the data + +N_test = 100 # Size of the grid +xgrid = range(extrema(vcat(x...)) .* 1.1...; length=N_test) # Create a 1D grid +xgrid_v = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid + +# We predict the value of y on this grid on plot it against the data -# Compute prediction on a grid: -contourf(xgrid, xgrid, f(Xgrid, ColVecs(X), k, 0.1)) -scatter!(X[1, :], X[2, :]; color=y, lab="data", widen=false) +y_grid = f(xgrid_v, x, y, k, λ) # Compute prediction on a grid +contourf( + xgrid, + xgrid, + reshape(y_grid, N_test, N_test)'; + label="Predictions", + title="Trained model", +) +scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2); label="y = 1") +scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2); label="y = 2") +xlims!(extrema(xgrid)) +ylims!(extrema(xgrid)) +# contourf(xgrid, xgrid, f(Xgrid, ColVecs(X), k, 0.1)) +# scatter!(X[1, :], X[2, :]; color=y, lab="data", widen=false) diff --git a/examples/train_kernel_parameters.jl b/examples/train_kernel_parameters.jl new file mode 100644 index 000000000..c3cc5d1dd --- /dev/null +++ b/examples/train_kernel_parameters.jl @@ -0,0 +1,84 @@ +# # Kernel Ridge Regression + +# ## We load KernelFunctions and some other packages + +using KernelFunctions +using LinearAlgebra +using Distributions +using Plots; +default(; lw=2.0, legendfontsize=15.0); +using Flux: Optimise +using ForwardDiff +using Random: seed! +seed!(42) + +# ## Data Generation +# We generated data in 1 dimension + +xmin = -3; +xmax = 3; # Bounds of the data +N = 50 # Number of samples +x_train = rand(Uniform(xmin, xmax), N) # We sample 100 random samples +σ = 0.1 +y_train = sinc.(x_train) + randn(N) * σ # We create a function and add some noise +x_test = range(xmin - 0.1, xmax + 0.1; length=300) + +# Plot the data + +scatter(x_train, y_train; lab="data") +plot!(x_test, sinc; lab="true function") + +# ## Kernel training +# To train the kernel parameters via ForwardDiff.jl +# we need to create a function creating a kernel from an array + +kernelcall(θ) = transform( + exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(), + exp(θ[3]), +) + +# From theory we know the prediction for a test set x given +# the kernel parameters and normalization constant + +function f(x, x_train, y_train, θ) + k = kernelcall(θ[1:3]) + return kernelmatrix(k, x, x_train) * + ((kernelmatrix(k, x_train) + exp(θ[4]) * I) \ y_train) +end + +# We look how the prediction looks like +# with starting parameters [1.0, 1.0, 1.0, 1.0] we get : + +ŷ = f(x_test, x_train, y_train, log.(ones(4))) +scatter(x_train, y_train; lab="data") +plot!(x_test, sinc; lab="true function") +plot!(x_test, ŷ; lab="prediction") + +# We define the loss based on the L2 norm both +# for the loss and the regularization + +function loss(θ) + ŷ = f(x_train, x_train, y_train, θ) + return sum(abs2, y_train - ŷ) + exp(θ[4]) * norm(ŷ) +end + +# The loss with our starting point : + +loss(log.(ones(4))) + +# ## Training the model + +θ = log.([1.0, 0.0, 0.01, 0.001]) # Initial vector +anim = Animation() +opt = Optimise.ADAGrad(0.5) +for i in 1:30 + grads = ForwardDiff.gradient(loss, θ) # We compute the gradients given the kernel parameters and regularization + Optimise.update!(opt, θ, grads) + scatter( + x_train, y_train; lab="data", title="i = $(i), Loss = $(round(loss(θ), digits = 4))" + ) + plot!(x_test, sinc; lab="true function") + plot!(x_test, f(x_test, x_train, y_train, θ); lab="Prediction", lw=3.0) + frame(anim) +end +gif(anim)