diff --git a/docs/.gitignore b/docs/.gitignore
index ec44e567b..909052372 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,3 +1,3 @@
 build/
 site/
-src/examples/
\ No newline at end of file
+src/examples/
diff --git a/docs/Project.toml b/docs/Project.toml
index 907292917..8eb475354 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,4 +1,6 @@
 [deps]
+AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 KernelFunctions = "ec8451be-7e33-11e9-00cf-bbf324bd1392"
 PDMats = "90014a1f-27ba-587c-ab20-58faa44d9150"
diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl
index 94346eea2..ebe279618 100644
--- a/examples/deepkernellearning.jl
+++ b/examples/deepkernellearning.jl
@@ -1,40 +1,78 @@
+# # Deep Kernel Learning with Flux
+# ## Package loading
+# We use a couple of useful packages to plot and optimize
+# the different hyper-parameters
 using KernelFunctions
-using MLDataUtils
-using Zygote
 using Flux
 using Distributions, LinearAlgebra
 using Plots
+using ProgressMeter
+using AbstractGPs
+pyplot();
+default(; legendfontsize=15.0, linewidth=3.0);
 
-Flux.@functor SqExponentialKernel
-Flux.@functor KernelSum
-Flux.@functor Matern32Kernel
-Flux.@functor FunctionTransform
-
-neuralnet = Chain(Dense(1, 3), Dense(3, 2))
-k = SqExponentialKernel(FunctionTransform(neuralnet))
+# ## Data creation
+# We create a simple 1D Problem with very different variations
 xmin = -3;
-xmax = 3;
-x = range(xmin, xmax; length=100)
-x_test = rand(Uniform(xmin, xmax), 200)
-x, y = noisy_function(sinc, x; noise=0.1)
-X = reshape(x, :, 1)
-λ = [0.1]
-function f(x, k, λ)
-    return kernelmatrix(k, X, x; obsdim=1) *
-           inv(kernelmatrix(k, X; obsdim=1) + exp(λ[1]) * I) *
-           y
-end
-f(X, k, 1.0)
-loss(k, λ) = ŷ -> sum(y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ)(f(X, k, λ))
-loss(k, λ)
+xmax = 3; # Limits
+N = 150
+noise = 0.01
+x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset
+target_f(x) = sinc(abs(x)^abs(x)) # We use sinc with a highly varying value
+target_f(x::AbstractArray) = target_f(first(x))
+y_train = target_f.(x_train) + randn(N) * noise
+x_test = collect(eachrow(range(xmin, xmax; length=200))) # Testing dataset
+spectral_mixture_kernel()
+# ## Model definition
+# We create a neural net with 2 layers and 10 units each
+# The data is passed through the NN before being used in the kernel
+neuralnet = Chain(Dense(1, 20), Dense(20, 30), Dense(30, 5))
+# We use two cases :
+# - The Squared Exponential Kernel
+k = transform(SqExponentialKernel(), FunctionTransform(neuralnet))
+
+# We use AbstractGPs.jl to define our model
+gpprior = GP(k) # GP Prior
+fx = AbstractGPs.FiniteGP(gpprior, x_train, noise) # Prior on f
+fp = posterior(fx, y_train) # Posterior of f
+
+# This compute the log evidence of `y`,
+# which is going to be used as the objective
+loss(y) = -logpdf(fx, y)
+
+@info "Init Loss = $(loss(y_train))"
+
+# Flux will automatically extract all the parameters of the kernel
 ps = Flux.params(k)
-# push!(ps,λ)
-opt = Flux.Momentum(1.0)
-##
-for i in 1:10
-    grads = Zygote.gradient(() -> loss(k, λ), ps)
+
+# We show the initial prediction with the untrained model
+p_init = Plots.plot(
+    vcat(x_test...), target_f; lab="true f", title="Loss = $(loss(y_train))"
+)
+Plots.scatter!(vcat(x_train...), y_train; lab="data")
+pred = marginals(fp(x_test))
+Plots.plot!(vcat(x_test...), mean.(pred); ribbon=std.(pred), lab="Prediction")
+# ## Training
+anim = Animation()
+nmax = 1000
+opt = Flux.ADAM(0.1)
+@showprogress for i in 1:nmax
+    global grads = gradient(ps) do
+        loss(y_train)
+    end
     Flux.Optimise.update!(opt, ps, grads)
-    p = Plots.scatter(x, y; lab="data", title="Loss = $(loss(k,λ))")
-    Plots.plot!(x, f(X, k, λ); lab="Prediction", lw=3.0)
-    display(p)
+    if i % 100 == 0
+        @info "$i/$nmax"
+        L = loss(y_train)
+        # @info "Loss = $L"
+        p = Plots.plot(
+            vcat(x_test...), target_f; lab="true f", title="Loss = $(loss(y_train))"
+        )
+        p = Plots.scatter!(vcat(x_train...), y_train; lab="data")
+        pred = marginals(posterior(fx, y_train)(x_test))
+        Plots.plot!(vcat(x_test...), mean.(pred); ribbon=std.(pred), lab="Prediction")
+        frame(anim)
+        display(p)
+    end
 end
+gif(anim; fps=5)
diff --git a/examples/gaussianprocesspriors.jl b/examples/gaussianprocesspriors.jl
new file mode 100644
index 000000000..38da13a9a
--- /dev/null
+++ b/examples/gaussianprocesspriors.jl
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+# # Gaussian process prior samples
+#
+# The kernels defined in this package can also be used to specify the
+# covariance of a Gaussian process prior.
+# A Gaussian process (GP) is defined by its mean function $m(\cdot)$ and its covariance function or kernel $k(\cdot, \cdot')$:
+# ```math
+#   f \sim \mathcal{GP}\big(m(\cdot), k(\cdot, \cdot')\big)
+# ```
+# The function values of the GP at a finite number of points $X = \{x_n\}_{n=1}^N$ follow a multivariate normal distribution with mean vector $\mathbf{m}$ and covariance matrix $\mathrm{K}$, where
+# ```math
+# \begin{aligned}
+#   \mathbf{m}_i &= m(x_i) \\
+#   \mathrm{K}_{i,j} &= k(x_i, x_j)
+# \end{aligned}
+# ```
+# where $1 \le i, j \le N$.
+#
+# In this notebook we show samples from zero-mean GPs with different kernels.
+
+## Load required packages
+using KernelFunctions
+using LinearAlgebra
+using Distributions
+using Plots
+default(; lw=1.0, legendfontsize=15.0)
+using Random: seed!
+seed!(42);
+
+# We now define a function that visualizes a kernel
+# for us. We use the same randomness to obtain
+# comparable samples.
+
+num_inputs = 101
+xlim = (-5, 5)
+X = reshape(collect(range(xlim...; length=num_inputs)), :, 1)
+num_samples = 11
+v = randn(num_inputs, num_samples);
+
+function visualize(k::Kernel; xref=0.0)
+    K = kernelmatrix(k, X; obsdim=1)
+    L = cholesky(K + 1e-6 * I)
+    f = L.L * v
+
+    p_kernel_2d = heatmap(
+        K;
+        yflip=true,
+        colorbar=false,
+        ylabel=string(k),
+        framestyle=:none,
+        #color=:blues,
+        vlim=(0, 1),
+        title=raw"$k(x, x')$",
+    )
+
+    p_kernel_cut = plot(
+        X,
+        k.(X, xref);
+        title=string(raw"$k(x, ", xref, raw")$"),
+        xlim=xlim,
+        xticks=(xlim, xlim),
+        label=nothing,
+    )
+
+    p_samples = plot(
+        X,
+        f;
+        c="blue",
+        title=raw"$f(x)$",
+        ylim=(-3, 3),
+        xlim=xlim,
+        xticks=(xlim, xlim),
+        label=nothing,
+    )
+
+    return plot(p_kernel_2d, p_kernel_cut, p_samples; layout=(1, 3), xlabel=raw"$x$")
+end
+
+# We can now visualize a kernel and show samples from
+# a Gaussian process with this kernel:
+
+visualize(SqExponentialKernel())
+
+# This also allows us to compare different kernels:
+
+kernel_classes = [Matern12Kernel, Matern32Kernel, Matern52Kernel, SqExponentialKernel]
+plot(
+    [visualize(k()) for k in kernel_classes]...,
+    #layout=(length(kernel_classes), 1)
+)
diff --git a/examples/kernel_ridge_regression.jl b/examples/kernel_ridge_regression.jl
new file mode 100644
index 000000000..a76905462
--- /dev/null
+++ b/examples/kernel_ridge_regression.jl
@@ -0,0 +1,159 @@
+# # Kernel Ridge Regression
+#
+# Building on linear regression, we can fit non-linear data sets by introducing a feature space. In a higher-dimensional feature space, we can overfit the data; ridge regression introduces regularization to avoid this. In this notebook we show how we can use KernelFunctions.jl for *kernel* ridge regression.
+
+## Loading and setup of required packages
+using KernelFunctions
+using LinearAlgebra
+using Distributions
+
+## Plotting
+using Plots;
+default(; lw=2.0, legendfontsize=15.0);
+
+using Random: seed!
+seed!(42);
+
+# ## From linear regression to ridge regression
+# Here we use a one-dimensional toy problem. We generate data using the fourth-order polynomial $f(x) = (x+4)(x+1)(x-1)(x-3)$:
+
+f_truth(x) = (x + 4) * (x + 1) * (x - 1) * (x - 3)
+
+x_train = collect(-5:0.5:5)
+x_test = collect(-5:0.1:5)
+
+noise = rand(Uniform(-10, 10), size(x_train))
+y_train = f_truth.(x_train) + noise
+y_test = f_truth.(x_test)
+
+plot(x_test, y_test; label=raw"$f(x)$")
+scatter!(x_train, y_train; label="observations")
+
+# For training inputs $\mathrm{X}=(\mathbf{x}_n)_{n=1}^N$ and observations $\mathbf{y}=(y_n)_{n=1}^N$, the linear regression weights $\mathbf{w}$ using the least-squares estimator are given by
+# ```math
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y}
+# ```
+# We predict at test inputs $\mathbf{x}_*$ using
+# ```math
+# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w}
+# ```
+# This is implemented by `linear_regression`:
+
+function linear_regression(X, y, Xstar)
+    weights = (X' * X) \ (X' * y)
+    return Xstar * weights
+end
+
+# A linear regression fit to the above data set:
+
+y_pred = linear_regression(x_train, y_train, x_test)
+scatter(x_train, y_train; label="observations")
+plot!(x_test, y_pred; label="linear fit")
+
+# We can improve the fit by including additional features, i.e. generalizing to $\mathrm{X} = (\phi(x_n))_{n=1}^N$, where $\phi(x)$ constructs a feature vector for each input $x$. Here we include powers of the input, $\phi(x) = (1, x, x^2, \dots, x^d)$:
+
+function featurize_poly(x; degree=1)
+    xcols = [x .^ d for d in 0:degree]
+    return hcat(xcols...)
+end
+
+function featurized_fit_and_plot(degree)
+    X = featurize_poly(x_train; degree=degree)
+    Xstar = featurize_poly(x_test; degree=degree)
+    y_pred = linear_regression(X, y_train, Xstar)
+    scatter(x_train, y_train; legend=false, title="fit of order $degree")
+    return plot!(x_test, y_pred)
+end
+
+plot([featurized_fit_and_plot(degree) for degree in 1:4]...)
+
+# Note that the fit becomes perfect when we include exactly as many orders in the features as we have in the underlying polynomial (4).
+#
+# However, when increasing the number of features, we can quickly overfit to noise in the data set:
+
+featurized_fit_and_plot(18)
+
+# To counteract this unwanted behaviour, we can introduce regularization. This leads to *ridge regression* with $L_2$ regularization of the weights ([Tikhonov regularization](https://en.wikipedia.org/wiki/Tikhonov_regularization)).
+# Instead of the weights in linear regression,
+# $$
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y}
+# $$
+# we introduce the ridge parameter $\lambda$:
+# $$
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y}
+# $$
+# As before, we predict at test inputs $\mathbf{x}_*$ using
+# ```math
+# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w}
+# ```
+# This is implemented by `ridge_regression`:
+
+function ridge_regression(X, y, Xstar, lambda)
+    weights = (X' * X + lambda * I) \ (X' * y)
+    return Xstar * weights
+end
+
+function regularized_fit_and_plot(degree, lambda)
+    X = featurize_poly(x_train; degree=degree)
+    Xstar = featurize_poly(x_test; degree=degree)
+    y_pred = ridge_regression(X, y_train, Xstar, lambda)
+    scatter(x_train, y_train; legend=false, title="\$\\lambda=$lambda\$")
+    return plot!(x_test, y_pred)
+end
+
+plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]...)
+
+# Instead of constructing the feature matrix explicitly, we can use *kernels* to replace inner products of feature vectors with a kernel evaluation: $\langle \phi(x), \phi(x') \rangle = k(x, x')$ or $\mathrm{X} \mathrm{X}^\top = \mathrm{K}$, where $\mathrm{K}_{ij} = k(x_i, x_j)$.
+#
+# To apply this "kernel trick" to ridge regression, we can rewrite the ridge estimate for the weights
+# $$
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y}
+# $$
+# using the [matrix inversion lemma](https://tlienart.github.io/pub/csml/mtheory/matinvlem.html#basic_lemmas)
+# as
+# $$
+# \mathbf{w} = \mathrm{X}^\top (\mathrm{X} \mathrm{X}^\top + \lambda \mathbb{1})^{-1} \mathbf{y}
+# $$
+# where we can now replace the inner product with the kernel matrix,
+# $$
+# \mathbf{w} = \mathrm{X}^\top (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y}
+# $$
+# And the prediction yields another inner product,
+# ```math
+# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} = \langle \mathbf{x}_*, \mathbf{w} \rangle = \mathbf{k}_* (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y}
+# ```
+# where $(\mathbf{k}_*)_n = k(x_*, x_n)$.
+#
+# This is implemented by `kernel_ridge_regression`:
+
+function kernel_ridge_regression(k, X, y, Xstar, lambda)
+    K = kernelmatrix(k, X)
+    kstar = kernelmatrix(k, Xstar, X)
+    return kstar * ((K + lambda * I) \ y)
+end
+
+# Now, instead of explicitly constructing features, we can simply pass in a `PolynomialKernel` object:
+
+function kernelized_fit_and_plot(kernel, lambda=1e-4)
+    y_pred = kernel_ridge_regression(kernel, x_train, y_train, x_test, lambda)
+    if kernel isa PolynomialKernel
+        title = string("order ", kernel.degree)
+    else
+        title = string(kernel)
+    end
+    scatter(x_train, y_train; label=nothing)
+    p = plot!(
+        x_test,
+        y_pred;
+        label=nothing,
+        title=title,
+        #title=string(raw"$\lambda=", lambda, raw"$")
+    )
+    return p
+end
+
+plot([kernelized_fit_and_plot(PolynomialKernel(; degree=degree, c=1)) for degree in 1:4]...)
+
+# However, we can now also use kernels that would have an infinite-dimensional feature expansion, such as the squared exponential kernel:
+
+kernelized_fit_and_plot(SqExponentialKernel())
diff --git a/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl
deleted file mode 100644
index 0bc02f25a..000000000
--- a/examples/kernelridgeregression.jl
+++ /dev/null
@@ -1,40 +0,0 @@
-using KernelFunctions
-using MLDataUtils
-using Zygote
-using Flux
-using Distributions, LinearAlgebra
-using Plots
-
-Flux.@functor SqExponentialKernel
-Flux.@functor ScaleTransform
-Flux.@functor KernelSum
-Flux.@functor Matern32Kernel
-
-xmin = -3;
-xmax = 3;
-x = range(xmin, xmax; length=100)
-x_test = range(xmin, xmax; length=300)
-x, y = noisy_function(sinc, x; noise=0.1)
-X = reshape(x, :, 1)
-X_test = reshape(x_test, :, 1)
-k = SqExponentialKernel(1.0)#+Matern32Kernel(2.0)
-λ = [-1.0]
-function f(x, k, λ)
-    return kernelmatrix(k, x, X; obsdim=1) *
-           inv(kernelmatrix(k, X; obsdim=1) + exp(λ[1]) * I) *
-           y
-end
-f(X, k, 1.0)
-loss(k, λ) = ŷ -> sum(y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ)(f(X, k, λ))
-loss(k, λ)
-ps = Flux.params(k)
-push!(ps, λ)
-opt = Flux.Momentum(0.1)
-##
-for i in 1:10
-    grads = Zygote.gradient(() -> loss(k, λ), ps)
-    Flux.Optimise.update!(opt, ps, grads)
-    p = Plots.scatter(x, y; lab="data", title="Loss = $(loss(k,λ))")
-    Plots.plot!(x_test, f(X_test, k, λ); lab="Prediction", lw=3.0)
-    display(p)
-end
diff --git a/examples/svm.jl b/examples/svm.jl
index e07c7f6f3..6e738c55a 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -1,11 +1,13 @@
-# # Support vector machine
+# # Support Vector Machines
 
-using KernelFunctions
-using Distributions
-using Plots
+# TODO: introduction
+#
+# We first load the packages we will need in this notebook:
 
-using LinearAlgebra
-using Random
+using KernelFunctions
+using Distributions, LinearAlgebra, Random
+using Plots;
+default(; legendfontsize=15.0, ms=5.0);
 
 ## Set plotting theme
 theme(:wong)
@@ -13,27 +15,84 @@ theme(:wong)
 ## Set seed
 Random.seed!(1234);
 
+# ## Data Generation
+#
+# We first generate a mixture of two Gaussians in 2 dimensions
+
+xmin = -3;
+xmax = 3; # Limits for sampling μ₁ and μ₂
+μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers
+
+# We then sample both the input $x$ and the class $y$:
+
+N = 100 # Number of samples
+y = rand((-1, 1), N) # Select randomly between the two classes
+x = Vector{Vector{Float64}}(undef, N) # We preallocate x
+x[y .== 1] = [rand(MvNormal(μ[:, 1], I)) for _ in 1:count(y .== 1)] # Features for samples of class 1
+x[y .== -1] = [rand(MvNormal(μ[:, 2], I)) for _ in 1:count(y .== -1)] # Features for samples of class 2
+scatter(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2); label="y = 1", title="Data")
+scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2); label="y = 2")
+
 # Number of samples:
-N = 100;
+#N = 100;
 
 # Select randomly between two classes:
-y = rand([-1, 1], N);
+#y = rand([-1, 1], N);
 
 # Random attributes for both classes:
-X = Matrix{Float64}(undef, 2, N)
-rand!(MvNormal(randn(2), I), view(X, :, y .== 1))
-rand!(MvNormal(randn(2), I), view(X, :, y .== -1));
+#X = Matrix{Float64}(undef, 2, N)
+#rand!(MvNormal(randn(2), I), view(X, :, y .== 1))
+#rand!(MvNormal(randn(2), I), view(X, :, y .== -1));
 
 # Create a 2D grid:
-xgrid = range(floor(Int, minimum(X)), ceil(Int, maximum(X)); length=100)
-Xgrid = ColVecs(mapreduce(collect, hcat, Iterators.product(xgrid, xgrid)));
+#xgrid = range(floor(Int, minimum(X)), ceil(Int, maximum(X)); length=100)
+#Xgrid = ColVecs(mapreduce(collect, hcat, Iterators.product(xgrid, xgrid)));
+
+# ## Model Definition
+# TODO Write theory here
+
+# ### Kernel
 
 # Create kernel function:
 k = SqExponentialKernel() ∘ ScaleTransform(2.0)
+λ = 1.0 # Regularization parameter
+
+# ### Predictor
+# We create a function to return the optimal prediction for a test data `x_new`
 
 # Optimal prediction:
-f(x, X, k, λ) = kernelmatrix(k, x, X) / (kernelmatrix(k, X) + exp(λ) * I) * y
+f(x, X, y, k, λ) = kernelmatrix(k, x, X) / (kernelmatrix(k, X) + λ * I) * y
+# f(x, X, y, k, λ) = kernelmatrix(k, x, X) * ((kernelmatrix(k, X) + λ * I) \ y)
+
+# ### Loss
+# We also compute the total loss of the model that we want to minimize
+
+hingeloss(y, ŷ) = maximum(zero(ŷ), 1 - y * ŷ) # hingeloss function
+function reg_hingeloss(k, x, y, λ)
+    ŷ = f(x, x, y, k, λ)
+    return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation
+end
+
+# ### Testing the trained model
+# We create a 2D grid based on the maximum values of the data
+
+N_test = 100 # Size of the grid
+xgrid = range(extrema(vcat(x...)) .* 1.1...; length=N_test) # Create a 1D grid
+xgrid_v = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid
+
+# We predict the value of y on this grid on plot it against the data
 
-# Compute prediction on a grid:
-contourf(xgrid, xgrid, f(Xgrid, ColVecs(X), k, 0.1))
-scatter!(X[1, :], X[2, :]; color=y, lab="data", widen=false)
+y_grid = f(xgrid_v, x, y, k, λ) # Compute prediction on a grid
+contourf(
+    xgrid,
+    xgrid,
+    reshape(y_grid, N_test, N_test)';
+    label="Predictions",
+    title="Trained model",
+)
+scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2); label="y = 1")
+scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2); label="y = 2")
+xlims!(extrema(xgrid))
+ylims!(extrema(xgrid))
+# contourf(xgrid, xgrid, f(Xgrid, ColVecs(X), k, 0.1))
+# scatter!(X[1, :], X[2, :]; color=y, lab="data", widen=false)
diff --git a/examples/train_kernel_parameters.jl b/examples/train_kernel_parameters.jl
new file mode 100644
index 000000000..c3cc5d1dd
--- /dev/null
+++ b/examples/train_kernel_parameters.jl
@@ -0,0 +1,84 @@
+# # Kernel Ridge Regression
+
+# ## We load KernelFunctions and some other packages
+
+using KernelFunctions
+using LinearAlgebra
+using Distributions
+using Plots;
+default(; lw=2.0, legendfontsize=15.0);
+using Flux: Optimise
+using ForwardDiff
+using Random: seed!
+seed!(42)
+
+# ## Data Generation
+# We generated data in 1 dimension
+
+xmin = -3;
+xmax = 3; # Bounds of the data
+N = 50 # Number of samples
+x_train = rand(Uniform(xmin, xmax), N) # We sample 100 random samples
+σ = 0.1
+y_train = sinc.(x_train) + randn(N) * σ # We create a function and add some noise
+x_test = range(xmin - 0.1, xmax + 0.1; length=300)
+
+# Plot the data
+
+scatter(x_train, y_train; lab="data")
+plot!(x_test, sinc; lab="true function")
+
+# ## Kernel training
+# To train the kernel parameters via ForwardDiff.jl
+# we need to create a function creating a kernel from an array
+
+kernelcall(θ) = transform(
+    exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(),
+    exp(θ[3]),
+)
+
+# From theory we know the prediction for a test set x given
+# the kernel parameters and normalization constant
+
+function f(x, x_train, y_train, θ)
+    k = kernelcall(θ[1:3])
+    return kernelmatrix(k, x, x_train) *
+           ((kernelmatrix(k, x_train) + exp(θ[4]) * I) \ y_train)
+end
+
+# We look how the prediction looks like
+# with starting parameters [1.0, 1.0, 1.0, 1.0] we get :
+
+ŷ = f(x_test, x_train, y_train, log.(ones(4)))
+scatter(x_train, y_train; lab="data")
+plot!(x_test, sinc; lab="true function")
+plot!(x_test, ŷ; lab="prediction")
+
+# We define the loss based on the L2 norm both
+# for the loss and the regularization
+
+function loss(θ)
+    ŷ = f(x_train, x_train, y_train, θ)
+    return sum(abs2, y_train - ŷ) + exp(θ[4]) * norm(ŷ)
+end
+
+# The loss with our starting point :
+
+loss(log.(ones(4)))
+
+# ## Training the model
+
+θ = log.([1.0, 0.0, 0.01, 0.001]) # Initial vector
+anim = Animation()
+opt = Optimise.ADAGrad(0.5)
+for i in 1:30
+    grads = ForwardDiff.gradient(loss, θ) # We compute the gradients given the kernel parameters and regularization
+    Optimise.update!(opt, θ, grads)
+    scatter(
+        x_train, y_train; lab="data", title="i = $(i), Loss = $(round(loss(θ), digits = 4))"
+    )
+    plot!(x_test, sinc; lab="true function")
+    plot!(x_test, f(x_test, x_train, y_train, θ); lab="Prediction", lw=3.0)
+    frame(anim)
+end
+gif(anim)