From fcda0c6785afd6ea8c20509c4bb4321a1340d30d Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Thu, 4 Jun 2020 17:59:57 +0200
Subject: [PATCH 01/19] Draft for example scripts

---
 examples/deepkernellearning.jl    | 73 +++++++++++++++++--------
 examples/kernelridgeregression.jl | 91 ++++++++++++++++++++-----------
 examples/svm.jl                   | 40 ++++++++------
 src/kernels/transformedkernel.jl  |  6 +-
 4 files changed, 136 insertions(+), 74 deletions(-)

diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl
index b77f0d13e..50c433705 100644
--- a/examples/deepkernellearning.jl
+++ b/examples/deepkernellearning.jl
@@ -3,33 +3,58 @@ using MLDataUtils
 using Zygote
 using Flux
 using Distributions, LinearAlgebra
-using Plots
+using Plots;
+pyplot(); default(legendfontsize = 15.0, linewidth = 3.0)
+using SliceMap
 
-Flux.@functor SqExponentialKernel
-Flux.@functor KernelSum
-Flux.@functor Matern32Kernel
-Flux.@functor FunctionTransform
+neuralnet = Chain(Dense(1, 10), Dense(10, 2))
+
+Base.map(
+    t::KernelFunctions.FunctionTransform,
+    X::AbstractVector;
+    obsdim::Int = defaultobs,
+) where {T} =
+    slicemap(X, dims = 1) do x
+        t.f(x)
+    end
 
-neuralnet = Chain(Dense(1,3),Dense(3,2))
-k = SqExponentialKernel(FunctionTransform(neuralnet))
 xmin = -3; xmax = 3
-x = range(xmin,xmax,length=100)
-x_test = rand(Uniform(xmin,xmax),200)
-x,y = noisy_function(sinc,x;noise=0.1)
-X = reshape(x,:,1)
-λ = [0.1]
-f(x,k,λ) = kernelmatrix(k,X,x,obsdim=1)*inv(kernelmatrix(k,X,obsdim=1)+exp(λ[1])*I)*y
-f(X,k,1.0)
-loss(k,λ) = f(X,k,λ) |> ŷ ->sum(y-ŷ)/length(y)+exp(λ[1])*norm(ŷ)
-loss(k,λ)
+
+x_train = range(xmin, xmax, length = 100)
+x_test = rand(Uniform(xmin, xmax), 200)
+x_train, y = noisy_function(x_train; noise = 0.01) do x
+    sinc(abs(x) ^ abs(x))
+end
+X = reshape(x_train, :, 1)
+k = transform(SqExponentialKernel(), FunctionTransform(neuralnet))
+
+λ = log.([1.0])
+function f(x, k, λ)
+    K = kernelmatrix(k, X, x, obsdim = 1)
+    return K * inv(K + exp(λ[1]) * I) * y
+end
+f(X, k, 1.0)
+loss(k, λ) = f(X, k, λ) |> ŷ -> sum(abs2, y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ)
+
+@info "Init Loss = $(loss(k, λ))"
+
 ps = Flux.params(k)
-# push!(ps,λ)
-opt = Flux.Momentum(1.0)
+push!(ps,λ)
+
+p = Plots.scatter(x_train, y, lab = "data", title = "Loss = $(loss(k, λ))")
+Plots.plot!(x_train, f(X, k, λ), lab = "Prediction", lw = 3.0) |> display
 ##
-for i in 1:10
-    grads = Zygote.gradient(()->loss(k,λ),ps)
-    Flux.Optimise.update!(opt,ps,grads)
-    p = Plots.scatter(x,y,lab="data",title="Loss = $(loss(k,λ))")
-    Plots.plot!(x,f(X,k,λ),lab="Prediction",lw=3.0)
-    display(p)
+anim = Animation()
+nmax= 10
+opt = Flux.ADAGrad(0.001)
+@progress for i = 1:nmax
+    grads = Zygote.gradient(() -> loss(k, λ), ps)
+    Flux.Optimise.update!(opt, ps, grads)
+    if i % 100 == 0
+        @info "$i/$nmax"
+        p = Plots.scatter(x, y, lab = "data", title = "Loss = $(loss(k,λ))")
+        Plots.plot!(x, f(X, k, λ), lab = "Prediction", lw = 3.0)
+        frame(anim)
+    end
 end
+gif(anim, fps = 5)
diff --git a/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl
index 5c1d61d86..d6e999e06 100644
--- a/examples/kernelridgeregression.jl
+++ b/examples/kernelridgeregression.jl
@@ -1,35 +1,64 @@
+# Kernel Ridge Regression
 using KernelFunctions
-using MLDataUtils
-using Zygote
-using Flux
-using Distributions, LinearAlgebra
+using LinearAlgebra
+using Distributions
 using Plots
+using Flux: Optimise
+using ForwardDiff
 
-Flux.@functor SqExponentialKernel
-Flux.@functor ScaleTransform
-Flux.@functor KernelSum
-Flux.@functor Matern32Kernel
-
-xmin = -3; xmax = 3
-x = range(xmin,xmax,length=100)
-x_test = range(xmin,xmax,length=300)
-x,y = noisy_function(sinc,x;noise=0.1)
-X = reshape(x,:,1)
-X_test = reshape(x_test,:,1)
-k = SqExponentialKernel(1.0)#+Matern32Kernel(2.0)
-λ = [-1.0]
-f(x,k,λ) = kernelmatrix(k,x,X,obsdim=1)*inv(kernelmatrix(k,X,obsdim=1)+exp(λ[1])*I)*y
-f(X,k,1.0)
-loss(k,λ) = f(X,k,λ) |> ŷ ->sum(y-ŷ)/length(y)+exp(λ[1])*norm(ŷ)
-loss(k,λ)
-ps = Flux.params(k)
-push!(ps,λ)
-opt = Flux.Momentum(0.1)
-##
-for i in 1:10
-    grads = Zygote.gradient(()->loss(k,λ),ps)
-    Flux.Optimise.update!(opt,ps,grads)
-    p = Plots.scatter(x,y,lab="data",title="Loss = $(loss(k,λ))")
-    Plots.plot!(x_test,f(X_test,k,λ),lab="Prediction",lw=3.0)
-    display(p)
+# We generate our data :
+xmin = -3; xmax = 3 # Bounds of the data
+N = 50 # Number of samples
+σ = 0.1
+
+x_train = rand(Uniform(xmin, xmax), N) # We sample 100 random samples
+x_test = range(xmin, xmax, length = 300) # We use x_test to show the prediction function
+y = sinc.(x_train) + randn(N) * σ # We create a function and add some noise
+
+# Plot the data
+scatter(x_train, y, lab = "data")
+plot!(x_test, sinc, lab = "true function")
+
+# Create a function taking kernel parameters and creating a kernel
+kernelcall(θ) = transform(
+    exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(),
+    exp(θ[3]),
+)
+
+# Return the prediction given the normalization value λ
+function f(x, θ)
+    k = kernelcall(θ[1:3])
+    kernelmatrix(k, x, x_train) *
+    inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * y
+end
+
+# Starting with parameters [1.0, 1.0, 1.0, 1.0] we get :
+ŷ = f(x_test, log.(ones(4)))
+scatter(x_train, y, lab = "data")
+plot!(x_test, sinc, lab = "true function")
+plot!(x_test, ŷ, lab = "prediction")
+
+# Create a loss on the training data
+function loss(θ)
+    ŷ = f(x_train, θ)
+    sum(abs2, y - ŷ) + exp(θ[4]) * norm(ŷ)
+end
+
+# The loss with our starting point :
+loss(log.(ones(4)))
+
+## Training model
+
+θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001))
+anim = Animation()
+opt = ADAGrad(0.5)
+@progress for i = 1:30
+    grads = ForwardDiff.gradient(x -> loss(x), θ) # We compute the gradients given the kernel parameters and regularization
+    Δ = Optimise.apply!(opt, θ, grads)
+    θ .-= Δ # We apply a simple Gradient descent algorithm
+    p = scatter(x_train, y, lab = "data", title = "i = $(i), Loss = $(round(loss(θ), digits = 4))")
+    plot!(x_test, sinc, lab = "true function")
+    plot!(x_test, f(x_test, θ), lab = "Prediction", lw = 3.0)
+    frame(anim)
 end
+gif(anim)
diff --git a/examples/svm.jl b/examples/svm.jl
index 823b2e1d4..30efc9519 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -5,19 +5,27 @@ using Flux
 using Distributions, LinearAlgebra
 using Plots
 
-N = 100 #Number of samples
-μ = randn(2,2) # Random Centers
-xgrid = range(-3,3,length=100) # Create a grid
-Xgrid = hcat(collect.(Iterators.product(xgrid,xgrid))...)' #Combine into a 2D grid
-y = rand([-1,1],N) # Select randomly between the two classes
-X = zeros(N,2)
-X[y.==1,:] = rand(MvNormal(μ[:,1],I),count(y.==1))' #Attribute samples from class 1
-X[y.==-1,:] = rand(MvNormal(μ[:,2],I),count(y.==-1))' # Attribute samples from class 2
-
-
-k = SqExponentialKernel(2.0) # Create kernel function
-f(x,k,λ) = kernelmatrix(k,x,X,obsdim=1)*inv(kernelmatrix(k,X,obsdim=1)+exp(λ[1])*I)*y # Optimal prediction f
-svmloss(y,ŷ)= f(X,k,λ) |> ŷ -> sum(maximum.(0.0,1-y*ŷ)) - λ*norm(ŷ) # Total svm loss with regularisation
-pred = f(Xgrid,k,λ) #Compute prediction on a grid
-contourf(xgrid,xgrid,pred)
-scatter!(eachcol(X)...,color=y,lab="data")
+N = 100 # Number of samples
+N_test = 200 # Size of the grid
+xmin = -3; xmax = 3
+μ = rand(Uniform(xmin, xmax), 2, 2) # Random Centers
+xgrid = range(-xmin, xmax, length=N_test) # Create a grid
+Xgrid = hcat(collect.(Iterators.product(xgrid, xgrid))...) #Combine into a 2D grid
+y = rand((-1, 1), N) # Select randomly between the two classes
+X_train = zeros(2, N)
+X_train[:, y .== 1] = rand(MvNormal(μ[:, 1], I), count(y.==1)) #Attribute samples from class 1
+X_train[:, y .== -1] = rand(MvNormal(μ[:, 2], I), count(y.==-1)) # Attribute samples from class 2
+scatter(eachrow(X_train)..., zcolor= y)
+## Compute predictions
+k = SqExponentialKernel() # Create kernel function
+function f(x, k, λ)
+    kernelmatrix(k, x, X_train, obsdim=2) * inv(kernelmatrix(k, X_train, obsdim=2) + exp(λ[1]) * I) * y # Optimal prediction f
+end
+λ = log.([1.0])
+function reg_hingeloss(k, λ)
+    ŷ = f(X, k, λ)
+    return sum(maximum.(0.0, 1 - y * ŷ)) - exp(λ[1]) * norm(ŷ) # Total svm loss with regularisation
+end
+y_grid = f(Xgrid, k, λ) #Compute prediction on a grid
+contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test))
+scatter!(eachrow(X_train)..., zcolor=y,lab="data")
diff --git a/src/kernels/transformedkernel.jl b/src/kernels/transformedkernel.jl
index b96208308..3ae038b75 100644
--- a/src/kernels/transformedkernel.jl
+++ b/src/kernels/transformedkernel.jl
@@ -41,11 +41,11 @@ _scale(t::ScaleTransform, metric, x, y) = evaluate(metric, t(x), t(y))
 """
 transform
 
-transform(k::BaseKernel, t::Transform) = TransformedKernel(k, t)
+transform(k::Kernel, t::Transform) = TransformedKernel(k, t)
 
-transform(k::BaseKernel, ρ::Real) = TransformedKernel(k, ScaleTransform(ρ))
+transform(k::Kernel, ρ::Real) = TransformedKernel(k, ScaleTransform(ρ))
 
-transform(k::BaseKernel,ρ::AbstractVector) = TransformedKernel(k, ARDTransform(ρ))
+transform(k::Kernel,ρ::AbstractVector) = TransformedKernel(k, ARDTransform(ρ))
 
 kernel(κ) = κ.kernel
 

From 2982f7ce0baa860387804a95c553bf60e9c36c63 Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Fri, 12 Jun 2020 18:58:59 +0200
Subject: [PATCH 02/19] Work on examples, but need some fixes on transform

---
 examples/deepkernellearning.jl | 45 ++++++++++++++++++++--------------
 examples/svm.jl                |  6 +++--
 2 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl
index 50c433705..ac4de77c2 100644
--- a/examples/deepkernellearning.jl
+++ b/examples/deepkernellearning.jl
@@ -1,13 +1,17 @@
+# # Loading dataset
+# We use a couple of useful datasets to plot and optimize
+# The different hyper-parameters
 using KernelFunctions
 using MLDataUtils
 using Zygote
 using Flux
 using Distributions, LinearAlgebra
-using Plots;
+using Plots
+using AbstractGPs
 pyplot(); default(legendfontsize = 15.0, linewidth = 3.0)
 using SliceMap
 
-neuralnet = Chain(Dense(1, 10), Dense(10, 2))
+
 
 Base.map(
     t::KernelFunctions.FunctionTransform,
@@ -18,28 +22,33 @@ Base.map(
         t.f(x)
     end
 
-xmin = -3; xmax = 3
+# ## Data creation
+# We create a simple 1D Problem with very different variations
+xmin = -3; xmax = 3 # Limits
+noise = 0.01
+x_train = rand(Uniform(xmin, xmax), 100) # Training dataset
+x_test = range(xmin, xmax, length=200) # Testing dataset
+target_f(x) = sinc(abs(x) ^ abs(x)) # We use sinc with a highly varying value
+x_train, y = noisy_function(target_f, x_train; noise = 0.01)
 
-x_train = range(xmin, xmax, length = 100)
-x_test = rand(Uniform(xmin, xmax), 200)
-x_train, y = noisy_function(x_train; noise = 0.01) do x
-    sinc(abs(x) ^ abs(x))
-end
-X = reshape(x_train, :, 1)
+# ## Model definition
+# We create a neural net with 2 layers and 10 units each
+# The data is passed through the NN before being used in the kernel
+neuralnet = Chain(Dense(1, 10), Dense(10, 2))
+# We use a Squared Exponential Kernel
 k = transform(SqExponentialKernel(), FunctionTransform(neuralnet))
 
-λ = log.([1.0])
-function f(x, k, λ)
-    K = kernelmatrix(k, X, x, obsdim = 1)
-    return K * inv(K + exp(λ[1]) * I) * y
-end
-f(X, k, 1.0)
-loss(k, λ) = f(X, k, λ) |> ŷ -> sum(abs2, y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ)
+f = GP(k)
+fx = f(x_train, noise)
+fp = posterior(fx, y)
+# This compute the log evidence of `y`,
+# which is going to be used as the objective
+loss(y) = logpdf(fx, y)
+pred
 
-@info "Init Loss = $(loss(k, λ))"
+@info "Init Loss = $(loss(y))"
 
 ps = Flux.params(k)
-push!(ps,λ)
 
 p = Plots.scatter(x_train, y, lab = "data", title = "Loss = $(loss(k, λ))")
 Plots.plot!(x_train, f(X, k, λ), lab = "Prediction", lw = 3.0) |> display
diff --git a/examples/svm.jl b/examples/svm.jl
index 30efc9519..c92600938 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -1,14 +1,16 @@
 using KernelFunctions
-using MLDataUtils
 using Zygote
 using Flux
 using Distributions, LinearAlgebra
 using Plots
 
+# #
+
 N = 100 # Number of samples
 N_test = 200 # Size of the grid
 xmin = -3; xmax = 3
-μ = rand(Uniform(xmin, xmax), 2, 2) # Random Centers
+
+μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers
 xgrid = range(-xmin, xmax, length=N_test) # Create a grid
 Xgrid = hcat(collect.(Iterators.product(xgrid, xgrid))...) #Combine into a 2D grid
 y = rand((-1, 1), N) # Select randomly between the two classes

From c83829a912b969f622843e877535404a06de2aba Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Mon, 6 Jul 2020 18:22:18 +0200
Subject: [PATCH 03/19] Formatting for Literate.jl

---
 examples/svm.jl            | 65 +++++++++++++++++++++++---------------
 src/matrix/kernelmatrix.jl |  2 +-
 2 files changed, 40 insertions(+), 27 deletions(-)

diff --git a/examples/svm.jl b/examples/svm.jl
index c92600938..0e10e0622 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -1,33 +1,46 @@
+# # Support Vector Machines
+# ## We first load some needed packages
 using KernelFunctions
-using Zygote
-using Flux
 using Distributions, LinearAlgebra
-using Plots
-
-# #
-
-N = 100 # Number of samples
-N_test = 200 # Size of the grid
-xmin = -3; xmax = 3
+using Plots; default(legendfontsize = 15.0, ms = 5.0)
 
+# ## Data Generation
+# ### We first generate a mixture of two Gaussians in 2 dimensions
+xmin = -3; xmax = 3 # Limits for sampling μ₁ and μ₂
 μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers
-xgrid = range(-xmin, xmax, length=N_test) # Create a grid
-Xgrid = hcat(collect.(Iterators.product(xgrid, xgrid))...) #Combine into a 2D grid
+# ### We then sample both y and x
+N = 100 # Number of samples
 y = rand((-1, 1), N) # Select randomly between the two classes
-X_train = zeros(2, N)
-X_train[:, y .== 1] = rand(MvNormal(μ[:, 1], I), count(y.==1)) #Attribute samples from class 1
-X_train[:, y .== -1] = rand(MvNormal(μ[:, 2], I), count(y.==-1)) # Attribute samples from class 2
-scatter(eachrow(X_train)..., zcolor= y)
-## Compute predictions
-k = SqExponentialKernel() # Create kernel function
-function f(x, k, λ)
-    kernelmatrix(k, x, X_train, obsdim=2) * inv(kernelmatrix(k, X_train, obsdim=2) + exp(λ[1]) * I) * y # Optimal prediction f
+x = Vector{Vector{Float64}}(undef, N) # We preallocate x
+x[y .== 1] = [rand(MvNormal(μ[:, 1], I)) for _ in 1:count(y.==1)] # Features for samples of class 1
+x[y .== -1] = [rand(MvNormal(μ[:, 2], I)) for _ in 1:count(y.==-1)] # Features for samples of class 2
+scatter(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1", title = "Data")
+scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
+
+# ## Model Definition
+# TODO Write theory here
+# ### We create a kernel k
+k = SqExponentialKernel() # SqExponentialKernel or RBFKernel
+λ = 1.0 # Regularization parameter
+
+# ### We create a function to return the optimal prediction for a
+# test data `x_new`
+function f(x_new, x, y, k, λ)
+    kernelmatrix(k, x_new, x) * inv(kernelmatrix(k, x) + λ * I) * y # Optimal prediction f
 end
-λ = log.([1.0])
-function reg_hingeloss(k, λ)
-    ŷ = f(X, k, λ)
-    return sum(maximum.(0.0, 1 - y * ŷ)) - exp(λ[1]) * norm(ŷ) # Total svm loss with regularisation
+
+# ### We also compute the total loss of the model that we want to minimize
+hingeloss(y, ŷ) = maximum(zero(ŷ), 1 - y * ŷ) # hingeloss function
+function reg_hingeloss(k, x, y, λ)
+    ŷ = f(x, x, y, k, λ)
+    return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation
 end
-y_grid = f(Xgrid, k, λ) #Compute prediction on a grid
-contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test))
-scatter!(eachrow(X_train)..., zcolor=y,lab="data")
+# ### We create a 2D grid based on the maximum values of the data
+N_test = 200 # Size of the grid
+xgrid = range(extrema(vcat(x...)).*1.1..., length=N_test) # Create a 1D grid
+xgrid = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid
+# ### We predict the value of y on this grid on plot it against the data
+y_grid = f(xgrid, x, y, k, λ) #Compute prediction on a grid
+contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)', label =  "Predictions", title="Trained model")
+scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1")
+scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
diff --git a/src/matrix/kernelmatrix.jl b/src/matrix/kernelmatrix.jl
index bc24f7354..8b07841f9 100644
--- a/src/matrix/kernelmatrix.jl
+++ b/src/matrix/kernelmatrix.jl
@@ -89,7 +89,7 @@ function kernelmatrix(κ::SimpleKernel, x::AbstractVector)
 end
 
 function kernelmatrix(κ::SimpleKernel, x::AbstractVector, y::AbstractVector)
-    validate_dims(x, y)
+    # validate_dims(x, y)
     return map(d -> kappa(κ, d), pairwise(metric(κ), x, y))
 end
 

From 118e9b80055b0109061ae3948001d6af66b8660d Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Wed, 8 Jul 2020 19:59:14 +0200
Subject: [PATCH 04/19] Added examples to make.jl

---
 docs/.gitignore                   |  3 +-
 docs/Project.toml                 | 13 ++++++
 docs/make.jl                      | 25 ++++++++++-
 examples/automaticstatistician.jl |  0
 examples/deepkernellearning.jl    | 72 ++++++++++++++++++-------------
 examples/kernelridgeregression.jl | 57 +++++++++++++-----------
 examples/svm.jl                   |  2 +-
 7 files changed, 111 insertions(+), 61 deletions(-)
 create mode 100644 examples/automaticstatistician.jl

diff --git a/docs/.gitignore b/docs/.gitignore
index 90fb8a549..909052372 100644
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,4 +1,3 @@
 build/
 site/
-
-#Temp to avoid to many changes
+src/examples/
diff --git a/docs/Project.toml b/docs/Project.toml
index 9e80a3254..0252eb1e6 100644
--- a/docs/Project.toml
+++ b/docs/Project.toml
@@ -1,5 +1,18 @@
 [deps]
+AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918"
+Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
+Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
 [compat]
+Distributions = "0.23"
 Documenter = "0.23, 0.24, 0.25"
+Flux = "0.8"
+ForwardDiff = "0.10"
+Literate = "2"
+Plots = "1"
diff --git a/docs/make.jl b/docs/make.jl
index e0366f01d..03772733c 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -1,13 +1,33 @@
 using Documenter
+using Literate
 using KernelFunctions
 
+if ispath(joinpath(@__DIR__, "src", "examples"))
+    rm(joinpath(@__DIR__, "src", "examples"), recursive=true)
+end
+
+for filename in readdir(joinpath(@__DIR__, "..", "examples"))
+    endswith(filename, ".jl") || continue
+    name = splitext(filename)[1]
+    Literate.markdown(
+        joinpath(@__DIR__, "..", "examples", filename),
+        joinpath(@__DIR__, "src", "examples"),
+        name = name,
+        documenter = true,
+    )
+end
+
 makedocs(
     sitename = "KernelFunctions",
     format = Documenter.HTML(),
     modules = [KernelFunctions],
     pages = ["Home"=>"index.md",
              "User Guide" => "userguide.md",
-             "Examples"=>"example.md",
+             "Examples"=>
+                    ["SVM" => "examples/svm.md",
+                     # "Kernel Ridge Regression" => "examples/kernelridgeregression.md",
+                     # "Deep Kernel Learning" => "examples/deepkernellearning.md",
+                     ],
              "Kernel Functions"=>"kernels.md",
              "Transform"=>"transform.md",
              "Metrics"=>"metrics.md",
@@ -22,5 +42,6 @@ makedocs(
 deploydocs(
     deps = Deps.pip("mkdocs", "python-markdown-math"),
     repo = "github.com/JuliaGaussianProcesses/KernelFunctions.jl.git",
-    target = "build"
+    target = "build",
+    push_preview = true,
 )
diff --git a/examples/automaticstatistician.jl b/examples/automaticstatistician.jl
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl
index ac4de77c2..6f858e8ea 100644
--- a/examples/deepkernellearning.jl
+++ b/examples/deepkernellearning.jl
@@ -1,35 +1,36 @@
-# # Loading dataset
-# We use a couple of useful datasets to plot and optimize
-# The different hyper-parameters
+# # Deep Kernel Learning with Flux
+# ## Package loading
+# We use a couple of useful packages to plot and optimize
+# the different hyper-parameters
 using KernelFunctions
-using MLDataUtils
-using Zygote
 using Flux
 using Distributions, LinearAlgebra
 using Plots
 using AbstractGPs
 pyplot(); default(legendfontsize = 15.0, linewidth = 3.0)
-using SliceMap
+# using SliceMap
 
 
 
-Base.map(
-    t::KernelFunctions.FunctionTransform,
-    X::AbstractVector;
-    obsdim::Int = defaultobs,
-) where {T} =
-    slicemap(X, dims = 1) do x
-        t.f(x)
-    end
+# Base.map(
+#     t::KernelFunctions.FunctionTransform,
+#     X::AbstractVector;
+#     obsdim::Int = defaultobs,
+# ) where {T} =
+#     slicemap(X, dims = 1) do x
+#         t.f(x)
+#     end
 
 # ## Data creation
 # We create a simple 1D Problem with very different variations
 xmin = -3; xmax = 3 # Limits
+N = 100
 noise = 0.01
-x_train = rand(Uniform(xmin, xmax), 100) # Training dataset
-x_test = range(xmin, xmax, length=200) # Testing dataset
+x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset
 target_f(x) = sinc(abs(x) ^ abs(x)) # We use sinc with a highly varying value
-x_train, y = noisy_function(target_f, x_train; noise = 0.01)
+target_f(x::AbstractArray) = target_f(first(x))
+y_train = target_f.(x_train) + randn(N) * noise
+x_test = collect(eachrow(range(xmin, xmax, length=200))) # Testing dataset
 
 # ## Model definition
 # We create a neural net with 2 layers and 10 units each
@@ -38,32 +39,41 @@ neuralnet = Chain(Dense(1, 10), Dense(10, 2))
 # We use a Squared Exponential Kernel
 k = transform(SqExponentialKernel(), FunctionTransform(neuralnet))
 
-f = GP(k)
-fx = f(x_train, noise)
-fp = posterior(fx, y)
+# We use AbstractGPs.jl to define our model
+gpprior = GP(k) # GP Prior
+fx = AbstractGPs.FiniteGP(gpprior, x_train, noise) # Prior on f
+fp = posterior(fx, y_train) # Posterior of f
+
 # This compute the log evidence of `y`,
 # which is going to be used as the objective
 loss(y) = logpdf(fx, y)
-pred
 
-@info "Init Loss = $(loss(y))"
+@info "Init Loss = $(loss(y_train))"
 
+# Flux will automatically extract all the parameters of the kernel
 ps = Flux.params(k)
 
-p = Plots.scatter(x_train, y, lab = "data", title = "Loss = $(loss(k, λ))")
-Plots.plot!(x_train, f(X, k, λ), lab = "Prediction", lw = 3.0) |> display
-##
+# We show the initial prediction with the untrained model
+p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
+Plots.scatter!(vcat(x_train...), y_train, lab = "data")
+pred = marginals(fp(x_test))
+Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
+# ## Training
 anim = Animation()
 nmax= 10
 opt = Flux.ADAGrad(0.001)
-@progress for i = 1:nmax
-    grads = Zygote.gradient(() -> loss(k, λ), ps)
+for i = 1:nmax
+    grads = gradient(ps) do
+        loss(y_train)
+    end
     Flux.Optimise.update!(opt, ps, grads)
-    if i % 100 == 0
+    # if i % 100 == 0
         @info "$i/$nmax"
-        p = Plots.scatter(x, y, lab = "data", title = "Loss = $(loss(k,λ))")
-        Plots.plot!(x, f(X, k, λ), lab = "Prediction", lw = 3.0)
+        p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
+        p = Plots.scatter!(vcat(x_train...), y_train, lab = "data")
+        pred = marginals(fp(x_test))
+        Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
         frame(anim)
-    end
+    # end
 end
 gif(anim, fps = 5)
diff --git a/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl
index d6e999e06..2efa474bd 100644
--- a/examples/kernelridgeregression.jl
+++ b/examples/kernelridgeregression.jl
@@ -1,64 +1,71 @@
-# Kernel Ridge Regression
+# # Kernel Ridge Regression
+# ## We load KernelFunctions and some other packages
 using KernelFunctions
 using LinearAlgebra
 using Distributions
-using Plots
+using Plots; default(lw = 2.0, legendfontsize = 15.0)
 using Flux: Optimise
 using ForwardDiff
+using Random: seed!
+seed!(42)
 
-# We generate our data :
+# ## Data Generation
+# We generated data in 1 dimension
 xmin = -3; xmax = 3 # Bounds of the data
 N = 50 # Number of samples
-σ = 0.1
-
 x_train = rand(Uniform(xmin, xmax), N) # We sample 100 random samples
-x_test = range(xmin, xmax, length = 300) # We use x_test to show the prediction function
-y = sinc.(x_train) + randn(N) * σ # We create a function and add some noise
+σ = 0.1
+y_train = sinc.(x_train) + randn(N) * σ # We create a function and add some noise
+x_test = range(xmin-0.1, xmax+0.1, length=300)
 
 # Plot the data
-scatter(x_train, y, lab = "data")
+scatter(x_train, y_train, lab = "data")
 plot!(x_test, sinc, lab = "true function")
 
-# Create a function taking kernel parameters and creating a kernel
+# ## Kernel training
+# To train the kernel parameters via ForwardDiff.jl
+# we need to create a function creating a kernel from an array
 kernelcall(θ) = transform(
     exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(),
     exp(θ[3]),
 )
 
-# Return the prediction given the normalization value λ
-function f(x, θ)
+# From theory we know the prediction for a test set x given
+# the kernel parameters and normalization constant
+function f(x, x_train, y_train, θ)
     k = kernelcall(θ[1:3])
     kernelmatrix(k, x, x_train) *
-    inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * y
+    inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * y_train
 end
 
-# Starting with parameters [1.0, 1.0, 1.0, 1.0] we get :
-ŷ = f(x_test, log.(ones(4)))
-scatter(x_train, y, lab = "data")
+# We look how the prediction looks like
+# with starting parameters [1.0, 1.0, 1.0, 1.0] we get :
+ŷ = f(x_test, x_train, y_train, log.(ones(4)))
+scatter(x_train, y_train, lab = "data")
 plot!(x_test, sinc, lab = "true function")
 plot!(x_test, ŷ, lab = "prediction")
 
-# Create a loss on the training data
+# We define the loss based on the L2 norm both
+# for the loss and the regularization
 function loss(θ)
-    ŷ = f(x_train, θ)
-    sum(abs2, y - ŷ) + exp(θ[4]) * norm(ŷ)
+    ŷ = f(x_train, x_train, y_train, θ)
+    sum(abs2, y_train - ŷ) + exp(θ[4]) * norm(ŷ)
 end
 
 # The loss with our starting point :
 loss(log.(ones(4)))
 
-## Training model
-
-θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001))
+# ## Training the model
+θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001)) # Initial vector
 anim = Animation()
-opt = ADAGrad(0.5)
-@progress for i = 1:30
+opt = Optimise.ADAGrad(0.5)
+for i = 1:30
     grads = ForwardDiff.gradient(x -> loss(x), θ) # We compute the gradients given the kernel parameters and regularization
     Δ = Optimise.apply!(opt, θ, grads)
     θ .-= Δ # We apply a simple Gradient descent algorithm
-    p = scatter(x_train, y, lab = "data", title = "i = $(i), Loss = $(round(loss(θ), digits = 4))")
+    p = scatter(x_train, y_train, lab = "data", title = "i = $(i), Loss = $(round(loss(θ), digits = 4))")
     plot!(x_test, sinc, lab = "true function")
-    plot!(x_test, f(x_test, θ), lab = "Prediction", lw = 3.0)
+    plot!(x_test, f(x_test, x_train, y_train, θ), lab = "Prediction", lw = 3.0)
     frame(anim)
 end
 gif(anim)
diff --git a/examples/svm.jl b/examples/svm.jl
index 0e10e0622..5200c626e 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -1,5 +1,5 @@
 # # Support Vector Machines
-# ## We first load some needed packages
+# ## Package loading
 using KernelFunctions
 using Distributions, LinearAlgebra
 using Plots; default(legendfontsize = 15.0, ms = 5.0)

From 438fcee0582bf553e3d27c8c62e31ffbf13e19ef Mon Sep 17 00:00:00 2001
From: Theo Galy-Fajou <theo.galyfajou@gmail.com>
Date: Thu, 9 Jul 2020 00:16:20 +0200
Subject: [PATCH 05/19] Small corrections

---
 examples/deepkernellearning.jl |  4 ++--
 examples/svm.jl                | 10 ++++++----
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl
index 6f858e8ea..01771e53b 100644
--- a/examples/deepkernellearning.jl
+++ b/examples/deepkernellearning.jl
@@ -67,13 +67,13 @@ for i = 1:nmax
         loss(y_train)
     end
     Flux.Optimise.update!(opt, ps, grads)
-    # if i % 100 == 0
+    if i % 100 == 0
         @info "$i/$nmax"
         p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
         p = Plots.scatter!(vcat(x_train...), y_train, lab = "data")
         pred = marginals(fp(x_test))
         Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
         frame(anim)
-    # end
+    end
 end
 gif(anim, fps = 5)
diff --git a/examples/svm.jl b/examples/svm.jl
index 5200c626e..9bb262496 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -20,7 +20,7 @@ scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
 # ## Model Definition
 # TODO Write theory here
 # ### We create a kernel k
-k = SqExponentialKernel() # SqExponentialKernel or RBFKernel
+k = SqExponentialKernel() # SqExponentialKernel/RBFKernel
 λ = 1.0 # Regularization parameter
 
 # ### We create a function to return the optimal prediction for a
@@ -36,11 +36,13 @@ function reg_hingeloss(k, x, y, λ)
     return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation
 end
 # ### We create a 2D grid based on the maximum values of the data
-N_test = 200 # Size of the grid
+N_test = 100 # Size of the grid
 xgrid = range(extrema(vcat(x...)).*1.1..., length=N_test) # Create a 1D grid
-xgrid = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid
+xgrid_v = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid
 # ### We predict the value of y on this grid on plot it against the data
-y_grid = f(xgrid, x, y, k, λ) #Compute prediction on a grid
+y_grid = f(xgrid_v, x, y, k, λ) #Compute prediction on a grid
 contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)', label =  "Predictions", title="Trained model")
 scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1")
 scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
+xlims!(extrema(xgrid))
+ylims!(extrema(xgrid))

From 0aaf2bb72e45687bc1cd5324ba9fb97f69902803 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Galy-Fajou?= <theo.galyfajou@gmail.com>
Date: Fri, 6 Nov 2020 11:47:13 +0100
Subject: [PATCH 06/19] Moved examples to docs

---
 {examples => docs/examples}/automaticstatistician.jl | 0
 {examples => docs/examples}/deepkernellearning.jl    | 0
 {examples => docs/examples}/kernelridgeregression.jl | 0
 {examples => docs/examples}/svm.jl                   | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 rename {examples => docs/examples}/automaticstatistician.jl (100%)
 rename {examples => docs/examples}/deepkernellearning.jl (100%)
 rename {examples => docs/examples}/kernelridgeregression.jl (100%)
 rename {examples => docs/examples}/svm.jl (100%)

diff --git a/examples/automaticstatistician.jl b/docs/examples/automaticstatistician.jl
similarity index 100%
rename from examples/automaticstatistician.jl
rename to docs/examples/automaticstatistician.jl
diff --git a/examples/deepkernellearning.jl b/docs/examples/deepkernellearning.jl
similarity index 100%
rename from examples/deepkernellearning.jl
rename to docs/examples/deepkernellearning.jl
diff --git a/examples/kernelridgeregression.jl b/docs/examples/kernelridgeregression.jl
similarity index 100%
rename from examples/kernelridgeregression.jl
rename to docs/examples/kernelridgeregression.jl
diff --git a/examples/svm.jl b/docs/examples/svm.jl
similarity index 100%
rename from examples/svm.jl
rename to docs/examples/svm.jl

From cfa203fc5798b3e87188a01bbdcdccf027d6419c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Th=C3=A9o=20Galy-Fajou?= <theo.galyfajou@gmail.com>
Date: Fri, 6 Nov 2020 18:30:02 +0100
Subject: [PATCH 07/19] Small updates

---
 docs/examples/deepkernellearning.jl | 41 ++++++++++++-----------------
 src/kernels/transformedkernel.jl    |  2 --
 2 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/docs/examples/deepkernellearning.jl b/docs/examples/deepkernellearning.jl
index 01771e53b..f90dc07dd 100644
--- a/docs/examples/deepkernellearning.jl
+++ b/docs/examples/deepkernellearning.jl
@@ -6,38 +6,28 @@ using KernelFunctions
 using Flux
 using Distributions, LinearAlgebra
 using Plots
+using ProgressMeter
 using AbstractGPs
 pyplot(); default(legendfontsize = 15.0, linewidth = 3.0)
-# using SliceMap
-
-
-
-# Base.map(
-#     t::KernelFunctions.FunctionTransform,
-#     X::AbstractVector;
-#     obsdim::Int = defaultobs,
-# ) where {T} =
-#     slicemap(X, dims = 1) do x
-#         t.f(x)
-#     end
 
 # ## Data creation
 # We create a simple 1D Problem with very different variations
 xmin = -3; xmax = 3 # Limits
-N = 100
+N = 150
 noise = 0.01
 x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset
 target_f(x) = sinc(abs(x) ^ abs(x)) # We use sinc with a highly varying value
 target_f(x::AbstractArray) = target_f(first(x))
 y_train = target_f.(x_train) + randn(N) * noise
 x_test = collect(eachrow(range(xmin, xmax, length=200))) # Testing dataset
-
+spectral_mixture_kernel()
 # ## Model definition
 # We create a neural net with 2 layers and 10 units each
 # The data is passed through the NN before being used in the kernel
-neuralnet = Chain(Dense(1, 10), Dense(10, 2))
-# We use a Squared Exponential Kernel
-k = transform(SqExponentialKernel(), FunctionTransform(neuralnet))
+neuralnet = Chain(Dense(1, 20), Dense(20, 30), Dense(30, 5))
+# We use two cases :
+# - The Squared Exponential Kernel
+k = transform(SqExponentialKernel(), FunctionTransform(neuralnet) )
 
 # We use AbstractGPs.jl to define our model
 gpprior = GP(k) # GP Prior
@@ -46,7 +36,7 @@ fp = posterior(fx, y_train) # Posterior of f
 
 # This compute the log evidence of `y`,
 # which is going to be used as the objective
-loss(y) = logpdf(fx, y)
+loss(y) = -logpdf(fx, y)
 
 @info "Init Loss = $(loss(y_train))"
 
@@ -54,26 +44,29 @@ loss(y) = logpdf(fx, y)
 ps = Flux.params(k)
 
 # We show the initial prediction with the untrained model
-p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
+p_init = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
 Plots.scatter!(vcat(x_train...), y_train, lab = "data")
 pred = marginals(fp(x_test))
 Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
 # ## Training
 anim = Animation()
-nmax= 10
-opt = Flux.ADAGrad(0.001)
-for i = 1:nmax
-    grads = gradient(ps) do
+nmax= 1000
+opt = Flux.ADAM(0.1)
+@showprogress for i = 1:nmax
+    global grads = gradient(ps) do
         loss(y_train)
     end
     Flux.Optimise.update!(opt, ps, grads)
     if i % 100 == 0
         @info "$i/$nmax"
+        L = loss(y_train)
+        # @info "Loss = $L"
         p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))")
         p = Plots.scatter!(vcat(x_train...), y_train, lab = "data")
-        pred = marginals(fp(x_test))
+        pred = marginals(posterior(fx, y_train)(x_test))
         Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction")
         frame(anim)
+        display(p)
     end
 end
 gif(anim, fps = 5)
diff --git a/src/kernels/transformedkernel.jl b/src/kernels/transformedkernel.jl
index ec9eddc8b..3d1928cd3 100644
--- a/src/kernels/transformedkernel.jl
+++ b/src/kernels/transformedkernel.jl
@@ -52,8 +52,6 @@ end
 
 transform(k::Kernel,ρ::AbstractVector) = TransformedKernel(k, ARDTransform(ρ))
 
-transform(k::Kernel, ρ::AbstractVector) = transform(k, ARDTransform(ρ))
-
 kernel(κ) = κ.kernel
 
 Base.show(io::IO, κ::TransformedKernel) = printshifted(io, κ, 0)

From f438d9811ae4a6bd30a4e2f6d5bad494063769f9 Mon Sep 17 00:00:00 2001
From: ST John <st--@users.noreply.github.com>
Date: Thu, 7 Jan 2021 22:26:04 +0000
Subject: [PATCH 08/19] fix paths

---
 docs/make.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index afed32030..892c34209 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -6,11 +6,11 @@ if ispath(joinpath(@__DIR__, "src", "examples"))
     rm(joinpath(@__DIR__, "src", "examples"), recursive=true)
 end
 
-for filename in readdir(joinpath(@__DIR__, "..", "examples"))
+for filename in readdir(joinpath(@__DIR__, "examples"))
     endswith(filename, ".jl") || continue
     name = splitext(filename)[1]
     Literate.markdown(
-        joinpath(@__DIR__, "..", "examples", filename),
+        joinpath(@__DIR__, "examples", filename),
         joinpath(@__DIR__, "src", "examples"),
         name = name,
         documenter = true,

From 768e721af5319447f787497a3d892d3d3b5e57e3 Mon Sep 17 00:00:00 2001
From: ST John <st--@users.noreply.github.com>
Date: Fri, 8 Jan 2021 11:31:09 +0000
Subject: [PATCH 09/19] move examples out of docs

---
 docs/make.jl                                         | 4 ++--
 {docs/examples => examples}/automaticstatistician.jl | 0
 {docs/examples => examples}/deepkernellearning.jl    | 0
 {docs/examples => examples}/kernelridgeregression.jl | 0
 {docs/examples => examples}/svm.jl                   | 0
 5 files changed, 2 insertions(+), 2 deletions(-)
 rename {docs/examples => examples}/automaticstatistician.jl (100%)
 rename {docs/examples => examples}/deepkernellearning.jl (100%)
 rename {docs/examples => examples}/kernelridgeregression.jl (100%)
 rename {docs/examples => examples}/svm.jl (100%)

diff --git a/docs/make.jl b/docs/make.jl
index 892c34209..afed32030 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -6,11 +6,11 @@ if ispath(joinpath(@__DIR__, "src", "examples"))
     rm(joinpath(@__DIR__, "src", "examples"), recursive=true)
 end
 
-for filename in readdir(joinpath(@__DIR__, "examples"))
+for filename in readdir(joinpath(@__DIR__, "..", "examples"))
     endswith(filename, ".jl") || continue
     name = splitext(filename)[1]
     Literate.markdown(
-        joinpath(@__DIR__, "examples", filename),
+        joinpath(@__DIR__, "..", "examples", filename),
         joinpath(@__DIR__, "src", "examples"),
         name = name,
         documenter = true,
diff --git a/docs/examples/automaticstatistician.jl b/examples/automaticstatistician.jl
similarity index 100%
rename from docs/examples/automaticstatistician.jl
rename to examples/automaticstatistician.jl
diff --git a/docs/examples/deepkernellearning.jl b/examples/deepkernellearning.jl
similarity index 100%
rename from docs/examples/deepkernellearning.jl
rename to examples/deepkernellearning.jl
diff --git a/docs/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl
similarity index 100%
rename from docs/examples/kernelridgeregression.jl
rename to examples/kernelridgeregression.jl
diff --git a/docs/examples/svm.jl b/examples/svm.jl
similarity index 100%
rename from docs/examples/svm.jl
rename to examples/svm.jl

From 0eb6f36c161d08491a82a9735b0d0b453b947968 Mon Sep 17 00:00:00 2001
From: ST John <st--@users.noreply.github.com>
Date: Fri, 8 Jan 2021 11:38:22 +0000
Subject: [PATCH 10/19] cleanup svm.jl

---
 examples/svm.jl | 39 +++++++++++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 10 deletions(-)

diff --git a/examples/svm.jl b/examples/svm.jl
index 9bb262496..906d4f03b 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -1,14 +1,23 @@
+# -*- coding: utf-8 -*-
 # # Support Vector Machines
-# ## Package loading
+
+# TODO: introduction
+#
+# We first load the packages we will need in this notebook:
+
 using KernelFunctions
 using Distributions, LinearAlgebra
 using Plots; default(legendfontsize = 15.0, ms = 5.0)
 
 # ## Data Generation
-# ### We first generate a mixture of two Gaussians in 2 dimensions
+#
+# We first generate a mixture of two Gaussians in 2 dimensions
+
 xmin = -3; xmax = 3 # Limits for sampling μ₁ and μ₂
 μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers
-# ### We then sample both y and x
+
+# We then sample both the input $x$ and the class $y$:
+
 N = 100 # Number of samples
 y = rand((-1, 1), N) # Select randomly between the two classes
 x = Vector{Vector{Float64}}(undef, N) # We preallocate x
@@ -19,27 +28,37 @@ scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2")
 
 # ## Model Definition
 # TODO Write theory here
-# ### We create a kernel k
-k = SqExponentialKernel() # SqExponentialKernel/RBFKernel
+
+# ### Kernel
+
+k = SqExponentialKernel()  # SqExponentialKernel/RBFKernel
 λ = 1.0 # Regularization parameter
 
-# ### We create a function to return the optimal prediction for a
-# test data `x_new`
+# ### Predictor
+# We create a function to return the optimal prediction for a test data `x_new`
+
 function f(x_new, x, y, k, λ)
     kernelmatrix(k, x_new, x) * inv(kernelmatrix(k, x) + λ * I) * y # Optimal prediction f
 end
 
-# ### We also compute the total loss of the model that we want to minimize
+# ### Loss
+# We also compute the total loss of the model that we want to minimize
+
 hingeloss(y, ŷ) = maximum(zero(ŷ), 1 - y * ŷ) # hingeloss function
 function reg_hingeloss(k, x, y, λ)
     ŷ = f(x, x, y, k, λ)
     return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation
 end
-# ### We create a 2D grid based on the maximum values of the data
+
+# ### Testing the trained model
+# We create a 2D grid based on the maximum values of the data
+
 N_test = 100 # Size of the grid
 xgrid = range(extrema(vcat(x...)).*1.1..., length=N_test) # Create a 1D grid
 xgrid_v = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid
-# ### We predict the value of y on this grid on plot it against the data
+
+# We predict the value of y on this grid on plot it against the data
+
 y_grid = f(xgrid_v, x, y, k, λ) #Compute prediction on a grid
 contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)', label =  "Predictions", title="Trained model")
 scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1")

From bdd0800756ea2386ea556c805f3f80a29eed1638 Mon Sep 17 00:00:00 2001
From: ST John <st--@users.noreply.github.com>
Date: Fri, 8 Jan 2021 12:21:32 +0000
Subject: [PATCH 11/19] constants for paths

---
 docs/make.jl | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/make.jl b/docs/make.jl
index afed32030..7b3d01a0f 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -2,16 +2,19 @@ using Documenter
 using Literate
 using KernelFunctions
 
-if ispath(joinpath(@__DIR__, "src", "examples"))
-    rm(joinpath(@__DIR__, "src", "examples"), recursive=true)
+const EXAMPLES_SRC = joinpath(@__DIR__, "..", "examples")
+const EXAMPLES_OUT = joinpath(@__DIR__, "src", "examples")
+
+if ispath(EXAMPLES_OUT)
+    rm(EXAMPLES_OUT, recursive=true)
 end
 
-for filename in readdir(joinpath(@__DIR__, "..", "examples"))
+for filename in readdir(EXAMPLES_SRC)
     endswith(filename, ".jl") || continue
     name = splitext(filename)[1]
     Literate.markdown(
-        joinpath(@__DIR__, "..", "examples", filename),
-        joinpath(@__DIR__, "src", "examples"),
+        joinpath(EXAMPLES_SRC, filename),
+        EXAMPLES_OUT,
         name = name,
         documenter = true,
     )

From 125480fe66adabf16680955932c53a92d509bec8 Mon Sep 17 00:00:00 2001
From: ST John <st--@users.noreply.github.com>
Date: Mon, 18 Jan 2021 16:47:43 +0000
Subject: [PATCH 12/19] break cells notebook-style

---
 examples/kernelridgeregression.jl | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl
index 212e3ada6..bfa968ee5 100644
--- a/examples/kernelridgeregression.jl
+++ b/examples/kernelridgeregression.jl
@@ -1,5 +1,8 @@
+# -*- coding: utf-8 -*-
 # # Kernel Ridge Regression
+
 # ## We load KernelFunctions and some other packages
+
 using KernelFunctions
 using LinearAlgebra
 using Distributions
@@ -12,6 +15,7 @@ seed!(42)
 
 # ## Data Generation
 # We generated data in 1 dimension
+
 xmin = -3;
 xmax = 3; # Bounds of the data
 N = 50 # Number of samples
@@ -21,12 +25,14 @@ y_train = sinc.(x_train) + randn(N) * σ # We create a function and add some noi
 x_test = range(xmin - 0.1, xmax + 0.1; length=300)
 
 # Plot the data
+
 scatter(x_train, y_train; lab="data")
 plot!(x_test, sinc; lab="true function")
 
 # ## Kernel training
 # To train the kernel parameters via ForwardDiff.jl
 # we need to create a function creating a kernel from an array
+
 kernelcall(θ) = transform(
     exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(),
     exp(θ[3]),
@@ -34,6 +40,7 @@ kernelcall(θ) = transform(
 
 # From theory we know the prediction for a test set x given
 # the kernel parameters and normalization constant
+
 function f(x, x_train, y_train, θ)
     k = kernelcall(θ[1:3])
     return kernelmatrix(k, x, x_train) *
@@ -43,6 +50,7 @@ end
 
 # We look how the prediction looks like
 # with starting parameters [1.0, 1.0, 1.0, 1.0] we get :
+
 ŷ = f(x_test, x_train, y_train, log.(ones(4)))
 scatter(x_train, y_train; lab="data")
 plot!(x_test, sinc; lab="true function")
@@ -50,15 +58,18 @@ plot!(x_test, ŷ; lab="prediction")
 
 # We define the loss based on the L2 norm both
 # for the loss and the regularization
+
 function loss(θ)
     ŷ = f(x_train, x_train, y_train, θ)
     return sum(abs2, y_train - ŷ) + exp(θ[4]) * norm(ŷ)
 end
 
 # The loss with our starting point :
+
 loss(log.(ones(4)))
 
 # ## Training the model
+
 θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001)) # Initial vector
 anim = Animation()
 opt = Optimise.ADAGrad(0.5)

From e188bee381f658f85bca464f6504c3b52d782620 Mon Sep 17 00:00:00 2001
From: ST John <st--@users.noreply.github.com>
Date: Mon, 18 Jan 2021 16:49:27 +0000
Subject: [PATCH 13/19] add GP prior samples notebook

---
 examples/gaussianprocesspriors.jl | 81 +++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 examples/gaussianprocesspriors.jl

diff --git a/examples/gaussianprocesspriors.jl b/examples/gaussianprocesspriors.jl
new file mode 100644
index 000000000..ba7e647c6
--- /dev/null
+++ b/examples/gaussianprocesspriors.jl
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+# # Gaussian process prior samples
+#
+# The kernels defined in this package can also be used to specify the
+# covariance of a Gaussian process prior.
+# A Gaussian process (GP) is defined by its mean function $m(\cdot)$ and its covariance function or kernel $k(\cdot, \cdot')$:
+# $$
+#   f \sim \mathcal{GP}(m(\cdot), k(\cdot, \cdot')
+# $$
+# The function values of the GP at a finite number of points $X = \{x_n\}_{n=1}^N$ follow a multivariate normal distribution with mean vector $\mathbf{m}$ and covariance matrix $\mathrm{K}$, where
+# $$
+# \begin{align}
+#   \mathbf{m}_i &= m(x_i) \\
+#   \mathrm{K}_{i,j} &= k(x_i, x_j)
+# \end{align}
+# $$
+# where $1 \le i, j \le N$.
+#
+# In this notebook we show samples from zero-mean GPs with different kernels.
+
+# Load required packages
+using KernelFunctions
+using LinearAlgebra
+using Distributions
+using Plots
+default(; lw=1.0, legendfontsize=15.0)
+using Random: seed!
+seed!(42);
+
+num_inputs = 101
+xlim = (-5, 5)
+X = reshape(collect(range(xlim...; length=num_inputs)), :, 1)
+num_samples = 11
+v = randn(num_inputs, num_samples);
+
+function visualize(k::Kernel; xref=0.0)
+    K = kernelmatrix(k, X; obsdim=1)
+    L = cholesky(K + 1e-6 * I)
+    f = L.L * v
+
+    p_kernel_2d = heatmap(
+        K;
+        yflip=true,
+        colorbar=false,
+        ylabel=string(k),
+        framestyle=:none,
+        #color=:blues,
+        vlim=(0, 1),
+        title=raw"$k(x, x')$",
+    )
+
+    p_kernel_cut = plot(
+        X,
+        k.(X, xref);
+        title=string(raw"$k(x, ", xref, raw")$"),
+        xlim=xlim,
+        xticks=(xlim, xlim),
+        label=nothing,
+    )
+
+    p_samples = plot(
+        X,
+        f;
+        c="blue",
+        title=raw"$f(x)$",
+        ylim=(-3, 3),
+        xlim=xlim,
+        xticks=(xlim, xlim),
+        label=nothing,
+    )
+
+    return plot(p_kernel_2d, p_kernel_cut, p_samples; layout=(1, 3), xlabel=raw"$x$")
+end
+
+visualize(SqExponentialKernel())
+
+kernel_classes = [Matern12Kernel, Matern32Kernel, Matern52Kernel, SqExponentialKernel]
+plot(
+    [visualize(k()) for k in kernel_classes]...,
+    #layout=(length(kernel_classes), 1)
+)

From c7ecd3139d67741e24391b912fcea3b6deb0ad39 Mon Sep 17 00:00:00 2001
From: ST John <st--@users.noreply.github.com>
Date: Mon, 18 Jan 2021 19:00:56 +0000
Subject: [PATCH 14/19] rename kernelridgeregression.jl ->
 train_kernel_parameters.jl

---
 examples/{kernelridgeregression.jl => train_kernel_parameters.jl} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/{kernelridgeregression.jl => train_kernel_parameters.jl} (100%)

diff --git a/examples/kernelridgeregression.jl b/examples/train_kernel_parameters.jl
similarity index 100%
rename from examples/kernelridgeregression.jl
rename to examples/train_kernel_parameters.jl

From a2f8a5e4a2f198bf1f4eff7a7ce17af2c4cd64c0 Mon Sep 17 00:00:00 2001
From: ST John <st--@users.noreply.github.com>
Date: Mon, 18 Jan 2021 19:02:45 +0000
Subject: [PATCH 15/19] new kernel_ridge_regression.jl tutorial

---
 examples/kernel_ridge_regression.jl | 184 ++++++++++++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 examples/kernel_ridge_regression.jl

diff --git a/examples/kernel_ridge_regression.jl b/examples/kernel_ridge_regression.jl
new file mode 100644
index 000000000..96bdfaf3d
--- /dev/null
+++ b/examples/kernel_ridge_regression.jl
@@ -0,0 +1,184 @@
+# -*- coding: utf-8 -*-
+# # Kernel Ridge Regression
+#
+# Building on linear regression, we can fit non-linear data sets by introducing a feature space. In a higher-dimensional feature space, we can overfit the data; ridge regression introduces regularization to avoid this. In this notebook we show how we can use KernelFunctions.jl for *kernel* ridge regression.
+
+# +
+# Loading and setup of required packages
+using KernelFunctions
+using LinearAlgebra
+using Distributions
+
+# Plotting
+using Plots;
+default(; lw=2.0, legendfontsize=15.0);
+
+# Gradient-based optimization
+using Flux: Optimise
+using ForwardDiff
+
+using Random: seed!
+seed!(42);
+# -
+
+# ## From linear regression to ridge regression
+# Here we use a one-dimensional toy problem. We generate data using the fourth-order polynomial $f(x) = (x+4)(x+1)(x-1)(x-3)$:
+
+# +
+f_truth(x) = (x + 4) * (x + 1) * (x - 1) * (x - 3)
+
+x_train = collect(-5:0.5:5)
+x_test = collect(-5:0.1:5)
+
+noise = rand(Uniform(-10, 10), size(x_train))
+y_train = f_truth.(x_train) + noise
+y_test = f_truth.(x_test)
+
+plot(x_test, y_test; label=raw"$f(x)$")
+scatter!(x_train, y_train; label="observations")
+# -
+
+# For training inputs $\mathrm{X}=(\mathbf{x}_n)_{n=1}^N$ and observations $\mathbf{y}=(y_n)_{n=1}^N$, the linear regression weights $\mathbf{w}$ using the least-squares estimator are given by
+# $$
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y}
+# $$
+# We predict at test inputs $\mathbf{x}_*$ using
+# $$
+# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w}
+# $$
+# This is implemented by `linear_regression`:
+
+function linear_regression(X::AbstractVecOrMat, y::AbstractVector, Xstar::AbstractVecOrMat)
+    weights = inv(X' * X) * X' * y
+    return Xstar * weights
+end
+
+# A linear regression fit to the above data set:
+
+y_pred = linear_regression(x_train, y_train, x_test)
+scatter(x_train, y_train; label="observations")
+plot!(x_test, y_pred; label="linear fit")
+
+# We can improve the fit by including additional features, i.e. generalizing to $\mathrm{X} = (\phi(x_n))_{n=1}^N$, where $\phi(x)$ constructs a feature vector for each input $x$. Here we include powers of the input, $\phi(x) = (1, x, x^2, \dots, x^d)$:
+
+function featurize_poly(x; degree=1)
+    xcols = [x .^ d for d in 0:degree]
+    return hcat(xcols...)
+end
+
+# +
+function featurized_fit_and_plot(degree)
+    X = featurize_poly(x_train; degree=degree)
+    Xstar = featurize_poly(x_test; degree=degree)
+    y_pred = linear_regression(X, y_train, Xstar)
+    scatter(x_train, y_train; label=nothing)
+    p = plot!(x_test, y_pred; label=nothing, title="fit of order $degree")
+    return p
+end
+
+plot([featurized_fit_and_plot(degree) for degree in 1:4]...)
+# -
+
+# Note that the fit becomes perfect when we include exactly as many orders in the features as we have in the underlying polynomial (4).
+#
+# However, when increasing the number of features, we can quickly overfit to noise in the data set:
+
+featurized_fit_and_plot(18)
+
+# To counteract this unwanted behaviour, we can introduce regularization. This leads to *ridge regression* with $L_2$ regularization of the weights ([Tikhonov regularization](https://en.wikipedia.org/wiki/Tikhonov_regularization)).
+# Instead of the weights in linear regression,
+# $$
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y}
+# $$
+# we introduce the ridge parameter $\lambda$:
+# $$
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y}
+# $$
+# As before, we predict at test inputs $\mathbf{x}_*$ using
+# $$
+# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w}
+# $$
+# This is implemented by `ridge_regression`:
+
+function ridge_regression(
+    X::AbstractVecOrMat, y::AbstractVector, Xstar::AbstractVecOrMat, lambda::Float64
+)
+    weights = inv(X' * X + lambda * I) * X' * y
+    return Xstar * weights
+end
+
+# +
+function regularized_fit_and_plot(degree, lambda)
+    X = featurize_poly(x_train; degree=degree)
+    Xstar = featurize_poly(x_test; degree=degree)
+    y_pred = ridge_regression(X, y_train, Xstar, lambda)
+    scatter(x_train, y_train; label=nothing)
+    p = plot!(x_test, y_pred; label=nothing, title=string(raw"$\lambda=", lambda, raw"$"))
+    return p
+end
+
+plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]...)
+# -
+
+# Instead of constructing the feature matrix explicitly, we can use *kernels* to replace inner products of feature vectors with a kernel evaluation: $\langle \phi(x), \phi(x') \rangle = k(x, x')$ or $\mathrm{X} \mathrm{X}^\top = \mathrm{K}$, where $\mathrm{K}_{ij} = k(x_i, x_j)$.
+#
+# To apply this "kernel trick" to ridge regression, we can rewrite the ridge estimate for the weights
+# $$
+# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y}
+# $$
+# using the [matrix inversion lemma](https://tlienart.github.io/pub/csml/mtheory/matinvlem.html#basic_lemmas)
+# as
+# $$
+# \mathbf{w} = \mathrm{X}^\top (\mathrm{X} \mathrm{X}^\top + \lambda \mathbb{1})^{-1} \mathbf{y}
+# $$
+# where we can now replace the inner product with the kernel matrix,
+# $$
+# \mathbf{w} = \mathrm{X}^\top (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y}
+# $$
+# And the prediction yields another inner product,
+# $$
+# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} = \langle \mathbf{x}_*, \mathbf{w} \rangle = \mathbf{k}_* (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y}
+# $$
+# where $(\mathbf{k}_*)_n = k(x_*, x_n)$.
+#
+# This is implemented by `kernel_ridge_regression`:
+
+function kernel_ridge_regression(
+    k::Kernel,
+    X::AbstractVecOrMat,
+    y::AbstractVector,
+    Xstar::AbstractVecOrMat,
+    lambda::Float64,
+)
+    K = kernelmatrix(k, X)
+    kstar = kernelmatrix(k, Xstar, X)
+    return kstar * inv(K + lambda * I) * y
+end
+
+# Now, instead of explicitly constructing features, we can simply pass in a `PolynomialKernel` object:
+
+# +
+function kernelized_fit_and_plot(kernel, lambda=1e-4)
+    y_pred = kernel_ridge_regression(kernel, x_train, y_train, x_test, lambda)
+    if kernel isa PolynomialKernel
+        title = string("order ", kernel.degree)
+    else
+        title = string(kernel)
+    end
+    scatter(x_train, y_train; label=nothing)
+    p = plot!(
+        x_test,
+        y_pred;
+        label=nothing,
+        title=title,
+        #title=string(raw"$\lambda=", lambda, raw"$")
+    )
+    return p
+end
+
+plot([kernelized_fit_and_plot(PolynomialKernel(; degree=degree, c=1)) for degree in 1:4]...)
+# -
+
+# However, we can now also use kernels that would have an infinite-dimensional feature expansion, such as the squared exponential kernel:
+
+kernelized_fit_and_plot(SqExponentialKernel())

From 542f93230a4cd850d00515f4ec2f631c44b036e3 Mon Sep 17 00:00:00 2001
From: ST John <st--@users.noreply.github.com>
Date: Mon, 18 Jan 2021 19:06:16 +0000
Subject: [PATCH 16/19] update make.jl for example notebooks

---
 docs/make.jl                      | 6 ++++--
 examples/automaticstatistician.jl | 1 -
 2 files changed, 4 insertions(+), 3 deletions(-)
 delete mode 100644 examples/automaticstatistician.jl

diff --git a/docs/make.jl b/docs/make.jl
index 0908576b6..6801a0a14 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -32,9 +32,11 @@ makedocs(;
         "Home" => "index.md",
         "User Guide" => "userguide.md",
         "Examples" => [
+            "Kernel Ridge Regression" => "examples/kernel_ridge_regression.md",
+            "Training kernel parameters" => "examples/train_kernel_parameters.md",
+            "Gaussian process priors" => "examples/gaussianprocesspriors.md",
             "SVM" => "examples/svm.md",
-            # "Kernel Ridge Regression" => "examples/kernelridgeregression.md",
-            # "Deep Kernel Learning" => "examples/deepkernellearning.md",
+            "Deep Kernel Learning" => "examples/deepkernellearning.md",
         ],
         "Kernel Functions" => "kernels.md",
         "Input Transforms" => "transform.md",
diff --git a/examples/automaticstatistician.jl b/examples/automaticstatistician.jl
deleted file mode 100644
index 8b1378917..000000000
--- a/examples/automaticstatistician.jl
+++ /dev/null
@@ -1 +0,0 @@
-

From 2d84c74da733ab74665c141b0698da01a08cd1f2 Mon Sep 17 00:00:00 2001
From: ST John <st--@users.noreply.github.com>
Date: Mon, 18 Jan 2021 22:30:30 +0000
Subject: [PATCH 17/19] update format

---
 examples/gaussianprocesspriors.jl   | 11 ++++++++++-
 examples/kernel_ridge_regression.jl | 18 ++----------------
 2 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/examples/gaussianprocesspriors.jl b/examples/gaussianprocesspriors.jl
index ba7e647c6..cd34c4724 100644
--- a/examples/gaussianprocesspriors.jl
+++ b/examples/gaussianprocesspriors.jl
@@ -18,7 +18,7 @@
 #
 # In this notebook we show samples from zero-mean GPs with different kernels.
 
-# Load required packages
+## Load required packages
 using KernelFunctions
 using LinearAlgebra
 using Distributions
@@ -27,6 +27,10 @@ default(; lw=1.0, legendfontsize=15.0)
 using Random: seed!
 seed!(42);
 
+# We now define a function that visualizes a kernel
+# for us. We use the same randomness to obtain
+# comparable samples.
+
 num_inputs = 101
 xlim = (-5, 5)
 X = reshape(collect(range(xlim...; length=num_inputs)), :, 1)
@@ -72,8 +76,13 @@ function visualize(k::Kernel; xref=0.0)
     return plot(p_kernel_2d, p_kernel_cut, p_samples; layout=(1, 3), xlabel=raw"$x$")
 end
 
+# We can now visualize a kernel and show samples from
+# a Gaussian process with this kernel:
+
 visualize(SqExponentialKernel())
 
+# This also allows us to compare different kernels:
+
 kernel_classes = [Matern12Kernel, Matern32Kernel, Matern52Kernel, SqExponentialKernel]
 plot(
     [visualize(k()) for k in kernel_classes]...,
diff --git a/examples/kernel_ridge_regression.jl b/examples/kernel_ridge_regression.jl
index 96bdfaf3d..f242a0d4a 100644
--- a/examples/kernel_ridge_regression.jl
+++ b/examples/kernel_ridge_regression.jl
@@ -3,28 +3,21 @@
 #
 # Building on linear regression, we can fit non-linear data sets by introducing a feature space. In a higher-dimensional feature space, we can overfit the data; ridge regression introduces regularization to avoid this. In this notebook we show how we can use KernelFunctions.jl for *kernel* ridge regression.
 
-# +
-# Loading and setup of required packages
+## Loading and setup of required packages
 using KernelFunctions
 using LinearAlgebra
 using Distributions
 
-# Plotting
+## Plotting
 using Plots;
 default(; lw=2.0, legendfontsize=15.0);
 
-# Gradient-based optimization
-using Flux: Optimise
-using ForwardDiff
-
 using Random: seed!
 seed!(42);
-# -
 
 # ## From linear regression to ridge regression
 # Here we use a one-dimensional toy problem. We generate data using the fourth-order polynomial $f(x) = (x+4)(x+1)(x-1)(x-3)$:
 
-# +
 f_truth(x) = (x + 4) * (x + 1) * (x - 1) * (x - 3)
 
 x_train = collect(-5:0.5:5)
@@ -36,7 +29,6 @@ y_test = f_truth.(x_test)
 
 plot(x_test, y_test; label=raw"$f(x)$")
 scatter!(x_train, y_train; label="observations")
-# -
 
 # For training inputs $\mathrm{X}=(\mathbf{x}_n)_{n=1}^N$ and observations $\mathbf{y}=(y_n)_{n=1}^N$, the linear regression weights $\mathbf{w}$ using the least-squares estimator are given by
 # $$
@@ -66,7 +58,6 @@ function featurize_poly(x; degree=1)
     return hcat(xcols...)
 end
 
-# +
 function featurized_fit_and_plot(degree)
     X = featurize_poly(x_train; degree=degree)
     Xstar = featurize_poly(x_test; degree=degree)
@@ -77,7 +68,6 @@ function featurized_fit_and_plot(degree)
 end
 
 plot([featurized_fit_and_plot(degree) for degree in 1:4]...)
-# -
 
 # Note that the fit becomes perfect when we include exactly as many orders in the features as we have in the underlying polynomial (4).
 #
@@ -107,7 +97,6 @@ function ridge_regression(
     return Xstar * weights
 end
 
-# +
 function regularized_fit_and_plot(degree, lambda)
     X = featurize_poly(x_train; degree=degree)
     Xstar = featurize_poly(x_test; degree=degree)
@@ -118,7 +107,6 @@ function regularized_fit_and_plot(degree, lambda)
 end
 
 plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]...)
-# -
 
 # Instead of constructing the feature matrix explicitly, we can use *kernels* to replace inner products of feature vectors with a kernel evaluation: $\langle \phi(x), \phi(x') \rangle = k(x, x')$ or $\mathrm{X} \mathrm{X}^\top = \mathrm{K}$, where $\mathrm{K}_{ij} = k(x_i, x_j)$.
 #
@@ -157,7 +145,6 @@ end
 
 # Now, instead of explicitly constructing features, we can simply pass in a `PolynomialKernel` object:
 
-# +
 function kernelized_fit_and_plot(kernel, lambda=1e-4)
     y_pred = kernel_ridge_regression(kernel, x_train, y_train, x_test, lambda)
     if kernel isa PolynomialKernel
@@ -177,7 +164,6 @@ function kernelized_fit_and_plot(kernel, lambda=1e-4)
 end
 
 plot([kernelized_fit_and_plot(PolynomialKernel(; degree=degree, c=1)) for degree in 1:4]...)
-# -
 
 # However, we can now also use kernels that would have an infinite-dimensional feature expansion, such as the squared exponential kernel:
 

From 9e4e144ad97c169b6c1b8b292ed320cc145f4590 Mon Sep 17 00:00:00 2001
From: st-- <st--@users.noreply.github.com>
Date: Wed, 30 Jun 2021 16:06:11 +0300
Subject: [PATCH 18/19] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: David Widmann <devmotion@users.noreply.github.com>
Co-authored-by: Théo Galy-Fajou <theo.galyfajou@gmail.com>
---
 examples/gaussianprocesspriors.jl   | 14 ++++-----
 examples/kernel_ridge_regression.jl | 47 +++++++++++------------------
 examples/svm.jl                     |  1 -
 examples/train_kernel_parameters.jl | 13 +++-----
 4 files changed, 30 insertions(+), 45 deletions(-)

diff --git a/examples/gaussianprocesspriors.jl b/examples/gaussianprocesspriors.jl
index cd34c4724..38da13a9a 100644
--- a/examples/gaussianprocesspriors.jl
+++ b/examples/gaussianprocesspriors.jl
@@ -4,16 +4,16 @@
 # The kernels defined in this package can also be used to specify the
 # covariance of a Gaussian process prior.
 # A Gaussian process (GP) is defined by its mean function $m(\cdot)$ and its covariance function or kernel $k(\cdot, \cdot')$:
-# $$
-#   f \sim \mathcal{GP}(m(\cdot), k(\cdot, \cdot')
-# $$
+# ```math
+#   f \sim \mathcal{GP}\big(m(\cdot), k(\cdot, \cdot')\big)
+# ```
 # The function values of the GP at a finite number of points $X = \{x_n\}_{n=1}^N$ follow a multivariate normal distribution with mean vector $\mathbf{m}$ and covariance matrix $\mathrm{K}$, where
-# $$
-# \begin{align}
+# ```math
+# \begin{aligned}
 #   \mathbf{m}_i &= m(x_i) \\
 #   \mathrm{K}_{i,j} &= k(x_i, x_j)
-# \end{align}
-# $$
+# \end{aligned}
+# ```
 # where $1 \le i, j \le N$.
 #
 # In this notebook we show samples from zero-mean GPs with different kernels.
diff --git a/examples/kernel_ridge_regression.jl b/examples/kernel_ridge_regression.jl
index f242a0d4a..a76905462 100644
--- a/examples/kernel_ridge_regression.jl
+++ b/examples/kernel_ridge_regression.jl
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # # Kernel Ridge Regression
 #
 # Building on linear regression, we can fit non-linear data sets by introducing a feature space. In a higher-dimensional feature space, we can overfit the data; ridge regression introduces regularization to avoid this. In this notebook we show how we can use KernelFunctions.jl for *kernel* ridge regression.
@@ -31,17 +30,17 @@ plot(x_test, y_test; label=raw"$f(x)$")
 scatter!(x_train, y_train; label="observations")
 
 # For training inputs $\mathrm{X}=(\mathbf{x}_n)_{n=1}^N$ and observations $\mathbf{y}=(y_n)_{n=1}^N$, the linear regression weights $\mathbf{w}$ using the least-squares estimator are given by
-# $$
+# ```math
 # \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y}
-# $$
+# ```
 # We predict at test inputs $\mathbf{x}_*$ using
-# $$
+# ```math
 # \hat{y}_* = \mathbf{x}_*^\top \mathbf{w}
-# $$
+# ```
 # This is implemented by `linear_regression`:
 
-function linear_regression(X::AbstractVecOrMat, y::AbstractVector, Xstar::AbstractVecOrMat)
-    weights = inv(X' * X) * X' * y
+function linear_regression(X, y, Xstar)
+    weights = (X' * X) \ (X' * y)
     return Xstar * weights
 end
 
@@ -62,9 +61,8 @@ function featurized_fit_and_plot(degree)
     X = featurize_poly(x_train; degree=degree)
     Xstar = featurize_poly(x_test; degree=degree)
     y_pred = linear_regression(X, y_train, Xstar)
-    scatter(x_train, y_train; label=nothing)
-    p = plot!(x_test, y_pred; label=nothing, title="fit of order $degree")
-    return p
+    scatter(x_train, y_train; legend=false, title="fit of order $degree")
+    return plot!(x_test, y_pred)
 end
 
 plot([featurized_fit_and_plot(degree) for degree in 1:4]...)
@@ -85,15 +83,13 @@ featurized_fit_and_plot(18)
 # \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y}
 # $$
 # As before, we predict at test inputs $\mathbf{x}_*$ using
-# $$
+# ```math
 # \hat{y}_* = \mathbf{x}_*^\top \mathbf{w}
-# $$
+# ```
 # This is implemented by `ridge_regression`:
 
-function ridge_regression(
-    X::AbstractVecOrMat, y::AbstractVector, Xstar::AbstractVecOrMat, lambda::Float64
-)
-    weights = inv(X' * X + lambda * I) * X' * y
+function ridge_regression(X, y, Xstar, lambda)
+    weights = (X' * X + lambda * I) \ (X' * y)
     return Xstar * weights
 end
 
@@ -101,9 +97,8 @@ function regularized_fit_and_plot(degree, lambda)
     X = featurize_poly(x_train; degree=degree)
     Xstar = featurize_poly(x_test; degree=degree)
     y_pred = ridge_regression(X, y_train, Xstar, lambda)
-    scatter(x_train, y_train; label=nothing)
-    p = plot!(x_test, y_pred; label=nothing, title=string(raw"$\lambda=", lambda, raw"$"))
-    return p
+    scatter(x_train, y_train; legend=false, title="\$\\lambda=$lambda\$")
+    return plot!(x_test, y_pred)
 end
 
 plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]...)
@@ -124,23 +119,17 @@ plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]].
 # \mathbf{w} = \mathrm{X}^\top (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y}
 # $$
 # And the prediction yields another inner product,
-# $$
+# ```math
 # \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} = \langle \mathbf{x}_*, \mathbf{w} \rangle = \mathbf{k}_* (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y}
-# $$
+# ```
 # where $(\mathbf{k}_*)_n = k(x_*, x_n)$.
 #
 # This is implemented by `kernel_ridge_regression`:
 
-function kernel_ridge_regression(
-    k::Kernel,
-    X::AbstractVecOrMat,
-    y::AbstractVector,
-    Xstar::AbstractVecOrMat,
-    lambda::Float64,
-)
+function kernel_ridge_regression(k, X, y, Xstar, lambda)
     K = kernelmatrix(k, X)
     kstar = kernelmatrix(k, Xstar, X)
-    return kstar * inv(K + lambda * I) * y
+    return kstar * ((K + lambda * I) \ y)
 end
 
 # Now, instead of explicitly constructing features, we can simply pass in a `PolynomialKernel` object:
diff --git a/examples/svm.jl b/examples/svm.jl
index fa9be7b15..aecd218c9 100644
--- a/examples/svm.jl
+++ b/examples/svm.jl
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # # Support Vector Machines
 
 # TODO: introduction
diff --git a/examples/train_kernel_parameters.jl b/examples/train_kernel_parameters.jl
index bfa968ee5..c3cc5d1dd 100644
--- a/examples/train_kernel_parameters.jl
+++ b/examples/train_kernel_parameters.jl
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # # Kernel Ridge Regression
 
 # ## We load KernelFunctions and some other packages
@@ -44,8 +43,7 @@ kernelcall(θ) = transform(
 function f(x, x_train, y_train, θ)
     k = kernelcall(θ[1:3])
     return kernelmatrix(k, x, x_train) *
-           inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) *
-           y_train
+           ((kernelmatrix(k, x_train) + exp(θ[4]) * I) \ y_train)
 end
 
 # We look how the prediction looks like
@@ -70,14 +68,13 @@ loss(log.(ones(4)))
 
 # ## Training the model
 
-θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001)) # Initial vector
+θ = log.([1.0, 0.0, 0.01, 0.001]) # Initial vector
 anim = Animation()
 opt = Optimise.ADAGrad(0.5)
 for i in 1:30
-    grads = ForwardDiff.gradient(x -> loss(x), θ) # We compute the gradients given the kernel parameters and regularization
-    Δ = Optimise.apply!(opt, θ, grads)
-    θ .-= Δ # We apply a simple Gradient descent algorithm
-    p = scatter(
+    grads = ForwardDiff.gradient(loss, θ) # We compute the gradients given the kernel parameters and regularization
+    Optimise.update!(opt, θ, grads)
+    scatter(
         x_train, y_train; lab="data", title="i = $(i), Loss = $(round(loss(θ), digits = 4))"
     )
     plot!(x_test, sinc; lab="true function")

From 3195326ccf29b230e615d8d872145e64302554bd Mon Sep 17 00:00:00 2001
From: st-- <st--@users.noreply.github.com>
Date: Wed, 30 Jun 2021 17:03:08 +0300
Subject: [PATCH 19/19] Update docs/make.jl

---
 docs/make.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/make.jl b/docs/make.jl
index dbd74d723..ab0da26c1 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -27,7 +27,6 @@ isempty(processes) || success(processes) || error("some examples were not run su
 # Build documentation
 using KernelFunctions
 using Documenter
-using Literate
 
 # Print `@debug` statements (https://github.com/JuliaDocs/Documenter.jl/issues/955)
 if haskey(ENV, "GITHUB_ACTIONS")