From fcda0c6785afd6ea8c20509c4bb4321a1340d30d Mon Sep 17 00:00:00 2001 From: Theo Galy-Fajou Date: Thu, 4 Jun 2020 17:59:57 +0200 Subject: [PATCH 01/19] Draft for example scripts --- examples/deepkernellearning.jl | 73 +++++++++++++++++-------- examples/kernelridgeregression.jl | 91 ++++++++++++++++++++----------- examples/svm.jl | 40 ++++++++------ src/kernels/transformedkernel.jl | 6 +- 4 files changed, 136 insertions(+), 74 deletions(-) diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl index b77f0d13e..50c433705 100644 --- a/examples/deepkernellearning.jl +++ b/examples/deepkernellearning.jl @@ -3,33 +3,58 @@ using MLDataUtils using Zygote using Flux using Distributions, LinearAlgebra -using Plots +using Plots; +pyplot(); default(legendfontsize = 15.0, linewidth = 3.0) +using SliceMap -Flux.@functor SqExponentialKernel -Flux.@functor KernelSum -Flux.@functor Matern32Kernel -Flux.@functor FunctionTransform +neuralnet = Chain(Dense(1, 10), Dense(10, 2)) + +Base.map( + t::KernelFunctions.FunctionTransform, + X::AbstractVector; + obsdim::Int = defaultobs, +) where {T} = + slicemap(X, dims = 1) do x + t.f(x) + end -neuralnet = Chain(Dense(1,3),Dense(3,2)) -k = SqExponentialKernel(FunctionTransform(neuralnet)) xmin = -3; xmax = 3 -x = range(xmin,xmax,length=100) -x_test = rand(Uniform(xmin,xmax),200) -x,y = noisy_function(sinc,x;noise=0.1) -X = reshape(x,:,1) -λ = [0.1] -f(x,k,λ) = kernelmatrix(k,X,x,obsdim=1)*inv(kernelmatrix(k,X,obsdim=1)+exp(λ[1])*I)*y -f(X,k,1.0) -loss(k,λ) = f(X,k,λ) |> ŷ ->sum(y-ŷ)/length(y)+exp(λ[1])*norm(ŷ) -loss(k,λ) + +x_train = range(xmin, xmax, length = 100) +x_test = rand(Uniform(xmin, xmax), 200) +x_train, y = noisy_function(x_train; noise = 0.01) do x + sinc(abs(x) ^ abs(x)) +end +X = reshape(x_train, :, 1) +k = transform(SqExponentialKernel(), FunctionTransform(neuralnet)) + +λ = log.([1.0]) +function f(x, k, λ) + K = kernelmatrix(k, X, x, obsdim = 1) + return K * inv(K + exp(λ[1]) * I) * y +end +f(X, k, 1.0) +loss(k, λ) = f(X, k, λ) |> ŷ -> sum(abs2, y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ) + +@info "Init Loss = $(loss(k, λ))" + ps = Flux.params(k) -# push!(ps,λ) -opt = Flux.Momentum(1.0) +push!(ps,λ) + +p = Plots.scatter(x_train, y, lab = "data", title = "Loss = $(loss(k, λ))") +Plots.plot!(x_train, f(X, k, λ), lab = "Prediction", lw = 3.0) |> display ## -for i in 1:10 - grads = Zygote.gradient(()->loss(k,λ),ps) - Flux.Optimise.update!(opt,ps,grads) - p = Plots.scatter(x,y,lab="data",title="Loss = $(loss(k,λ))") - Plots.plot!(x,f(X,k,λ),lab="Prediction",lw=3.0) - display(p) +anim = Animation() +nmax= 10 +opt = Flux.ADAGrad(0.001) +@progress for i = 1:nmax + grads = Zygote.gradient(() -> loss(k, λ), ps) + Flux.Optimise.update!(opt, ps, grads) + if i % 100 == 0 + @info "$i/$nmax" + p = Plots.scatter(x, y, lab = "data", title = "Loss = $(loss(k,λ))") + Plots.plot!(x, f(X, k, λ), lab = "Prediction", lw = 3.0) + frame(anim) + end end +gif(anim, fps = 5) diff --git a/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl index 5c1d61d86..d6e999e06 100644 --- a/examples/kernelridgeregression.jl +++ b/examples/kernelridgeregression.jl @@ -1,35 +1,64 @@ +# Kernel Ridge Regression using KernelFunctions -using MLDataUtils -using Zygote -using Flux -using Distributions, LinearAlgebra +using LinearAlgebra +using Distributions using Plots +using Flux: Optimise +using ForwardDiff -Flux.@functor SqExponentialKernel -Flux.@functor ScaleTransform -Flux.@functor KernelSum -Flux.@functor Matern32Kernel - -xmin = -3; xmax = 3 -x = range(xmin,xmax,length=100) -x_test = range(xmin,xmax,length=300) -x,y = noisy_function(sinc,x;noise=0.1) -X = reshape(x,:,1) -X_test = reshape(x_test,:,1) -k = SqExponentialKernel(1.0)#+Matern32Kernel(2.0) -λ = [-1.0] -f(x,k,λ) = kernelmatrix(k,x,X,obsdim=1)*inv(kernelmatrix(k,X,obsdim=1)+exp(λ[1])*I)*y -f(X,k,1.0) -loss(k,λ) = f(X,k,λ) |> ŷ ->sum(y-ŷ)/length(y)+exp(λ[1])*norm(ŷ) -loss(k,λ) -ps = Flux.params(k) -push!(ps,λ) -opt = Flux.Momentum(0.1) -## -for i in 1:10 - grads = Zygote.gradient(()->loss(k,λ),ps) - Flux.Optimise.update!(opt,ps,grads) - p = Plots.scatter(x,y,lab="data",title="Loss = $(loss(k,λ))") - Plots.plot!(x_test,f(X_test,k,λ),lab="Prediction",lw=3.0) - display(p) +# We generate our data : +xmin = -3; xmax = 3 # Bounds of the data +N = 50 # Number of samples +σ = 0.1 + +x_train = rand(Uniform(xmin, xmax), N) # We sample 100 random samples +x_test = range(xmin, xmax, length = 300) # We use x_test to show the prediction function +y = sinc.(x_train) + randn(N) * σ # We create a function and add some noise + +# Plot the data +scatter(x_train, y, lab = "data") +plot!(x_test, sinc, lab = "true function") + +# Create a function taking kernel parameters and creating a kernel +kernelcall(θ) = transform( + exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(), + exp(θ[3]), +) + +# Return the prediction given the normalization value λ +function f(x, θ) + k = kernelcall(θ[1:3]) + kernelmatrix(k, x, x_train) * + inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * y +end + +# Starting with parameters [1.0, 1.0, 1.0, 1.0] we get : +ŷ = f(x_test, log.(ones(4))) +scatter(x_train, y, lab = "data") +plot!(x_test, sinc, lab = "true function") +plot!(x_test, ŷ, lab = "prediction") + +# Create a loss on the training data +function loss(θ) + ŷ = f(x_train, θ) + sum(abs2, y - ŷ) + exp(θ[4]) * norm(ŷ) +end + +# The loss with our starting point : +loss(log.(ones(4))) + +## Training model + +θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001)) +anim = Animation() +opt = ADAGrad(0.5) +@progress for i = 1:30 + grads = ForwardDiff.gradient(x -> loss(x), θ) # We compute the gradients given the kernel parameters and regularization + Δ = Optimise.apply!(opt, θ, grads) + θ .-= Δ # We apply a simple Gradient descent algorithm + p = scatter(x_train, y, lab = "data", title = "i = $(i), Loss = $(round(loss(θ), digits = 4))") + plot!(x_test, sinc, lab = "true function") + plot!(x_test, f(x_test, θ), lab = "Prediction", lw = 3.0) + frame(anim) end +gif(anim) diff --git a/examples/svm.jl b/examples/svm.jl index 823b2e1d4..30efc9519 100644 --- a/examples/svm.jl +++ b/examples/svm.jl @@ -5,19 +5,27 @@ using Flux using Distributions, LinearAlgebra using Plots -N = 100 #Number of samples -μ = randn(2,2) # Random Centers -xgrid = range(-3,3,length=100) # Create a grid -Xgrid = hcat(collect.(Iterators.product(xgrid,xgrid))...)' #Combine into a 2D grid -y = rand([-1,1],N) # Select randomly between the two classes -X = zeros(N,2) -X[y.==1,:] = rand(MvNormal(μ[:,1],I),count(y.==1))' #Attribute samples from class 1 -X[y.==-1,:] = rand(MvNormal(μ[:,2],I),count(y.==-1))' # Attribute samples from class 2 - - -k = SqExponentialKernel(2.0) # Create kernel function -f(x,k,λ) = kernelmatrix(k,x,X,obsdim=1)*inv(kernelmatrix(k,X,obsdim=1)+exp(λ[1])*I)*y # Optimal prediction f -svmloss(y,ŷ)= f(X,k,λ) |> ŷ -> sum(maximum.(0.0,1-y*ŷ)) - λ*norm(ŷ) # Total svm loss with regularisation -pred = f(Xgrid,k,λ) #Compute prediction on a grid -contourf(xgrid,xgrid,pred) -scatter!(eachcol(X)...,color=y,lab="data") +N = 100 # Number of samples +N_test = 200 # Size of the grid +xmin = -3; xmax = 3 +μ = rand(Uniform(xmin, xmax), 2, 2) # Random Centers +xgrid = range(-xmin, xmax, length=N_test) # Create a grid +Xgrid = hcat(collect.(Iterators.product(xgrid, xgrid))...) #Combine into a 2D grid +y = rand((-1, 1), N) # Select randomly between the two classes +X_train = zeros(2, N) +X_train[:, y .== 1] = rand(MvNormal(μ[:, 1], I), count(y.==1)) #Attribute samples from class 1 +X_train[:, y .== -1] = rand(MvNormal(μ[:, 2], I), count(y.==-1)) # Attribute samples from class 2 +scatter(eachrow(X_train)..., zcolor= y) +## Compute predictions +k = SqExponentialKernel() # Create kernel function +function f(x, k, λ) + kernelmatrix(k, x, X_train, obsdim=2) * inv(kernelmatrix(k, X_train, obsdim=2) + exp(λ[1]) * I) * y # Optimal prediction f +end +λ = log.([1.0]) +function reg_hingeloss(k, λ) + ŷ = f(X, k, λ) + return sum(maximum.(0.0, 1 - y * ŷ)) - exp(λ[1]) * norm(ŷ) # Total svm loss with regularisation +end +y_grid = f(Xgrid, k, λ) #Compute prediction on a grid +contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)) +scatter!(eachrow(X_train)..., zcolor=y,lab="data") diff --git a/src/kernels/transformedkernel.jl b/src/kernels/transformedkernel.jl index b96208308..3ae038b75 100644 --- a/src/kernels/transformedkernel.jl +++ b/src/kernels/transformedkernel.jl @@ -41,11 +41,11 @@ _scale(t::ScaleTransform, metric, x, y) = evaluate(metric, t(x), t(y)) """ transform -transform(k::BaseKernel, t::Transform) = TransformedKernel(k, t) +transform(k::Kernel, t::Transform) = TransformedKernel(k, t) -transform(k::BaseKernel, ρ::Real) = TransformedKernel(k, ScaleTransform(ρ)) +transform(k::Kernel, ρ::Real) = TransformedKernel(k, ScaleTransform(ρ)) -transform(k::BaseKernel,ρ::AbstractVector) = TransformedKernel(k, ARDTransform(ρ)) +transform(k::Kernel,ρ::AbstractVector) = TransformedKernel(k, ARDTransform(ρ)) kernel(κ) = κ.kernel From 2982f7ce0baa860387804a95c553bf60e9c36c63 Mon Sep 17 00:00:00 2001 From: Theo Galy-Fajou Date: Fri, 12 Jun 2020 18:58:59 +0200 Subject: [PATCH 02/19] Work on examples, but need some fixes on transform --- examples/deepkernellearning.jl | 45 ++++++++++++++++++++-------------- examples/svm.jl | 6 +++-- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl index 50c433705..ac4de77c2 100644 --- a/examples/deepkernellearning.jl +++ b/examples/deepkernellearning.jl @@ -1,13 +1,17 @@ +# # Loading dataset +# We use a couple of useful datasets to plot and optimize +# The different hyper-parameters using KernelFunctions using MLDataUtils using Zygote using Flux using Distributions, LinearAlgebra -using Plots; +using Plots +using AbstractGPs pyplot(); default(legendfontsize = 15.0, linewidth = 3.0) using SliceMap -neuralnet = Chain(Dense(1, 10), Dense(10, 2)) + Base.map( t::KernelFunctions.FunctionTransform, @@ -18,28 +22,33 @@ Base.map( t.f(x) end -xmin = -3; xmax = 3 +# ## Data creation +# We create a simple 1D Problem with very different variations +xmin = -3; xmax = 3 # Limits +noise = 0.01 +x_train = rand(Uniform(xmin, xmax), 100) # Training dataset +x_test = range(xmin, xmax, length=200) # Testing dataset +target_f(x) = sinc(abs(x) ^ abs(x)) # We use sinc with a highly varying value +x_train, y = noisy_function(target_f, x_train; noise = 0.01) -x_train = range(xmin, xmax, length = 100) -x_test = rand(Uniform(xmin, xmax), 200) -x_train, y = noisy_function(x_train; noise = 0.01) do x - sinc(abs(x) ^ abs(x)) -end -X = reshape(x_train, :, 1) +# ## Model definition +# We create a neural net with 2 layers and 10 units each +# The data is passed through the NN before being used in the kernel +neuralnet = Chain(Dense(1, 10), Dense(10, 2)) +# We use a Squared Exponential Kernel k = transform(SqExponentialKernel(), FunctionTransform(neuralnet)) -λ = log.([1.0]) -function f(x, k, λ) - K = kernelmatrix(k, X, x, obsdim = 1) - return K * inv(K + exp(λ[1]) * I) * y -end -f(X, k, 1.0) -loss(k, λ) = f(X, k, λ) |> ŷ -> sum(abs2, y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ) +f = GP(k) +fx = f(x_train, noise) +fp = posterior(fx, y) +# This compute the log evidence of `y`, +# which is going to be used as the objective +loss(y) = logpdf(fx, y) +pred -@info "Init Loss = $(loss(k, λ))" +@info "Init Loss = $(loss(y))" ps = Flux.params(k) -push!(ps,λ) p = Plots.scatter(x_train, y, lab = "data", title = "Loss = $(loss(k, λ))") Plots.plot!(x_train, f(X, k, λ), lab = "Prediction", lw = 3.0) |> display diff --git a/examples/svm.jl b/examples/svm.jl index 30efc9519..c92600938 100644 --- a/examples/svm.jl +++ b/examples/svm.jl @@ -1,14 +1,16 @@ using KernelFunctions -using MLDataUtils using Zygote using Flux using Distributions, LinearAlgebra using Plots +# # + N = 100 # Number of samples N_test = 200 # Size of the grid xmin = -3; xmax = 3 -μ = rand(Uniform(xmin, xmax), 2, 2) # Random Centers + +μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers xgrid = range(-xmin, xmax, length=N_test) # Create a grid Xgrid = hcat(collect.(Iterators.product(xgrid, xgrid))...) #Combine into a 2D grid y = rand((-1, 1), N) # Select randomly between the two classes From c83829a912b969f622843e877535404a06de2aba Mon Sep 17 00:00:00 2001 From: Theo Galy-Fajou Date: Mon, 6 Jul 2020 18:22:18 +0200 Subject: [PATCH 03/19] Formatting for Literate.jl --- examples/svm.jl | 65 +++++++++++++++++++++++--------------- src/matrix/kernelmatrix.jl | 2 +- 2 files changed, 40 insertions(+), 27 deletions(-) diff --git a/examples/svm.jl b/examples/svm.jl index c92600938..0e10e0622 100644 --- a/examples/svm.jl +++ b/examples/svm.jl @@ -1,33 +1,46 @@ +# # Support Vector Machines +# ## We first load some needed packages using KernelFunctions -using Zygote -using Flux using Distributions, LinearAlgebra -using Plots - -# # - -N = 100 # Number of samples -N_test = 200 # Size of the grid -xmin = -3; xmax = 3 +using Plots; default(legendfontsize = 15.0, ms = 5.0) +# ## Data Generation +# ### We first generate a mixture of two Gaussians in 2 dimensions +xmin = -3; xmax = 3 # Limits for sampling μ₁ and μ₂ μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers -xgrid = range(-xmin, xmax, length=N_test) # Create a grid -Xgrid = hcat(collect.(Iterators.product(xgrid, xgrid))...) #Combine into a 2D grid +# ### We then sample both y and x +N = 100 # Number of samples y = rand((-1, 1), N) # Select randomly between the two classes -X_train = zeros(2, N) -X_train[:, y .== 1] = rand(MvNormal(μ[:, 1], I), count(y.==1)) #Attribute samples from class 1 -X_train[:, y .== -1] = rand(MvNormal(μ[:, 2], I), count(y.==-1)) # Attribute samples from class 2 -scatter(eachrow(X_train)..., zcolor= y) -## Compute predictions -k = SqExponentialKernel() # Create kernel function -function f(x, k, λ) - kernelmatrix(k, x, X_train, obsdim=2) * inv(kernelmatrix(k, X_train, obsdim=2) + exp(λ[1]) * I) * y # Optimal prediction f +x = Vector{Vector{Float64}}(undef, N) # We preallocate x +x[y .== 1] = [rand(MvNormal(μ[:, 1], I)) for _ in 1:count(y.==1)] # Features for samples of class 1 +x[y .== -1] = [rand(MvNormal(μ[:, 2], I)) for _ in 1:count(y.==-1)] # Features for samples of class 2 +scatter(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1", title = "Data") +scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2") + +# ## Model Definition +# TODO Write theory here +# ### We create a kernel k +k = SqExponentialKernel() # SqExponentialKernel or RBFKernel +λ = 1.0 # Regularization parameter + +# ### We create a function to return the optimal prediction for a +# test data `x_new` +function f(x_new, x, y, k, λ) + kernelmatrix(k, x_new, x) * inv(kernelmatrix(k, x) + λ * I) * y # Optimal prediction f end -λ = log.([1.0]) -function reg_hingeloss(k, λ) - ŷ = f(X, k, λ) - return sum(maximum.(0.0, 1 - y * ŷ)) - exp(λ[1]) * norm(ŷ) # Total svm loss with regularisation + +# ### We also compute the total loss of the model that we want to minimize +hingeloss(y, ŷ) = maximum(zero(ŷ), 1 - y * ŷ) # hingeloss function +function reg_hingeloss(k, x, y, λ) + ŷ = f(x, x, y, k, λ) + return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation end -y_grid = f(Xgrid, k, λ) #Compute prediction on a grid -contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)) -scatter!(eachrow(X_train)..., zcolor=y,lab="data") +# ### We create a 2D grid based on the maximum values of the data +N_test = 200 # Size of the grid +xgrid = range(extrema(vcat(x...)).*1.1..., length=N_test) # Create a 1D grid +xgrid = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid +# ### We predict the value of y on this grid on plot it against the data +y_grid = f(xgrid, x, y, k, λ) #Compute prediction on a grid +contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)', label = "Predictions", title="Trained model") +scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1") +scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2") diff --git a/src/matrix/kernelmatrix.jl b/src/matrix/kernelmatrix.jl index bc24f7354..8b07841f9 100644 --- a/src/matrix/kernelmatrix.jl +++ b/src/matrix/kernelmatrix.jl @@ -89,7 +89,7 @@ function kernelmatrix(κ::SimpleKernel, x::AbstractVector) end function kernelmatrix(κ::SimpleKernel, x::AbstractVector, y::AbstractVector) - validate_dims(x, y) + # validate_dims(x, y) return map(d -> kappa(κ, d), pairwise(metric(κ), x, y)) end From 118e9b80055b0109061ae3948001d6af66b8660d Mon Sep 17 00:00:00 2001 From: Theo Galy-Fajou Date: Wed, 8 Jul 2020 19:59:14 +0200 Subject: [PATCH 04/19] Added examples to make.jl --- docs/.gitignore | 3 +- docs/Project.toml | 13 ++++++ docs/make.jl | 25 ++++++++++- examples/automaticstatistician.jl | 0 examples/deepkernellearning.jl | 72 ++++++++++++++++++------------- examples/kernelridgeregression.jl | 57 +++++++++++++----------- examples/svm.jl | 2 +- 7 files changed, 111 insertions(+), 61 deletions(-) create mode 100644 examples/automaticstatistician.jl diff --git a/docs/.gitignore b/docs/.gitignore index 90fb8a549..909052372 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,4 +1,3 @@ build/ site/ - -#Temp to avoid to many changes +src/examples/ diff --git a/docs/Project.toml b/docs/Project.toml index 9e80a3254..0252eb1e6 100644 --- a/docs/Project.toml +++ b/docs/Project.toml @@ -1,5 +1,18 @@ [deps] +AbstractGPs = "99985d1d-32ba-4be9-9821-2ec096f28918" +Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f" Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4" +Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c" +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Literate = "98b081ad-f1c9-55d3-8b20-4c87d4299306" +Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" +Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" [compat] +Distributions = "0.23" Documenter = "0.23, 0.24, 0.25" +Flux = "0.8" +ForwardDiff = "0.10" +Literate = "2" +Plots = "1" diff --git a/docs/make.jl b/docs/make.jl index e0366f01d..03772733c 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -1,13 +1,33 @@ using Documenter +using Literate using KernelFunctions +if ispath(joinpath(@__DIR__, "src", "examples")) + rm(joinpath(@__DIR__, "src", "examples"), recursive=true) +end + +for filename in readdir(joinpath(@__DIR__, "..", "examples")) + endswith(filename, ".jl") || continue + name = splitext(filename)[1] + Literate.markdown( + joinpath(@__DIR__, "..", "examples", filename), + joinpath(@__DIR__, "src", "examples"), + name = name, + documenter = true, + ) +end + makedocs( sitename = "KernelFunctions", format = Documenter.HTML(), modules = [KernelFunctions], pages = ["Home"=>"index.md", "User Guide" => "userguide.md", - "Examples"=>"example.md", + "Examples"=> + ["SVM" => "examples/svm.md", + # "Kernel Ridge Regression" => "examples/kernelridgeregression.md", + # "Deep Kernel Learning" => "examples/deepkernellearning.md", + ], "Kernel Functions"=>"kernels.md", "Transform"=>"transform.md", "Metrics"=>"metrics.md", @@ -22,5 +42,6 @@ makedocs( deploydocs( deps = Deps.pip("mkdocs", "python-markdown-math"), repo = "github.com/JuliaGaussianProcesses/KernelFunctions.jl.git", - target = "build" + target = "build", + push_preview = true, ) diff --git a/examples/automaticstatistician.jl b/examples/automaticstatistician.jl new file mode 100644 index 000000000..e69de29bb diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl index ac4de77c2..6f858e8ea 100644 --- a/examples/deepkernellearning.jl +++ b/examples/deepkernellearning.jl @@ -1,35 +1,36 @@ -# # Loading dataset -# We use a couple of useful datasets to plot and optimize -# The different hyper-parameters +# # Deep Kernel Learning with Flux +# ## Package loading +# We use a couple of useful packages to plot and optimize +# the different hyper-parameters using KernelFunctions -using MLDataUtils -using Zygote using Flux using Distributions, LinearAlgebra using Plots using AbstractGPs pyplot(); default(legendfontsize = 15.0, linewidth = 3.0) -using SliceMap +# using SliceMap -Base.map( - t::KernelFunctions.FunctionTransform, - X::AbstractVector; - obsdim::Int = defaultobs, -) where {T} = - slicemap(X, dims = 1) do x - t.f(x) - end +# Base.map( +# t::KernelFunctions.FunctionTransform, +# X::AbstractVector; +# obsdim::Int = defaultobs, +# ) where {T} = +# slicemap(X, dims = 1) do x +# t.f(x) +# end # ## Data creation # We create a simple 1D Problem with very different variations xmin = -3; xmax = 3 # Limits +N = 100 noise = 0.01 -x_train = rand(Uniform(xmin, xmax), 100) # Training dataset -x_test = range(xmin, xmax, length=200) # Testing dataset +x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset target_f(x) = sinc(abs(x) ^ abs(x)) # We use sinc with a highly varying value -x_train, y = noisy_function(target_f, x_train; noise = 0.01) +target_f(x::AbstractArray) = target_f(first(x)) +y_train = target_f.(x_train) + randn(N) * noise +x_test = collect(eachrow(range(xmin, xmax, length=200))) # Testing dataset # ## Model definition # We create a neural net with 2 layers and 10 units each @@ -38,32 +39,41 @@ neuralnet = Chain(Dense(1, 10), Dense(10, 2)) # We use a Squared Exponential Kernel k = transform(SqExponentialKernel(), FunctionTransform(neuralnet)) -f = GP(k) -fx = f(x_train, noise) -fp = posterior(fx, y) +# We use AbstractGPs.jl to define our model +gpprior = GP(k) # GP Prior +fx = AbstractGPs.FiniteGP(gpprior, x_train, noise) # Prior on f +fp = posterior(fx, y_train) # Posterior of f + # This compute the log evidence of `y`, # which is going to be used as the objective loss(y) = logpdf(fx, y) -pred -@info "Init Loss = $(loss(y))" +@info "Init Loss = $(loss(y_train))" +# Flux will automatically extract all the parameters of the kernel ps = Flux.params(k) -p = Plots.scatter(x_train, y, lab = "data", title = "Loss = $(loss(k, λ))") -Plots.plot!(x_train, f(X, k, λ), lab = "Prediction", lw = 3.0) |> display -## +# We show the initial prediction with the untrained model +p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))") +Plots.scatter!(vcat(x_train...), y_train, lab = "data") +pred = marginals(fp(x_test)) +Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction") +# ## Training anim = Animation() nmax= 10 opt = Flux.ADAGrad(0.001) -@progress for i = 1:nmax - grads = Zygote.gradient(() -> loss(k, λ), ps) +for i = 1:nmax + grads = gradient(ps) do + loss(y_train) + end Flux.Optimise.update!(opt, ps, grads) - if i % 100 == 0 + # if i % 100 == 0 @info "$i/$nmax" - p = Plots.scatter(x, y, lab = "data", title = "Loss = $(loss(k,λ))") - Plots.plot!(x, f(X, k, λ), lab = "Prediction", lw = 3.0) + p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))") + p = Plots.scatter!(vcat(x_train...), y_train, lab = "data") + pred = marginals(fp(x_test)) + Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction") frame(anim) - end + # end end gif(anim, fps = 5) diff --git a/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl index d6e999e06..2efa474bd 100644 --- a/examples/kernelridgeregression.jl +++ b/examples/kernelridgeregression.jl @@ -1,64 +1,71 @@ -# Kernel Ridge Regression +# # Kernel Ridge Regression +# ## We load KernelFunctions and some other packages using KernelFunctions using LinearAlgebra using Distributions -using Plots +using Plots; default(lw = 2.0, legendfontsize = 15.0) using Flux: Optimise using ForwardDiff +using Random: seed! +seed!(42) -# We generate our data : +# ## Data Generation +# We generated data in 1 dimension xmin = -3; xmax = 3 # Bounds of the data N = 50 # Number of samples -σ = 0.1 - x_train = rand(Uniform(xmin, xmax), N) # We sample 100 random samples -x_test = range(xmin, xmax, length = 300) # We use x_test to show the prediction function -y = sinc.(x_train) + randn(N) * σ # We create a function and add some noise +σ = 0.1 +y_train = sinc.(x_train) + randn(N) * σ # We create a function and add some noise +x_test = range(xmin-0.1, xmax+0.1, length=300) # Plot the data -scatter(x_train, y, lab = "data") +scatter(x_train, y_train, lab = "data") plot!(x_test, sinc, lab = "true function") -# Create a function taking kernel parameters and creating a kernel +# ## Kernel training +# To train the kernel parameters via ForwardDiff.jl +# we need to create a function creating a kernel from an array kernelcall(θ) = transform( exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(), exp(θ[3]), ) -# Return the prediction given the normalization value λ -function f(x, θ) +# From theory we know the prediction for a test set x given +# the kernel parameters and normalization constant +function f(x, x_train, y_train, θ) k = kernelcall(θ[1:3]) kernelmatrix(k, x, x_train) * - inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * y + inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * y_train end -# Starting with parameters [1.0, 1.0, 1.0, 1.0] we get : -ŷ = f(x_test, log.(ones(4))) -scatter(x_train, y, lab = "data") +# We look how the prediction looks like +# with starting parameters [1.0, 1.0, 1.0, 1.0] we get : +ŷ = f(x_test, x_train, y_train, log.(ones(4))) +scatter(x_train, y_train, lab = "data") plot!(x_test, sinc, lab = "true function") plot!(x_test, ŷ, lab = "prediction") -# Create a loss on the training data +# We define the loss based on the L2 norm both +# for the loss and the regularization function loss(θ) - ŷ = f(x_train, θ) - sum(abs2, y - ŷ) + exp(θ[4]) * norm(ŷ) + ŷ = f(x_train, x_train, y_train, θ) + sum(abs2, y_train - ŷ) + exp(θ[4]) * norm(ŷ) end # The loss with our starting point : loss(log.(ones(4))) -## Training model - -θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001)) +# ## Training the model +θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001)) # Initial vector anim = Animation() -opt = ADAGrad(0.5) -@progress for i = 1:30 +opt = Optimise.ADAGrad(0.5) +for i = 1:30 grads = ForwardDiff.gradient(x -> loss(x), θ) # We compute the gradients given the kernel parameters and regularization Δ = Optimise.apply!(opt, θ, grads) θ .-= Δ # We apply a simple Gradient descent algorithm - p = scatter(x_train, y, lab = "data", title = "i = $(i), Loss = $(round(loss(θ), digits = 4))") + p = scatter(x_train, y_train, lab = "data", title = "i = $(i), Loss = $(round(loss(θ), digits = 4))") plot!(x_test, sinc, lab = "true function") - plot!(x_test, f(x_test, θ), lab = "Prediction", lw = 3.0) + plot!(x_test, f(x_test, x_train, y_train, θ), lab = "Prediction", lw = 3.0) frame(anim) end gif(anim) diff --git a/examples/svm.jl b/examples/svm.jl index 0e10e0622..5200c626e 100644 --- a/examples/svm.jl +++ b/examples/svm.jl @@ -1,5 +1,5 @@ # # Support Vector Machines -# ## We first load some needed packages +# ## Package loading using KernelFunctions using Distributions, LinearAlgebra using Plots; default(legendfontsize = 15.0, ms = 5.0) From 438fcee0582bf553e3d27c8c62e31ffbf13e19ef Mon Sep 17 00:00:00 2001 From: Theo Galy-Fajou Date: Thu, 9 Jul 2020 00:16:20 +0200 Subject: [PATCH 05/19] Small corrections --- examples/deepkernellearning.jl | 4 ++-- examples/svm.jl | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/deepkernellearning.jl b/examples/deepkernellearning.jl index 6f858e8ea..01771e53b 100644 --- a/examples/deepkernellearning.jl +++ b/examples/deepkernellearning.jl @@ -67,13 +67,13 @@ for i = 1:nmax loss(y_train) end Flux.Optimise.update!(opt, ps, grads) - # if i % 100 == 0 + if i % 100 == 0 @info "$i/$nmax" p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))") p = Plots.scatter!(vcat(x_train...), y_train, lab = "data") pred = marginals(fp(x_test)) Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction") frame(anim) - # end + end end gif(anim, fps = 5) diff --git a/examples/svm.jl b/examples/svm.jl index 5200c626e..9bb262496 100644 --- a/examples/svm.jl +++ b/examples/svm.jl @@ -20,7 +20,7 @@ scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2") # ## Model Definition # TODO Write theory here # ### We create a kernel k -k = SqExponentialKernel() # SqExponentialKernel or RBFKernel +k = SqExponentialKernel() # SqExponentialKernel/RBFKernel λ = 1.0 # Regularization parameter # ### We create a function to return the optimal prediction for a @@ -36,11 +36,13 @@ function reg_hingeloss(k, x, y, λ) return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation end # ### We create a 2D grid based on the maximum values of the data -N_test = 200 # Size of the grid +N_test = 100 # Size of the grid xgrid = range(extrema(vcat(x...)).*1.1..., length=N_test) # Create a 1D grid -xgrid = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid +xgrid_v = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid # ### We predict the value of y on this grid on plot it against the data -y_grid = f(xgrid, x, y, k, λ) #Compute prediction on a grid +y_grid = f(xgrid_v, x, y, k, λ) #Compute prediction on a grid contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)', label = "Predictions", title="Trained model") scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1") scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2") +xlims!(extrema(xgrid)) +ylims!(extrema(xgrid)) From 0aaf2bb72e45687bc1cd5324ba9fb97f69902803 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Galy-Fajou?= Date: Fri, 6 Nov 2020 11:47:13 +0100 Subject: [PATCH 06/19] Moved examples to docs --- {examples => docs/examples}/automaticstatistician.jl | 0 {examples => docs/examples}/deepkernellearning.jl | 0 {examples => docs/examples}/kernelridgeregression.jl | 0 {examples => docs/examples}/svm.jl | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {examples => docs/examples}/automaticstatistician.jl (100%) rename {examples => docs/examples}/deepkernellearning.jl (100%) rename {examples => docs/examples}/kernelridgeregression.jl (100%) rename {examples => docs/examples}/svm.jl (100%) diff --git a/examples/automaticstatistician.jl b/docs/examples/automaticstatistician.jl similarity index 100% rename from examples/automaticstatistician.jl rename to docs/examples/automaticstatistician.jl diff --git a/examples/deepkernellearning.jl b/docs/examples/deepkernellearning.jl similarity index 100% rename from examples/deepkernellearning.jl rename to docs/examples/deepkernellearning.jl diff --git a/examples/kernelridgeregression.jl b/docs/examples/kernelridgeregression.jl similarity index 100% rename from examples/kernelridgeregression.jl rename to docs/examples/kernelridgeregression.jl diff --git a/examples/svm.jl b/docs/examples/svm.jl similarity index 100% rename from examples/svm.jl rename to docs/examples/svm.jl From cfa203fc5798b3e87188a01bbdcdccf027d6419c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Galy-Fajou?= Date: Fri, 6 Nov 2020 18:30:02 +0100 Subject: [PATCH 07/19] Small updates --- docs/examples/deepkernellearning.jl | 41 ++++++++++++----------------- src/kernels/transformedkernel.jl | 2 -- 2 files changed, 17 insertions(+), 26 deletions(-) diff --git a/docs/examples/deepkernellearning.jl b/docs/examples/deepkernellearning.jl index 01771e53b..f90dc07dd 100644 --- a/docs/examples/deepkernellearning.jl +++ b/docs/examples/deepkernellearning.jl @@ -6,38 +6,28 @@ using KernelFunctions using Flux using Distributions, LinearAlgebra using Plots +using ProgressMeter using AbstractGPs pyplot(); default(legendfontsize = 15.0, linewidth = 3.0) -# using SliceMap - - - -# Base.map( -# t::KernelFunctions.FunctionTransform, -# X::AbstractVector; -# obsdim::Int = defaultobs, -# ) where {T} = -# slicemap(X, dims = 1) do x -# t.f(x) -# end # ## Data creation # We create a simple 1D Problem with very different variations xmin = -3; xmax = 3 # Limits -N = 100 +N = 150 noise = 0.01 x_train = collect(eachrow(rand(Uniform(xmin, xmax), N))) # Training dataset target_f(x) = sinc(abs(x) ^ abs(x)) # We use sinc with a highly varying value target_f(x::AbstractArray) = target_f(first(x)) y_train = target_f.(x_train) + randn(N) * noise x_test = collect(eachrow(range(xmin, xmax, length=200))) # Testing dataset - +spectral_mixture_kernel() # ## Model definition # We create a neural net with 2 layers and 10 units each # The data is passed through the NN before being used in the kernel -neuralnet = Chain(Dense(1, 10), Dense(10, 2)) -# We use a Squared Exponential Kernel -k = transform(SqExponentialKernel(), FunctionTransform(neuralnet)) +neuralnet = Chain(Dense(1, 20), Dense(20, 30), Dense(30, 5)) +# We use two cases : +# - The Squared Exponential Kernel +k = transform(SqExponentialKernel(), FunctionTransform(neuralnet) ) # We use AbstractGPs.jl to define our model gpprior = GP(k) # GP Prior @@ -46,7 +36,7 @@ fp = posterior(fx, y_train) # Posterior of f # This compute the log evidence of `y`, # which is going to be used as the objective -loss(y) = logpdf(fx, y) +loss(y) = -logpdf(fx, y) @info "Init Loss = $(loss(y_train))" @@ -54,26 +44,29 @@ loss(y) = logpdf(fx, y) ps = Flux.params(k) # We show the initial prediction with the untrained model -p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))") +p_init = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))") Plots.scatter!(vcat(x_train...), y_train, lab = "data") pred = marginals(fp(x_test)) Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction") # ## Training anim = Animation() -nmax= 10 -opt = Flux.ADAGrad(0.001) -for i = 1:nmax - grads = gradient(ps) do +nmax= 1000 +opt = Flux.ADAM(0.1) +@showprogress for i = 1:nmax + global grads = gradient(ps) do loss(y_train) end Flux.Optimise.update!(opt, ps, grads) if i % 100 == 0 @info "$i/$nmax" + L = loss(y_train) + # @info "Loss = $L" p = Plots.plot(vcat(x_test...), target_f, lab = "true f", title = "Loss = $(loss(y_train))") p = Plots.scatter!(vcat(x_train...), y_train, lab = "data") - pred = marginals(fp(x_test)) + pred = marginals(posterior(fx, y_train)(x_test)) Plots.plot!(vcat(x_test...), mean.(pred), ribbon = std.(pred), lab = "Prediction") frame(anim) + display(p) end end gif(anim, fps = 5) diff --git a/src/kernels/transformedkernel.jl b/src/kernels/transformedkernel.jl index ec9eddc8b..3d1928cd3 100644 --- a/src/kernels/transformedkernel.jl +++ b/src/kernels/transformedkernel.jl @@ -52,8 +52,6 @@ end transform(k::Kernel,ρ::AbstractVector) = TransformedKernel(k, ARDTransform(ρ)) -transform(k::Kernel, ρ::AbstractVector) = transform(k, ARDTransform(ρ)) - kernel(κ) = κ.kernel Base.show(io::IO, κ::TransformedKernel) = printshifted(io, κ, 0) From f438d9811ae4a6bd30a4e2f6d5bad494063769f9 Mon Sep 17 00:00:00 2001 From: ST John Date: Thu, 7 Jan 2021 22:26:04 +0000 Subject: [PATCH 08/19] fix paths --- docs/make.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index afed32030..892c34209 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -6,11 +6,11 @@ if ispath(joinpath(@__DIR__, "src", "examples")) rm(joinpath(@__DIR__, "src", "examples"), recursive=true) end -for filename in readdir(joinpath(@__DIR__, "..", "examples")) +for filename in readdir(joinpath(@__DIR__, "examples")) endswith(filename, ".jl") || continue name = splitext(filename)[1] Literate.markdown( - joinpath(@__DIR__, "..", "examples", filename), + joinpath(@__DIR__, "examples", filename), joinpath(@__DIR__, "src", "examples"), name = name, documenter = true, From 768e721af5319447f787497a3d892d3d3b5e57e3 Mon Sep 17 00:00:00 2001 From: ST John Date: Fri, 8 Jan 2021 11:31:09 +0000 Subject: [PATCH 09/19] move examples out of docs --- docs/make.jl | 4 ++-- {docs/examples => examples}/automaticstatistician.jl | 0 {docs/examples => examples}/deepkernellearning.jl | 0 {docs/examples => examples}/kernelridgeregression.jl | 0 {docs/examples => examples}/svm.jl | 0 5 files changed, 2 insertions(+), 2 deletions(-) rename {docs/examples => examples}/automaticstatistician.jl (100%) rename {docs/examples => examples}/deepkernellearning.jl (100%) rename {docs/examples => examples}/kernelridgeregression.jl (100%) rename {docs/examples => examples}/svm.jl (100%) diff --git a/docs/make.jl b/docs/make.jl index 892c34209..afed32030 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -6,11 +6,11 @@ if ispath(joinpath(@__DIR__, "src", "examples")) rm(joinpath(@__DIR__, "src", "examples"), recursive=true) end -for filename in readdir(joinpath(@__DIR__, "examples")) +for filename in readdir(joinpath(@__DIR__, "..", "examples")) endswith(filename, ".jl") || continue name = splitext(filename)[1] Literate.markdown( - joinpath(@__DIR__, "examples", filename), + joinpath(@__DIR__, "..", "examples", filename), joinpath(@__DIR__, "src", "examples"), name = name, documenter = true, diff --git a/docs/examples/automaticstatistician.jl b/examples/automaticstatistician.jl similarity index 100% rename from docs/examples/automaticstatistician.jl rename to examples/automaticstatistician.jl diff --git a/docs/examples/deepkernellearning.jl b/examples/deepkernellearning.jl similarity index 100% rename from docs/examples/deepkernellearning.jl rename to examples/deepkernellearning.jl diff --git a/docs/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl similarity index 100% rename from docs/examples/kernelridgeregression.jl rename to examples/kernelridgeregression.jl diff --git a/docs/examples/svm.jl b/examples/svm.jl similarity index 100% rename from docs/examples/svm.jl rename to examples/svm.jl From 0eb6f36c161d08491a82a9735b0d0b453b947968 Mon Sep 17 00:00:00 2001 From: ST John Date: Fri, 8 Jan 2021 11:38:22 +0000 Subject: [PATCH 10/19] cleanup svm.jl --- examples/svm.jl | 39 +++++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/examples/svm.jl b/examples/svm.jl index 9bb262496..906d4f03b 100644 --- a/examples/svm.jl +++ b/examples/svm.jl @@ -1,14 +1,23 @@ +# -*- coding: utf-8 -*- # # Support Vector Machines -# ## Package loading + +# TODO: introduction +# +# We first load the packages we will need in this notebook: + using KernelFunctions using Distributions, LinearAlgebra using Plots; default(legendfontsize = 15.0, ms = 5.0) # ## Data Generation -# ### We first generate a mixture of two Gaussians in 2 dimensions +# +# We first generate a mixture of two Gaussians in 2 dimensions + xmin = -3; xmax = 3 # Limits for sampling μ₁ and μ₂ μ = rand(Uniform(xmin, xmax), 2, 2) # Sample 2 Random Centers -# ### We then sample both y and x + +# We then sample both the input $x$ and the class $y$: + N = 100 # Number of samples y = rand((-1, 1), N) # Select randomly between the two classes x = Vector{Vector{Float64}}(undef, N) # We preallocate x @@ -19,27 +28,37 @@ scatter!(getindex.(x[y .== -1], 1), getindex.(x[y .== -1], 2), label = "y = 2") # ## Model Definition # TODO Write theory here -# ### We create a kernel k -k = SqExponentialKernel() # SqExponentialKernel/RBFKernel + +# ### Kernel + +k = SqExponentialKernel() # SqExponentialKernel/RBFKernel λ = 1.0 # Regularization parameter -# ### We create a function to return the optimal prediction for a -# test data `x_new` +# ### Predictor +# We create a function to return the optimal prediction for a test data `x_new` + function f(x_new, x, y, k, λ) kernelmatrix(k, x_new, x) * inv(kernelmatrix(k, x) + λ * I) * y # Optimal prediction f end -# ### We also compute the total loss of the model that we want to minimize +# ### Loss +# We also compute the total loss of the model that we want to minimize + hingeloss(y, ŷ) = maximum(zero(ŷ), 1 - y * ŷ) # hingeloss function function reg_hingeloss(k, x, y, λ) ŷ = f(x, x, y, k, λ) return sum(hingeloss.(y, ŷ)) - λ * norm(ŷ) # Total svm loss with regularisation end -# ### We create a 2D grid based on the maximum values of the data + +# ### Testing the trained model +# We create a 2D grid based on the maximum values of the data + N_test = 100 # Size of the grid xgrid = range(extrema(vcat(x...)).*1.1..., length=N_test) # Create a 1D grid xgrid_v = vec(collect.(Iterators.product(xgrid, xgrid))) #Combine into a 2D grid -# ### We predict the value of y on this grid on plot it against the data + +# We predict the value of y on this grid on plot it against the data + y_grid = f(xgrid_v, x, y, k, λ) #Compute prediction on a grid contourf(xgrid, xgrid, reshape(y_grid, N_test, N_test)', label = "Predictions", title="Trained model") scatter!(getindex.(x[y .== 1], 1), getindex.(x[y .== 1], 2), label = "y = 1") From bdd0800756ea2386ea556c805f3f80a29eed1638 Mon Sep 17 00:00:00 2001 From: ST John Date: Fri, 8 Jan 2021 12:21:32 +0000 Subject: [PATCH 11/19] constants for paths --- docs/make.jl | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/make.jl b/docs/make.jl index afed32030..7b3d01a0f 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -2,16 +2,19 @@ using Documenter using Literate using KernelFunctions -if ispath(joinpath(@__DIR__, "src", "examples")) - rm(joinpath(@__DIR__, "src", "examples"), recursive=true) +const EXAMPLES_SRC = joinpath(@__DIR__, "..", "examples") +const EXAMPLES_OUT = joinpath(@__DIR__, "src", "examples") + +if ispath(EXAMPLES_OUT) + rm(EXAMPLES_OUT, recursive=true) end -for filename in readdir(joinpath(@__DIR__, "..", "examples")) +for filename in readdir(EXAMPLES_SRC) endswith(filename, ".jl") || continue name = splitext(filename)[1] Literate.markdown( - joinpath(@__DIR__, "..", "examples", filename), - joinpath(@__DIR__, "src", "examples"), + joinpath(EXAMPLES_SRC, filename), + EXAMPLES_OUT, name = name, documenter = true, ) From 125480fe66adabf16680955932c53a92d509bec8 Mon Sep 17 00:00:00 2001 From: ST John Date: Mon, 18 Jan 2021 16:47:43 +0000 Subject: [PATCH 12/19] break cells notebook-style --- examples/kernelridgeregression.jl | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/kernelridgeregression.jl b/examples/kernelridgeregression.jl index 212e3ada6..bfa968ee5 100644 --- a/examples/kernelridgeregression.jl +++ b/examples/kernelridgeregression.jl @@ -1,5 +1,8 @@ +# -*- coding: utf-8 -*- # # Kernel Ridge Regression + # ## We load KernelFunctions and some other packages + using KernelFunctions using LinearAlgebra using Distributions @@ -12,6 +15,7 @@ seed!(42) # ## Data Generation # We generated data in 1 dimension + xmin = -3; xmax = 3; # Bounds of the data N = 50 # Number of samples @@ -21,12 +25,14 @@ y_train = sinc.(x_train) + randn(N) * σ # We create a function and add some noi x_test = range(xmin - 0.1, xmax + 0.1; length=300) # Plot the data + scatter(x_train, y_train; lab="data") plot!(x_test, sinc; lab="true function") # ## Kernel training # To train the kernel parameters via ForwardDiff.jl # we need to create a function creating a kernel from an array + kernelcall(θ) = transform( exp(θ[1]) * SqExponentialKernel(),# + exp(θ[2]) * Matern32Kernel(), exp(θ[3]), @@ -34,6 +40,7 @@ kernelcall(θ) = transform( # From theory we know the prediction for a test set x given # the kernel parameters and normalization constant + function f(x, x_train, y_train, θ) k = kernelcall(θ[1:3]) return kernelmatrix(k, x, x_train) * @@ -43,6 +50,7 @@ end # We look how the prediction looks like # with starting parameters [1.0, 1.0, 1.0, 1.0] we get : + ŷ = f(x_test, x_train, y_train, log.(ones(4))) scatter(x_train, y_train; lab="data") plot!(x_test, sinc; lab="true function") @@ -50,15 +58,18 @@ plot!(x_test, ŷ; lab="prediction") # We define the loss based on the L2 norm both # for the loss and the regularization + function loss(θ) ŷ = f(x_train, x_train, y_train, θ) return sum(abs2, y_train - ŷ) + exp(θ[4]) * norm(ŷ) end # The loss with our starting point : + loss(log.(ones(4))) # ## Training the model + θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001)) # Initial vector anim = Animation() opt = Optimise.ADAGrad(0.5) From e188bee381f658f85bca464f6504c3b52d782620 Mon Sep 17 00:00:00 2001 From: ST John Date: Mon, 18 Jan 2021 16:49:27 +0000 Subject: [PATCH 13/19] add GP prior samples notebook --- examples/gaussianprocesspriors.jl | 81 +++++++++++++++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 examples/gaussianprocesspriors.jl diff --git a/examples/gaussianprocesspriors.jl b/examples/gaussianprocesspriors.jl new file mode 100644 index 000000000..ba7e647c6 --- /dev/null +++ b/examples/gaussianprocesspriors.jl @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +# # Gaussian process prior samples +# +# The kernels defined in this package can also be used to specify the +# covariance of a Gaussian process prior. +# A Gaussian process (GP) is defined by its mean function $m(\cdot)$ and its covariance function or kernel $k(\cdot, \cdot')$: +# $$ +# f \sim \mathcal{GP}(m(\cdot), k(\cdot, \cdot') +# $$ +# The function values of the GP at a finite number of points $X = \{x_n\}_{n=1}^N$ follow a multivariate normal distribution with mean vector $\mathbf{m}$ and covariance matrix $\mathrm{K}$, where +# $$ +# \begin{align} +# \mathbf{m}_i &= m(x_i) \\ +# \mathrm{K}_{i,j} &= k(x_i, x_j) +# \end{align} +# $$ +# where $1 \le i, j \le N$. +# +# In this notebook we show samples from zero-mean GPs with different kernels. + +# Load required packages +using KernelFunctions +using LinearAlgebra +using Distributions +using Plots +default(; lw=1.0, legendfontsize=15.0) +using Random: seed! +seed!(42); + +num_inputs = 101 +xlim = (-5, 5) +X = reshape(collect(range(xlim...; length=num_inputs)), :, 1) +num_samples = 11 +v = randn(num_inputs, num_samples); + +function visualize(k::Kernel; xref=0.0) + K = kernelmatrix(k, X; obsdim=1) + L = cholesky(K + 1e-6 * I) + f = L.L * v + + p_kernel_2d = heatmap( + K; + yflip=true, + colorbar=false, + ylabel=string(k), + framestyle=:none, + #color=:blues, + vlim=(0, 1), + title=raw"$k(x, x')$", + ) + + p_kernel_cut = plot( + X, + k.(X, xref); + title=string(raw"$k(x, ", xref, raw")$"), + xlim=xlim, + xticks=(xlim, xlim), + label=nothing, + ) + + p_samples = plot( + X, + f; + c="blue", + title=raw"$f(x)$", + ylim=(-3, 3), + xlim=xlim, + xticks=(xlim, xlim), + label=nothing, + ) + + return plot(p_kernel_2d, p_kernel_cut, p_samples; layout=(1, 3), xlabel=raw"$x$") +end + +visualize(SqExponentialKernel()) + +kernel_classes = [Matern12Kernel, Matern32Kernel, Matern52Kernel, SqExponentialKernel] +plot( + [visualize(k()) for k in kernel_classes]..., + #layout=(length(kernel_classes), 1) +) From c7ecd3139d67741e24391b912fcea3b6deb0ad39 Mon Sep 17 00:00:00 2001 From: ST John Date: Mon, 18 Jan 2021 19:00:56 +0000 Subject: [PATCH 14/19] rename kernelridgeregression.jl -> train_kernel_parameters.jl --- examples/{kernelridgeregression.jl => train_kernel_parameters.jl} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/{kernelridgeregression.jl => train_kernel_parameters.jl} (100%) diff --git a/examples/kernelridgeregression.jl b/examples/train_kernel_parameters.jl similarity index 100% rename from examples/kernelridgeregression.jl rename to examples/train_kernel_parameters.jl From a2f8a5e4a2f198bf1f4eff7a7ce17af2c4cd64c0 Mon Sep 17 00:00:00 2001 From: ST John Date: Mon, 18 Jan 2021 19:02:45 +0000 Subject: [PATCH 15/19] new kernel_ridge_regression.jl tutorial --- examples/kernel_ridge_regression.jl | 184 ++++++++++++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 examples/kernel_ridge_regression.jl diff --git a/examples/kernel_ridge_regression.jl b/examples/kernel_ridge_regression.jl new file mode 100644 index 000000000..96bdfaf3d --- /dev/null +++ b/examples/kernel_ridge_regression.jl @@ -0,0 +1,184 @@ +# -*- coding: utf-8 -*- +# # Kernel Ridge Regression +# +# Building on linear regression, we can fit non-linear data sets by introducing a feature space. In a higher-dimensional feature space, we can overfit the data; ridge regression introduces regularization to avoid this. In this notebook we show how we can use KernelFunctions.jl for *kernel* ridge regression. + +# + +# Loading and setup of required packages +using KernelFunctions +using LinearAlgebra +using Distributions + +# Plotting +using Plots; +default(; lw=2.0, legendfontsize=15.0); + +# Gradient-based optimization +using Flux: Optimise +using ForwardDiff + +using Random: seed! +seed!(42); +# - + +# ## From linear regression to ridge regression +# Here we use a one-dimensional toy problem. We generate data using the fourth-order polynomial $f(x) = (x+4)(x+1)(x-1)(x-3)$: + +# + +f_truth(x) = (x + 4) * (x + 1) * (x - 1) * (x - 3) + +x_train = collect(-5:0.5:5) +x_test = collect(-5:0.1:5) + +noise = rand(Uniform(-10, 10), size(x_train)) +y_train = f_truth.(x_train) + noise +y_test = f_truth.(x_test) + +plot(x_test, y_test; label=raw"$f(x)$") +scatter!(x_train, y_train; label="observations") +# - + +# For training inputs $\mathrm{X}=(\mathbf{x}_n)_{n=1}^N$ and observations $\mathbf{y}=(y_n)_{n=1}^N$, the linear regression weights $\mathbf{w}$ using the least-squares estimator are given by +# $$ +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y} +# $$ +# We predict at test inputs $\mathbf{x}_*$ using +# $$ +# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} +# $$ +# This is implemented by `linear_regression`: + +function linear_regression(X::AbstractVecOrMat, y::AbstractVector, Xstar::AbstractVecOrMat) + weights = inv(X' * X) * X' * y + return Xstar * weights +end + +# A linear regression fit to the above data set: + +y_pred = linear_regression(x_train, y_train, x_test) +scatter(x_train, y_train; label="observations") +plot!(x_test, y_pred; label="linear fit") + +# We can improve the fit by including additional features, i.e. generalizing to $\mathrm{X} = (\phi(x_n))_{n=1}^N$, where $\phi(x)$ constructs a feature vector for each input $x$. Here we include powers of the input, $\phi(x) = (1, x, x^2, \dots, x^d)$: + +function featurize_poly(x; degree=1) + xcols = [x .^ d for d in 0:degree] + return hcat(xcols...) +end + +# + +function featurized_fit_and_plot(degree) + X = featurize_poly(x_train; degree=degree) + Xstar = featurize_poly(x_test; degree=degree) + y_pred = linear_regression(X, y_train, Xstar) + scatter(x_train, y_train; label=nothing) + p = plot!(x_test, y_pred; label=nothing, title="fit of order $degree") + return p +end + +plot([featurized_fit_and_plot(degree) for degree in 1:4]...) +# - + +# Note that the fit becomes perfect when we include exactly as many orders in the features as we have in the underlying polynomial (4). +# +# However, when increasing the number of features, we can quickly overfit to noise in the data set: + +featurized_fit_and_plot(18) + +# To counteract this unwanted behaviour, we can introduce regularization. This leads to *ridge regression* with $L_2$ regularization of the weights ([Tikhonov regularization](https://en.wikipedia.org/wiki/Tikhonov_regularization)). +# Instead of the weights in linear regression, +# $$ +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y} +# $$ +# we introduce the ridge parameter $\lambda$: +# $$ +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y} +# $$ +# As before, we predict at test inputs $\mathbf{x}_*$ using +# $$ +# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} +# $$ +# This is implemented by `ridge_regression`: + +function ridge_regression( + X::AbstractVecOrMat, y::AbstractVector, Xstar::AbstractVecOrMat, lambda::Float64 +) + weights = inv(X' * X + lambda * I) * X' * y + return Xstar * weights +end + +# + +function regularized_fit_and_plot(degree, lambda) + X = featurize_poly(x_train; degree=degree) + Xstar = featurize_poly(x_test; degree=degree) + y_pred = ridge_regression(X, y_train, Xstar, lambda) + scatter(x_train, y_train; label=nothing) + p = plot!(x_test, y_pred; label=nothing, title=string(raw"$\lambda=", lambda, raw"$")) + return p +end + +plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]...) +# - + +# Instead of constructing the feature matrix explicitly, we can use *kernels* to replace inner products of feature vectors with a kernel evaluation: $\langle \phi(x), \phi(x') \rangle = k(x, x')$ or $\mathrm{X} \mathrm{X}^\top = \mathrm{K}$, where $\mathrm{K}_{ij} = k(x_i, x_j)$. +# +# To apply this "kernel trick" to ridge regression, we can rewrite the ridge estimate for the weights +# $$ +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y} +# $$ +# using the [matrix inversion lemma](https://tlienart.github.io/pub/csml/mtheory/matinvlem.html#basic_lemmas) +# as +# $$ +# \mathbf{w} = \mathrm{X}^\top (\mathrm{X} \mathrm{X}^\top + \lambda \mathbb{1})^{-1} \mathbf{y} +# $$ +# where we can now replace the inner product with the kernel matrix, +# $$ +# \mathbf{w} = \mathrm{X}^\top (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y} +# $$ +# And the prediction yields another inner product, +# $$ +# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} = \langle \mathbf{x}_*, \mathbf{w} \rangle = \mathbf{k}_* (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y} +# $$ +# where $(\mathbf{k}_*)_n = k(x_*, x_n)$. +# +# This is implemented by `kernel_ridge_regression`: + +function kernel_ridge_regression( + k::Kernel, + X::AbstractVecOrMat, + y::AbstractVector, + Xstar::AbstractVecOrMat, + lambda::Float64, +) + K = kernelmatrix(k, X) + kstar = kernelmatrix(k, Xstar, X) + return kstar * inv(K + lambda * I) * y +end + +# Now, instead of explicitly constructing features, we can simply pass in a `PolynomialKernel` object: + +# + +function kernelized_fit_and_plot(kernel, lambda=1e-4) + y_pred = kernel_ridge_regression(kernel, x_train, y_train, x_test, lambda) + if kernel isa PolynomialKernel + title = string("order ", kernel.degree) + else + title = string(kernel) + end + scatter(x_train, y_train; label=nothing) + p = plot!( + x_test, + y_pred; + label=nothing, + title=title, + #title=string(raw"$\lambda=", lambda, raw"$") + ) + return p +end + +plot([kernelized_fit_and_plot(PolynomialKernel(; degree=degree, c=1)) for degree in 1:4]...) +# - + +# However, we can now also use kernels that would have an infinite-dimensional feature expansion, such as the squared exponential kernel: + +kernelized_fit_and_plot(SqExponentialKernel()) From 542f93230a4cd850d00515f4ec2f631c44b036e3 Mon Sep 17 00:00:00 2001 From: ST John Date: Mon, 18 Jan 2021 19:06:16 +0000 Subject: [PATCH 16/19] update make.jl for example notebooks --- docs/make.jl | 6 ++++-- examples/automaticstatistician.jl | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) delete mode 100644 examples/automaticstatistician.jl diff --git a/docs/make.jl b/docs/make.jl index 0908576b6..6801a0a14 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -32,9 +32,11 @@ makedocs(; "Home" => "index.md", "User Guide" => "userguide.md", "Examples" => [ + "Kernel Ridge Regression" => "examples/kernel_ridge_regression.md", + "Training kernel parameters" => "examples/train_kernel_parameters.md", + "Gaussian process priors" => "examples/gaussianprocesspriors.md", "SVM" => "examples/svm.md", - # "Kernel Ridge Regression" => "examples/kernelridgeregression.md", - # "Deep Kernel Learning" => "examples/deepkernellearning.md", + "Deep Kernel Learning" => "examples/deepkernellearning.md", ], "Kernel Functions" => "kernels.md", "Input Transforms" => "transform.md", diff --git a/examples/automaticstatistician.jl b/examples/automaticstatistician.jl deleted file mode 100644 index 8b1378917..000000000 --- a/examples/automaticstatistician.jl +++ /dev/null @@ -1 +0,0 @@ - From 2d84c74da733ab74665c141b0698da01a08cd1f2 Mon Sep 17 00:00:00 2001 From: ST John Date: Mon, 18 Jan 2021 22:30:30 +0000 Subject: [PATCH 17/19] update format --- examples/gaussianprocesspriors.jl | 11 ++++++++++- examples/kernel_ridge_regression.jl | 18 ++---------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/examples/gaussianprocesspriors.jl b/examples/gaussianprocesspriors.jl index ba7e647c6..cd34c4724 100644 --- a/examples/gaussianprocesspriors.jl +++ b/examples/gaussianprocesspriors.jl @@ -18,7 +18,7 @@ # # In this notebook we show samples from zero-mean GPs with different kernels. -# Load required packages +## Load required packages using KernelFunctions using LinearAlgebra using Distributions @@ -27,6 +27,10 @@ default(; lw=1.0, legendfontsize=15.0) using Random: seed! seed!(42); +# We now define a function that visualizes a kernel +# for us. We use the same randomness to obtain +# comparable samples. + num_inputs = 101 xlim = (-5, 5) X = reshape(collect(range(xlim...; length=num_inputs)), :, 1) @@ -72,8 +76,13 @@ function visualize(k::Kernel; xref=0.0) return plot(p_kernel_2d, p_kernel_cut, p_samples; layout=(1, 3), xlabel=raw"$x$") end +# We can now visualize a kernel and show samples from +# a Gaussian process with this kernel: + visualize(SqExponentialKernel()) +# This also allows us to compare different kernels: + kernel_classes = [Matern12Kernel, Matern32Kernel, Matern52Kernel, SqExponentialKernel] plot( [visualize(k()) for k in kernel_classes]..., diff --git a/examples/kernel_ridge_regression.jl b/examples/kernel_ridge_regression.jl index 96bdfaf3d..f242a0d4a 100644 --- a/examples/kernel_ridge_regression.jl +++ b/examples/kernel_ridge_regression.jl @@ -3,28 +3,21 @@ # # Building on linear regression, we can fit non-linear data sets by introducing a feature space. In a higher-dimensional feature space, we can overfit the data; ridge regression introduces regularization to avoid this. In this notebook we show how we can use KernelFunctions.jl for *kernel* ridge regression. -# + -# Loading and setup of required packages +## Loading and setup of required packages using KernelFunctions using LinearAlgebra using Distributions -# Plotting +## Plotting using Plots; default(; lw=2.0, legendfontsize=15.0); -# Gradient-based optimization -using Flux: Optimise -using ForwardDiff - using Random: seed! seed!(42); -# - # ## From linear regression to ridge regression # Here we use a one-dimensional toy problem. We generate data using the fourth-order polynomial $f(x) = (x+4)(x+1)(x-1)(x-3)$: -# + f_truth(x) = (x + 4) * (x + 1) * (x - 1) * (x - 3) x_train = collect(-5:0.5:5) @@ -36,7 +29,6 @@ y_test = f_truth.(x_test) plot(x_test, y_test; label=raw"$f(x)$") scatter!(x_train, y_train; label="observations") -# - # For training inputs $\mathrm{X}=(\mathbf{x}_n)_{n=1}^N$ and observations $\mathbf{y}=(y_n)_{n=1}^N$, the linear regression weights $\mathbf{w}$ using the least-squares estimator are given by # $$ @@ -66,7 +58,6 @@ function featurize_poly(x; degree=1) return hcat(xcols...) end -# + function featurized_fit_and_plot(degree) X = featurize_poly(x_train; degree=degree) Xstar = featurize_poly(x_test; degree=degree) @@ -77,7 +68,6 @@ function featurized_fit_and_plot(degree) end plot([featurized_fit_and_plot(degree) for degree in 1:4]...) -# - # Note that the fit becomes perfect when we include exactly as many orders in the features as we have in the underlying polynomial (4). # @@ -107,7 +97,6 @@ function ridge_regression( return Xstar * weights end -# + function regularized_fit_and_plot(degree, lambda) X = featurize_poly(x_train; degree=degree) Xstar = featurize_poly(x_test; degree=degree) @@ -118,7 +107,6 @@ function regularized_fit_and_plot(degree, lambda) end plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]...) -# - # Instead of constructing the feature matrix explicitly, we can use *kernels* to replace inner products of feature vectors with a kernel evaluation: $\langle \phi(x), \phi(x') \rangle = k(x, x')$ or $\mathrm{X} \mathrm{X}^\top = \mathrm{K}$, where $\mathrm{K}_{ij} = k(x_i, x_j)$. # @@ -157,7 +145,6 @@ end # Now, instead of explicitly constructing features, we can simply pass in a `PolynomialKernel` object: -# + function kernelized_fit_and_plot(kernel, lambda=1e-4) y_pred = kernel_ridge_regression(kernel, x_train, y_train, x_test, lambda) if kernel isa PolynomialKernel @@ -177,7 +164,6 @@ function kernelized_fit_and_plot(kernel, lambda=1e-4) end plot([kernelized_fit_and_plot(PolynomialKernel(; degree=degree, c=1)) for degree in 1:4]...) -# - # However, we can now also use kernels that would have an infinite-dimensional feature expansion, such as the squared exponential kernel: From 9e4e144ad97c169b6c1b8b292ed320cc145f4590 Mon Sep 17 00:00:00 2001 From: st-- Date: Wed, 30 Jun 2021 16:06:11 +0300 Subject: [PATCH 18/19] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: David Widmann Co-authored-by: Théo Galy-Fajou --- examples/gaussianprocesspriors.jl | 14 ++++----- examples/kernel_ridge_regression.jl | 47 +++++++++++------------------ examples/svm.jl | 1 - examples/train_kernel_parameters.jl | 13 +++----- 4 files changed, 30 insertions(+), 45 deletions(-) diff --git a/examples/gaussianprocesspriors.jl b/examples/gaussianprocesspriors.jl index cd34c4724..38da13a9a 100644 --- a/examples/gaussianprocesspriors.jl +++ b/examples/gaussianprocesspriors.jl @@ -4,16 +4,16 @@ # The kernels defined in this package can also be used to specify the # covariance of a Gaussian process prior. # A Gaussian process (GP) is defined by its mean function $m(\cdot)$ and its covariance function or kernel $k(\cdot, \cdot')$: -# $$ -# f \sim \mathcal{GP}(m(\cdot), k(\cdot, \cdot') -# $$ +# ```math +# f \sim \mathcal{GP}\big(m(\cdot), k(\cdot, \cdot')\big) +# ``` # The function values of the GP at a finite number of points $X = \{x_n\}_{n=1}^N$ follow a multivariate normal distribution with mean vector $\mathbf{m}$ and covariance matrix $\mathrm{K}$, where -# $$ -# \begin{align} +# ```math +# \begin{aligned} # \mathbf{m}_i &= m(x_i) \\ # \mathrm{K}_{i,j} &= k(x_i, x_j) -# \end{align} -# $$ +# \end{aligned} +# ``` # where $1 \le i, j \le N$. # # In this notebook we show samples from zero-mean GPs with different kernels. diff --git a/examples/kernel_ridge_regression.jl b/examples/kernel_ridge_regression.jl index f242a0d4a..a76905462 100644 --- a/examples/kernel_ridge_regression.jl +++ b/examples/kernel_ridge_regression.jl @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Kernel Ridge Regression # # Building on linear regression, we can fit non-linear data sets by introducing a feature space. In a higher-dimensional feature space, we can overfit the data; ridge regression introduces regularization to avoid this. In this notebook we show how we can use KernelFunctions.jl for *kernel* ridge regression. @@ -31,17 +30,17 @@ plot(x_test, y_test; label=raw"$f(x)$") scatter!(x_train, y_train; label="observations") # For training inputs $\mathrm{X}=(\mathbf{x}_n)_{n=1}^N$ and observations $\mathbf{y}=(y_n)_{n=1}^N$, the linear regression weights $\mathbf{w}$ using the least-squares estimator are given by -# $$ +# ```math # \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y} -# $$ +# ``` # We predict at test inputs $\mathbf{x}_*$ using -# $$ +# ```math # \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} -# $$ +# ``` # This is implemented by `linear_regression`: -function linear_regression(X::AbstractVecOrMat, y::AbstractVector, Xstar::AbstractVecOrMat) - weights = inv(X' * X) * X' * y +function linear_regression(X, y, Xstar) + weights = (X' * X) \ (X' * y) return Xstar * weights end @@ -62,9 +61,8 @@ function featurized_fit_and_plot(degree) X = featurize_poly(x_train; degree=degree) Xstar = featurize_poly(x_test; degree=degree) y_pred = linear_regression(X, y_train, Xstar) - scatter(x_train, y_train; label=nothing) - p = plot!(x_test, y_pred; label=nothing, title="fit of order $degree") - return p + scatter(x_train, y_train; legend=false, title="fit of order $degree") + return plot!(x_test, y_pred) end plot([featurized_fit_and_plot(degree) for degree in 1:4]...) @@ -85,15 +83,13 @@ featurized_fit_and_plot(18) # \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y} # $$ # As before, we predict at test inputs $\mathbf{x}_*$ using -# $$ +# ```math # \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} -# $$ +# ``` # This is implemented by `ridge_regression`: -function ridge_regression( - X::AbstractVecOrMat, y::AbstractVector, Xstar::AbstractVecOrMat, lambda::Float64 -) - weights = inv(X' * X + lambda * I) * X' * y +function ridge_regression(X, y, Xstar, lambda) + weights = (X' * X + lambda * I) \ (X' * y) return Xstar * weights end @@ -101,9 +97,8 @@ function regularized_fit_and_plot(degree, lambda) X = featurize_poly(x_train; degree=degree) Xstar = featurize_poly(x_test; degree=degree) y_pred = ridge_regression(X, y_train, Xstar, lambda) - scatter(x_train, y_train; label=nothing) - p = plot!(x_test, y_pred; label=nothing, title=string(raw"$\lambda=", lambda, raw"$")) - return p + scatter(x_train, y_train; legend=false, title="\$\\lambda=$lambda\$") + return plot!(x_test, y_pred) end plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]...) @@ -124,23 +119,17 @@ plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]. # \mathbf{w} = \mathrm{X}^\top (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y} # $$ # And the prediction yields another inner product, -# $$ +# ```math # \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} = \langle \mathbf{x}_*, \mathbf{w} \rangle = \mathbf{k}_* (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y} -# $$ +# ``` # where $(\mathbf{k}_*)_n = k(x_*, x_n)$. # # This is implemented by `kernel_ridge_regression`: -function kernel_ridge_regression( - k::Kernel, - X::AbstractVecOrMat, - y::AbstractVector, - Xstar::AbstractVecOrMat, - lambda::Float64, -) +function kernel_ridge_regression(k, X, y, Xstar, lambda) K = kernelmatrix(k, X) kstar = kernelmatrix(k, Xstar, X) - return kstar * inv(K + lambda * I) * y + return kstar * ((K + lambda * I) \ y) end # Now, instead of explicitly constructing features, we can simply pass in a `PolynomialKernel` object: diff --git a/examples/svm.jl b/examples/svm.jl index fa9be7b15..aecd218c9 100644 --- a/examples/svm.jl +++ b/examples/svm.jl @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Support Vector Machines # TODO: introduction diff --git a/examples/train_kernel_parameters.jl b/examples/train_kernel_parameters.jl index bfa968ee5..c3cc5d1dd 100644 --- a/examples/train_kernel_parameters.jl +++ b/examples/train_kernel_parameters.jl @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- # # Kernel Ridge Regression # ## We load KernelFunctions and some other packages @@ -44,8 +43,7 @@ kernelcall(θ) = transform( function f(x, x_train, y_train, θ) k = kernelcall(θ[1:3]) return kernelmatrix(k, x, x_train) * - inv(kernelmatrix(k, x_train) + exp(θ[4]) * I) * - y_train + ((kernelmatrix(k, x_train) + exp(θ[4]) * I) \ y_train) end # We look how the prediction looks like @@ -70,14 +68,13 @@ loss(log.(ones(4))) # ## Training the model -θ = vcat(log.([1.0, 0.0, 0.01]), log(0.001)) # Initial vector +θ = log.([1.0, 0.0, 0.01, 0.001]) # Initial vector anim = Animation() opt = Optimise.ADAGrad(0.5) for i in 1:30 - grads = ForwardDiff.gradient(x -> loss(x), θ) # We compute the gradients given the kernel parameters and regularization - Δ = Optimise.apply!(opt, θ, grads) - θ .-= Δ # We apply a simple Gradient descent algorithm - p = scatter( + grads = ForwardDiff.gradient(loss, θ) # We compute the gradients given the kernel parameters and regularization + Optimise.update!(opt, θ, grads) + scatter( x_train, y_train; lab="data", title="i = $(i), Loss = $(round(loss(θ), digits = 4))" ) plot!(x_test, sinc; lab="true function") From 3195326ccf29b230e615d8d872145e64302554bd Mon Sep 17 00:00:00 2001 From: st-- Date: Wed, 30 Jun 2021 17:03:08 +0300 Subject: [PATCH 19/19] Update docs/make.jl --- docs/make.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/make.jl b/docs/make.jl index dbd74d723..ab0da26c1 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -27,7 +27,6 @@ isempty(processes) || success(processes) || error("some examples were not run su # Build documentation using KernelFunctions using Documenter -using Literate # Print `@debug` statements (https://github.com/JuliaDocs/Documenter.jl/issues/955) if haskey(ENV, "GITHUB_ACTIONS")