|
1 | 1 | # # Kernel Ridge Regression |
2 | 2 | # |
3 | | -# !!! warning |
4 | | -# This example is under construction |
5 | | - |
6 | | -# Setup |
| 3 | +# Building on linear regression, we can fit non-linear data sets by introducing a feature space. In a higher-dimensional feature space, we can overfit the data; ridge regression introduces regularization to avoid this. In this notebook we show how we can use KernelFunctions.jl for *kernel* ridge regression. |
7 | 4 |
|
| 5 | +## Loading and setup of required packages |
8 | 6 | using KernelFunctions |
9 | | -using MLDataUtils |
10 | | -using Zygote |
11 | | -using Flux |
12 | | -using Distributions, LinearAlgebra |
13 | | -using Plots |
14 | | - |
15 | | -Flux.@functor SqExponentialKernel |
16 | | -Flux.@functor ScaleTransform |
17 | | -Flux.@functor KernelSum |
18 | | -Flux.@functor Matern32Kernel |
19 | | - |
20 | | -# Generate date |
21 | | - |
22 | | -xmin = -3; |
23 | | -xmax = 3; |
24 | | -x = range(xmin, xmax; length=100) |
25 | | -x_test = range(xmin, xmax; length=300) |
26 | | -x, y = noisy_function(sinc, x; noise=0.1) |
27 | | -X = RowVecs(reshape(x, :, 1)) |
28 | | -X_test = RowVecs(reshape(x_test, :, 1)) |
29 | | -#md nothing #hide |
30 | | - |
31 | | -# Set up kernel and regularisation parameter |
32 | | - |
33 | | -k = SqExponentialKernel() + Matern32Kernel() ∘ ScaleTransform(2.0) |
34 | | -λ = [-1.0] |
35 | | -#md nothing #hide |
| 7 | +using LinearAlgebra |
| 8 | +using Distributions |
36 | 9 |
|
37 | | -# |
| 10 | +## Plotting |
| 11 | +using Plots; |
| 12 | +default(; lw=2.0, legendfontsize=15.0); |
38 | 13 |
|
39 | | -f(x, k, λ) = kernelmatrix(k, x, X) / (kernelmatrix(k, X) + exp(λ[1]) * I) * y |
40 | | -f(X, k, 1.0) |
| 14 | +using Random: seed! |
| 15 | +seed!(42); |
41 | 16 |
|
42 | | -# |
| 17 | +# ## From linear regression to ridge regression |
| 18 | +# Here we use a one-dimensional toy problem. We generate data using the fourth-order polynomial $f(x) = (x+4)(x+1)(x-1)(x-3)$: |
| 19 | + |
| 20 | +f_truth(x) = (x + 4) * (x + 1) * (x - 1) * (x - 3) |
| 21 | + |
| 22 | +x_train = collect(-5:0.5:5) |
| 23 | +x_test = collect(-5:0.1:5) |
| 24 | + |
| 25 | +noise = rand(Uniform(-10, 10), size(x_train)) |
| 26 | +y_train = f_truth.(x_train) + noise |
| 27 | +y_test = f_truth.(x_test) |
| 28 | + |
| 29 | +plot(x_test, y_test; label=raw"$f(x)$") |
| 30 | +scatter!(x_train, y_train; label="observations") |
| 31 | + |
| 32 | +# For training inputs $\mathrm{X}=(\mathbf{x}_n)_{n=1}^N$ and observations $\mathbf{y}=(y_n)_{n=1}^N$, the linear regression weights $\mathbf{w}$ using the least-squares estimator are given by |
| 33 | +# ```math |
| 34 | +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y} |
| 35 | +# ``` |
| 36 | +# We predict at test inputs $\mathbf{x}_*$ using |
| 37 | +# ```math |
| 38 | +# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} |
| 39 | +# ``` |
| 40 | +# This is implemented by `linear_regression`: |
| 41 | + |
| 42 | +function linear_regression(X, y, Xstar) |
| 43 | + weights = (X' * X) \ (X' * y) |
| 44 | + return Xstar * weights |
| 45 | +end |
| 46 | + |
| 47 | +# A linear regression fit to the above data set: |
| 48 | + |
| 49 | +y_pred = linear_regression(x_train, y_train, x_test) |
| 50 | +scatter(x_train, y_train; label="observations") |
| 51 | +plot!(x_test, y_pred; label="linear fit") |
| 52 | + |
| 53 | +# We can improve the fit by including additional features, i.e. generalizing to $\mathrm{X} = (\phi(x_n))_{n=1}^N$, where $\phi(x)$ constructs a feature vector for each input $x$. Here we include powers of the input, $\phi(x) = (1, x, x^2, \dots, x^d)$: |
43 | 54 |
|
44 | | -loss(k, λ) = (ŷ -> sum(y - ŷ) / length(y) + exp(λ[1]) * norm(ŷ))(f(X, k, λ)) |
45 | | -loss(k, λ) |
| 55 | +function featurize_poly(x; degree=1) |
| 56 | + xcols = [x .^ d for d in 0:degree] |
| 57 | + return hcat(xcols...) |
| 58 | +end |
| 59 | + |
| 60 | +function featurized_fit_and_plot(degree) |
| 61 | + X = featurize_poly(x_train; degree=degree) |
| 62 | + Xstar = featurize_poly(x_test; degree=degree) |
| 63 | + y_pred = linear_regression(X, y_train, Xstar) |
| 64 | + scatter(x_train, y_train; legend=false, title="fit of order $degree") |
| 65 | + return plot!(x_test, y_pred) |
| 66 | +end |
| 67 | + |
| 68 | +plot([featurized_fit_and_plot(degree) for degree in 1:4]...) |
46 | 69 |
|
| 70 | +# Note that the fit becomes perfect when we include exactly as many orders in the features as we have in the underlying polynomial (4). |
47 | 71 | # |
| 72 | +# However, when increasing the number of features, we can quickly overfit to noise in the data set: |
| 73 | + |
| 74 | +featurized_fit_and_plot(18) |
| 75 | + |
| 76 | +# To counteract this unwanted behaviour, we can introduce regularization. This leads to *ridge regression* with $L_2$ regularization of the weights ([Tikhonov regularization](https://en.wikipedia.org/wiki/Tikhonov_regularization)). |
| 77 | +# Instead of the weights in linear regression, |
| 78 | +# $$ |
| 79 | +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X})^{-1} \mathrm{X}^\top \mathbf{y} |
| 80 | +# $$ |
| 81 | +# we introduce the ridge parameter $\lambda$: |
| 82 | +# $$ |
| 83 | +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y} |
| 84 | +# $$ |
| 85 | +# As before, we predict at test inputs $\mathbf{x}_*$ using |
| 86 | +# ```math |
| 87 | +# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} |
| 88 | +# ``` |
| 89 | +# This is implemented by `ridge_regression`: |
| 90 | + |
| 91 | +function ridge_regression(X, y, Xstar, lambda) |
| 92 | + weights = (X' * X + lambda * I) \ (X' * y) |
| 93 | + return Xstar * weights |
| 94 | +end |
48 | 95 |
|
49 | | -ps = Flux.params(k) |
50 | | -push!(ps, λ) |
51 | | -opt = Flux.Momentum(0.1) |
52 | | -#md nothing #hide |
53 | | - |
54 | | -plots = [] |
55 | | -for i in 1:10 |
56 | | - grads = Zygote.gradient(() -> loss(k, λ), ps) |
57 | | - Flux.Optimise.update!(opt, ps, grads) |
58 | | - p = Plots.scatter(x, y; lab="data", title="Loss = $(loss(k,λ))") |
59 | | - Plots.plot!(x_test, f(X_test, k, λ); lab="Prediction", lw=3.0) |
60 | | - push!(plots, p) |
| 96 | +function regularized_fit_and_plot(degree, lambda) |
| 97 | + X = featurize_poly(x_train; degree=degree) |
| 98 | + Xstar = featurize_poly(x_test; degree=degree) |
| 99 | + y_pred = ridge_regression(X, y_train, Xstar, lambda) |
| 100 | + scatter(x_train, y_train; legend=false, title="\$\\lambda=$lambda\$") |
| 101 | + return plot!(x_test, y_pred) |
61 | 102 | end |
62 | 103 |
|
| 104 | +plot([regularized_fit_and_plot(18, lambda) for lambda in [1e-4, 1e-2, 0.1, 10]]...) |
| 105 | + |
| 106 | +# Instead of constructing the feature matrix explicitly, we can use *kernels* to replace inner products of feature vectors with a kernel evaluation: $\langle \phi(x), \phi(x') \rangle = k(x, x')$ or $\mathrm{X} \mathrm{X}^\top = \mathrm{K}$, where $\mathrm{K}_{ij} = k(x_i, x_j)$. |
| 107 | +# |
| 108 | +# To apply this "kernel trick" to ridge regression, we can rewrite the ridge estimate for the weights |
| 109 | +# $$ |
| 110 | +# \mathbf{w} = (\mathrm{X}^\top \mathrm{X} + \lambda \mathbb{1})^{-1} \mathrm{X}^\top \mathbf{y} |
| 111 | +# $$ |
| 112 | +# using the [matrix inversion lemma](https://tlienart.github.io/pub/csml/mtheory/matinvlem.html#basic_lemmas) |
| 113 | +# as |
| 114 | +# $$ |
| 115 | +# \mathbf{w} = \mathrm{X}^\top (\mathrm{X} \mathrm{X}^\top + \lambda \mathbb{1})^{-1} \mathbf{y} |
| 116 | +# $$ |
| 117 | +# where we can now replace the inner product with the kernel matrix, |
| 118 | +# $$ |
| 119 | +# \mathbf{w} = \mathrm{X}^\top (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y} |
| 120 | +# $$ |
| 121 | +# And the prediction yields another inner product, |
| 122 | +# ```math |
| 123 | +# \hat{y}_* = \mathbf{x}_*^\top \mathbf{w} = \langle \mathbf{x}_*, \mathbf{w} \rangle = \mathbf{k}_* (\mathrm{K} + \lambda \mathbb{1})^{-1} \mathbf{y} |
| 124 | +# ``` |
| 125 | +# where $(\mathbf{k}_*)_n = k(x_*, x_n)$. |
63 | 126 | # |
| 127 | +# This is implemented by `kernel_ridge_regression`: |
| 128 | + |
| 129 | +function kernel_ridge_regression(k, X, y, Xstar, lambda) |
| 130 | + K = kernelmatrix(k, X) |
| 131 | + kstar = kernelmatrix(k, Xstar, X) |
| 132 | + return kstar * ((K + lambda * I) \ y) |
| 133 | +end |
| 134 | + |
| 135 | +# Now, instead of explicitly constructing features, we can simply pass in a `PolynomialKernel` object: |
| 136 | + |
| 137 | +function kernelized_fit_and_plot(kernel, lambda=1e-4) |
| 138 | + y_pred = kernel_ridge_regression(kernel, x_train, y_train, x_test, lambda) |
| 139 | + if kernel isa PolynomialKernel |
| 140 | + title = string("order ", kernel.degree) |
| 141 | + else |
| 142 | + title = string(kernel) |
| 143 | + end |
| 144 | + scatter(x_train, y_train; label=nothing) |
| 145 | + p = plot!( |
| 146 | + x_test, |
| 147 | + y_pred; |
| 148 | + label=nothing, |
| 149 | + title=title, |
| 150 | + #title=string(raw"$\lambda=", lambda, raw"$") |
| 151 | + ) |
| 152 | + return p |
| 153 | +end |
| 154 | + |
| 155 | +plot([kernelized_fit_and_plot(PolynomialKernel(; degree=degree, c=1)) for degree in 1:4]...) |
| 156 | + |
| 157 | +# However, we can now also use kernels that would have an infinite-dimensional feature expansion, such as the squared exponential kernel: |
64 | 158 |
|
65 | | -l = @layout grid(10, 1) |
66 | | -plot(plots...; layout=l, size=(300, 1500)) |
| 159 | +kernelized_fit_and_plot(SqExponentialKernel()) |
0 commit comments