Machine Learning and Pattern Recognition Minimal GP Demo
Machine Learning and Pattern Recognition Minimal GP Demo
import numpy as np
import matplotlib.pyplot as plt
# We create kernel functions that take NxD and MxD design matrices, to give NxM
# kernel values. rbf_fn is a fixed unit bandwidth RBF kernel, and
# gauss_kernel_fn rescales that using free parameters ell (1xD or 1x1) and
sigma_f (1x1).
# Compute covariance of function values for those points. The Gaussian kernel
# gives positive definite matrices in theory. In practice, we need to add a tiny
# amount of noise to the process (a "jitter" or "nugget" term) if we need a
# matrix that is positive definite given finite numerical precision.
N_grid = X_grid.shape[0]
K_grid = k_fn(X_grid, X_grid) + 1e-9*np.eye(N_grid)
# Pick four of the original grid points, just sample a setting of those and plot
as crosses:
idx = np.round(N_grid*np.array([0.2,0.4,0.6,0.8])).astype(int)
X_locs1 = X_grid[idx, :]
N_locs1 = idx.size
K_locs1 = k_fn(X_locs1, X_locs1)
L_locs1 = np.linalg.cholesky(K_locs1)
plt.figure(2)
plt.clf()
noise_var = 0.0 # add no noise to the samples, we want to look at function values
#noise_var = 1.0 # add some noise to the samples, we want to simulate data
f_locs1 = np.dot(L_locs1, np.random.randn(N_locs1)) + \
np.sqrt(noise_var)*np.random.randn(N_locs1)
plt.plot(X_locs1, f_locs1, 'x', markersize=20, markeredgewidth=2)
# We could later choose to sample any of the other function values, conditioned
# on the ones we already picked. We compute all the relevant covariances, and
# apply standard formulae to get the mean and covariance of the Gaussian over
# the remaining function values, conditioned on the ones we've already picked.
X_rest = np.delete(X_grid, idx, 0)
K_rest = k_fn(X_rest, X_rest)
K_rest_locs1 = k_fn(X_rest, X_locs1)
M = K_locs1 + noise_var*np.eye(N_locs1)
# We could compute cholesky(M) once and use it to solve both the linear systems
in the next two lines
rest_cond_mu = np.dot(K_rest_locs1, np.linalg.solve(M, f_locs1))
rest_cond_cov = K_rest - np.dot(K_rest_locs1, np.linalg.solve(M, K_rest_locs1.T))
# If you change noise_var from 0.0 to 1.0 above, you can simulate first looking
# at noisy values of the function. The conditional distribution is then the
# posterior over some function values given some noisy observations.
# If you'd rather plot the mean prediction and error bars rather than samples,
# you can do that too. The thick black solid line shows the mean, and dashed
# thick black lines show +/- 2 standard deviations -- at any particular
# location, we have ~95% belief the function will lie in this range.
plt.plot(X_rest, rest_cond_mu, '-k', linewidth=2)
rest_cond_std = np.sqrt(np.diag(rest_cond_cov))
plt.plot(X_rest, rest_cond_mu + 2*rest_cond_std, '--k', linewidth=2)
plt.plot(X_rest, rest_cond_mu - 2*rest_cond_std, '--k', linewidth=2)
# The mean and error bars can be computed more cheaply if you don't want to know
# the whole posterior. See the notes.
# Final remarks:
# --------------
# - None of the points have to be on a grid.
# - The initial training points X_locs1 could be anywhere.
# - The remaining test points X_rest could be anywhere.
# - If the points aren't sampled densely and in order, we wouldn't
# join them with straight lines. We'd just plot individual points.
# - The inputs can be in D-dimensions. The kernel function is the
# only part of the code that looks at the original input features.
# None of the rest of the code cares where the scalar function
# values are located in input space.