Define constants. These may be different for you.


In [1]:
DATA_PATH = '../../data/sdss_dr7_photometry_source.csv.gz'

In [2]:
import sys

import matplotlib.pyplot as plt
import numpy as np
import sklearn.gaussian_process

sys.path.insert(1, '..')
import splitter

%matplotlib inline

In [9]:
data = splitter.load(DATA_PATH)
(train_X, train_y), (test_X, test_y) = splitter.split(data, 10000, 100)

In [10]:
def preprocess_sgd(train_X, train_y, test_X):
    kernel = sklearn.gaussian_process.kernels.Matern()
    subset = train_X[:2000]
    sgd_train_X = kernel(train_X, subset)
    sgd_test_X = kernel(test_X, subset)
    return sgd_train_X, sgd_test_X

In [11]:
sgd_train_X, sgd_test_X = preprocess_sgd(train_X, train_y, test_X)

In [19]:
approximate_kernel = sgd_test_X.dot(sgd_test_X.T)
plt.matshow(approximate_kernel)
plt.colorbar()


Out[19]:
<matplotlib.colorbar.Colorbar at 0x11bdf20f0>

In [18]:
actual_kernel = sklearn.gaussian_process.kernels.Matern()(test_X, test_X)
Ksquared = actual_kernel@actual_kernel
plt.matshow(Ksquared)
plt.colorbar()


Out[18]:
<matplotlib.colorbar.Colorbar at 0x112bbe438>

In [17]:
np.linalg.norm(Ksquared - approximate_kernel)


Out[17]:
5932.9763453908045

In [23]:
actual_eig = np.linalg.eig(actual_kernel)[0]
plt.plot(range(len(actual_eig)), actual_eig)


Out[23]:
[<matplotlib.lines.Line2D at 0x10a13de48>]

In [24]:
approximate_eig = np.linalg.eig(approximate_kernel)[0]
plt.plot(range(len(approximate_eig)), approximate_eig)


Out[24]:
[<matplotlib.lines.Line2D at 0x10a249710>]

In [ ]: