Define constants. These may be different for you.
In [1]:
DATA_PATH = '../../data/sdss_dr7_photometry_source.csv.gz'
In [2]:
import sys
import matplotlib.pyplot as plt
import numpy as np
import sklearn.gaussian_process
sys.path.insert(1, '..')
import splitter
%matplotlib inline
In [9]:
data = splitter.load(DATA_PATH)
(train_X, train_y), (test_X, test_y) = splitter.split(data, 10000, 100)
In [10]:
def preprocess_sgd(train_X, train_y, test_X):
kernel = sklearn.gaussian_process.kernels.Matern()
subset = train_X[:2000]
sgd_train_X = kernel(train_X, subset)
sgd_test_X = kernel(test_X, subset)
return sgd_train_X, sgd_test_X
In [11]:
sgd_train_X, sgd_test_X = preprocess_sgd(train_X, train_y, test_X)
In [19]:
approximate_kernel = sgd_test_X.dot(sgd_test_X.T)
plt.matshow(approximate_kernel)
plt.colorbar()
Out[19]:
In [18]:
actual_kernel = sklearn.gaussian_process.kernels.Matern()(test_X, test_X)
Ksquared = actual_kernel@actual_kernel
plt.matshow(Ksquared)
plt.colorbar()
Out[18]:
In [17]:
np.linalg.norm(Ksquared - approximate_kernel)
Out[17]:
In [23]:
actual_eig = np.linalg.eig(actual_kernel)[0]
plt.plot(range(len(actual_eig)), actual_eig)
Out[23]:
In [24]:
approximate_eig = np.linalg.eig(approximate_kernel)[0]
plt.plot(range(len(approximate_eig)), approximate_eig)
Out[24]:
In [ ]: