In [1]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('darkgrid')
In [2]:
# set random seed, for reproducibility
np.random.seed(12345)
adapted from http://www.astroml.org/book_figures/chapter8/fig_gp_example.html
In [3]:
# true x and y points
x_true = np.linspace(0,10,500)
y_true = np.cos(x_true)
# noisy observed x and y points
x_train = np.random.choice(x_true, 25, replace=False)
dy = 0.5
y_train = np.random.normal(np.cos(x_train), dy)
In [4]:
plt.plot(x_true, y_true, label='true')
plt.plot(x_train, y_train, 's', label='observed')
plt.legend()
Out[4]:
In [5]:
from sklearn.gaussian_process import GaussianProcess
In [6]:
gp = GaussianProcess(theta0=100, nugget=.5)
gp.fit(x_train[:, None], y_train)
y_pred = gp.predict(x_true[:, None])
In [7]:
# What does x_train[:,None] do?
assert np.all(x_train.reshape((25,1)) == x_train[:,None]), 'x_train[:,None] is a cryptic way to reshape the array'
In [8]:
# What does the GPR prediction look like?
plt.plot(x_true, y_true, label='true')
plt.plot(x_train, y_train, 's', label='observed')
plt.plot(x_true, y_pred, label='predicted')
plt.legend()
Out[8]:
We can measure the training error (RMSE):
In [9]:
y_pred = gp.predict(x_train[:, None])
rmse = np.sqrt(np.mean((y_pred - y_train)**2))
print '(in-sample) rmse =', rmse
To measure the "test" error, we need different data. Since this is a simulation, we can have as much as we want:
In [10]:
# noisy observed x and y points
x_test= np.random.uniform(0, 10, size=1000)
y_test = np.random.normal(np.cos(x_test), dy)
# out-of-sample prediction
y_pred = gp.predict(x_test[:, None])
rmse = np.sqrt(np.mean((y_pred - y_test)**2))
print '(out-of-sample) rmse =', rmse
Refactor this into a function:
In [11]:
def test_train_error(theta0):
""" calculate the RMSE for the test and train data
using GPR with the specified theta0 value"""
gp = GaussianProcess(theta0=theta0, nugget=.5)
gp.fit(x_train[:, None], y_train)
y_pred = gp.predict(x_train[:, None])
err_train = np.sqrt(np.mean((y_pred - y_train)**2))
y_pred = gp.predict(x_test[:, None])
err_test = np.sqrt(np.mean((y_pred - y_test)**2))
return dict(theta0=theta0, err_train=err_train, err_test=err_test)
test_train_error(100)
Out[11]:
I didn't tell you about dict last time! This is one of my favorite Python things. It is like an array, but you can use whatever you want as the index. Well, pretty much whatever you want... in CS lingo, it is an associative array. You can create one with curly brackets:
d1 = {}
d1['abie'] = 'forgetful'
d2 = {'yes':1, 'no':0}
d2['yes'] # what value will this produce?
In [12]:
results = []
for theta0 in np.exp(np.linspace(-3, 5, 50)):
results.append(
test_train_error(theta0))
results = pd.DataFrame(results)
In [13]:
plt.semilogx(results.theta0, results.err_train, label='Train Error')
plt.plot(results.theta0, results.err_test, label='Test Error')
plt.legend()
plt.xlabel('$\theta_0$')
plt.ylabel('RMSE')
Out[13]:
So the best value is around $\theta_0 = 1.1$:
In [14]:
gp = GaussianProcess(theta0=1.1, nugget=.5)
gp.fit(x_train[:, None], y_train) # What does x_train[:,None] do?
y_pred = gp.predict(x_true[:, None])
In [15]:
plt.plot(x_true, y_true, label='true')
plt.plot(x_train, y_train, 's', label='observed')
plt.plot(x_true, y_pred, label='predicted')
plt.legend()
Out[15]:
In [16]:
gp = GaussianProcess(theta0=.1, nugget=.5)
gp.fit(x_train[:, None], y_train)
y_pred = gp.predict(x_true[:, None])
plt.plot(x_true, y_true, label='true')
plt.plot(x_train, y_train, 's', label='observed')
plt.plot(x_true, y_pred, label='predicted')
plt.legend()
Out[16]:
In [17]:
df = pd.read_csv('ISL_Fig_2_9_data.csv', index_col=0)
In [18]:
df.head()
Out[18]:
In [19]:
df['color'] = df['Y'].map({'red': 'orange', 'blue': 'purple'})
In [20]:
plt.figure(figsize=(8,8))
for g, dfg in df.groupby('color'):
plt.plot(dfg['X.1'], dfg['X.2'], 's', color=g, ms=10, alpha=.6)
In [21]:
# load the k-NN library
import sklearn.neighbors
In [22]:
# create the training data
# array of feature vectors
X_train = np.array(df.filter(['X.1', 'X.2']))
# corresponding labels
y_train = np.array(df.color.map({'orange': 0, 'purple': 1}))
In [23]:
n_neighbors=20
weights='uniform'
clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights)
clf.fit(X_train, y_train)
Out[23]:
In [24]:
# explore the decision boundary. For that, we will predict each
# point in the mesh [-3, 4]x[-3, 4].
xx, yy = np.meshgrid(np.linspace(-3, 4, 500),
np.linspace(-3, 4, 500))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
In [25]:
plt.figure(figsize=(8,8))
for g, dfg in df.groupby('color'):
plt.plot(dfg['X.1'], dfg['X.2'], 's', color=g, ms=10, alpha=.6)
plt.contour(xx, yy, Z, levels=[.5], colors='k', zorder=10)
Out[25]:
In [26]:
y_pred = clf.predict(X_train)
c = np.mean(y_train == y_pred)
print '(in-sample) concordance =', c
To do out-of-sample, need to know distribution, or approximate it somehow. One way (to which we will return) is the train/test split. In this simulation environment, however, we can just get more:
In [27]:
orange_means = pd.read_csv('ISL_Fig_2_9_orange_mean.csv', index_col=0)
purple_means = pd.read_csv('ISL_Fig_2_9_purple_mean.csv', index_col=0)
To simulate data of either color, first choose a mean uniformly at random, and then draw from a normal centered at that mean, with standard deviation 0.5:
In [28]:
def rmixture(means):
i = np.random.randint(len(means.index))
return np.random.normal(loc=means.iloc[i], scale=.5)
rmixture(orange_means), rmixture(purple_means)
Out[28]:
In [29]:
n_test = 1000
X_test = [rmixture(orange_means) for i in range(n_test)] + \
[rmixture(purple_means) for i in range(n_test)]
X_test = np.array(X_test)
y_test = [0 for i in range(n_test)] + [1 for i in range(n_test)]
y_test = np.array(y_test)
Have a look at this newly simulated data:
In [30]:
plt.figure(figsize=(8,8))
plt.plot(X_test[:n_test,0], X_test[:n_test,1], 's', color='orange', ms=10, alpha=.6)
plt.plot(X_test[n_test:,0], X_test[n_test:,1], 's', color='purple', ms=10, alpha=.6)
plt.contour(xx, yy, Z, levels=[.5], colors='k', zorder=10)
Out[30]:
In [31]:
y_pred = clf.predict(X_test)
In [32]:
c = np.mean(y_test == y_pred)
print '(out-of-sample) concordance =', c
Now refactor the test-/train-error measurement into a function that takes the $k$ in $k$-NN as a parameter, and sweep over a range of $k$ values to find the best $k$, and reproduce ISL Figure 2.17.
In [33]:
def test_train_error(k):
""" calculate the concordance for the test and train data
using k-NN with the specified k value"""
n_neighbors=k
weights='uniform'
clf = sklearn.neighbors.KNeighborsClassifier(n_neighbors, weights)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
err_train = np.mean(y_pred != y_train)
y_pred = clf.predict(X_test)
err_test = np.mean(y_pred != y_test)
return dict(k=k, err_train=err_train, err_test=err_test)
test_train_error(1)
Out[33]:
In [34]:
results = []
for k in range(1,100):
results.append(
test_train_error(k))
results = pd.DataFrame(results)
In [35]:
plt.semilogx(1/results.k, results.err_train, label='Train Error')
plt.plot(1/results.k, results.err_test, label='Test Error')
plt.legend()
plt.xlabel('$1/k$')
plt.ylabel('Error Rate')
Out[35]: