In [1]:
!date
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('darkgrid')
In [2]:
# set random seed, for reproducibility
np.random.seed(12345)
In [3]:
x_true = np.linspace(0,15,1000)
y_true = np.cos(x_true)
sigma_true = .3
x_train = np.random.choice(x_true, size=100)
y_train = np.random.laplace(np.cos(x_train), sigma_true)
In [4]:
# remember what this looks like?
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.legend()
Out[4]:
In [5]:
import sklearn.linear_model
In [6]:
clf = sklearn.linear_model.LinearRegression()
In [7]:
X_train = x_train[:,None]
clf.fit(X_train, y_train)
Out[7]:
In [8]:
X_true = x_true[:,None]
y_pred = clf.predict(X_true)
In [9]:
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[9]:
In [10]:
import sklearn.preprocessing
In [11]:
poly = sklearn.preprocessing.PolynomialFeatures(degree=2)
In [12]:
poly.fit_transform(X_train)
Out[12]:
In [13]:
clf = sklearn.linear_model.LinearRegression()
clf.fit(poly.transform(X_train), y_train)
Out[13]:
In [14]:
X_true = x_true[:,None]
y_pred = clf.predict(poly.transform(X_true))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[14]:
Adding squared terms doesn't do much, but how about higher degree polynomial?
In [15]:
poly = sklearn.preprocessing.PolynomialFeatures(degree=10)
clf = sklearn.linear_model.LinearRegression()
clf.fit(poly.fit_transform(X_train), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(poly.transform(X_true))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[15]:
Not so good if you have to extrapolate, though...
In [16]:
x_true = np.linspace(-2,17,1000)
y_true = np.cos(x_true)
X_true = x_true[:,None]
y_pred = clf.predict(poly.transform(X_true))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[16]:
How about if you know that this is some sort of periodic function?
In [17]:
def my_transform(X, phases=[0,.5], freqs=[1., 2.]):
""" Return sine waves of a number of different phases and frequencies"""
n,p = X.shape
assert p == 1, "designed for univariate feature space only"
X = X.reshape(n)
X_new = np.empty((n,len(phases)*len(freqs)))
col = 0
for a in phases:
for b in freqs:
X_new[:,col] = np.cos(a + b*X)
col += 1
assert np.all(np.isfinite(X_new))
return X_new
If you really know the phase and frequency, you can get it:
In [18]:
phases=[0, np.pi]
freqs=[1, 2]
clf = sklearn.linear_model.LinearRegression()
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[18]:
But suppose that you do not have it so narrowed down:
In [19]:
phases=[0,1,2]
freqs=np.linspace(0,6,7)
clf = sklearn.linear_model.LinearRegression()
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[19]:
And there is that whole small $n$ large $p$ thing you need to worry about:
In [20]:
phases=np.linspace(0,6,10)
freqs=np.linspace(0,6,10)
clf = sklearn.linear_model.LinearRegression()
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[20]:
Which can end up totally breaking if $n$ is too small and $p$ is too large.
In [21]:
phases=np.linspace(0,6,25)
freqs=np.linspace(0,6,25)
clf = sklearn.linear_model.LinearRegression()
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[21]:
Enter penalty functions, e.g. sklearn.linear_model.Ridge:
In [22]:
phases=np.linspace(0,6,25)
freqs=np.linspace(0,6,25)
clf = sklearn.linear_model.Ridge(alpha=1)
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[22]:
Or for this sort of application, sklearn.linear_model.Lasso:
In [23]:
phases=np.linspace(0,6,25)
freqs=np.linspace(0,6,25)
clf = sklearn.linear_model.Lasso(alpha=.05)
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[23]:
Amazing, in my humble opinion!
In [24]:
phases=np.linspace(0,6,100)
freqs=np.linspace(0,6,100)
clf = sklearn.linear_model.Lasso(alpha=.05)
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[24]:
In [25]:
# One thing that is so great about this? You can understand the model:
clf.coef_.shape
Out[25]:
In [26]:
clf.coef_
Out[26]:
In [27]:
np.where(clf.coef_>.01)
Out[27]:
In [28]:
print phases[2:5]
print freqs[15:18]
One more penalty function, because I love the name: Elastic Nets
In [29]:
phases=np.linspace(0,6,100)
freqs=np.linspace(0,6,100)
clf = sklearn.linear_model.ElasticNet(alpha=.1, l1_ratio=.25)
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Out[29]:
In [30]:
np.where(clf.coef_>.01)
Out[30]:
How should you find the appropriate values of alpha (and l1_ratio in E-N case)? Cross-validation, of course. There are some special tricks for making this fast, and sklearn has implementations you can use if this is relevant to your projects.
In [31]:
# set random seed for reproducibility
np.random.seed(12345)
x_true = np.linspace(0,15,1000)
y_true_numeric = np.cos(x_true)
y_true = (np.random.uniform(low=-1, high=1, size=1000) <= y_true_numeric)
train = np.random.choice(range(1000), size=100)
x_train = x_true[train]
y_train = y_true[train]
Not as pretty...
In [32]:
plt.plot(x_true, y_true_numeric, '-', label='Propensity')
plt.plot(x_train, np.random.normal(0,.01,size=100)+y_train, 's', label='Jittered Train')
plt.legend()
Out[32]:
That would look better if the class labels were $\pm1$:
In [33]:
y_true = 2*y_true - 1
y_train = y_true[train]
In [34]:
plt.plot(x_true, y_true_numeric, '-', label='Propensity')
plt.plot(x_train, np.random.normal(0,.01,size=100)+y_train, 's', label='Jittered Train')
plt.legend()
Out[34]:
In [35]:
clf = sklearn.linear_model.LinearRegression()
X_train = x_train[:,None]
clf.fit(X_train, y_train)
Out[35]:
In [36]:
y_pred = (clf.predict(X_train) >= 0)
In [37]:
np.mean(y_pred == y_train)
Out[37]:
There must be something wrong with that, right???
In [38]:
y_pred
Out[38]:
In [39]:
y_train
Out[39]:
In [40]:
y_pred = 2*y_pred -1
In [41]:
np.mean(y_pred == y_train)
Out[41]:
Not very good, and that is in-sample fit!
But we know how to fix it:
In [42]:
phases=np.linspace(0,6,100)
freqs=np.linspace(0,6,100)
clf = sklearn.linear_model.ElasticNet(alpha=.1, l1_ratio=.25)
clf.fit(my_transform(X_train, phases, freqs), y_train)
y_pred = clf.predict(my_transform(X_train, phases, freqs))
y_pred = (y_pred >= 0)
y_pred = 2*y_pred -1
In [43]:
np.mean(y_pred == y_train)
Out[43]:
You should not be impressed by this until you test it out-of-sample... what do you think will happen in that case?
Using linear regression for this is a little silly, though. We should consider a loss function that is more appropriate to the task at hand:
In [44]:
clf = sklearn.linear_model.LogisticRegression(penalty='l1', C=.05)
clf.fit(my_transform(X_train, phases, freqs), y_train)
y_pred = clf.predict(my_transform(X_train, phases, freqs))
# don't need to mess around with the results this time...
np.mean(y_pred == y_train)
Out[44]:
Surprising that it is worse, not better. Worth testing this out of sample, too, huh?
The hinge loss may be less familiar:
In [45]:
import sklearn.svm
In [46]:
clf = sklearn.svm.SVC(kernel='linear')
clf.fit(my_transform(X_train, phases, freqs), y_train)
y_pred = clf.predict(my_transform(X_train, phases, freqs))
# don't need to mess around with the results this time...
np.mean(y_pred == y_train)
Out[46]:
The other cool thing about the SVM is the kernel trick. It is possible to use the kernel trick in a lot of settings, but it is particularly central to SVM. Let us first remove all of the transformed covariate columns we added:
In [47]:
clf = sklearn.svm.SVC(kernel='linear', C=1.)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
np.mean(y_pred == y_train)
Out[47]:
Not good at all. But with the kernel trick:
In [48]:
clf = sklearn.svm.SVC(kernel='poly', C=1., degree=6)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
np.mean(y_pred == y_train)
Out[48]:
In [49]:
clf = sklearn.svm.SVC(kernel='rbf', C=1., gamma=0.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
np.mean(y_pred == y_train)
Out[49]:
Homework: use out-of-sample cross-validation to find the best values for C, gamma, and degree. Use test/train/validation split to estimate the accuracy of the best method. Bonus extra hard theory-and-application problem: Figure out what the best accuracy possible is for this toy problem.
In [49]: