In [ ]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline
sns.set_context('poster')
sns.set_style('darkgrid')
In [ ]:
# set random seed, for reproducibility
np.random.seed(12345)
In [ ]:
x_true = np.linspace(0,15,1000)
y_true = np.cos(x_true)
sigma_true = .3
x_train = np.random.choice(x_true, size=100)
y_train = np.random.laplace(np.cos(x_train), sigma_true)
In [ ]:
# remember what this looks like?
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.legend()
In [ ]:
import sklearn.linear_model
In [ ]:
clf = sklearn.linear_model.____________________
In [ ]:
X_train = x_train[:,None]
clf.fit(_____________________, _____________________)
In [ ]:
X_true = x_true[:,None]
y_pred = clf.predict(_____________________)
In [ ]:
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
In [ ]:
import sklearn.preprocessing
In [ ]:
poly = sklearn.preprocessing.PolynomialFeatures(degree=_____________________)
In [ ]:
poly.fit_transform(X_train) # What does fit_transform seem to do? is poly.fit different? what about poly.transform?
In [ ]:
clf = sklearn.linear_model.LinearRegression()
clf.fit(poly.transform(X_train), y_train)
In [ ]:
X_true = x_true[:,None]
y_pred = clf.predict(poly.transform(X_true))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Adding squared terms doesn't do much, but how about higher degree polynomial?
In [ ]:
poly = sklearn.preprocessing.PolynomialFeatures(degree=_____________________)
clf = sklearn.linear_model.LinearRegression()
clf.fit(poly._____________________(X_train), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(poly._____________________(X_true))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Not so good if you have to extrapolate, though...
In [ ]:
x_true = np.linspace(-2,17,1000)
y_true = np.cos(x_true)
X_true = x_true[:,None]
y_pred = clf.predict(poly.transform(X_true))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
How about if you know that this is some sort of periodic function?
In [ ]:
def my_transform(X, phases=[0,.5], freqs=[1., 2.]):
""" Return sine waves of a number of different phases and frequencies
Parameters
----------
X : _____________________
phases : _____________________
freqs : _____________________
Results
-------
Returns _____________________
"""
n,p = X.shape
assert p == 1, "designed for univariate feature space only"
X = X.reshape(n)
X_new = np.empty((n,len(phases)*len(freqs)))
col = 0
for a in phases:
for b in freqs:
X_new[:,col] = np.cos(a + b*X)
col += 1
assert np.all(np.isfinite(X_new))
return X_new
If you really know the phase and frequency, you can get it:
In [ ]:
phases = ________________________
freqs = ________________________
clf = sklearn.linear_model.LinearRegression()
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
But suppose that you do not have it so narrowed down:
In [ ]:
phases=[0,1,2]
freqs=np.linspace(0,6,7)
clf = sklearn.linear_model.LinearRegression()
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
And there is that whole small $n$ large $p$ thing you need to worry about:
In [ ]:
phases=np.linspace(0,6,10)
freqs=np.linspace(0,6,10)
clf = sklearn.linear_model.LinearRegression()
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Which can end up totally breaking if $n$ is too small and $p$ is too large.
In [ ]:
phases=np.linspace(0,6,25)
freqs=np.linspace(0,6,25)
clf = sklearn.linear_model.LinearRegression()
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Enter penalty functions, e.g. sklearn.linear_model.Ridge:
In [ ]:
phases=np.linspace(0,6,25)
freqs=np.linspace(0,6,25)
clf = sklearn.linear_model.Ridge(alpha=________________________)
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Or for this sort of application, sklearn.linear_model.Lasso:
In [ ]:
phases=np.linspace(0,6,25)
freqs=np.linspace(0,6,25)
clf = sklearn.linear_model.Lasso(alpha=________________________)
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
Amazing, in my humble opinion!
In [ ]:
phases=np.linspace(0,6,100)
freqs=np.linspace(0,6,100)
clf = sklearn.linear_model.Lasso(alpha=.05)
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
In [ ]:
clf.______________________
In [ ]:
clf.coef_.________________________
In [ ]:
np.where(clf.coef_>.01)
In [ ]:
print phases[2:5]
print freqs[15:18]
What is the phase and frequency of the sine wave with a coefficient weight greater than 0.01?
In [ ]:
________________________, ________________________
One more penalty function, because I love the name: Elastic Nets
In [ ]:
phases=np.linspace(0,6,100)
freqs=np.linspace(0,6,100)
clf = sklearn.linear_model.ElasticNet(alpha=________________________, l1_ratio=________________________)
clf.fit(my_transform(X_train, phases, freqs), y_train)
X_true = x_true[:,None]
y_pred = clf.predict(my_transform(X_true, phases, freqs))
plt.plot(x_true, y_true, '-', label='Truth')
plt.plot(x_train, y_train, 's', label='Train')
plt.plot(x_true, y_pred, '-', label='Predicted')
plt.legend()
In [ ]:
np.where(clf.coef_>.01)
How should you find the appropriate values of alpha (and l1_ratio in E-N case)? Cross-validation, of course. There are some special tricks for making this fast, and sklearn has implementations you can use if this is relevant to your projects.
In [ ]:
# set random seed for reproducibility
np.random.seed(12345)
x_true = np.linspace(0,15,1000)
y_true_numeric = np.cos(x_true)
y_true = (np.random.uniform(low=-1, high=1, size=1000) <= y_true_numeric)
train = np.random.choice(range(1000), size=100)
x_train = x_true[train]
y_train = y_true[train]
Not as pretty...
In [ ]:
plt.plot(x_true, y_true_numeric, '-', label='Propensity')
plt.plot(x_train, np.random.normal(0,________________________,size=100)+y_train, 's', label='Jittered Train')
plt.legend()
That would look better if the class labels were $\pm1$:
In [ ]:
y_true = 2*y_true - 1
y_train = y_true[train]
In [ ]:
plt.plot(x_true, y_true_numeric, '-', label='Propensity')
plt.plot(x_train, np.random.normal(0,.01,size=100)+y_train, 's', label='Jittered Train')
plt.legend()
In [ ]:
# fit the linear model
clf = sklearn.linear_model.LinearRegression()
X_train = x_train[:,None]
clf.fit(X_train, y_train)
In [ ]:
# make a categorical prediction with a linear regression
y_pred = (clf.predict(X_train) >= ________________________)
In [ ]:
# accuracy
np.mean(________________________ == ________________________)
There must be something wrong with that, right???
In [ ]:
y_pred = ________________________
In [ ]:
np.mean(y_pred == y_train)
Not very good, and that is in-sample fit!
But we know how to fix it:
In [ ]:
phases=np.linspace(0,6,100)
freqs=np.linspace(0,6,100)
clf = sklearn.linear_model.ElasticNet(alpha=.1, l1_ratio=________________________)
clf.fit(my_transform(X_train, phases, freqs), y_train)
y_pred = clf.predict(my_transform(X_train, phases, freqs))
y_pred = (y_pred >= 0)
y_pred = 2*y_pred -1
In [ ]:
np.mean(y_pred == y_train)
You should not be impressed by this until you test it out-of-sample... what do you think will happen in that case?
Using linear regression for this is a little silly, though. We should consider a loss function that is more appropriate to the task at hand:
In [ ]:
clf = sklearn.linear_model.LogisticRegression(penalty=________________________, C=________________________)
clf.fit(my_transform(X_train, phases, freqs), y_train)
y_pred = clf.predict(my_transform(X_train, phases, freqs))
# don't need to mess around with the results this time...
np.mean(y_pred == y_train)
Surprising that it is worse, not better. Worth testing this out of sample, too, huh?
The hinge loss may be less familiar:
In [ ]:
import sklearn.svm
In [ ]:
clf = sklearn.svm.SVC(kernel='linear')
clf.fit(my_transform(X_train, phases, freqs), y_train)
y_pred = clf.predict(my_transform(X_train, phases, freqs))
# don't need to mess around with the results this time...
np.mean(y_pred == y_train)
The other cool thing about the SVM is the kernel trick. It is possible to use the kernel trick in a lot of settings, but it is particularly central to SVM. Let us first remove all of the transformed covariate columns we added:
In [ ]:
clf = sklearn.svm.SVC(kernel='linear', C=________________________)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
np.mean(y_pred == y_train)
Not good at all. But with the kernel trick:
In [ ]:
clf = sklearn.svm.SVC(kernel='poly', C=_____________________, degree=_____________________)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
np.mean(y_pred == y_train)
In [ ]:
clf = sklearn.svm.SVC(kernel='rbf', C=_____________________, gamma=_____________________)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_train)
np.mean(y_pred == y_train)
Homework: use out-of-sample cross-validation to find the best values for C, gamma, and degree. Use test/train/validation split to estimate the accuracy of the best method. Bonus extra hard theory-and-application problem: Figure out what the best accuracy possible is for this toy problem.