In [3]:
%matplotlib inline
import seaborn as sns
sns.set()
In [4]:
import numpy as np
import matplotlib.pyplot as plt
In [5]:
from IPython.display import Image, YouTubeVideo
Machine Learning is a subfield of computer science that utilizes statistics and mathemathical optimization to learn generalizable patterns from data. In machine learning, we develop models with adjustable parameters, so we can learn the parameters that best fit the data. In this tutorial, I will cover five commonly used algorithms in supervised learning.
In [10]:
YouTubeVideo("IFACrIx5SZ0", start = 85, end = 95)
Out[10]:
In [5]:
from sklearn.datasets import load_iris
iris = load_iris()
In [6]:
x_ind = 0
y_ind = 1
In [7]:
X = iris.data[:,(x_ind, y_ind)]
labels = iris.target
In [8]:
print X.shape
print labels.shape
In [9]:
# this formatter will label the colorbar with the correct target names
formatter = plt.FuncFormatter(lambda i, *args: iris.target_names[int(i)])
plt.scatter(X[:, x_ind], X[:, y_ind],
c=labels, cmap=plt.cm.get_cmap('RdYlBu', 3))
plt.colorbar(ticks=[0, 1, 2], format=formatter)
plt.clim(-0.5, 2.5)
plt.xlabel(iris.feature_names[x_ind])
plt.ylabel(iris.feature_names[y_ind]);
In [10]:
from sklearn import neighbors
In [11]:
sample_size = 25
sample_size_zeroInd = sample_size - 1
In [12]:
rand_sample = np.random.randint(0,150, (sample_size,1))
In [13]:
x_sample = X[rand_sample].reshape((sample_size,2))
label_sample = labels[rand_sample]
In [14]:
plt.scatter(x_sample[:sample_size_zeroInd, 0], x_sample[:sample_size_zeroInd, 1],
c=label_sample[:sample_size_zeroInd], cmap=plt.cm.get_cmap('RdYlBu', 3), s = 40)
plt.colorbar(ticks=[0, 1, 2], format = formatter)
plt.clim(-0.5, 2.5)
plt.xlabel(iris.feature_names[x_ind])
plt.ylabel(iris.feature_names[y_ind]);
plt.scatter(x_sample[sample_size_zeroInd, x_ind], x_sample[sample_size_zeroInd, y_ind], s = 100, c ='g', alpha = 0.5)
Out[14]:
In [15]:
n_neighbors = 13
h = 0.02
In [16]:
clf = neighbors.KNeighborsClassifier(n_neighbors)
clf.fit(X, labels)
Out[16]:
In [17]:
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.get_cmap('RdYlBu', 3), alpha = 0.2)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap=plt.cm.get_cmap('RdYlBu', 3), s = 40)
plt.colorbar(ticks=[0, 1, 2], format = formatter)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xlabel(iris.feature_names[x_ind])
plt.ylabel(iris.feature_names[y_ind]);
In [18]:
from sklearn.linear_model import LogisticRegression
In [19]:
clf = LogisticRegression(C=1e5)
clf.fit(X, labels)
Out[19]:
In [20]:
x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1, figsize=(4, 3))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.get_cmap('RdYlBu', 3), alpha = 0.2)
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=labels, edgecolors='k', cmap=plt.cm.get_cmap('RdYlBu', 3), s = 40)
plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
Out[20]:
In [22]:
Image(url = 'http://i.imgur.com/zDBbD.png')
Out[22]:
In [23]:
Image(url ='http://i.imgur.com/aLZlG.png')
Out[23]:
In [24]:
Image(url = 'http://i.imgur.com/kxWgh.png')
Out[24]:
In [25]:
Image(url ='http://i.imgur.com/ePy4V.png')
Out[25]:
In [26]:
Image(url ='http://i.imgur.com/BWYYZ.png')
Out[26]:
In [27]:
Image(url = 'http://i.imgur.com/R9967.png')
Out[27]:
In [28]:
Image(url = 'http://i.imgur.com/WuxyO.png')
Out[28]:
In [29]:
Image(url = 'http://i.imgur.com/gWdPX.png')
Out[29]:
In [30]:
from sklearn.datasets.samples_generator import make_blobs
In [31]:
X, y = make_blobs(n_samples=50, centers=2,
random_state=0, cluster_std=0.60)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='spring');
In [32]:
xfit = np.linspace(-1, 3.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='spring')
for m, b in [(1, 0.65), (0.5, 1.6), (-0.2, 2.9)]:
plt.plot(xfit, m * xfit + b, '-k')
plt.xlim(-1, 3.5);
In [33]:
xfit = np.linspace(-1, 3.5)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='spring')
for m, b, d in [(1, 0.65, 0.33), (0.5, 1.6, 0.55), (-0.2, 2.9, 0.2)]:
yfit = m * xfit + b
plt.plot(xfit, yfit, '-k')
plt.fill_between(xfit, yfit - d, yfit + d, edgecolor='none', color='#AAAAAA', alpha=0.4)
plt.xlim(-1, 3.5);
In [34]:
from sklearn.svm import SVC # "Support Vector Classifier"
clf = SVC(kernel='linear')
clf.fit(X, y)
Out[34]:
In [35]:
def plot_svc_decision_function(clf, ax=None):
"""Plot the decision function for a 2D SVC"""
if ax is None:
ax = plt.gca()
x = np.linspace(plt.xlim()[0], plt.xlim()[1], 30)
y = np.linspace(plt.ylim()[0], plt.ylim()[1], 30)
Y, X = np.meshgrid(y, x)
P = np.zeros_like(X)
for i, xi in enumerate(x):
for j, yj in enumerate(y):
P[i, j] = clf.decision_function([xi, yj])
# plot the margins
ax.contour(X, Y, P, colors='k',
levels=[-1, 0, 1], alpha=0.5,
linestyles=['--', '-', '--'])
In [36]:
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='spring')
plot_svc_decision_function(clf);
In [37]:
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='spring')
plot_svc_decision_function(clf)
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=200, facecolors='none');
In [38]:
from IPython.html.widgets import interact
def plot_svm(N=10):
X, y = make_blobs(n_samples=200, centers=2,
random_state=0, cluster_std=0.60)
X = X[:N]
y = y[:N]
clf = SVC(kernel='linear')
clf.fit(X, y)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='spring')
plt.xlim(-1, 4)
plt.ylim(-1, 6)
plot_svc_decision_function(clf, plt.gca())
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=200, facecolors='none')
interact(plot_svm, N=[10, 200], kernel='linear');
In [39]:
from sklearn.datasets.samples_generator import make_circles
X, y = make_circles(100, factor=.1, noise=.1)
clf = SVC(kernel='linear').fit(X, y)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='spring')
plot_svc_decision_function(clf);
In [40]:
r = np.exp(-(X[:, 0] ** 2 + X[:, 1] ** 2))
In [41]:
from mpl_toolkits import mplot3d
def plot_3D(elev=30, azim=30):
ax = plt.subplot(projection='3d')
ax.scatter3D(X[:, 0], X[:, 1], r, c=y, s=50, cmap='spring')
ax.view_init(elev=elev, azim=azim)
ax.set_xlabel('x')
ax.set_ylabel('y')
ax.set_zlabel('r')
interact(plot_3D, elev=[-90, 90], azip=(-180, 180));
In [42]:
clf = SVC(kernel='rbf')
clf.fit(X, y)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='spring')
plot_svc_decision_function(clf)
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1],
s=200, facecolors='none');
In [43]:
from sklearn.tree import DecisionTreeClassifier
In [46]:
X, y = make_blobs(n_samples=300, centers=4,
random_state=0, cluster_std=1.0)
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow');
In [47]:
def visualize_tree(estimator, X, y, boundaries=True,
xlim=None, ylim=None):
estimator.fit(X, y)
if xlim is None:
xlim = (X[:, 0].min() - 0.1, X[:, 0].max() + 0.1)
if ylim is None:
ylim = (X[:, 1].min() - 0.1, X[:, 1].max() + 0.1)
x_min, x_max = xlim
y_min, y_max = ylim
xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
np.linspace(y_min, y_max, 100))
Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, alpha=0.2, cmap='rainbow')
plt.clim(y.min(), y.max())
# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='rainbow')
plt.axis('off')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.clim(y.min(), y.max())
# Plot the decision boundaries
def plot_boundaries(i, xlim, ylim):
if i < 0:
return
tree = estimator.tree_
if tree.feature[i] == 0:
plt.plot([tree.threshold[i], tree.threshold[i]], ylim, '-k')
plot_boundaries(tree.children_left[i],
[xlim[0], tree.threshold[i]], ylim)
plot_boundaries(tree.children_right[i],
[tree.threshold[i], xlim[1]], ylim)
elif tree.feature[i] == 1:
plt.plot(xlim, [tree.threshold[i], tree.threshold[i]], '-k')
plot_boundaries(tree.children_left[i], xlim,
[ylim[0], tree.threshold[i]])
plot_boundaries(tree.children_right[i], xlim,
[tree.threshold[i], ylim[1]])
if boundaries:
plot_boundaries(0, plt.xlim(), plt.ylim())
In [48]:
clf = DecisionTreeClassifier()
In [49]:
plt.figure()
visualize_tree(clf, X[:200], y[:200], boundaries=False)
plt.figure()
visualize_tree(clf, X[-200:], y[-200:], boundaries=False)
In [50]:
from sklearn.ensemble import RandomForestClassifier
In [51]:
def fit_randomized_tree(random_state=0):
X, y = make_blobs(n_samples=300, centers=4,
random_state=0, cluster_std=2.0)
clf = DecisionTreeClassifier(max_depth=15)
rng = np.random.RandomState(random_state)
i = np.arange(len(y))
rng.shuffle(i)
visualize_tree(clf, X[i[:250]], y[i[:250]], boundaries=False,
xlim=(X[:, 0].min(), X[:, 0].max()),
ylim=(X[:, 1].min(), X[:, 1].max()))
from IPython.html.widgets import interact
interact(fit_randomized_tree, random_state=[0, 100]);
In [52]:
clf = RandomForestClassifier(n_estimators=100, random_state=0)
visualize_tree(clf, X, y, boundaries=False);
In [ ]: