In [1]:
%matplotlib inline
In [2]:
# Regression can be used on pockets of feature space.
# Think of the dataset as subject for several data processes
# Regression can be used in the context of clustering.
# Because Regression is a supervised technique, we need to use
# k-Nearest Neighbors instead of k-Means
In [3]:
# Using Iris dataset to predict the petal width for each flower
# clustering by iris species can give us better results.
# We cannot cluster by species but we can work under the
# assumption that the X's will be closer for the same species
In [4]:
from sklearn.datasets import load_iris
In [10]:
iris = load_iris()
X = iris.data
y = iris.target
In [11]:
iris.feature_names
Out[11]:
In [12]:
# try to predict the petal length based on the sepal
# length and width. Fit a linear regression to see how well the
# k-NN regresion does in comparison.
In [14]:
from sklearn.linear_model import LinearRegression
import numpy as np
lr = LinearRegression()
lr.fit(X, y)
Out[14]:
In [16]:
print "The MSE is: {:.2}".format(np.power(y - lr.predict(X),
2).mean())
In [17]:
# For k-NN regression:
from sklearn.neighbors import KNeighborsRegressor
In [19]:
knnr = KNeighborsRegressor(n_neighbors=10)
knnr.fit(X, y)
print "The MSE is: {:.2}".format(np.power(y - knnr.predict(X),
2).mean())
In [20]:
import matplotlib.pyplot as plt
f, ax = plt.subplots(nrows=2, figsize=(7,10))
ax[0].set_title("Predictions")
ax[0].scatter(X[:, 0], X[:, 1], s=lr.predict(X)*80,
label='LR Predictions', color='c',
edgecolors='black')
ax[1].scatter(X[:, 0], X[: ,1], s=knnr.predict(X)*80,
label='k-NN Predictions', color='m',
edgecolors='black')
ax[0].legend()
ax[1].legend()
Out[20]:
In [24]:
setosa_idx = np.where(iris.target_names=='setosa')
setosa_mask = iris.target == setosa_idx[0]
y[setosa_mask][:5]
Out[24]:
In [22]:
knnr.predict(X)[setosa_mask][:5]
Out[22]:
In [23]:
lr.predict(X)[setosa_mask][:5]
Out[23]:
In [ ]: