In [ ]:
%pylab inline
import pandas as pd
from matplotlib import pyplot as plt
import seaborn; seaborn.set()
from ipywidgets import interact
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
autumn()
In [ ]:
np.random.seed(13)
n = 100
df = pd.DataFrame(
np.vstack([
np.random.normal(loc=0, scale=1, size=(n, 2)),
np.random.normal(loc=3, scale=2, size=(n, 2))
]), columns=['x1', 'x2'])
df['target'] = np.hstack([np.ones(n), np.zeros(n)]).T
plt.scatter(df.x1, df.x2, c=df.target, s=100, edgecolor='black', linewidth='1');
In [ ]:
from sklearn.neighbors import KNeighborsClassifier as KNN
features = df.drop('target', axis=1)
answer = df['target']
def get_grid(data, border=1., step=.01):
x_min, x_max = data[:, 0].min() - border, data[:, 0].max() + border
y_min, y_max = data[:, 1].min() - border, data[:, 1].max() + border
return np.meshgrid(np.arange(x_min, x_max, step),
np.arange(y_min, y_max, step))
xx, yy = get_grid(features.values, step=0.025)
def show_knn(k=4, proba=True, weights='uniform'):
clf = KNN(n_neighbors=k, weights=weights)
clf.fit(features, answer)
if proba:
predicted = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1].reshape(xx.shape)
else:
predicted = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.pcolormesh(xx, yy, predicted)
plt.scatter(df.x1, df.x2, c=answer, s=100, edgecolor='black', linewidth='1')
plt.xlabel('x1')
plt.ylabel('x2')
plt.axis([xx.min(), xx.max(), yy.min(), yy.max()]);
In [ ]:
interact(show_knn, k=(1, len(df)), weights=['uniform', 'distance'], __manual=True);
In [ ]:
data = pd.read_csv('wine/winequality-red.csv', sep=';')
print("Shape:", data.shape)
data.head(5)
In [ ]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error as score
X = data.drop('quality', axis = 1)
y = data['quality']
clf = KNeighborsRegressor(n_neighbors=1)
clf = clf.fit(X, y)
In [ ]:
score(clf.predict(X), y)
In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=sum(list(map(ord, 'shad'))))
clf = clf.fit(X_train, y_train)
score(clf.predict(X_test), y_test)
In [ ]:
def get_scores(X_train, X_test, y_train, y_test, max_k=100, clf_class=KNeighborsRegressor):
for k in range(1, max_k):
yield score(y_test, clf_class(n_neighbors=k).fit(X_train, y_train).predict(X_test))
scores = list(get_scores(X_train, X_test, y_train, y_test))
In [ ]:
best_k = min(range(len(scores)), key=scores.__getitem__)
start_point = best_k, scores[best_k]
plt.annotate("k = {}\nScore = {:.4}".format(best_k, scores[best_k]),
xy=start_point,
xytext=(50, -10), textcoords='offset points',
size=20,
bbox=dict(boxstyle="round", fc="1"),
va="center", ha="left",
arrowprops=dict(facecolor='red', width=4,))
plt.plot(scores, linewidth=3.0);
In [ ]:
for idx in range(10):
parts = train_test_split(X, y, test_size=0.3, random_state=idx)
current_scores = list(get_scores(*parts))
plt.plot(current_scores, linewidth=3.0);
In [ ]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
params = {'n_neighbors': list(range(1, 100))}
grid_searcher = GridSearchCV(KNeighborsRegressor(),
params,
cv=StratifiedKFold(y, n_folds=5, random_state=sum(list(map(ord, 'knn')))),
scoring="mean_squared_error",
n_jobs=-1,)
grid_searcher.fit(X, y);
In [ ]:
means, stds = list(map(np.array, zip(*[(
np.mean(i.cv_validation_scores),
np.std(i.cv_validation_scores))
for i in grid_searcher.grid_scores_])))
means *= -1
plot(range(len(means)), means)
best_k = grid_searcher.best_params_['n_neighbors']
start_point = best_k, -grid_searcher.best_score_
plt.annotate("k = {}\nScore = {:.4}".format(best_k, -grid_searcher.best_score_),
xy=start_point,
xytext=(10, 70), textcoords='offset points',
size=20,
bbox=dict(boxstyle="round", fc="1"),
va="center", ha="left",
arrowprops=dict(facecolor='red', width=4,))
plt.fill_between(range(len(means)), means + 2 * stds, means - 2 * stds, alpha = 0.2, facecolor='blue');
In [ ]:
X.head(5)
In [ ]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
clf = make_pipeline(StandardScaler(), KNeighborsRegressor())
params = {'kneighborsregressor__n_neighbors': list(range(1, 100))}
grid_searcher = GridSearchCV(clf,
params,
cv=StratifiedKFold(y, n_folds=5, random_state=sum(list(map(ord, 'knn')))),
scoring="mean_squared_error",
n_jobs=-1,)
grid_searcher.fit(X, y);
scaled_means = -np.array([np.mean(i.cv_validation_scores) for i in grid_searcher.grid_scores_])
In [ ]:
plot(range(len(means)), means)
plot(range(len(scaled_means)), scaled_means)
best_point = grid_searcher.best_params_['kneighborsregressor__n_neighbors'], -grid_searcher.best_score_
plt.annotate("k = {}\nScore = {:.4}".format(*best_point),
xy=best_point,
xytext=(20, 60), textcoords='offset points',
size=20,
bbox=dict(boxstyle="round", fc="1"),
va="center", ha="left",
arrowprops=dict(facecolor='red', width=4,))
plt.legend(['Initial data', 'Scaled data'], loc='upper right');