In [ ]:
%pylab inline
import pandas as pd
from matplotlib import pyplot as plt
import seaborn; seaborn.set()
from ipywidgets import interact
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
autumn()

KNN

$$ a(x, X^l) = \arg \max_{y \in Y} \sum_{i = 1}^{l}[y_i = y] ~ w(i, x) $$

In [ ]:
np.random.seed(13)
n = 100

df = pd.DataFrame(
    np.vstack([
        np.random.normal(loc=0, scale=1, size=(n, 2)),
        np.random.normal(loc=3, scale=2, size=(n, 2))
    ]), columns=['x1', 'x2'])

df['target'] = np.hstack([np.ones(n), np.zeros(n)]).T
plt.scatter(df.x1, df.x2, c=df.target, s=100, edgecolor='black', linewidth='1');

In [ ]:
from sklearn.neighbors import KNeighborsClassifier as KNN

features = df.drop('target', axis=1)
answer = df['target']

def get_grid(data, border=1., step=.01):
    x_min, x_max = data[:, 0].min() - border, data[:, 0].max() + border
    y_min, y_max = data[:, 1].min() - border, data[:, 1].max() + border
    return np.meshgrid(np.arange(x_min, x_max, step),
                       np.arange(y_min, y_max, step))

xx, yy = get_grid(features.values, step=0.025)

def show_knn(k=4, proba=True, weights='uniform'):
    clf = KNN(n_neighbors=k, weights=weights)
    clf.fit(features, answer)
    if proba:
        predicted = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1].reshape(xx.shape)
    else:
        predicted = clf.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
    plt.pcolormesh(xx, yy, predicted)
    plt.scatter(df.x1, df.x2, c=answer, s=100, edgecolor='black', linewidth='1')
    plt.xlabel('x1')
    plt.ylabel('x2')
    plt.axis([xx.min(), xx.max(), yy.min(), yy.max()]);

In [ ]:
interact(show_knn, k=(1, len(df)), weights=['uniform', 'distance'], __manual=True);

Wine


In [ ]:
data = pd.read_csv('wine/winequality-red.csv', sep=';')
print("Shape:", data.shape)
data.head(5)

In [ ]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error as score
X = data.drop('quality', axis = 1)
y = data['quality']
clf = KNeighborsRegressor(n_neighbors=1)
clf = clf.fit(X, y)

In [ ]:
score(clf.predict(X), y)

In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=sum(list(map(ord, 'shad'))))

clf = clf.fit(X_train, y_train)
score(clf.predict(X_test), y_test)

In [ ]:
def get_scores(X_train, X_test, y_train, y_test, max_k=100, clf_class=KNeighborsRegressor):
    for k in range(1, max_k):
        yield score(y_test, clf_class(n_neighbors=k).fit(X_train, y_train).predict(X_test))

scores = list(get_scores(X_train, X_test, y_train, y_test))

In [ ]:
best_k = min(range(len(scores)), key=scores.__getitem__)
start_point = best_k, scores[best_k]
plt.annotate("k = {}\nScore = {:.4}".format(best_k, scores[best_k]),
            xy=start_point,
            xytext=(50, -10), textcoords='offset points',
            size=20,
            bbox=dict(boxstyle="round", fc="1"),
            va="center", ha="left",
            arrowprops=dict(facecolor='red', width=4,))
plt.plot(scores, linewidth=3.0);

In [ ]:
for idx in range(10):
    parts = train_test_split(X, y, test_size=0.3, random_state=idx)
    current_scores = list(get_scores(*parts))
    plt.plot(current_scores, linewidth=3.0);

In [ ]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import StratifiedKFold
params = {'n_neighbors': list(range(1, 100))}
grid_searcher = GridSearchCV(KNeighborsRegressor(),
                             params,
                             cv=StratifiedKFold(y, n_folds=5, random_state=sum(list(map(ord, 'knn')))),
                             scoring="mean_squared_error",
                             n_jobs=-1,)
grid_searcher.fit(X, y);

In [ ]:
means, stds = list(map(np.array, zip(*[(
            np.mean(i.cv_validation_scores),
            np.std(i.cv_validation_scores))
    for i in grid_searcher.grid_scores_])))

means *= -1
plot(range(len(means)), means)
best_k = grid_searcher.best_params_['n_neighbors']
start_point = best_k, -grid_searcher.best_score_
plt.annotate("k = {}\nScore = {:.4}".format(best_k, -grid_searcher.best_score_),
            xy=start_point,
            xytext=(10, 70), textcoords='offset points',
            size=20,
            bbox=dict(boxstyle="round", fc="1"),
            va="center", ha="left",
            arrowprops=dict(facecolor='red', width=4,))
plt.fill_between(range(len(means)), means + 2 * stds, means - 2 * stds, alpha = 0.2, facecolor='blue');

In [ ]:
X.head(5)

In [ ]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
clf = make_pipeline(StandardScaler(), KNeighborsRegressor())    

params = {'kneighborsregressor__n_neighbors': list(range(1, 100))}
grid_searcher = GridSearchCV(clf,
                             params,
                             cv=StratifiedKFold(y, n_folds=5, random_state=sum(list(map(ord, 'knn')))),
                             scoring="mean_squared_error",
                             n_jobs=-1,)
grid_searcher.fit(X, y);

scaled_means = -np.array([np.mean(i.cv_validation_scores) for i in grid_searcher.grid_scores_])

In [ ]:
plot(range(len(means)), means)
plot(range(len(scaled_means)), scaled_means)
best_point = grid_searcher.best_params_['kneighborsregressor__n_neighbors'], -grid_searcher.best_score_
plt.annotate("k = {}\nScore = {:.4}".format(*best_point),
            xy=best_point,
            xytext=(20, 60), textcoords='offset points',
            size=20,
            bbox=dict(boxstyle="round", fc="1"),
            va="center", ha="left",
            arrowprops=dict(facecolor='red', width=4,))
plt.legend(['Initial data', 'Scaled data'], loc='upper right');

Поиск соседей

  1. Linear search (Brute force)
  2. Space partitioning / Branch and bound methodology (KD tree,Ball tree, Cone tree)
  3. Locality sensitive hashing (LSH, ALSH)
  4. Compression / clustering based search (PQ, OPQ, LOPQ, APQ, ...)

Space partitioning / Branch and bound methodology

Locality sensitive hashing

Links