In [ ]:
%pylab inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set();
scatter_args = dict(s=100, edgecolor='black', linewidth='1.5', cmap="autumn")

Random forest

Out-of-bag score

Feature importances

Линейные классификаторы

$$a(x) = sign(\left<w^Tx\right> - w_0)$$

In [ ]:
def get_grid(data, step=0.1):
    x_min, x_max = data.x.min() - 1, data.x.max() + 1
    y_min, y_max = data.y.min() - 1, data.y.max() + 1
    return np.meshgrid(np.arange(x_min, x_max, step),
                       np.arange(y_min, y_max, step))

from sklearn.cross_validation import cross_val_score

def get_score(X, y, cl):
    return cross_val_score(cl, X, y, cv=5, scoring='mean_squared_error').mean()

def plot_linear_border(cl, X, plot, borders=1):
    x_limits = (np.min(X.x) - borders, np.max(X.x) + borders)
    y_limits = (np.min(X.y) - borders, np.max(X.y) + borders)
    line_x = np.linspace(*x_limits, num=2)
    line_y = (-line_x * cl.coef_[0, 0] - cl.intercept_) / cl.coef_[0, 1]
    plot.plot(line_x, line_y, c='r', lw=2)
    plot.fill_between(line_x, line_y, -100, color='r')
    plot.fill_between(line_x, line_y, 100, color='yellow')
    plot.autoscale(tight=True)
    plot.set_xlim(*x_limits)
    plot.set_ylim(*y_limits)

def show_classifier(X, y, cl,
                    feature_modifier=lambda x: x,
                    proba=True,
                    print_score=False,
                    borders=1):
    fig, ax = plt.subplots(1, 1)
    xys = c_[ravel(xs), ravel(ys)]
    cl.fit(feature_modifier(X), y)
    if print_score:
        print("MSE = {}".format(get_score(feature_modifier(X), y, cl)))
    if proba:
        predicted = cl.predict_proba(feature_modifier(pd.DataFrame(xys, columns=('x', 'y'))))[:,1].reshape(xs.shape)
    else:
        predicted = cl.predict(feature_modifier(pd.DataFrame(xys, columns=('x', 'y')))).reshape(xs.shape)
    plot_linear_border(cl, X, ax, borders=borders)
    ax.scatter(X.x, X.y, c=y, **scatter_args)
    return cl

In [ ]:
n = 200
random = np.random.RandomState(17)
df1 = pd.DataFrame(data=random.multivariate_normal((0,0), [[1, 0.3], [0.3, 0.7]], n), columns=['x', 'y'])
df1['target'] = 0
df2 = pd.DataFrame(data=random.multivariate_normal((1,2), [[1, -0.5], [-0.5, 1.6]], n), columns=['x', 'y'])
df2['target'] = 1
data = pd.concat([df1, df2], ignore_index=True)
features = data[['x', 'y']]
data.plot(kind='scatter', x='x', y='y', c='target', colormap='autumn', alpha=0.75, colorbar=False);

In [ ]:
from sklearn.svm import LinearSVC
big_grid = get_grid(features, 0.1)
show_classifier(features, data.target, 
                LinearSVC(),
                proba=False);

Градиентный спуск

$$M_i(w, w_0) = y_i(\left<x, w\right> - w_0)$$$$\sum_{i=1}^l \mathscr{L}(M(x_i)) \to min$$

In [ ]:
from sklearn.linear_model import SGDClassifier

In [ ]:
random = np.random.RandomState(11)
n_iters = 20
figure(figsize=(10, 8 * n_iters))
xys = c_[ravel(xs), ravel(ys)]
clf = SGDClassifier(alpha=1, l1_ratio=0)
train_objects = data.ix[random.choice(data.index, n_iters)]
for iteration in range(n_iters):
    new_object = train_objects.iloc[iteration]
    clf = clf.partial_fit([new_object[['x', 'y']]], [new_object.target], classes=[0, 1])
    ax = subplot(n_iters, 1, iteration + 1)
    title("objets count = {}".format(iteration + 1))
    predicted = clf.predict(xys).reshape(xs.shape)
    plot_linear_border(clf, features, ax)
    processed_objects = train_objects.head(iteration + 1)
    scatter(processed_objects.x, processed_objects.y, c=processed_objects.target, alpha=0.5, **scatter_args)
    scatter(new_object.x, new_object.y, marker='x', lw='20')

Links