In [ ]:
%pylab inline
import pandas as pd
from matplotlib import pyplot as plt
import seaborn; seaborn.set()
from ipywidgets import interact
pylab.rcParams['figure.figsize'] = (10.0, 8.0)
scatter_args = dict(s=100, edgecolor='black', linewidth='1.5')
autumn()

Decision Trees


In [ ]:
from sklearn.cross_validation import cross_val_score

def get_grid(data, step=0.01):
    x_min, x_max = data.x.min() - 1, data.x.max() + 1
    y_min, y_max = data.y.min() - 1, data.y.max() + 1
    return np.meshgrid(np.arange(x_min, x_max, step),
                         np.arange(y_min, y_max, step))

def get_score(X, y, cl):
    return cross_val_score(cl, X, y, cv=5, scoring='mean_squared_error').mean()

def show_classifier(X, y, cl,
                    feature_modifier=lambda x: x,
                    proba=True,
                    print_score=False,
                    grid=None):
    if not grid:
        xs, ys = get_grid(X)
    else:
        xs, ys = grid
    xys = c_[ravel(xs), ravel(ys)]
    cl.fit(feature_modifier(X), y)
    if print_score:
        print("MSE = {}".format(get_score(feature_modifier(X), y, cl)))
    if proba:
        predicted = cl.predict_proba(feature_modifier(pd.DataFrame(xys, columns=('x', 'y'))))[:,1].reshape(xs.shape)
    else:
        predicted = cl.predict(feature_modifier(pd.DataFrame(xys, columns=('x', 'y')))).reshape(xs.shape)
    pcolormesh(xs, ys, predicted)
    scatter(X.x, X.y, c=y, alpha=0.5, **scatter_args)
    autoscale(tight=True)
    return cl

np.random.seed(13)
n = 100

df = pd.DataFrame(
    np.vstack([
        np.random.multivariate_normal((0,0), [[1, 0.3], [0.3, 0.7]], n),
        random.multivariate_normal((1,2), [[1, -0.5], [-0.5, 1.6]], n)
    ]), columns=['x', 'y'])
df['target'] = np.hstack([np.ones(n), np.zeros(n)]).T

X = df.drop('target', axis=1)
y = df['target']

plt.scatter(df.x, df.y, c=df.target, **scatter_args);

In [ ]:
from sklearn.tree import DecisionTreeClassifier
simple_tree = show_classifier(X, y, DecisionTreeClassifier(max_depth=3));

In [ ]:
%install_ext https://gist.githubusercontent.com/tempestadept/9322248/raw/8eb1fa343947d9628e71f10d47d1b3939a9df8a8/gvpng.py

In [ ]:
%load_ext gvpng

In [ ]:
# Перевод дерева в графвизовский формат dot
try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO
from sklearn import tree
def vis_tree(cl, feature_names=['x', 'y']):
    s = StringIO()
    tree.export_graphviz(cl, out_file=s, feature_names=feature_names)
    return s.getvalue()

In [ ]:
%dotstr vis_tree(simple_tree)
show_classifier(X, y, DecisionTreeClassifier(max_depth=3));

Misclassification Rate: $$C(j,t) = \frac{1}{n_{jt}} \sum_{y_i: x_{ij} \gt t} [y_i \ne \hat{y}]$$ Gini: $$C(j,t) = \sum_{k=1}^K p_i (1 - p_i) = 1 - \sum_{k=1}^K p_i^2$$ Entropy: $$H(p) = -\sum_i p_i log_2(p_i)$$


In [ ]:
entropy = lambda p: -np.sum(p * np.log2(p)) if 0 not in p else 0
gini = lambda p: 1. - (np.array(p)**2).sum()
misclassification = lambda p: min(p)

pvals = np.linspace(0, 1, num=51)        
plt.plot(pvals, [entropy([p,1-p])/2. for p in pvals], label='Entropy')
plt.plot(pvals, [gini([p,1-p]) for p in pvals], label='Gini')
plt.plot(pvals, [misclassification([p,1-p]) for p in pvals], label='Misclassification rate')
plt.legend()
plt.title('Impurity functions');

In [ ]:
class_size = 80
np.random.seed(111)

points_1d = pd.DataFrame(
    {'x': np.hstack([
            np.random.normal(loc=0, scale=1, size=(class_size // 2)),
            np.random.normal(loc=4, scale=1, size=(class_size)),
            np.random.normal(loc=7, scale=2, size=(class_size // 2))
            ]),
     'label':  np.hstack([
            np.ones(class_size // 2),
            np.zeros(class_size),
            np.ones(class_size // 2)
            ])
})
f, axarr = plt.subplots(4, sharex=True)

axarr[0].set_title("Points")
axarr[0].scatter(points_1d.x, np.random.uniform(high=0.2, size=len(points_1d)), c=points_1d.label, s=40);
steps = np.linspace(np.min(points_1d.x), np.max(points_1d.x), 100)
def get_ratio_by_mask(arr, mask):
    masked = arr[mask]
    return np.sum(masked) / len(masked) if len(masked) else np.nan
ratios = np.array([np.sum(points_1d.label[points_1d.x < k]) for k in steps])

axarr[1].set_title("Ratio of class 1")
left_ratios = np.array([get_ratio_by_mask(points_1d.label, points_1d.x < k) for k in steps])
right_ratios = np.array([get_ratio_by_mask(points_1d.label, points_1d.x >= k) for k in steps])
axarr[1].plot(steps, left_ratios, label='Left part')
axarr[1].plot(steps, right_ratios, '--', label='Right part')
axarr[1].set_ylim([0, 1.1])
axarr[1].legend(loc='best')

axarr[2].set_title("Entropy")
vec_entropy = lambda p: -np.nan_to_num(p * np.log2(p) + (1 - p) * np.log2(1 - p))
left_entropy = vec_entropy(left_ratios)
axarr[2].plot(steps, left_entropy, label='Left part')
right_entropy = vec_entropy(right_ratios)
axarr[2].plot(steps, right_entropy, '--', label='Right part')
axarr[2].set_ylim([-0.1, 1.1])
axarr[2].legend(loc='best')

axarr[3].set_title("Impurity")
weights = np.array([np.sum(points_1d.x < s) for s in steps]) / len(points_1d)
impurity = weights * left_entropy + (1 - weights) * right_entropy
axarr[3].set_ylim([0.6, 1.05])
axarr[3].plot(steps, impurity, 'r', label='Right part')
lowest_impurity = np.argmin(impurity);

In [ ]:
show_classifier(X, y, DecisionTreeClassifier(), print_score=True);

In [ ]:
show_classifier(X, y, DecisionTreeClassifier(max_depth=3), print_score=True);

In [ ]:
def add_features(X):
    return pd.concat([X, pd.DataFrame({'f1': X.x + X.y, 'f2': X.x - X.y})], axis=1)

show_classifier(X, y, DecisionTreeClassifier(max_depth=3),
                feature_modifier=add_features,
                print_score=True);

Random forest


In [ ]:
from sklearn.ensemble import RandomForestClassifier
n_estimators_grid = [1, 10, 50]
figure(figsize=(10, 8 * len(n_estimators_grid)))

for index, n_estimators in enumerate(n_estimators_grid):
    subplot(len(n_estimators_grid), 1, index + 1)
    show_classifier(X, y, 
                    RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1));
    title("n_estimators = {}".format(n_estimators));

In [ ]:
r_data = pd.DataFrame(np.random.normal(size=(300, 2)), columns=['x', 'y'])
r_data['target'] = (r_data.x ** 2 + r_data.y ** 2) ** 0.5
r_features = r_data[['x', 'y']]
r_data.plot(kind='scatter', x='x', y='y', c='target', colormap='autumn', colorbar=False, **scatter_args);

In [ ]:
from sklearn.tree import DecisionTreeRegressor
show_classifier(r_features, r_data.target, 
                DecisionTreeRegressor(), proba=False);

In [ ]:
clfs = []
n_samples = 10
figure(figsize=(10, 8*n_samples))
sample_size = 100
grid = get_grid(r_data, 0.1)

for index in range(n_samples):
    rows = r_data.ix[np.random.RandomState(index).choice(r_data.index, sample_size)]
    subplot(n_samples, 1, index + 1)
    clfs.append(show_classifier(rows[['x', 'y']], rows.target, 
                DecisionTreeRegressor(),
                proba=False,
                grid=grid));

In [ ]:
xs, ys = grid
xys = c_[ravel(xs), ravel(ys)]
predicted = [cl.predict(pd.DataFrame(xys, columns=('x', 'y'))).reshape(xs.shape) for cl in clfs]
pcolormesh(xs, ys, np.mean(predicted, axis=0))
scatter(r_data.x, r_data.y, c=r_data.target, **scatter_args)
autoscale(tight=True)

In [ ]:
from sklearn.ensemble import RandomForestRegressor
show_classifier(r_features, r_data.target, 
                RandomForestRegressor(n_estimators=10),
                proba=False,
                grid=grid);

In [ ]:
from sklearn.cross_validation import train_test_split
from itertools import chain, repeat
from sklearn.metrics import mean_squared_error

def get_plot_points(algorithm,
                    n_estimators_grid,
                    params,
                    train_features, test_features,
                    train_answers, test_answers):
    train_errors, test_errors = [], []
    for n_estimators in n_estimators_grid:
        cl = algorithm(n_estimators=n_estimators, **params)
        cl.fit(train_features, train_answers)
        train_errors.append(mean_squared_error(cl.predict(train_features), train_answers))
        test_errors.append(mean_squared_error(cl.predict(test_features), test_answers))
    return train_errors, test_errors

split = train_test_split(r_features, r_data.target, test_size=0.33, random_state=42)
train_errors_rsm, test_errors_rsm = [], []
n_estimators_grid = range(1, 361, 20)
plot(*chain(*zip(repeat(n_estimators_grid),
                 chain(get_plot_points(RandomForestRegressor, n_estimators_grid, {'n_jobs' : -1}, *split),
                       get_plot_points(RandomForestRegressor, n_estimators_grid, {'n_jobs' : -1, 'max_features' : 1}, *split)
                       ),
                 ['r-', 'b-', 'r--', 'b--'])));
legend(('Bagging (train)', 'Bagging (test)',
        'Bagging + RSM (train)', 'Bagging + RSM (test)'));

In [ ]:
from sklearn.ensemble import RandomForestClassifier
n_estimators_grid = [1, 10, 50]
figure(figsize=(10, 8 * len(n_estimators_grid)))

for index, n_estimators in enumerate(n_estimators_grid):
    subplot(len(n_estimators_grid), 1, index + 1)
    show_classifier(X, y, 
                    RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1),
                    print_score=True);
    title("n_estimators = {}".format(n_estimators));

Links