In [ ]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import numpy as np
import pandas as pd

from sklearn.base import clone
from sklearn.datasets import make_moons
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

cm_bright = ListedColormap(['#FF0000', '#0000FF'])

In [ ]:
noise = 0.15
rng = np.random.RandomState(1)

scaler = StandardScaler()

X_train, y_train = make_moons(n_samples=30, shuffle=True, noise=noise, random_state=rng)
X_test, y_test = make_moons(n_samples=1000, shuffle=True, noise=noise, random_state=rng)
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

noised_versions = []
for _ in range(5):
    X_noise, y_noise = make_moons(n_samples=30, shuffle=True, noise=noise, random_state=rng)
    X_noise = scaler.transform(X_noise)
    y_noise = shuffle(y_noise, random_state=rng)
    noised_versions.append(shuffle(np.vstack([X_train, X_noise]),
                                   np.concatenate([y_train, y_noise]),
                                   random_state=rng))
    
X_train_noise, y_train_noise = noised_versions[0]

In [ ]:
def plot_model(ax, clf, xlim=(-2.5, 2.5), ylim=(-2.5, 2.5),
               steps=100, cmap=plt.cm.RdBu, alpha=.5):
    xx, yy = np.meshgrid(np.linspace(xlim[0], xlim[1], steps),
                         np.linspace(ylim[0], ylim[1], steps))
    Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, cmap=cmap, alpha=alpha)
    ax.set_xlim(xlim); ax.set_ylim(ylim)
    
    
def plot_model_data(ax, X, y, clf=None,
                    xlim=(-2.5, 2.5), ylim=(-2.5, 2.5),
                    steps=100, cmap=plt.cm.RdBu, alpha=.5):
    if clf is not None:
        plot_model(ax, clf, xlim=xlim, ylim=ylim, steps=steps,
                   cmap=cmap, alpha=alpha)
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap=cm_bright, s=60,
               edgecolors='k',alpha=.7)
    ax.set_xticks([]), ax.set_yticks([])

In [ ]:
def evaluate(m, X_train, y_train, X_test, y_test):
    print(f"train acc: {m.score(X_train, y_train) * 100:.0f}%,"
          f" test acc: {m.score(X_test, y_test) * 100:.0f}%")

In [ ]:
sgd_params = dict(
    learning_rate_init=0.1,
    learning_rate='constant',
    max_iter=1000,
)

In [ ]:
m_lbfgs = MLPClassifier(solver='lbfgs', hidden_layer_sizes=[256],
                   alpha=0, random_state=0)
m_lbfgs.fit(X_train, y_train)
evaluate(m_lbfgs, X_train, y_train, X_test, y_test)

In [ ]:
m_sgd = MLPClassifier(solver='sgd', hidden_layer_sizes=[256],
                   alpha=0, random_state=0, **sgd_params)
m_sgd.fit(X_train, y_train)
evaluate(m_sgd, X_train, y_train, X_test, y_test)

In [ ]:
fig, ax_list = plt.subplots(nrows=2, ncols=4, figsize=(16, 8))
stats = []
for i, ax in enumerate(ax_list.ravel()):
    hidden_size = 2 ** (i + 2)
    m = MLPClassifier(solver='sgd', hidden_layer_sizes=[hidden_size],
                   alpha=0, random_state=0, **sgd_params)
    m.fit(X_train, y_train)
    train_acc = m.score(X_train, y_train) * 100
    test_acc = m.score(X_test, y_test) * 100
    stats.append(dict(
        n_layers=1,
        hidden_size=hidden_size,
        train_acc=train_acc,
        test_acc=test_acc,
    ))
    ax.set_title(f"hidden: {hidden_size}, train {train_acc:.1f}%, test {test_acc:.1f}%")
    plot_model_data(ax, X_train, y_train, clf=m)
fig.suptitle("1 hidden layer MLP");

In [ ]:
fig, ax_list = plt.subplots(nrows=2, ncols=4, figsize=(16, 8))
for i, ax in enumerate(ax_list.ravel()):
    hidden_size = 2 ** (i + 2)
    m = MLPClassifier(solver='lbfgs', hidden_layer_sizes=[hidden_size] * 2,
                      alpha=0, random_state=0, **sgd_params)
    m.fit(X_train, y_train)
    train_acc = m.score(X_train, y_train) * 100
    test_acc = m.score(X_test, y_test) * 100
    stats.append(dict(
        n_layers=2,
        hidden_size=hidden_size,
        train_acc=train_acc,
        test_acc=test_acc,
    ))
    ax.set_title(f"hidden: {hidden_size}, train {train_acc:.1f}%, test {test_acc:.1f}%")
    plot_model_data(ax, X_train, y_train, clf=m)
fig.suptitle("2 hidden layers MLP");
fig.savefig('moons_lbfgs.svg')

In [ ]:
stats_df = pd.DataFrame.from_dict(stats)
stats_df = stats_df.sort_values('hidden_size')
# stats_df[stats_df['n_layers'] == 1].plot(x='hidden_size', y='train_acc')
# stats_df[stats_df['n_layers'] == 1].plot(x='hidden_size', y='test_acc')

In [ ]:
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(16, 8))
plot_model_data(ax0, X_train, y_train)
ax0.set_title("Original training set: m=50")

plot_model_data(ax1, X_train_noise, y_train_noise)
ax1.set_title("Original training set + noise point: m=50 + 100")

In [ ]:
%%time
m_smooth = MLPClassifier(solver='lbfgs', hidden_layer_sizes=[256],
                   alpha=0, random_state=0)
m_smooth.fit(X_train, y_train)
evaluate(m_smooth, X_train, y_train, X_test, y_test)

In [ ]:
fig, ax = plt.subplots(figsize=(8, 8))
plot_model_data(ax, X_train, y_train)
fig.savefig("half_moons.svg")
plt.close(fig)

In [ ]:
fig, ax = plt.subplots(figsize=(8, 8))
plot_model_data(ax, X_train, y_train, m_smooth)
fig.savefig("low_complexity_decision_function.svg")
plt.close(fig)

In [ ]:
%%time
m_noise = clone(m_smooth)
m_noise.fit(X_train_noise, y_train_noise)
evaluate(m_noise, X_train, y_train, X_test, y_test)

In [ ]:
i = 0
for X, y  in noised_versions:
    m = clone(m_smooth)
    m.fit(X, y)
    if m.score(X_train, y_train) == 1:
        i += 1
        fig, ax = plt.subplots(figsize=(8, 8))
        plot_model_data(ax, X, y, m)
        fig.savefig(f"high_complexity_decision_function_{i:02d}.svg")
        plt.close(fig)

In [ ]:
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.neural_network.multilayer_perceptron import ACTIVATIONS


def logits(m, X):
    sigma = ACTIVATIONS[m.activation]
    a = X
    for i in range(m.n_layers_ - 1):
        a = safe_sparse_dot(a, m.coefs_[i])
        a += m.intercepts_[i]
        if (i + 1) != (m.n_layers_ - 1):
            activations = sigma(a)
    return a


def lipschitz(m):
    return np.prod([max(np.linalg.svd(w)[1]) for w in m.coefs_])


def margins(m, X, y):
    preds = logits(m, X).ravel()
#     correct_mask = (preds >= 0) == y
#     return np.abs(preds * correct_mask)
    return np.abs(preds)


def normalized_margins(m, X, y):
    return margins(m, X, y) / lipschitz(m)


def bartlett_complexity(m, X, y):
    return 1 / normalized_margins(m, X, y).mean()

In [ ]:
lipschitz(m_smooth)

In [ ]:
lipschitz(m_noise)

In [ ]:
margins(m_smooth, X_train, y_train).mean(), lipschitz(m_smooth)

In [ ]:
margins(m_noise, X_train, y_train).mean(), lipschitz(m_noise)

In [ ]:
from scipy.stats import gaussian_kde


x = np.linspace(0, 0.5, 100)
m_smooth_margins_density = gaussian_kde(normalized_margins(m_smooth, X_train, y_train))
m_smooth_complexity = bartlett_complexity(m_smooth, X_train, y_train)
plt.fill_between(x, m_smooth_margins_density(x), 0, alpha=0.5,
                 label=f'smooth model (complexity: {m_smooth_complexity:0.1f})')
m_noise_margins_density = gaussian_kde(normalized_margins(m_noise, X_train, y_train))
m_noise_complexity = bartlett_complexity(m_noise, X_train, y_train)
plt.fill_between(x, m_noise_margins_density(x), 0, alpha=0.5,
                 label=f'hard/noisy model (complexity: {m_noise_complexity:0.1f})')
plt.xlabel('Density of normalized margins (training set)')
plt.legend(loc='best');

Complexity measures vs excess risk


In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import json
from gzip import GzipFile


with GzipFile('models.log.gz', 'r') as f:
    records = [json.loads(line.strip()) for line in f]
    models_df = pd.DataFrame.from_dict(records)

with GzipFile('evaluations.log.gz', 'r') as f:
    records = [json.loads(line.strip()) for line in f]
    df = pd.DataFrame.from_dict(records)
    
df = df.merge(models_df)

In [ ]:
# fig, ax = plt.subplots(figsize=(16, 6))
# q = df.query("solver == 'lbfgs' and label_noise_rate == 0.1 and width == 256").sort_values('depth')
# q.plot(x='depth', y='train_acc', kind='line', ax=ax)
# q.plot(x='depth', y='test_acc', kind='line', ax=ax)

In [ ]:
# fig, ax = plt.subplots(figsize=(16, 6))
# q = df.query("solver == 'lbfgs' and label_noise_rate == 0.1 and depth == 1").sort_values('width')
# q.plot(x='width', y='train_acc', kind='line', ax=ax)
# q.plot(x='width', y='test_acc', kind='line', ax=ax)

In [ ]:
# df.query("excess_risk > 0.22 and bartlett_complexity_mean < 7")

In [ ]:
fig, ax = plt.subplots(figsize=(16, 6))
q = df.query("train_acc >= 0.9 and width == 256")
depths = sorted(q['depth'].unique())
colors = plt.cm.tab10(np.linspace(0, 0.5, len(depths)))
for depth, color in zip(depths, colors):
    q.query(f"depth == '{depth}'").plot(
            x='lipschitz', y='excess_risk', kind='scatter',
            c=color, s=50, label=f"{depth} hidden layer", ax=ax)
ax.set_xlim((0, 300))
ax.plot([0, 150], [0.07, 0.75])
fig.suptitle("256 units per hidden-layer MLPs")
fig.savefig("lipschitz_depth.svg")

In [ ]:
fig, ax = plt.subplots(figsize=(16, 6))
q = df.query("train_acc >= 0.9 and depth == 1")
widths = sorted(q['width'].unique())
colors = plt.cm.tab10(np.linspace(0, 0.5, len(widths)))
for width, color in zip(widths, colors):
    q.query(f"width == '{width}'").plot(
            x='lipschitz', y='excess_risk', kind='scatter',
            c=color, s=50, label=f"width: {width} units", ax=ax)
ax.set_xlim((0, 300))
ax.plot([0, 150], [0.07, 0.75])
fig.suptitle("1-hidden layer MLPs");
fig.savefig("lipschitz_width.svg")

In [ ]:
fig, ax = plt.subplots(figsize=(16, 6))
solvers = q['solver'].unique()
colors = plt.cm.tab10(np.linspace(0, 0.2, len(solvers)))

q = df.query("train_acc >= .9")
for solver, color in zip(solvers, colors):
    q.query(f"solver == '{solver}'").plot(
            x='lipschitz', y='excess_risk', kind='scatter',
            c=color, s=50, label=solver, ax=ax)
ax.plot([0, 150], [0.07, 0.75])
ax.set_xlim((0, 300));
fig.savefig("lipschitz_optimizer.svg")

In [ ]:
fig, ax = plt.subplots(figsize=(16, 6))
q = df.query("train_acc >= 0.9")
for solver, color in zip(solvers, colors):
    q.query(f"solver == '{solver}'").plot(
            x='bartlett_complexity_mean', y='excess_risk', kind='scatter',
            c=color, s=50, label=solver, ax=ax)
ax.set_xlabel('Lipschitz / margin')
# ax.plot([0, 45], [0.23, 0.75])
ax.set_xlim((0, 100));
fig.savefig("lipschitz_margin_optimizer.svg")

In [ ]:
fig, ax = plt.subplots(figsize=(16, 6))
q = df.query("train_acc >= 1")
q.plot(x='lipschitz', y='excess_risk', kind='scatter',
       c=plt.cm.Blues(q['label_noise_rate']), s=50, ax=ax)
ax.plot([0, 150], [0.07, 0.75], c='k')
ax.set_xlim((0, 300))
fig.suptitle('Impact of label noise (0% to 100%)');

In [ ]:
# for model_id in df['model_id'].unique():
#     df.query('n_samples_train == 30')[df['model_id'] == model_id].plot('label_noise_rate', 'lipschitz')

Implicit regularization by SGD


In [ ]: