notebook.community

Edit and run



In [5]:

    
%matplotlib inline
from collections import defaultdict
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.linear_model import LogisticRegression



In [6]:

    
class Dataset(object):
    def __init__(self, data):
        X, y = data
        X = StandardScaler().fit_transform(X)
        self.X = X
        self.y = y
        self.split()
        self.define_mesh()

    def split(self, test_size=0.4):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=test_size)
        self.X0 = X_train
        self.y0 = y_train
        self.X1 = X_test
        self.y1 = y_test
        
    def define_mesh(self, h=0.02):
        # h is the step size in the mesh
        x_min, x_max = self.X[:, 0].min() - .5, self.X[:, 0].max() + .5
        y_min, y_max = self.X[:, 1].min() - .5, self.X[:, 1].max() + .5
        self.xx, self.yy = np.meshgrid(np.arange(x_min, x_max, h),
                                       np.arange(y_min, y_max, h))


class ModelCollection(object):
    def __init__(self):
        self.models = defaultdict(dict)
        self.scores = defaultdict(dict)

    def __str__(self):
        mystr = ''
        for ds_name, ds in self.models.iteritems():
            for model_name, model in ds.iteritems():
                score = self.scores[ds_name][model_name]
                mystr += '%s - %s\t%.2f\n' % (ds_name, model_name, score)
        return mystr
                
    def add_model(self, dataset_name, model_name, model, score):
        self.models[dataset_name][model_name] = model
        self.scores[dataset_name][model_name] = score 

    def get_model(self, dataset_name, model_name):
        return self.models[dataset_name][model_name]

    def get_score(self, dataset_name, model_name):
        return self.scores[dataset_name][model_name]

    def get_num_models(self, dataset_name):
        return len(self.models[dataset_name])



In [7]:

    
X, y = make_classification(n_features=2, n_redundant=0, n_informative=2,
                           random_state=1, n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)
datasets = { 'Moons': Dataset( make_moons(noise=0.3) ),
             'Circles': Dataset( make_circles(noise=0.2, factor=0.5) ),
             'Linearly Separable': Dataset( linearly_separable ) }



In [12]:

    
def make_classifiers():
    clf_names = ['Nearest Neighbors', 'Linear SVM', 'RBF SVM', 'Decision Tree',
                 'Random Forest', 'AdaBoost', 'Extra Trees', 'Naive Bayes', 'QDA',
                 'Logistic']
    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel='linear', C=0.025),
        SVC(gamma=2, C=1),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                           algorithm="SAMME", n_estimators=200),
        ExtraTreesClassifier(),
        GaussianNB(),
        QDA(),
        LogisticRegression()]

    classifier_dict = {}
    for name, clf in zip(clf_names, classifiers):
        yield name, clf


model_collection = ModelCollection()
for ds_name, ds in datasets.items():
    for clf_name, clf in make_classifiers():
        clf.fit(ds.X0, ds.y0) # fit training data
        score = clf.score(ds.X1, ds.y1) # compute predictive accuracy on test data
        model_collection.add_model(ds_name, clf_name, clf, score)



In [13]:

    
def plot_panel(ds_name, clf_name, model_collection, datasets, ax):
    ds = datasets[ds_name]
    if clf_name is None:
        ax.set_title("%s" % ds_name)
    else:
        model = model_collection.get_model(ds_name, clf_name)
        # Plot the decision boundary
        if hasattr(model, "decision_function"):
            Z = model.decision_function(np.c_[ds.xx.ravel(), ds.yy.ravel()])
        else:
            Z = model.predict_proba(np.c_[ds.xx.ravel(), ds.yy.ravel()])[:, 1]
        Z = Z.reshape(ds.xx.shape)
        ax.contourf(ds.xx, ds.yy, Z, cmap=cm, alpha=0.8)
        score = model_collection.get_score(ds_name, clf_name)
        ax.set_title("%s (%.2f)" % (clf_name, score))

    # Plot the data points
    ax.scatter(ds.X0[:, 0], ds.X0[:, 1], c=ds.y0, cmap=cm_bright)
    ax.scatter(ds.X1[:, 0], ds.X1[:, 1], c=ds.y1, cmap=cm_bright, alpha=0.6)
    ax.set_xlim(ds.xx.min(), ds.xx.max())
    ax.set_ylim(ds.yy.min(), ds.yy.max())
    ax.set_xticks(())
    ax.set_yticks(())


cm = plt.cm.bwr
cm_bright = ListedColormap(['#0000FF', '#FF0000'])

ds_cols = ['Moons', 'Circles', 'Linearly Separable']
clf_rows = ['Nearest Neighbors', 'Linear SVM', 'RBF SVM', 'Decision Tree',
            'Random Forest', 'AdaBoost', 'Extra Trees', 'Naive Bayes',
            'Logistic']

figure, axes = plt.subplots(len(clf_rows) + 1, len(ds_cols), figsize=(10, 30))

for j, d in enumerate(ds_cols):
    plot_panel(d, None, model_collection, datasets, axes[0, j])

for i, c in enumerate(clf_rows):
    for j, d in enumerate(ds_cols):
        plot_panel(d, c, model_collection, datasets, axes[i+1, j])

figure.subplots_adjust(left=.02, right=.98)