In [1]:
from __future__ import print_function
import os
from os import path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from classify_base import load_data, enumerate_and_write_predictions
import meta
import numpy as np

In [2]:
level_0_clfs = [
#     {
#         'name': 'random_forest_1',
#         'classifier_creator': lambda: RandomForestClassifier(
#             n_estimators=200,
#             criterion='gini',
#             max_features='sqrt',
#             max_depth=None,
#             n_jobs=3),
#         'preprocessing': [
#             MinMaxScaler(feature_range=(0, 1), copy=True)
#         ],
#         'persistence': True
#         # [ 0.96573468  0.96679757  0.96332897  0.96594022  0.96665079]
#     },
#     {
#         'name': 'ada_boost_1',
#         'classifier_creator': lambda: AdaBoostClassifier(
#             n_estimators=1000, learning_rate=1.0),
#         'preprocessing': [
#             MinMaxScaler(feature_range=(0, 1), copy=True),
#             PCA(n_components=35, whiten=True)
#         ],
#         'persistence': True
#         # [0.64259369  0.70272522  0.70436957  0.72656901  0.71069557]
#     },
#     {
#         'name': 'svm_1',
#         'classifier_creator': lambda: SVC(
#             C=2.8, kernel='rbf', gamma=0.0073, cache_size=2000, probability=True),
#         'preprocessing': [
#             MinMaxScaler(feature_range=(0, 1), copy=True),
#             PCA(n_components=35, whiten=True)
#         ],
#         'persistence': True
#     },
#     {
#         'name': 'svm_2',
#         'classifier_creator': lambda: SVC(
#             C=1000, kernel='rbf', gamma=0.049, cache_size=2000, probability=True),
#         'preprocessing': [
#             MinMaxScaler(feature_range=(0, 1), copy=True),
#             PCA(n_components=35, whiten=True)
#         ],
#         'persistence': True
#     },
#     {
#         'name': 'knn3',
#         'classifier_creator': lambda: KNeighborsClassifier(
#             n_neighbors=3, n_jobs=-1),
#         'preprocessing': [
#             MinMaxScaler(feature_range=(0, 1), copy=True),
#             PCA(n_components=35, whiten=True)
#         ],
#         'persistence': True
#     },
#     {
#         'name': 'knn7',
#         'classifier_creator': lambda: KNeighborsClassifier(
#             n_neighbors=7, n_jobs=-1),
#         'preprocessing': [
#             MinMaxScaler(feature_range=(0, 1), copy=True),
#             PCA(n_components=35, whiten=True)
#         ],
#         'persistence': True
#     },
    {
        'name': 'mlp1',
        'classifier_creator': None,
        'preprocessing': None,
        'persistence': True
    },
    {
        'name': 'mlp2',
        'classifier_creator': None,
        'preprocessing': None,
        'persistence': True
    },
    {
        'name': 'mlp2aug',
        'classifier_creator': None,
        'preprocessing': None,
        'persistence': True
    },
    {
        'name': 'cnn1',
        'classifier_creator': None,
        'preprocessing': None,
        'persistence': True
    },
    {
        'name': 'cnn2',
        'classifier_creator': None,
        'preprocessing': None,
        'persistence': True
    },
    {
        'name': 'cnn2_psblb',
        'classifier_creator': None,
        'preprocessing': None,
        'persistence': True
    },
#     {
#         'name': 'xgb1',
#         'classifier_creator': None,
#         'preprocessing': None,
#         'persistence': True
#     },
]

In [3]:
MODEL_DIR = 'stacking_models'
if not path.exists(MODEL_DIR):
    os.mkdir(MODEL_DIR)

In [4]:
(X_train_original, y_train_original, X_test_original) = load_data(None)

n_classes = meta.N_CLASSES


Loading data...
Data loaded

In [5]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
folds = list(kfold.split(np.arange(0, X_train_original.shape[0])))

for (a, b) in folds:
    assert np.all(np.sort(np.concatenate((a, b))) == np.arange(0, X_train_original.shape[0]))

In [6]:
stacking_train = np.zeros((X_train_original.shape[0], n_classes * len(level_0_clfs)))
stacking_test = np.zeros((X_test_original.shape[0], n_classes * len(level_0_clfs)))

In [7]:
for clf_n, clf_item in enumerate(level_0_clfs):
    clf_name = clf_item['name']
    clf_creator = clf_item['classifier_creator']
    preprocessing = clf_item['preprocessing']
    if preprocessing is None:
        preprocessing = []
    persistence = clf_item['persistence']
    
    print('Doing', clf_name)

    X_train_all = X_train_original.copy()
    X_test = X_test_original.copy()

    for prep in preprocessing:
        X_train_all = prep.fit_transform(X_train_all)
        X_test = prep.transform(X_test)

    fold_pred_file_name = path.join(MODEL_DIR, '{}_folds.npy'.format(clf_name))
    if persistence and path.exists(fold_pred_file_name):
        stacking_train[:, clf_n * n_classes : (clf_n + 1) * n_classes] = np.load(fold_pred_file_name)
    else:
        for fold_n, (train_idxs, val_idxs) in enumerate(folds):
            print("Fold", fold_n)
            X_train = X_train_all[train_idxs]
            y_train = y_train_original[train_idxs]
            X_val = X_train_all[val_idxs]
            y_val = y_train_original[val_idxs]

            clf = clf_creator()
            clf.fit(X_train, y_train)

            predictions = clf.predict(X_val)
            accuracy = accuracy_score(y_val, predictions)
            print(accuracy)

            stacking_train[val_idxs, clf_n * n_classes : (clf_n + 1) * n_classes] = clf.predict_proba(X_val)

        np.save(fold_pred_file_name, stacking_train[:, clf_n * n_classes : (clf_n + 1) * n_classes])

    full_pred_file_name = path.join(MODEL_DIR, '{}_full.npy'.format(clf_name))
    if persistence and path.exists(full_pred_file_name):
        stacking_test[:, clf_n * n_classes : (clf_n + 1) * n_classes] = np.load(full_pred_file_name)
    else:
        print("Full")
        clf = clf_creator()
        clf.fit(X_train_all, y_train_original)
        stacking_test[:, clf_n * n_classes : (clf_n + 1) * n_classes] = clf.predict_proba(X_test)
        
        np.save(full_pred_file_name, stacking_test[:, clf_n * n_classes : (clf_n + 1) * n_classes])

    print()


Doing mlp1

Doing mlp2

Doing mlp2aug

Doing cnn1

Doing cnn2

Doing cnn2_psblb


In [8]:
np.sum(stacking_train[0:1,:])


Out[8]:
6.0000000318801421

In [9]:
#level_1_clf = SVC(C=0.1, kernel='rbf')
level_1_clf = LogisticRegression()
level_1_clf.fit(stacking_train, y_train_original)


Out[9]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
predictions = level_1_clf.predict(stacking_test)
print(predictions)
enumerate_and_write_predictions(predictions.reshape((stacking_test.shape[0], 1)), 'stacking.csv')


[2 0 9 ..., 3 9 2]