In [1]:
from __future__ import print_function
import os
from os import path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from classify_base import load_data, enumerate_and_write_predictions
import meta
import numpy as np
In [2]:
level_0_clfs = [
# {
# 'name': 'random_forest_1',
# 'classifier_creator': lambda: RandomForestClassifier(
# n_estimators=200,
# criterion='gini',
# max_features='sqrt',
# max_depth=None,
# n_jobs=3),
# 'preprocessing': [
# MinMaxScaler(feature_range=(0, 1), copy=True)
# ],
# 'persistence': True
# # [ 0.96573468 0.96679757 0.96332897 0.96594022 0.96665079]
# },
# {
# 'name': 'ada_boost_1',
# 'classifier_creator': lambda: AdaBoostClassifier(
# n_estimators=1000, learning_rate=1.0),
# 'preprocessing': [
# MinMaxScaler(feature_range=(0, 1), copy=True),
# PCA(n_components=35, whiten=True)
# ],
# 'persistence': True
# # [0.64259369 0.70272522 0.70436957 0.72656901 0.71069557]
# },
# {
# 'name': 'svm_1',
# 'classifier_creator': lambda: SVC(
# C=2.8, kernel='rbf', gamma=0.0073, cache_size=2000, probability=True),
# 'preprocessing': [
# MinMaxScaler(feature_range=(0, 1), copy=True),
# PCA(n_components=35, whiten=True)
# ],
# 'persistence': True
# },
# {
# 'name': 'svm_2',
# 'classifier_creator': lambda: SVC(
# C=1000, kernel='rbf', gamma=0.049, cache_size=2000, probability=True),
# 'preprocessing': [
# MinMaxScaler(feature_range=(0, 1), copy=True),
# PCA(n_components=35, whiten=True)
# ],
# 'persistence': True
# },
# {
# 'name': 'knn3',
# 'classifier_creator': lambda: KNeighborsClassifier(
# n_neighbors=3, n_jobs=-1),
# 'preprocessing': [
# MinMaxScaler(feature_range=(0, 1), copy=True),
# PCA(n_components=35, whiten=True)
# ],
# 'persistence': True
# },
# {
# 'name': 'knn7',
# 'classifier_creator': lambda: KNeighborsClassifier(
# n_neighbors=7, n_jobs=-1),
# 'preprocessing': [
# MinMaxScaler(feature_range=(0, 1), copy=True),
# PCA(n_components=35, whiten=True)
# ],
# 'persistence': True
# },
{
'name': 'mlp1',
'classifier_creator': None,
'preprocessing': None,
'persistence': True
},
{
'name': 'mlp2',
'classifier_creator': None,
'preprocessing': None,
'persistence': True
},
{
'name': 'mlp2aug',
'classifier_creator': None,
'preprocessing': None,
'persistence': True
},
{
'name': 'cnn1',
'classifier_creator': None,
'preprocessing': None,
'persistence': True
},
{
'name': 'cnn2',
'classifier_creator': None,
'preprocessing': None,
'persistence': True
},
{
'name': 'cnn2_psblb',
'classifier_creator': None,
'preprocessing': None,
'persistence': True
},
# {
# 'name': 'xgb1',
# 'classifier_creator': None,
# 'preprocessing': None,
# 'persistence': True
# },
]
In [3]:
MODEL_DIR = 'stacking_models'
if not path.exists(MODEL_DIR):
os.mkdir(MODEL_DIR)
In [4]:
(X_train_original, y_train_original, X_test_original) = load_data(None)
n_classes = meta.N_CLASSES
In [5]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
folds = list(kfold.split(np.arange(0, X_train_original.shape[0])))
for (a, b) in folds:
assert np.all(np.sort(np.concatenate((a, b))) == np.arange(0, X_train_original.shape[0]))
In [6]:
stacking_train = np.zeros((X_train_original.shape[0], n_classes * len(level_0_clfs)))
stacking_test = np.zeros((X_test_original.shape[0], n_classes * len(level_0_clfs)))
In [7]:
for clf_n, clf_item in enumerate(level_0_clfs):
clf_name = clf_item['name']
clf_creator = clf_item['classifier_creator']
preprocessing = clf_item['preprocessing']
if preprocessing is None:
preprocessing = []
persistence = clf_item['persistence']
print('Doing', clf_name)
X_train_all = X_train_original.copy()
X_test = X_test_original.copy()
for prep in preprocessing:
X_train_all = prep.fit_transform(X_train_all)
X_test = prep.transform(X_test)
fold_pred_file_name = path.join(MODEL_DIR, '{}_folds.npy'.format(clf_name))
if persistence and path.exists(fold_pred_file_name):
stacking_train[:, clf_n * n_classes : (clf_n + 1) * n_classes] = np.load(fold_pred_file_name)
else:
for fold_n, (train_idxs, val_idxs) in enumerate(folds):
print("Fold", fold_n)
X_train = X_train_all[train_idxs]
y_train = y_train_original[train_idxs]
X_val = X_train_all[val_idxs]
y_val = y_train_original[val_idxs]
clf = clf_creator()
clf.fit(X_train, y_train)
predictions = clf.predict(X_val)
accuracy = accuracy_score(y_val, predictions)
print(accuracy)
stacking_train[val_idxs, clf_n * n_classes : (clf_n + 1) * n_classes] = clf.predict_proba(X_val)
np.save(fold_pred_file_name, stacking_train[:, clf_n * n_classes : (clf_n + 1) * n_classes])
full_pred_file_name = path.join(MODEL_DIR, '{}_full.npy'.format(clf_name))
if persistence and path.exists(full_pred_file_name):
stacking_test[:, clf_n * n_classes : (clf_n + 1) * n_classes] = np.load(full_pred_file_name)
else:
print("Full")
clf = clf_creator()
clf.fit(X_train_all, y_train_original)
stacking_test[:, clf_n * n_classes : (clf_n + 1) * n_classes] = clf.predict_proba(X_test)
np.save(full_pred_file_name, stacking_test[:, clf_n * n_classes : (clf_n + 1) * n_classes])
print()
In [8]:
np.sum(stacking_train[0:1,:])
Out[8]:
In [9]:
#level_1_clf = SVC(C=0.1, kernel='rbf')
level_1_clf = LogisticRegression()
level_1_clf.fit(stacking_train, y_train_original)
Out[9]:
In [10]:
predictions = level_1_clf.predict(stacking_test)
print(predictions)
enumerate_and_write_predictions(predictions.reshape((stacking_test.shape[0], 1)), 'stacking.csv')