In [1]:
import os
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
import lightgbm as lgb
from collections import Counter
import joblib
import keras
from keras.models import Model, Sequential
from keras.layers import InputLayer, Dense, Dropout, BatchNormalization
import tflearn
from tqdm import tqdm_notebook
In [54]:
DATA_DIR = "../../hcdr_data/"
Y_train = pd.read_csv(DATA_DIR + 'application_train.csv', usecols=['TARGET'])['TARGET'].values
test_idx = pd.read_csv(DATA_DIR + 'application_test.csv', usecols=['SK_ID_CURR'])
In [55]:
FOLDS_PATH = './folds.pkl'
folds_idx = joblib.load(FOLDS_PATH)
In [ ]:
!ls ../blending/
In [57]:
def is_good_file(f):
return True
In [111]:
# blend_files_names = !ls ../blending/*
# blend_files_names = [f
# for f in blend_files_names
# if is_good_file(f)]
blend_files_names = [
'../.....pkl',
...
]
blend_files = [joblib.load(f)
for f in blend_files_names]
In [112]:
len(blend_files)
Out[112]:
In [ ]:
good_names = []
good_files = []
for fname, blend_file in zip(blend_files_names, blend_files):
print(fname)
try:
assert len(blend_file['train']) == 307511
assert len(blend_file['test']) == 10
for i in range(10):
assert len(blend_file['test'][i]) == 48744
blend_file['train'] = blend_file['train'].ravel()
for i in range(10):
blend_file['test'][i] = blend_file['test'][i].ravel()
except:
print("bad file -- {}".format(fname))
continue
good_names.append(fname)
good_files.append(blend_file)
blend_files_names = good_names
blend_files = good_files
In [114]:
X_train = np.concatenate([blend_file['train'][:, np.newaxis]
for blend_file in blend_files], axis=-1)
# +
# [sp.stats.rankdata(blend_file['train'])[:, np.newaxis] / len(blend_file['train'])
# for blend_file in blend_files]
X_test_list = [np.concatenate([blend_file['test'][i][:, np.newaxis]
for blend_file in blend_files], axis=-1)
for i in range(10)]
# +
# [sp.stats.rankdata(blend_file['test'][i])[:, np.newaxis] / len(blend_file['test'][i])
# for blend_file in blend_files]
In [115]:
# train_df = pd.read_csv('../../hcdr_data/artgor_ann/train_processed_new.csv')
# test_df = pd.read_csv('../../hcdr_data/artgor_ann/test_processed_new.csv')
# train_df.replace([np.inf, -np.inf], -1, inplace=True)
# test_df.replace([np.inf, -np.inf], -1, inplace=True)
# train_df.fillna(-1, inplace=True)
# test_df.fillna(-1, inplace=True)
# for i in range(len(blend_files)):
# train_df['pred_{}'.format(i)] = X_train[:, i]
# features = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
In [116]:
# class MyModel:
# def fit(self, X_train, Y_train):
# n, m = X_train.shape
# w0 = np.random.normal(size=m)
# def fun(w):
# return -roc_auc_score(Y_train, X_train @ w)
# w = sp.optimize.minimize(fun, w0)
# self.w = w['x']
# def predict(self, X, y=None):
# return X @ self.w
# def predict_proba(self, X, y=None):
# return X @ self.w
In [117]:
for i in tqdm_notebook(range(len(blend_files_names))):
for n_fold, (train_idx, valid_idx) in enumerate(folds_idx):
ranked_valid = sp.stats.rankdata(X_train[valid_idx, i])
ranked_valid -= ranked_valid.min()
ranked_valid /= ranked_valid.max()
ranked_valid = np.round(ranked_valid, decimals=2)
rg = KNeighborsRegressor(n_neighbors=1)
rg.fit(X_train[valid_idx, i][:, np.newaxis], ranked_valid)
X_train[valid_idx, i] = ranked_valid
X_test_list[n_fold][:, i] = rg.predict(X_test_list[n_fold][:, i][:, np.newaxis])
In [118]:
X_train
Out[118]:
In [119]:
def get_model():
# model = Sequential()
# model.add(InputLayer(input_shape=(X_train.shape[1],)))
# for i in range(10):
# model.add(Dense(32, activation='elu'))
# model.add(Dense(1, activation='sigmoid'))
# model.compile('adam', 'binary_crossentropy')
return LogisticRegression()
In [120]:
clf = get_model()
In [121]:
# Create arrays and dataframes to store results
oof_preds = np.zeros(len(Y_train))
test_preds = []
final_preds = np.zeros(len(test_idx))
auc_scores = []
for n_fold, (train_idx, valid_idx) in enumerate(folds_idx):
train_x, train_y = X_train[train_idx], Y_train[train_idx]
valid_x, valid_y = X_train[valid_idx], Y_train[valid_idx]
clf = get_model()
# clf.fit(train_x, train_y,
# eval_set=[(train_x, train_y), (valid_x, valid_y)],
# eval_metric='auc', verbose=100, early_stopping_rounds=100)
# oof_preds[valid_idx] = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]
# y_pred = clf.predict_proba(X_test_list[n_fold], num_iteration=clf.best_iteration_)[:, 1]
# clf.fit(train_x, train_y, batch_size=128, epochs=10, validation_data=(valid_x, valid_y))
# oof_preds[valid_idx] = clf.predict_proba(valid_x).ravel()
# y_pred = clf.predict_proba(X_test_list[n_fold]).ravel()
clf.fit(train_x, train_y)
oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
y_pred = clf.predict_proba(X_test_list[n_fold])[:, 1]
final_preds += pd.Series(y_pred).rank().values
test_preds.append(y_pred)
auc_scores.append(roc_auc_score(valid_y, oof_preds[valid_idx]))
print('Fold %2d AUC : %.6f' % (n_fold + 1, auc_scores[-1]))
print("\n", np.mean(auc_scores), np.std(auc_scores), roc_auc_score(Y_train, oof_preds))
final_preds /= final_preds.max()
In [122]:
sub = pd.DataFrame({'SK_ID_CURR': test_idx['SK_ID_CURR'],
'TARGET': final_preds})
sub.to_csv('../submit/blending_end.csv', index=None)
In [17]:
import scipy as sp
import matplotlib.pyplot as plt
%matplotlib inline
In [18]:
train_preds = list(X_train.T) + [oof_preds]
test_preds = [sum((sp.stats.rankdata(X_test_list[i][:, j])
for i in range(10)))
for j in range(len(blend_files))] + [final_preds]
In [ ]:
train_corr = np.zeros((len(train_preds), len(train_preds)))
test_corr = np.zeros_like(train_corr)
In [ ]:
for i in range(len(train_preds)):
for j in range(len(train_preds)):
train_corr[i, j] = sp.stats.spearmanr(train_preds[i], train_preds[j]).correlation
test_corr[i, j] = sp.stats.spearmanr(test_preds[i], test_preds[j]).correlation
In [ ]:
names = [f.split('/')[-1][:-4] for f in blend_files_names] + ['blend']
In [ ]:
import itertools
def plot_corr_matrix(cm_train, cm_test, names,
cmap=plt.cm.Blues, savefig=None):
def plot_cm(ax, cm, title, draw_y=True):
ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.set_title(title)
tick_marks = np.arange(len(names))
ax.set_xticks(tick_marks)
ax.set_xticklabels(names, rotation=80)
if draw_y:
ax.set_yticks(tick_marks)
ax.set_yticklabels(names)
fmt = '.2f'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
ax.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(30, 15))
plot_cm(ax1, cm_train, title='train')
plot_cm(ax2, cm_test, title='test', draw_y=False)
plt.subplots_adjust(wspace=0.05)
if savefig is not None:
plt.savefig(savefig)
In [ ]:
plot_corr_matrix(train_corr, test_corr, names, savefig='corr.svg')
In [ ]:
from IPython.display import FileLink
FileLink('./corr.svg')
In [ ]: