In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import time
import xgboost as xgb
import lightgbm as lgb
# import category_encoders as cat_ed
import gc, mlcrate, glob

# from gplearn.genetic import SymbolicTransformer, SymbolicClassifier
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor
from IPython.display import display

from catboost import CatBoostClassifier
from scipy.cluster import hierarchy as hc
from collections import Counter

from sklearn import metrics
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import  roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# will ignore all warning from sklearn, seaborn etc..
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

pd.option_context("display.max_rows", 1000);
pd.option_context("display.max_columns", 1000);

In [3]:
PATH = os.getcwd()
PATH


Out[3]:
'D:\\Github\\fastai\\courses\\ml1\\AV_WNS'

In [4]:
df_raw = pd.read_csv(f'{PATH}\\train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}\\test.csv', low_memory=False)

df_raw['previous_year_rating'].fillna(0, inplace=True)
df_test['previous_year_rating'].fillna(0, inplace=True)

df_raw['education'].replace('\'s', '', regex=True, inplace=True)
df_test['education'].replace('\'s', '', regex=True, inplace=True)

df_raw.drop('employee_id', axis=1, inplace=True)
df_test.drop('employee_id', axis=1, inplace=True)

In [5]:
for c in df_raw.columns:
    n = df_raw[c].nunique()
    print(c)
    if n <= 8:
        print(n, sorted(df_raw[c].value_counts().to_dict().items()))
    else:
        print(n)
    print(120 * '-')


department
9
------------------------------------------------------------------------------------------------------------------------
region
34
------------------------------------------------------------------------------------------------------------------------
education
3 [('Bachelor', 36669), ('Below Secondary', 805), ('Master & above', 14925)]
------------------------------------------------------------------------------------------------------------------------
gender
2 [('f', 16312), ('m', 38496)]
------------------------------------------------------------------------------------------------------------------------
recruitment_channel
3 [('other', 30446), ('referred', 1142), ('sourcing', 23220)]
------------------------------------------------------------------------------------------------------------------------
no_of_trainings
10
------------------------------------------------------------------------------------------------------------------------
age
41
------------------------------------------------------------------------------------------------------------------------
previous_year_rating
6 [(0.0, 4124), (1.0, 6223), (2.0, 4225), (3.0, 18618), (4.0, 9877), (5.0, 11741)]
------------------------------------------------------------------------------------------------------------------------
length_of_service
35
------------------------------------------------------------------------------------------------------------------------
KPIs_met >80%
2 [(0, 35517), (1, 19291)]
------------------------------------------------------------------------------------------------------------------------
awards_won?
2 [(0, 53538), (1, 1270)]
------------------------------------------------------------------------------------------------------------------------
avg_training_score
61
------------------------------------------------------------------------------------------------------------------------
is_promoted
2 [(0, 50140), (1, 4668)]
------------------------------------------------------------------------------------------------------------------------

In [6]:
df_raw.shape, df_test.shape


Out[6]:
((54808, 13), (23490, 12))

In [7]:
new_cols = ['department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met_more_than_80_percent', 'awards_won_bool',
       'avg_training_score', 'is_promoted']
#re-naming them
df_raw.columns = new_cols
df_test.columns = new_cols[:-1]

cat_cols = list(df_raw.select_dtypes(include=['object']).columns)
num_cols = list(df_raw.select_dtypes(exclude=['object']).columns)

In [8]:
df_raw_cat, df_test_cat = df_raw[cat_cols], df_test[cat_cols]

In [9]:
df_raw.get_ftype_counts()


Out[9]:
object:dense     5
int64:dense      7
float64:dense    1
dtype: int64

In [10]:
bins = [0., 5., 10., 15., 20., 25., 30., 35., 40]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['length_of_service']
for col in bin_cols:    
    df_raw[f'bin_{col}'.format(col)]  = pd.cut(df_raw[col] , bins, labels = labels)
    df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col], bins, labels = labels)

bins = [10, 20., 25., 30., 35., 40., 45., 50., 55., 60., 70]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['age']
for col in bin_cols:    
    df_raw[f'bin_{col}'.format(col)]  = pd.cut(df_raw[col] ,bins,labels = labels)
    df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)

bins = [30, 39., 44., 54., 66., 75., 80., 85., 90., 100]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['avg_training_score']
for col in bin_cols:    
    df_raw[f'bin_{col}'.format(col)]  = pd.cut(df_raw[col] ,bins,labels = labels)
    df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)

df_raw['promotion_chance'] = 'low'
my_query = df_raw.query('avg_training_score>=90').index
df_raw.iloc[my_query, -1] = 'very_high'
my_query = df_raw.query('avg_training_score>=75 and avg_training_score<90').index
df_raw.iloc[my_query, -1] = 'high'
my_query = df_raw.query('avg_training_score>=65 and avg_training_score<75').index
df_raw.iloc[my_query, -1] = 'medium'
my_query = df_raw.query('avg_training_score>=53 and avg_training_score<65').index
df_raw.iloc[my_query, -1] = 'low_medium'

df_test['promotion_chance'] = 'low'
my_query = df_test.query('avg_training_score>=90').index
df_test.iloc[my_query, -1] = 'very_high'
my_query = df_test.query('avg_training_score>=75 and avg_training_score<90').index
df_test.iloc[my_query, -1] = 'high'
my_query = df_test.query('avg_training_score>=65 and avg_training_score<75').index
df_test.iloc[my_query, -1] = 'medium'
my_query = df_test.query('avg_training_score>=53 and avg_training_score<65').index
df_test.iloc[my_query, -1] = 'low_medium'

#######################################################################################

df_raw['new_employee'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('previous_year_rating == 0').index
df_raw.iloc[my_query, -1] = 1

df_test['new_employee'] = np.zeros(df_test.shape[0])
my_query = df_test.query('previous_year_rating == 0').index
df_test.iloc[my_query, -1] = 1

df_raw['joining_age'] = df_raw['age'] - df_raw['length_of_service']
df_test['joining_age'] = df_test['age'] - df_test['length_of_service']

df_raw['main_training_received_for_promo'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('no_of_trainings <=2').index
df_raw.iloc[my_query, -1] = 1

df_test['main_training_received_for_promo'] = np.zeros(df_test.shape[0])
my_query = df_test.query('no_of_trainings <=2').index
df_test.iloc[my_query, -1] = 1

df_raw['region'].replace('region_', '', True, None, True)
df_test['region'].replace('region_', '', True, None, True)

df_raw['major_promo_region'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('region == \'2\' or region == \'22\' or region ==\'7\' or region == \'4\' or region == \'13\' or region ==\'15\'').index
df_raw.iloc[my_query, -1] = 1

df_test['major_promo_region'] = np.zeros(df_test.shape[0])
my_query = df_test.query('region == \'2\' or region == \'22\' or region ==\'7\' or region == \'4\' or region == \'13\' or region ==\'15\'').index
df_test.iloc[my_query, -1] = 1

df_raw['trainings_less_2'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('no_of_trainings <=2').index
df_raw.iloc[my_query, -1] = 1

df_test['trainings_less_2'] = np.zeros(df_test.shape[0])
my_query = df_test.query('no_of_trainings <=2').index
df_test.iloc[my_query, -1] = 1

def map_(regs, age):
    d = {}
    for i,j in zip(regs, age):
        d[i] = j
    return d

xyz = df_raw.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_raw['region'])

regs                   = xyz.reset_index()['region'].values
age                    = xyz.reset_index()['age'].values
joining_age            = xyz.reset_index()['joining_age'].values
previous_year_rating   = xyz.reset_index()['previous_year_rating'].values
length_of_service      = xyz.reset_index()['length_of_service'].values 
avg_training_score     = xyz.reset_index()['avg_training_score'].values

df_raw['reg_count'] = df_raw['region'].map(count)
d = map_(regs, age)
df_raw['mean_age_per_region']   = df_raw['region'].map(d)
d = map_(regs, joining_age)
df_raw['mean_joining_age_per_region']   = df_raw['region'].map(d)
d = map_(regs, previous_year_rating)
df_raw['mean_previous_year_rating_per_region']   = df_raw['region'].map(d)
d = map_(regs, avg_training_score)
df_raw['mean_avg_training_score_per_region']   = df_raw['region'].map(d)
d = map_(regs, length_of_service)
df_raw['mean_length_of_service_per_region']   = df_raw['region'].map(d)

xyz = df_test.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_test['region'])

regs                   = xyz.reset_index()['region'].values
age                    = xyz.reset_index()['age'].values
joining_age            = xyz.reset_index()['joining_age'].values
previous_year_rating   = xyz.reset_index()['previous_year_rating'].values
length_of_service      = xyz.reset_index()['length_of_service'].values 
avg_training_score     = xyz.reset_index()['avg_training_score'].values

df_test['reg_count'] = df_test['region'].map(count)
d = map_(regs, age)
df_test['mean_age_per_region']   = df_test['region'].map(d)
d = map_(regs, joining_age)
df_test['mean_joining_age_per_region']   = df_test['region'].map(d)
d = map_(regs, previous_year_rating)
df_test['mean_previous_year_rating_per_region']   = df_test['region'].map(d)
d = map_(regs, avg_training_score)
df_test['mean_avg_training_score_per_region']   = df_test['region'].map(d)
d = map_(regs, length_of_service)
df_test['mean_length_of_service_per_region']   = df_test['region'].map(d)
####################################################################################

del d, count, regs, joining_age, previous_year_rating, length_of_service, avg_training_score
gc.collect()


Out[10]:
244

In [11]:
df_raw.shape, df_test.shape


Out[11]:
((54808, 28), (23490, 27))

In [12]:
df_raw.drop(cat_cols, axis=1, inplace=True)
df_test.drop(cat_cols, axis=1, inplace=True)

In [13]:
df_raw_cat['dept_rec'] = 'in ' + df_raw_cat['department'] + ' via ' + df_raw_cat['recruitment_channel']
df_test_cat['dept_rec'] = 'in ' + df_test_cat['department'] + ' via ' + df_test_cat['recruitment_channel']

df_raw_cat['gen_edu'] =  df_raw_cat['gender'] + ' has ' + df_raw_cat['education']
df_test_cat['gen_edu'] = df_test_cat['gender'] + ' has ' + df_test_cat['education']

In [14]:
idx_split = df_raw_cat.shape[0]
df_raw_cat.head(1)


Out[14]:
department region education gender recruitment_channel dept_rec gen_edu
0 Sales & Marketing region_7 Master & above f sourcing in Sales & Marketing via sourcing f has Master & above

In [15]:
target = df_raw.is_promoted

In [16]:
full_df = pd.concat((df_raw_cat, df_test_cat), axis = 0, ignore_index=True)

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
full_sites = full_df[:].astype('str')
lst = full_sites.as_matrix().tolist()
flat_list = [' '.join(sublist) for sublist in lst]
vect = TfidfVectorizer(ngram_range=(1,2), max_features=500,analyzer='char_wb')
tfidf_matrix = vect.fit_transform(flat_list)

X_train_tf = tfidf_matrix[:idx_split]
X_test_tf  = tfidf_matrix[idx_split:]

X_train_tf.shape, X_test_tf.shape


Out[17]:
((54808, 194), (23490, 194))

In [18]:
%%time
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=120)
svd.fit(X_train_tf)
X_train_svd = svd.transform(X_train_tf)
X_test_svd = svd.transform(X_test_tf)
# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(X_train_svd)
X_train_svd_scl = scl.transform(X_train_svd)
X_test_svd_scl = scl.transform(X_test_svd)


Wall time: 5.59 s

In [36]:
train_cats(df_raw);
apply_cats(df_test, df_raw);

In [23]:
cat_cols = list(df_raw.select_dtypes(include=['category', 'object']).columns)
num_cols = list(df_raw.select_dtypes(exclude=['object', 'category']).columns)

In [28]:
train_dummies = pd.get_dummies(pd.concat((df_raw_cat,df_raw[cat_cols]),axis=1), drop_first=True)
test_dummies = pd.get_dummies(pd.concat((df_test_cat,df_test[cat_cols]),axis=1), drop_first=True)
#df_raw.drop(cat_cols, axis=1, inplace=True)
#df_test.drop(cat_cols), axis=1, inplace=True)
df_raw.shape, df_test.shape, train_dummies.shape, test_dummies.shape


Out[28]:
((54808, 23), (23490, 22), (54808, 105), (23490, 105))

In [29]:
# df_raw.drop(cat_cols, axis=1, inplace=True)
# df_test.drop(cat_cols, axis=1, inplace=True)

In [36]:
X_train_tf.shape, df_raw.shape, train_dummies.shape


Out[36]:
((54808, 194), (54808, 18), (54808, 105))

In [40]:
X_test_tf.shape, df_test.shape, test_dummies.shape


Out[40]:
((23490, 194), (23490, 18), (23490, 105))

In [53]:
train_all, test_all = np.hstack([df_raw.values, train_dummies.values, X_train_tf.todense()]),np.hstack([df_test.values, test_dummies.values, X_test_tf.todense()])
train_all.shape, test_all.shape


Out[53]:
((54808, 317), (23490, 317))

In [65]:
train_all, test_all = pd.concat([df_raw, train_dummies], axis=1), pd.concat([df_test, test_dummies], axis=1)

In [102]:
train_all, target, test_all = np.save('train_all_better.npy', train_all), np.save('target.npy',target), np.save('test_all_better.npy',test_all)

In [4]:
train_all, target, test_all = np.load('train_all_better.npy'), np.load('target.npy'), np.load('test_all_better.npy')

In [5]:
def make_submission(probs):
    sample = pd.read_csv(f'{PATH}\\sample_submission.csv')
    submit = sample.copy()
    submit['is_promoted'] = probs
    return submit

In [6]:
np.count_nonzero(target), target.shape[0]- 4668


Out[6]:
(4668, 50140)

In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(train_all, target, test_size = .2, stratify = target)

In [8]:
%%time
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, auc, f1_score
clf_lr = LogisticRegression (random_state = 42,n_jobs=-1,max_iter=1000)
clf_lr.fit (X_train, y_train)
preds_lr = clf_lr.predict_proba (X_valid) [:, 1]
print ('Train test split LogisticRegression score:% s ROC AUC'% round (roc_auc_score (y_valid, preds_lr), 4))
print('F1 Score', f1_score(y_valid, (preds_lr>.61)*1))
print('F1 Score', f1_score(y_valid, (preds_lr>.7)*1))


Train test split LogisticRegression score:0.8535 ROC AUC
F1 Score 0.294964028777
F1 Score 0.267281105991
Wall time: 4.3 s

In [22]:
def show_confusion_matrix ( y_true ,  y_pred ,  title  =  'Confusion matrix' ): 
    table = confusion_matrix ( y_true ,  y_pred )#.values maybe
    fig , ax  =  plt . subplots ( frameon = False ) 
    fig.set_size_inches ( 4 ,  3 ) 
    fig.suptitle (title ,  fontsize  =  20 ) 
    ax.axis( 'off' ) 
    ax.xaxis.set_visible ( False )  
    ax.yaxis.set_visible ( False )
    the_table  =  ax . table ( cellText = table , 
        colWidths = [ 0.5 ] * len ([ 0 ,  1 ]), 
        rowLabels = ['True 0','True 1'],colLabels = [ 'Predicted 0' ,  'Predicted 1' ], 
        cellLoc='center',rowLoc='center',loc="center")
    the_table.set_fontsize(34) 
    the_table.scale(1,4)

In [23]:
show_confusion_matrix(y_valid, clf_lr.predict( X_valid ))



In [27]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.8):
    # Split the data into the training and validation sets
    idx = int(round(X.shape[0] * ratio))
    # Classifier training
    lr = LogisticRegression(C=C, random_state=seed,max_iter=8000,n_jobs=-1, class_weight='balanced').fit(X[:idx, :], y[:idx])
    # Prediction for validation set
    y_pred = lr.predict(X[idx:, :])
    # Calculate the quality
    score = f1_score(y[idx:], y_pred)
    return score

In [28]:
%%time
from tqdm import tqdm_notebook
# List of possible C-values
Cs = np.logspace(-1, 3, 20)

scores = []

for C in tqdm_notebook(Cs):
     scores.append(get_auc_lr_valid(X_train, y_train, C=C))


Wall time: 2min 42s

In [29]:
max(scores), Cs[np.argmax(scores)]


Out[29]:
(0.37186839722306064, 12.742749857031335)

In [30]:
final_model = LogisticRegression(random_state = 17,max_iter=8000, C = Cs[np.argmax(scores)],n_jobs=-1) 
final_model.fit(X_train,y_train) 
print('F1 on the test sample: {} '.format(round(f1_score(y_valid,final_model.predict(X_valid)),4))) 
show_confusion_matrix(y_valid, final_model.predict(X_valid))


F1 on the test sample: 0.409 

In [8]:
def runXGB(train_X, train_y, test_X, test_y=None):
        params = {}
        params['booster'] = 'gbtree'
        params['tree_method'] = 'gpu_hist'
        params["objective"] = "binary:logistic"
        params['eval_metric'] = 'aucpr'
        params["eta"] = 0.05 #0.03
        params["subsample"] = .8
        params["silent"] = 0
        params['verbose'] = 2
        params["max_depth"] = 9
        params["seed"] = 1
        params["max_delta_step"] = 4
        params['scale_pos_weight'] =  50140/4668
        params["gamma"] = 0.6 #.5 #.1 #.2
        params['colsample_bytree'] = 0.75
        nrounds = 700 #3600 #2000 #4000
        plst = list(params.items())

        xgtrain = xgb.DMatrix(train_X, label=train_y)
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, nrounds)
        pred_test_y = model.predict(xgtest)
        return pred_test_y

In [81]:
val_preds = runXGB(X_train, y_train, X_valid,)

In [82]:
val_preds, max(val_preds)


Out[82]:
(array([ 0.18268,  0.00008,  0.54749, ...,  0.00009,  0.07348,  0.00283], dtype=float32),
 0.99999917)

In [36]:
params = {}
params['booster'] = 'gbtree'
#params['tree_method'] = 'gpu_hist'
params["objective"] = "binary:logistic"
params['eval_metric'] = 'auc'
params["eta"] = 0.05
params["subsample"] = .7
params["silent"] = 0
params['verbose'] = 1
params["max_depth"] = 9
params["seed"] = 1
params["max_delta_step"] = 4
params['scale_pos_weight'] =  50140/4668
params["gamma"] = 1. #.5 #.1 #.2
params['colsample_bytree'] = 0.7
params['nrounds'] = 500 #3600 #2000 #4000

In [37]:
model, p_train, p_test = mlcrate.xgb.train_kfold(params, train_all, target, test_all, folds = 5, stratify=target)


[mlcrate] Training 5 stratified XGBoost models on training set (54808, 317) with test set (23490, 317)
[mlcrate] Running fold 0, 43846 train samples, 10962 validation samples
[0]	train-auc:0.885006	valid-auc:0.855153
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.902799	valid-auc:0.860806
[2]	train-auc:0.911549	valid-auc:0.880358
[3]	train-auc:0.920037	valid-auc:0.889555
[4]	train-auc:0.919806	valid-auc:0.888714
[5]	train-auc:0.923599	valid-auc:0.889547
[6]	train-auc:0.927261	valid-auc:0.891791
[7]	train-auc:0.929901	valid-auc:0.893423
[8]	train-auc:0.932335	valid-auc:0.894141
[9]	train-auc:0.9314	valid-auc:0.893521
[10]	train-auc:0.932615	valid-auc:0.894238
[11]	train-auc:0.934677	valid-auc:0.894965
[12]	train-auc:0.935865	valid-auc:0.895533
[13]	train-auc:0.937747	valid-auc:0.896421
[14]	train-auc:0.937826	valid-auc:0.896624
[15]	train-auc:0.938114	valid-auc:0.897115
[16]	train-auc:0.938481	valid-auc:0.897217
[17]	train-auc:0.939632	valid-auc:0.897744
[18]	train-auc:0.93999	valid-auc:0.897236
[19]	train-auc:0.939972	valid-auc:0.897355
[20]	train-auc:0.940405	valid-auc:0.898018
[21]	train-auc:0.941062	valid-auc:0.898256
[22]	train-auc:0.941018	valid-auc:0.898035
[23]	train-auc:0.941712	valid-auc:0.898261
[24]	train-auc:0.941989	valid-auc:0.898449
[25]	train-auc:0.942952	valid-auc:0.89876
[26]	train-auc:0.943364	valid-auc:0.898721
[27]	train-auc:0.943756	valid-auc:0.898497
[28]	train-auc:0.944174	valid-auc:0.89893
[29]	train-auc:0.944641	valid-auc:0.899131
[30]	train-auc:0.945114	valid-auc:0.898869
[31]	train-auc:0.945346	valid-auc:0.898896
[32]	train-auc:0.945414	valid-auc:0.898826
[33]	train-auc:0.945832	valid-auc:0.898934
[34]	train-auc:0.946043	valid-auc:0.899
[35]	train-auc:0.946782	valid-auc:0.899312
[36]	train-auc:0.947274	valid-auc:0.899546
[37]	train-auc:0.947777	valid-auc:0.899667
[38]	train-auc:0.947889	valid-auc:0.89975
[39]	train-auc:0.947997	valid-auc:0.899686
[40]	train-auc:0.948082	valid-auc:0.899762
[41]	train-auc:0.94864	valid-auc:0.900186
[42]	train-auc:0.949062	valid-auc:0.900266
[43]	train-auc:0.949193	valid-auc:0.900295
[44]	train-auc:0.949251	valid-auc:0.900233
[45]	train-auc:0.949847	valid-auc:0.900808
[46]	train-auc:0.950263	valid-auc:0.901142
[47]	train-auc:0.950369	valid-auc:0.901064
[48]	train-auc:0.950431	valid-auc:0.901192
[49]	train-auc:0.950787	valid-auc:0.90106
[50]	train-auc:0.950923	valid-auc:0.900885
[51]	train-auc:0.95112	valid-auc:0.900861
[52]	train-auc:0.951535	valid-auc:0.900856
[53]	train-auc:0.951723	valid-auc:0.900923
[54]	train-auc:0.952005	valid-auc:0.901156
[55]	train-auc:0.952314	valid-auc:0.900721
[56]	train-auc:0.952769	valid-auc:0.90064
[57]	train-auc:0.952799	valid-auc:0.900951
[58]	train-auc:0.953139	valid-auc:0.900915
[59]	train-auc:0.953342	valid-auc:0.900848
[60]	train-auc:0.954028	valid-auc:0.900876
[61]	train-auc:0.954043	valid-auc:0.900966
[62]	train-auc:0.954311	valid-auc:0.900886
[63]	train-auc:0.954945	valid-auc:0.900862
[64]	train-auc:0.955207	valid-auc:0.900963
[65]	train-auc:0.955599	valid-auc:0.901108
[66]	train-auc:0.956001	valid-auc:0.901105
[67]	train-auc:0.956048	valid-auc:0.901072
[68]	train-auc:0.956198	valid-auc:0.901089
[69]	train-auc:0.95643	valid-auc:0.901111
[70]	train-auc:0.95668	valid-auc:0.901112
[71]	train-auc:0.956894	valid-auc:0.901221
[72]	train-auc:0.957145	valid-auc:0.901122
[73]	train-auc:0.957239	valid-auc:0.900962
[74]	train-auc:0.957385	valid-auc:0.901055
[75]	train-auc:0.957482	valid-auc:0.901038
[76]	train-auc:0.957902	valid-auc:0.901138
[77]	train-auc:0.958255	valid-auc:0.901314
[78]	train-auc:0.958472	valid-auc:0.901229
[79]	train-auc:0.958647	valid-auc:0.901268
[80]	train-auc:0.958699	valid-auc:0.901316
[81]	train-auc:0.959214	valid-auc:0.901315
[82]	train-auc:0.959332	valid-auc:0.901129
[83]	train-auc:0.959648	valid-auc:0.90107
[84]	train-auc:0.95995	valid-auc:0.901294
[85]	train-auc:0.960196	valid-auc:0.901292
[86]	train-auc:0.960435	valid-auc:0.90123
[87]	train-auc:0.960852	valid-auc:0.901202
[88]	train-auc:0.961286	valid-auc:0.901269
[89]	train-auc:0.961482	valid-auc:0.901262
[90]	train-auc:0.961541	valid-auc:0.901225
[91]	train-auc:0.961762	valid-auc:0.901367
[92]	train-auc:0.962077	valid-auc:0.901489
[93]	train-auc:0.962108	valid-auc:0.901388
[94]	train-auc:0.962267	valid-auc:0.901501
[95]	train-auc:0.962457	valid-auc:0.9017
[96]	train-auc:0.962898	valid-auc:0.901587
[97]	train-auc:0.963156	valid-auc:0.901479
[98]	train-auc:0.963344	valid-auc:0.901446
[99]	train-auc:0.963633	valid-auc:0.901329
[100]	train-auc:0.9637	valid-auc:0.901315
[101]	train-auc:0.963932	valid-auc:0.901418
[102]	train-auc:0.96406	valid-auc:0.901419
[103]	train-auc:0.964293	valid-auc:0.9014
[104]	train-auc:0.96452	valid-auc:0.901495
[105]	train-auc:0.964848	valid-auc:0.901367
[106]	train-auc:0.964985	valid-auc:0.901336
[107]	train-auc:0.965355	valid-auc:0.901136
[108]	train-auc:0.965544	valid-auc:0.901116
[109]	train-auc:0.965765	valid-auc:0.901079
[110]	train-auc:0.966029	valid-auc:0.901078
[111]	train-auc:0.966241	valid-auc:0.90099
[112]	train-auc:0.966571	valid-auc:0.900996
[113]	train-auc:0.966633	valid-auc:0.900974
[114]	train-auc:0.966701	valid-auc:0.900942
[115]	train-auc:0.966778	valid-auc:0.900743
[116]	train-auc:0.967106	valid-auc:0.900643
[117]	train-auc:0.967326	valid-auc:0.900667
[118]	train-auc:0.967354	valid-auc:0.900671
[119]	train-auc:0.96745	valid-auc:0.900622
[120]	train-auc:0.967676	valid-auc:0.900604
[121]	train-auc:0.967767	valid-auc:0.90065
[122]	train-auc:0.967993	valid-auc:0.900724
[123]	train-auc:0.968267	valid-auc:0.900766
[124]	train-auc:0.968458	valid-auc:0.900637
[125]	train-auc:0.96864	valid-auc:0.900547
[126]	train-auc:0.968776	valid-auc:0.900651
[127]	train-auc:0.969217	valid-auc:0.900609
[128]	train-auc:0.969553	valid-auc:0.900455
[129]	train-auc:0.969755	valid-auc:0.900574
[130]	train-auc:0.97003	valid-auc:0.900507
[131]	train-auc:0.970209	valid-auc:0.900564
[132]	train-auc:0.970321	valid-auc:0.900545
[133]	train-auc:0.970447	valid-auc:0.900494
[134]	train-auc:0.970446	valid-auc:0.900523
[135]	train-auc:0.970717	valid-auc:0.90058
[136]	train-auc:0.970933	valid-auc:0.900531
[137]	train-auc:0.971058	valid-auc:0.900555
[138]	train-auc:0.971124	valid-auc:0.900444
[139]	train-auc:0.971219	valid-auc:0.900433
[140]	train-auc:0.971338	valid-auc:0.900288
[141]	train-auc:0.971533	valid-auc:0.9004
[142]	train-auc:0.971909	valid-auc:0.900396
[143]	train-auc:0.972078	valid-auc:0.900412
[144]	train-auc:0.972214	valid-auc:0.900474
[145]	train-auc:0.972453	valid-auc:0.900451
Stopping. Best iteration:
[95]	train-auc:0.962457	valid-auc:0.9017

C:\ProgramData\Anaconda3\lib\site-packages\mlcrate\backend.py:7: UserWarning: Timer.format_elapsed() has been deprecated in favour of Timer.fsince() and will be removed soon
  warn(message)
[mlcrate] Finished training fold 0 - took 2m56s - running score 0.9017
[mlcrate] Running fold 1, 43846 train samples, 10962 validation samples
[0]	train-auc:0.884704	valid-auc:0.860456
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.910908	valid-auc:0.857344
[2]	train-auc:0.915397	valid-auc:0.868895
[3]	train-auc:0.923977	valid-auc:0.879333
[4]	train-auc:0.923746	valid-auc:0.880443
[5]	train-auc:0.926892	valid-auc:0.885048
[6]	train-auc:0.929348	valid-auc:0.884205
[7]	train-auc:0.931457	valid-auc:0.887898
[8]	train-auc:0.933381	valid-auc:0.890711
[9]	train-auc:0.933289	valid-auc:0.890405
[10]	train-auc:0.93435	valid-auc:0.892082
[11]	train-auc:0.935419	valid-auc:0.892201
[12]	train-auc:0.936506	valid-auc:0.893213
[13]	train-auc:0.938204	valid-auc:0.894204
[14]	train-auc:0.93915	valid-auc:0.895271
[15]	train-auc:0.939859	valid-auc:0.896045
[16]	train-auc:0.940077	valid-auc:0.896377
[17]	train-auc:0.941003	valid-auc:0.897013
[18]	train-auc:0.941587	valid-auc:0.897377
[19]	train-auc:0.941735	valid-auc:0.897814
[20]	train-auc:0.942073	valid-auc:0.897734
[21]	train-auc:0.942304	valid-auc:0.898446
[22]	train-auc:0.942356	valid-auc:0.89845
[23]	train-auc:0.942906	valid-auc:0.898398
[24]	train-auc:0.943544	valid-auc:0.899064
[25]	train-auc:0.943848	valid-auc:0.898815
[26]	train-auc:0.944663	valid-auc:0.899085
[27]	train-auc:0.945549	valid-auc:0.899418
[28]	train-auc:0.946121	valid-auc:0.899521
[29]	train-auc:0.946328	valid-auc:0.899609
[30]	train-auc:0.946774	valid-auc:0.899915
[31]	train-auc:0.946914	valid-auc:0.900311
[32]	train-auc:0.94742	valid-auc:0.900683
[33]	train-auc:0.947745	valid-auc:0.900292
[34]	train-auc:0.948012	valid-auc:0.900601
[35]	train-auc:0.948221	valid-auc:0.900853
[36]	train-auc:0.948437	valid-auc:0.900901
[37]	train-auc:0.94883	valid-auc:0.900761
[38]	train-auc:0.949224	valid-auc:0.901051
[39]	train-auc:0.94938	valid-auc:0.90127
[40]	train-auc:0.94965	valid-auc:0.901271
[41]	train-auc:0.949806	valid-auc:0.901202
[42]	train-auc:0.950118	valid-auc:0.901055
[43]	train-auc:0.950384	valid-auc:0.900932
[44]	train-auc:0.95058	valid-auc:0.90095
[45]	train-auc:0.950791	valid-auc:0.900826
[46]	train-auc:0.951077	valid-auc:0.900851
[47]	train-auc:0.951307	valid-auc:0.90086
[48]	train-auc:0.951752	valid-auc:0.900621
[49]	train-auc:0.951989	valid-auc:0.900612
[50]	train-auc:0.952162	valid-auc:0.900798
[51]	train-auc:0.952329	valid-auc:0.900757
[52]	train-auc:0.95254	valid-auc:0.900888
[53]	train-auc:0.952772	valid-auc:0.900983
[54]	train-auc:0.953182	valid-auc:0.900927
[55]	train-auc:0.953559	valid-auc:0.901014
[56]	train-auc:0.953615	valid-auc:0.900768
[57]	train-auc:0.953925	valid-auc:0.900815
[58]	train-auc:0.954322	valid-auc:0.900982
[59]	train-auc:0.954394	valid-auc:0.900895
[60]	train-auc:0.954663	valid-auc:0.900875
[61]	train-auc:0.955016	valid-auc:0.900889
[62]	train-auc:0.955219	valid-auc:0.900799
[63]	train-auc:0.955473	valid-auc:0.900908
[64]	train-auc:0.955704	valid-auc:0.900753
[65]	train-auc:0.955929	valid-auc:0.900697
[66]	train-auc:0.956167	valid-auc:0.900786
[67]	train-auc:0.956258	valid-auc:0.900849
[68]	train-auc:0.956471	valid-auc:0.900878
[69]	train-auc:0.956591	valid-auc:0.900933
[70]	train-auc:0.956669	valid-auc:0.900919
[71]	train-auc:0.956936	valid-auc:0.900824
[72]	train-auc:0.957105	valid-auc:0.900757
[73]	train-auc:0.957357	valid-auc:0.900498
[74]	train-auc:0.957503	valid-auc:0.900549
[75]	train-auc:0.957753	valid-auc:0.900649
[76]	train-auc:0.95792	valid-auc:0.900639
[77]	train-auc:0.958315	valid-auc:0.900492
[78]	train-auc:0.958587	valid-auc:0.900482
[79]	train-auc:0.958857	valid-auc:0.900435
[80]	train-auc:0.958951	valid-auc:0.900575
[81]	train-auc:0.959329	valid-auc:0.900375
[82]	train-auc:0.959554	valid-auc:0.900432
[83]	train-auc:0.959972	valid-auc:0.900415
[84]	train-auc:0.960206	valid-auc:0.900629
[85]	train-auc:0.960543	valid-auc:0.900575
[86]	train-auc:0.960609	valid-auc:0.900751
[87]	train-auc:0.960646	valid-auc:0.900729
[88]	train-auc:0.960807	valid-auc:0.900714
[89]	train-auc:0.960994	valid-auc:0.900716
[90]	train-auc:0.961113	valid-auc:0.900694
Stopping. Best iteration:
[40]	train-auc:0.94965	valid-auc:0.901271

C:\ProgramData\Anaconda3\lib\site-packages\mlcrate\backend.py:7: UserWarning: Timer.format_elapsed() has been deprecated in favour of Timer.fsince() and will be removed soon
  warn(message)
[mlcrate] Finished training fold 1 - took 1m51s - running score 0.9014854999999999
[mlcrate] Running fold 2, 43846 train samples, 10962 validation samples
[0]	train-auc:0.884611	valid-auc:0.848949
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.90505	valid-auc:0.854606
[2]	train-auc:0.911651	valid-auc:0.869286
[3]	train-auc:0.919515	valid-auc:0.876459
[4]	train-auc:0.919105	valid-auc:0.878314
[5]	train-auc:0.923362	valid-auc:0.882181
[6]	train-auc:0.925136	valid-auc:0.883227
[7]	train-auc:0.927626	valid-auc:0.886899
[8]	train-auc:0.929015	valid-auc:0.888622
[9]	train-auc:0.928772	valid-auc:0.888498
[10]	train-auc:0.929788	valid-auc:0.890126
[11]	train-auc:0.932214	valid-auc:0.892244
[12]	train-auc:0.933662	valid-auc:0.89358
[13]	train-auc:0.935625	valid-auc:0.894808
[14]	train-auc:0.936021	valid-auc:0.895372
[15]	train-auc:0.936897	valid-auc:0.896093
[16]	train-auc:0.937415	valid-auc:0.896065
[17]	train-auc:0.938579	valid-auc:0.896748
[18]	train-auc:0.939062	valid-auc:0.897132
[19]	train-auc:0.939577	valid-auc:0.897557
[20]	train-auc:0.940157	valid-auc:0.898276
[21]	train-auc:0.940405	valid-auc:0.898594
[22]	train-auc:0.940666	valid-auc:0.898271
[23]	train-auc:0.941387	valid-auc:0.897945
[24]	train-auc:0.94157	valid-auc:0.898579
[25]	train-auc:0.94225	valid-auc:0.898982
[26]	train-auc:0.942786	valid-auc:0.899403
[27]	train-auc:0.943667	valid-auc:0.899136
[28]	train-auc:0.944392	valid-auc:0.899436
[29]	train-auc:0.944903	valid-auc:0.899614
[30]	train-auc:0.945213	valid-auc:0.899534
[31]	train-auc:0.945506	valid-auc:0.899894
[32]	train-auc:0.945797	valid-auc:0.900237
[33]	train-auc:0.946206	valid-auc:0.900148
[34]	train-auc:0.946475	valid-auc:0.900088
[35]	train-auc:0.947179	valid-auc:0.899818
[36]	train-auc:0.947471	valid-auc:0.900106
[37]	train-auc:0.947807	valid-auc:0.900259
[38]	train-auc:0.947873	valid-auc:0.900242
[39]	train-auc:0.948403	valid-auc:0.900424
[40]	train-auc:0.948475	valid-auc:0.900527
[41]	train-auc:0.948785	valid-auc:0.900469
[42]	train-auc:0.949209	valid-auc:0.900502
[43]	train-auc:0.949961	valid-auc:0.901051
[44]	train-auc:0.950097	valid-auc:0.900992
[45]	train-auc:0.950299	valid-auc:0.901022
[46]	train-auc:0.950659	valid-auc:0.901218
[47]	train-auc:0.950843	valid-auc:0.90122
[48]	train-auc:0.950991	valid-auc:0.901088
[49]	train-auc:0.951341	valid-auc:0.901019
[50]	train-auc:0.951331	valid-auc:0.900994
[51]	train-auc:0.951427	valid-auc:0.90098
[52]	train-auc:0.951983	valid-auc:0.901192
[53]	train-auc:0.952117	valid-auc:0.901384
[54]	train-auc:0.952457	valid-auc:0.90141
[55]	train-auc:0.95284	valid-auc:0.901502
[56]	train-auc:0.953235	valid-auc:0.901444
[57]	train-auc:0.953406	valid-auc:0.901545
[58]	train-auc:0.953671	valid-auc:0.901564
[59]	train-auc:0.953852	valid-auc:0.90165
[60]	train-auc:0.953923	valid-auc:0.901576
[61]	train-auc:0.954145	valid-auc:0.90153
[62]	train-auc:0.954385	valid-auc:0.901575
[63]	train-auc:0.954479	valid-auc:0.901688
[64]	train-auc:0.954737	valid-auc:0.901533
[65]	train-auc:0.955059	valid-auc:0.901299
[66]	train-auc:0.955356	valid-auc:0.90133
[67]	train-auc:0.955551	valid-auc:0.901242
[68]	train-auc:0.955919	valid-auc:0.901335
[69]	train-auc:0.95634	valid-auc:0.901098
[70]	train-auc:0.956775	valid-auc:0.901056
[71]	train-auc:0.957074	valid-auc:0.901081
[72]	train-auc:0.957393	valid-auc:0.901252
[73]	train-auc:0.957516	valid-auc:0.901051
[74]	train-auc:0.957576	valid-auc:0.901023
[75]	train-auc:0.957879	valid-auc:0.901038
[76]	train-auc:0.958125	valid-auc:0.901007
[77]	train-auc:0.958284	valid-auc:0.900993
[78]	train-auc:0.958588	valid-auc:0.90099
[79]	train-auc:0.958949	valid-auc:0.90091
[80]	train-auc:0.959243	valid-auc:0.900879
[81]	train-auc:0.959784	valid-auc:0.900548
[82]	train-auc:0.960159	valid-auc:0.900749
[83]	train-auc:0.96017	valid-auc:0.90078
[84]	train-auc:0.960294	valid-auc:0.900792
[85]	train-auc:0.960435	valid-auc:0.90072
[86]	train-auc:0.960558	valid-auc:0.900681
[87]	train-auc:0.960952	valid-auc:0.900679
[88]	train-auc:0.961135	valid-auc:0.900512
[89]	train-auc:0.9613	valid-auc:0.900621
[90]	train-auc:0.961384	valid-auc:0.900693
[91]	train-auc:0.961543	valid-auc:0.900574
[92]	train-auc:0.961727	valid-auc:0.900462
[93]	train-auc:0.961751	valid-auc:0.900471
[94]	train-auc:0.961835	valid-auc:0.900559
[95]	train-auc:0.962128	valid-auc:0.900566
[96]	train-auc:0.962216	valid-auc:0.900635
[97]	train-auc:0.962538	valid-auc:0.900652
[98]	train-auc:0.962826	valid-auc:0.900657
[99]	train-auc:0.963028	valid-auc:0.900866
[100]	train-auc:0.963333	valid-auc:0.900892
[101]	train-auc:0.963632	valid-auc:0.900987
[102]	train-auc:0.963701	valid-auc:0.900994
[103]	train-auc:0.964009	valid-auc:0.900862
[104]	train-auc:0.9642	valid-auc:0.900828
[105]	train-auc:0.964535	valid-auc:0.900937
[106]	train-auc:0.964634	valid-auc:0.900829
[107]	train-auc:0.964717	valid-auc:0.90075
[108]	train-auc:0.965028	valid-auc:0.900626
[109]	train-auc:0.965178	valid-auc:0.900467
[110]	train-auc:0.965223	valid-auc:0.900461
[111]	train-auc:0.965346	valid-auc:0.900362
[112]	train-auc:0.965578	valid-auc:0.90026
[113]	train-auc:0.965896	valid-auc:0.90025
Stopping. Best iteration:
[63]	train-auc:0.954479	valid-auc:0.901688

C:\ProgramData\Anaconda3\lib\site-packages\mlcrate\backend.py:7: UserWarning: Timer.format_elapsed() has been deprecated in favour of Timer.fsince() and will be removed soon
  warn(message)
[mlcrate] Finished training fold 2 - took 2m17s - running score 0.9015529999999999
[mlcrate] Running fold 3, 43847 train samples, 10961 validation samples
[0]	train-auc:0.896536	valid-auc:0.882976
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.911349	valid-auc:0.883518
[2]	train-auc:0.911323	valid-auc:0.87922
[3]	train-auc:0.919847	valid-auc:0.893189
[4]	train-auc:0.9246	valid-auc:0.900108
[5]	train-auc:0.924649	valid-auc:0.899557
[6]	train-auc:0.927869	valid-auc:0.902185
[7]	train-auc:0.929327	valid-auc:0.903277
[8]	train-auc:0.930638	valid-auc:0.903309
[9]	train-auc:0.93115	valid-auc:0.904018
[10]	train-auc:0.931879	valid-auc:0.904097
[11]	train-auc:0.933364	valid-auc:0.904743
[12]	train-auc:0.9349	valid-auc:0.906244
[13]	train-auc:0.9355	valid-auc:0.906024
[14]	train-auc:0.935555	valid-auc:0.906281
[15]	train-auc:0.936751	valid-auc:0.906844
[16]	train-auc:0.936884	valid-auc:0.906823
[17]	train-auc:0.938309	valid-auc:0.90668
[18]	train-auc:0.938843	valid-auc:0.906034
[19]	train-auc:0.939798	valid-auc:0.906677
[20]	train-auc:0.94015	valid-auc:0.907161
[21]	train-auc:0.94131	valid-auc:0.907234
[22]	train-auc:0.941856	valid-auc:0.907259
[23]	train-auc:0.941839	valid-auc:0.907255
[24]	train-auc:0.941975	valid-auc:0.906706
[25]	train-auc:0.942296	valid-auc:0.906869
[26]	train-auc:0.942331	valid-auc:0.906651
[27]	train-auc:0.942843	valid-auc:0.907122
[28]	train-auc:0.943504	valid-auc:0.907525
[29]	train-auc:0.944357	valid-auc:0.907689
[30]	train-auc:0.945005	valid-auc:0.907882
[31]	train-auc:0.945187	valid-auc:0.907876
[32]	train-auc:0.945328	valid-auc:0.907823
[33]	train-auc:0.945996	valid-auc:0.907694
[34]	train-auc:0.946438	valid-auc:0.907896
[35]	train-auc:0.946998	valid-auc:0.908235
[36]	train-auc:0.947719	valid-auc:0.908579
[37]	train-auc:0.948545	valid-auc:0.908839
[38]	train-auc:0.948585	valid-auc:0.908729
[39]	train-auc:0.948784	valid-auc:0.908956
[40]	train-auc:0.948966	valid-auc:0.908956
[41]	train-auc:0.949149	valid-auc:0.909036
[42]	train-auc:0.949606	valid-auc:0.909152
[43]	train-auc:0.950134	valid-auc:0.909311
[44]	train-auc:0.950242	valid-auc:0.909099
[45]	train-auc:0.950724	valid-auc:0.909398
[46]	train-auc:0.950882	valid-auc:0.909405
[47]	train-auc:0.951163	valid-auc:0.909509
[48]	train-auc:0.951523	valid-auc:0.909595
[49]	train-auc:0.951699	valid-auc:0.909667
[50]	train-auc:0.952245	valid-auc:0.909677
[51]	train-auc:0.95287	valid-auc:0.909801
[52]	train-auc:0.95304	valid-auc:0.910078
[53]	train-auc:0.953244	valid-auc:0.910137
[54]	train-auc:0.953524	valid-auc:0.910085
[55]	train-auc:0.953794	valid-auc:0.909879
[56]	train-auc:0.954031	valid-auc:0.909875
[57]	train-auc:0.954159	valid-auc:0.909733
[58]	train-auc:0.95459	valid-auc:0.909535
[59]	train-auc:0.954887	valid-auc:0.909524
[60]	train-auc:0.95502	valid-auc:0.909401
[61]	train-auc:0.955456	valid-auc:0.909312
[62]	train-auc:0.955445	valid-auc:0.909295
[63]	train-auc:0.95566	valid-auc:0.9093
[64]	train-auc:0.956031	valid-auc:0.909392
[65]	train-auc:0.956178	valid-auc:0.90958
[66]	train-auc:0.956326	valid-auc:0.909578
[67]	train-auc:0.956579	valid-auc:0.909706
[68]	train-auc:0.957068	valid-auc:0.909819
[69]	train-auc:0.9572	valid-auc:0.909787
[70]	train-auc:0.95736	valid-auc:0.909781
[71]	train-auc:0.957589	valid-auc:0.909715
[72]	train-auc:0.95766	valid-auc:0.909731
[73]	train-auc:0.958066	valid-auc:0.90972
[74]	train-auc:0.958159	valid-auc:0.909814
[75]	train-auc:0.958525	valid-auc:0.909915
[76]	train-auc:0.958562	valid-auc:0.9098
[77]	train-auc:0.959034	valid-auc:0.909642
[78]	train-auc:0.959187	valid-auc:0.909672
[79]	train-auc:0.959314	valid-auc:0.909657
[80]	train-auc:0.959467	valid-auc:0.909637
[81]	train-auc:0.959829	valid-auc:0.909781
[82]	train-auc:0.960057	valid-auc:0.909813
[83]	train-auc:0.960409	valid-auc:0.909914
[84]	train-auc:0.960698	valid-auc:0.909871
[85]	train-auc:0.960792	valid-auc:0.909873
[86]	train-auc:0.960904	valid-auc:0.909867
[87]	train-auc:0.961043	valid-auc:0.90981
[88]	train-auc:0.961131	valid-auc:0.909773
[89]	train-auc:0.961263	valid-auc:0.909646
[90]	train-auc:0.961448	valid-auc:0.909709
[91]	train-auc:0.961796	valid-auc:0.909588
[92]	train-auc:0.962048	valid-auc:0.909614
[93]	train-auc:0.962196	valid-auc:0.909636
[94]	train-auc:0.96232	valid-auc:0.909529
[95]	train-auc:0.962681	valid-auc:0.909313
[96]	train-auc:0.962856	valid-auc:0.909304
[97]	train-auc:0.963469	valid-auc:0.909376
[98]	train-auc:0.963669	valid-auc:0.90924
[99]	train-auc:0.964127	valid-auc:0.909224
[100]	train-auc:0.964249	valid-auc:0.909167
[101]	train-auc:0.964479	valid-auc:0.909121
[102]	train-auc:0.964647	valid-auc:0.909134
[103]	train-auc:0.964881	valid-auc:0.908993
Stopping. Best iteration:
[53]	train-auc:0.953244	valid-auc:0.910137

C:\ProgramData\Anaconda3\lib\site-packages\mlcrate\backend.py:7: UserWarning: Timer.format_elapsed() has been deprecated in favour of Timer.fsince() and will be removed soon
  warn(message)
[mlcrate] Finished training fold 3 - took 2m03s - running score 0.903699
[mlcrate] Running fold 4, 43847 train samples, 10961 validation samples
[0]	train-auc:0.896671	valid-auc:0.877186
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.911342	valid-auc:0.882195
[2]	train-auc:0.912539	valid-auc:0.876501
[3]	train-auc:0.921549	valid-auc:0.890471
[4]	train-auc:0.927369	valid-auc:0.894789
[5]	train-auc:0.927013	valid-auc:0.896552
[6]	train-auc:0.928939	valid-auc:0.897634
[7]	train-auc:0.930304	valid-auc:0.897109
[8]	train-auc:0.931109	valid-auc:0.898099
[9]	train-auc:0.93178	valid-auc:0.898536
[10]	train-auc:0.932369	valid-auc:0.898804
[11]	train-auc:0.933644	valid-auc:0.898658
[12]	train-auc:0.936057	valid-auc:0.900698
[13]	train-auc:0.937297	valid-auc:0.900654
[14]	train-auc:0.937033	valid-auc:0.90019
[15]	train-auc:0.937577	valid-auc:0.901024
[16]	train-auc:0.937377	valid-auc:0.900779
[17]	train-auc:0.938622	valid-auc:0.901267
[18]	train-auc:0.938739	valid-auc:0.900852
[19]	train-auc:0.940229	valid-auc:0.90208
[20]	train-auc:0.941199	valid-auc:0.902462
[21]	train-auc:0.94228	valid-auc:0.903006
[22]	train-auc:0.94258	valid-auc:0.903466
[23]	train-auc:0.94278	valid-auc:0.902905
[24]	train-auc:0.942897	valid-auc:0.90312
[25]	train-auc:0.94358	valid-auc:0.903533
[26]	train-auc:0.943547	valid-auc:0.903253
[27]	train-auc:0.943939	valid-auc:0.903638
[28]	train-auc:0.94462	valid-auc:0.904049
[29]	train-auc:0.946057	valid-auc:0.904249
[30]	train-auc:0.946594	valid-auc:0.90419
[31]	train-auc:0.946608	valid-auc:0.903973
[32]	train-auc:0.947228	valid-auc:0.904135
[33]	train-auc:0.947341	valid-auc:0.904189
[34]	train-auc:0.948074	valid-auc:0.904377
[35]	train-auc:0.948266	valid-auc:0.904561
[36]	train-auc:0.948725	valid-auc:0.904392
[37]	train-auc:0.94931	valid-auc:0.904256
[38]	train-auc:0.949322	valid-auc:0.904292
[39]	train-auc:0.949931	valid-auc:0.904532
[40]	train-auc:0.950255	valid-auc:0.904689
[41]	train-auc:0.950585	valid-auc:0.905005
[42]	train-auc:0.950909	valid-auc:0.904679
[43]	train-auc:0.95115	valid-auc:0.904856
[44]	train-auc:0.95154	valid-auc:0.904975
[45]	train-auc:0.951928	valid-auc:0.904816
[46]	train-auc:0.952523	valid-auc:0.905111
[47]	train-auc:0.952711	valid-auc:0.905173
[48]	train-auc:0.95303	valid-auc:0.905168
[49]	train-auc:0.953251	valid-auc:0.905058
[50]	train-auc:0.953555	valid-auc:0.904776
[51]	train-auc:0.953842	valid-auc:0.904704
[52]	train-auc:0.953901	valid-auc:0.905026
[53]	train-auc:0.954212	valid-auc:0.904969
[54]	train-auc:0.954549	valid-auc:0.905001
[55]	train-auc:0.954863	valid-auc:0.905119
[56]	train-auc:0.955239	valid-auc:0.905307
[57]	train-auc:0.955443	valid-auc:0.905312
[58]	train-auc:0.955677	valid-auc:0.905619
[59]	train-auc:0.955706	valid-auc:0.905822
[60]	train-auc:0.956009	valid-auc:0.905806
[61]	train-auc:0.956144	valid-auc:0.905751
[62]	train-auc:0.956346	valid-auc:0.905823
[63]	train-auc:0.956541	valid-auc:0.905946
[64]	train-auc:0.956791	valid-auc:0.905893
[65]	train-auc:0.95723	valid-auc:0.905957
[66]	train-auc:0.957383	valid-auc:0.905891
[67]	train-auc:0.957434	valid-auc:0.905904
[68]	train-auc:0.95794	valid-auc:0.905744
[69]	train-auc:0.958035	valid-auc:0.905799
[70]	train-auc:0.958226	valid-auc:0.9059
[71]	train-auc:0.958661	valid-auc:0.905741
[72]	train-auc:0.958855	valid-auc:0.905726
[73]	train-auc:0.959415	valid-auc:0.905662
[74]	train-auc:0.959886	valid-auc:0.905704
[75]	train-auc:0.960295	valid-auc:0.905669
[76]	train-auc:0.960324	valid-auc:0.905733
[77]	train-auc:0.960544	valid-auc:0.905579
[78]	train-auc:0.960653	valid-auc:0.90553
[79]	train-auc:0.960927	valid-auc:0.905644
[80]	train-auc:0.961166	valid-auc:0.905834
[81]	train-auc:0.961448	valid-auc:0.905827
[82]	train-auc:0.961646	valid-auc:0.906061
[83]	train-auc:0.962034	valid-auc:0.905726
[84]	train-auc:0.962189	valid-auc:0.905656
[85]	train-auc:0.962307	valid-auc:0.90562
[86]	train-auc:0.962536	valid-auc:0.9056
[87]	train-auc:0.962553	valid-auc:0.905602
[88]	train-auc:0.962911	valid-auc:0.905502
[89]	train-auc:0.962998	valid-auc:0.905582
[90]	train-auc:0.963046	valid-auc:0.905631
[91]	train-auc:0.963269	valid-auc:0.905578
[92]	train-auc:0.963362	valid-auc:0.905535
[93]	train-auc:0.963605	valid-auc:0.905502
[94]	train-auc:0.963681	valid-auc:0.90554
[95]	train-auc:0.963861	valid-auc:0.905551
[96]	train-auc:0.964018	valid-auc:0.905554
[97]	train-auc:0.964191	valid-auc:0.905406
[98]	train-auc:0.964462	valid-auc:0.905509
[99]	train-auc:0.96487	valid-auc:0.905471
[100]	train-auc:0.964923	valid-auc:0.905476
[101]	train-auc:0.965086	valid-auc:0.905315
[102]	train-auc:0.965312	valid-auc:0.905305
[103]	train-auc:0.965586	valid-auc:0.905372
[104]	train-auc:0.965749	valid-auc:0.90523
[105]	train-auc:0.96592	valid-auc:0.905241
[106]	train-auc:0.966097	valid-auc:0.905238
[107]	train-auc:0.966536	valid-auc:0.905213
[108]	train-auc:0.966813	valid-auc:0.905076
[109]	train-auc:0.967163	valid-auc:0.904947
[110]	train-auc:0.967204	valid-auc:0.904909
[111]	train-auc:0.967396	valid-auc:0.90497
[112]	train-auc:0.967482	valid-auc:0.905
[113]	train-auc:0.967688	valid-auc:0.904884
[114]	train-auc:0.967756	valid-auc:0.904738
[115]	train-auc:0.967891	valid-auc:0.904713
[116]	train-auc:0.968043	valid-auc:0.904615
[117]	train-auc:0.968389	valid-auc:0.904449
[118]	train-auc:0.968547	valid-auc:0.904503
[119]	train-auc:0.968698	valid-auc:0.904402
[120]	train-auc:0.968979	valid-auc:0.904395
[121]	train-auc:0.968994	valid-auc:0.90444
[122]	train-auc:0.969157	valid-auc:0.904377
[123]	train-auc:0.969357	valid-auc:0.904331
[124]	train-auc:0.969442	valid-auc:0.904419
[125]	train-auc:0.969626	valid-auc:0.904412
[126]	train-auc:0.969682	valid-auc:0.904391
[127]	train-auc:0.969846	valid-auc:0.904307
[128]	train-auc:0.970097	valid-auc:0.904234
[129]	train-auc:0.970285	valid-auc:0.904297
[130]	train-auc:0.970423	valid-auc:0.904254
[131]	train-auc:0.970602	valid-auc:0.904263
[132]	train-auc:0.970688	valid-auc:0.90431
Stopping. Best iteration:
[82]	train-auc:0.961646	valid-auc:0.906061

C:\ProgramData\Anaconda3\lib\site-packages\mlcrate\backend.py:7: UserWarning: Timer.format_elapsed() has been deprecated in favour of Timer.fsince() and will be removed soon
  warn(message)
[mlcrate] Finished training fold 4 - took 2m42s - running score 0.9041714000000001
C:\ProgramData\Anaconda3\lib\site-packages\mlcrate\backend.py:7: UserWarning: Timer.format_elapsed() has been deprecated in favour of Timer.fsince() and will be removed soon
  warn(message)
[mlcrate] Finished training 5 XGBoost models, took 11m54s

In [38]:
p_test, max(p_test)


Out[38]:
(array([ 0.63538,  0.05006,  0.03389, ...,  0.04417,  0.06983,  0.92965], dtype=float32),
 0.97443736)

In [52]:
pd.Series(target).value_counts()


Out[52]:
0    50140
1     4668
dtype: int64

In [57]:
4668/54808*100


Out[57]:
8.517004816815064

In [53]:
p_test.shape


Out[53]:
(23490,)

In [60]:
4668/54808*23490


Out[60]:
2000.6444314698583

In [66]:
pd.Series((p_test>.7)*1).value_counts()


Out[66]:
0    22095
1     1395
dtype: int64

In [46]:
pd.Series((p_test>.75)*1).value_counts()


Out[46]:
0    22660
1      830
dtype: int64

In [50]:
pd.Series((p_test>.8)*1).value_counts()


Out[50]:
0    22847
1      643
dtype: int64

In [51]:
pd.Series((p_test>.85)*1).value_counts()


Out[51]:
0    22906
1      584
dtype: int64

In [63]:
preds = pd.Series((p_test>.61)*1)

In [64]:
make_submission(preds).to_csv('clipped_at_.8.csv', index=False)

In [ ]:


In [ ]:


In [107]:
import joblib
joblib.dump(model[0],'xgb_best_0th')
joblib.dump(model,'xgb_best')


Out[107]:
['xgb_best']

In [88]:
import numpy as np
from sklearn.metrics import matthews_corrcoef

def find_matthews_threshold(p_valid, y_valid, try_all=False, verbose=False):
    p_valid, y_valid = np.array(p_valid), np.array(y_valid)

    best = 0
    best_score = -2
    totry = np.arange(0.3,1,0.01) if try_all is False else np.unique(p_valid)
    for t in totry:
        score = matthews_corrcoef(y_valid, p_valid > t)
        if score > best_score:
            best_score = score
            best = t
    if verbose is True: 
        print('Best score: ', round(best_score, 5), ' @ threshold ', best)

    return best

def best_threshold_submission(p_valid, y_valid, p_test, try_all=False, verbose=False):
    p_test = np.array(p_test)
    thresh = find_matthews_threshold(p_valid, y_valid, try_all, verbose)
    return (p_test > thresh)*1

submission_values = best_threshold_submission(val_preds, y_valid, p_test, True, True)


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:538: RuntimeWarning: invalid value encountered in double_scalars
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
Best score:  0.50454  @ threshold  0.937723

In [89]:
submission_values


Out[89]:
array([0, 0, 0, ..., 0, 0, 1])

In [90]:
pd.Series(submission_values).value_counts()


Out[90]:
0    23023
1      467
dtype: int64

In [96]:
(p_test>.9) *1


Out[96]:
array([0, 0, 0, ..., 0, 0, 1])

In [101]:
make_submission((p_test>.75) *1).to_csv('day_2.csv', index=None)

In [113]:
def get_xgb_imp(xgb, feat_names):
    imp_vals = xgb.get_fscore()
    feats_imp = pd.DataFrame(imp_vals,index=np.arange(2)).T
    feats_imp.iloc[:,0]= feats_imp.index    
    feats_imp.columns=['feature','importance']
    feats_imp.sort_values('importance',inplace=True,ascending=False)
    feats_imp.reset_index(drop=True,inplace=True)
    return feats_imp

feature_importance_df = get_xgb_imp(model[3], feat_names= train_all.columns)

In [114]:
feature_importance_df


Out[114]:
feature importance
0 f18 10876
1 f19 10526
2 f6 10002
3 f22 9186
4 f23 8264
5 f21 8259
6 f20 8240
7 f1 8132
8 f8 8079
9 f3 6557
10 f2 4699
11 f12 3007
12 f16 2915
13 f15 2700
14 f13 2095
15 f0 1815
16 f4 1521
17 f14 1346
18 f17 1307
19 f69 1004
20 f79 958
21 f70 887
22 f27 723
23 f67 694
24 f80 679
25 f91 673
26 f88 632
27 f28 623
28 f78 610
29 f89 590
... ... ...
60 f68 121
61 f65 118
62 f72 113
63 f42 111
64 f52 97
65 f36 95
66 f63 94
67 f54 86
68 f32 81
69 f48 78
70 f39 67
71 f26 62
72 f11 61
73 f56 58
74 f59 56
75 f41 41
76 f93 40
77 f34 38
78 f44 34
79 f86 32
80 f61 31
81 f47 30
82 f73 29
83 f84 29
84 f60 26
85 f53 10
86 f74 9
87 f58 5
88 f57 3
89 f75 2

90 rows × 2 columns


In [121]:
for i,j in enumerate(train_all.columns):
    print('f{}'.format(i), j)


f0 no_of_trainings
f1 age
f2 previous_year_rating
f3 length_of_service
f4 KPIs_met_more_than_80_percent
f5 awards_won_bool
f6 avg_training_score
f7 new_employee
f8 joining_age
f9 main_training_received_for_promo
f10 major_promo_region
f11 trainings_less_2
f12 reg_count
f13 mean_age_per_region
f14 mean_joining_age_per_region
f15 mean_previous_year_rating_per_region
f16 mean_avg_training_score_per_region
f17 mean_length_of_service_per_region
f18 impact_encoded_department
f19 impact_encoded_region
f20 impact_encoded_education
f21 impact_encoded_gender
f22 impact_encoded_recruitment_channel
f23 impact_encoded_promotion_chance
f24 department_Finance
f25 department_HR
f26 department_Legal
f27 department_Operations
f28 department_Procurement
f29 department_R&D
f30 department_Sales & Marketing
f31 department_Technology
f32 region_region_10
f33 region_region_11
f34 region_region_12
f35 region_region_13
f36 region_region_14
f37 region_region_15
f38 region_region_16
f39 region_region_17
f40 region_region_18
f41 region_region_19
f42 region_region_2
f43 region_region_20
f44 region_region_21
f45 region_region_22
f46 region_region_23
f47 region_region_24
f48 region_region_25
f49 region_region_26
f50 region_region_27
f51 region_region_28
f52 region_region_29
f53 region_region_3
f54 region_region_30
f55 region_region_31
f56 region_region_32
f57 region_region_33
f58 region_region_34
f59 region_region_4
f60 region_region_5
f61 region_region_6
f62 region_region_7
f63 region_region_8
f64 region_region_9
f65 education_Below Secondary
f66 education_Master's & above
f67 gender_m
f68 recruitment_channel_referred
f69 recruitment_channel_sourcing
f70 bin_length_of_service_2
f71 bin_length_of_service_3
f72 bin_length_of_service_4
f73 bin_length_of_service_5
f74 bin_length_of_service_6
f75 bin_length_of_service_7
f76 bin_length_of_service_8
f77 bin_age_2
f78 bin_age_3
f79 bin_age_4
f80 bin_age_5
f81 bin_age_6
f82 bin_age_7
f83 bin_age_8
f84 bin_age_9
f85 bin_age_10
f86 bin_avg_training_score_2
f87 bin_avg_training_score_3
f88 bin_avg_training_score_4
f89 bin_avg_training_score_5
f90 bin_avg_training_score_6
f91 bin_avg_training_score_7
f92 bin_avg_training_score_8
f93 bin_avg_training_score_9

In [ ]: