av_student_datafest_2_preprocessing


imports


In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import time
import xgboost as xgb
import lightgbm as lgb
import category_encoders as cat_ed
import gc, mlcrate, glob

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from IPython.display import display
from catboost import CatBoostClassifier
from scipy.cluster import hierarchy as hc
from collections import Counter

from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import  roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# will ignore all warning from sklearn, seaborn etc..
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

pd.option_context("display.max_rows", 20);
pd.option_context("display.max_columns", 100);

In [3]:
PATH = os.getcwd();
PATH


Out[3]:
'D:\\Github\\fastai\\courses\\ml1'

In [167]:
# df_raw_1 = pd.read_csv(f'{PATH}\\AV_Stud_2\\train.csv', low_memory=False)
# df_test_1 = pd.read_csv(f'{PATH}\\AV_Stud_2\\test.csv', low_memory=False)

# df_raw['last_new_job'] = df_raw_1['last_new_job']
# df_test['last_new_job'] = df_test_1['last_new_job']

# del df_raw_1, df_test_1

In [27]:
# df_raw = pd.read_csv(f'{PATH}\\AV_Stud_2\\train.csv', low_memory=False)
# df_test = pd.read_csv(f'{PATH}\\AV_Stud_2\\test.csv', low_memory=False)

# stack_train = pd.read_csv(f'{PATH}\\AV_Stud_2\\stack_train.csv')
# stack_test = pd.read_csv(f'{PATH}\\AV_Stud_2\\stack_test.csv')

# stack_train_2 = pd.read_csv(f'{PATH}\\AV_Stud_2\\stack_train_2.csv')
# stack_test_2 = pd.read_csv(f'{PATH}\\AV_Stud_2\\stack_test_2.csv')
train_67 = np.load(f'{PATH}\\AV_Stud_2\\train_67.npy')
test_67 = np.load(f'{PATH}\\AV_Stud_2\\test_67.npy')

# drop enrollee id

In [10]:
target = stack_train.target
stack_train.drop('target', axis=1, inplace=True)

cleaning a bit


In [5]:
target = df_raw.target.values
drop_col = ['enrollee_id']
df_raw.drop(drop_col, axis=1,inplace=True)
df_test.drop(drop_col, axis=1, inplace=True)

In [6]:
cols = ['city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'enrolled_university_degree',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target']
df_raw.columns = cols
df_test.columns = cols[:-1]

In [7]:
df_raw.head(2)


Out[7]:
city city_development_index gender relevent_experience enrolled_university enrolled_university_degree major_discipline experience company_size company_type last_new_job training_hours target
0 city_149 0.689 Male Has relevent experience no_enrollment Graduate STEM 3 100-500 Pvt Ltd 1 106 0
1 city_83 0.923 Male Has relevent experience no_enrollment Graduate STEM 14 <10 Funded Startup 1 69 0

In [8]:
for c in df_raw.columns:
    n = df_raw[c].nunique()
    print(c)
    if n <= 8:
        print(n, sorted(df_raw[c].value_counts().to_dict().items()))
    else:
        print(n)
    print(120 * '-')


city
123
------------------------------------------------------------------------------------------------------------------------
city_development_index
93
------------------------------------------------------------------------------------------------------------------------
gender
3 [('Female', 1188), ('Male', 12884), ('Other', 189)]
------------------------------------------------------------------------------------------------------------------------
relevent_experience
2 [('Has relevent experience', 13596), ('No relevent experience', 4763)]
------------------------------------------------------------------------------------------------------------------------
enrolled_university
3 [('Full time course', 3187), ('Part time course', 1171), ('no_enrollment', 13659)]
------------------------------------------------------------------------------------------------------------------------
enrolled_university_degree
5 [('Graduate', 10769), ('High School', 2032), ('Masters', 4319), ('Phd', 459), ('Primary School', 323)]
------------------------------------------------------------------------------------------------------------------------
major_discipline
6 [('Arts', 239), ('Business Degree', 307), ('Humanities', 688), ('No Major', 206), ('Other', 343), ('STEM', 13738)]
------------------------------------------------------------------------------------------------------------------------
experience
22
------------------------------------------------------------------------------------------------------------------------
company_size
8 [('10/49', 1466), ('100-500', 2698), ('1000-4999', 1399), ('10000+', 2044), ('50-99', 3120), ('500-999', 902), ('5000-9999', 591), ('<10', 1360)]
------------------------------------------------------------------------------------------------------------------------
company_type
6 [('Early Stage Startup', 582), ('Funded Startup', 1038), ('NGO', 534), ('Other', 119), ('Public Sector', 996), ('Pvt Ltd', 10051)]
------------------------------------------------------------------------------------------------------------------------
last_new_job
6 [('1', 7567), ('2', 2835), ('3', 1027), ('4', 1038), ('>4', 3339), ('never', 2186)]
------------------------------------------------------------------------------------------------------------------------
training_hours
241
------------------------------------------------------------------------------------------------------------------------
target
2 [(0, 15934), (1, 2425)]
------------------------------------------------------------------------------------------------------------------------

city and city_dev col's


In [10]:
#clean city split
df_raw['city'] = df_raw['city'].str.split('_',expand=True)[1]
df_raw['city'] = df_raw['city'].astype('int32')

df_test['city'] = df_test['city'].str.split('_',expand=True)[1]
df_test['city'] = df_test['city'].astype('int32')

In [28]:
df_raw['is_city_in_103_21_116_114_160'] = np.full(df_raw.shape[0], 0)
my_query = df_raw[(df_raw['city'] == 103)|(df_raw['city'] == 21) | (df_raw['city'] == 16) | (df_raw['city'] == 114) | (df_raw['city'] == 160)].index
df_raw.iloc[my_query, -1] = 1

In [30]:
df_test['is_city_in_103_21_116_114_160'] = np.full(df_test.shape[0], 0)

my_query = df_test[(df_test['city'] == 103)|(df_test['city'] == 21) | (df_test['city'] == 16) | (df_test['city'] == 114) | (df_test['city'] == 160)].index
df_test.iloc[my_query, -1] = 1

In [32]:
sns.countplot(df_raw[(df_raw['city'] == 103)|(df_raw['city'] == 21) | (df_raw['city'] == 16) |\
       (df_raw['city'] == 114) | (df_raw['city'] == 160)]['city'], data=df_raw, hue='target');



In [43]:
df_raw['city_development_index'].value_counts(ascending=False).head(20).cumsum()


Out[43]:
0.920     5185
0.624     6857
0.910     8511
0.926     9983
0.698    10638
0.897    11262
0.939    11806
0.855    12261
0.924    12579
0.804    12892
0.884    13173
0.887    13444
0.754    13708
0.913    13925
0.899    14119
0.802    14307
0.925    14485
0.893    14660
0.878    14816
0.743    14968
Name: city_development_index, dtype: int64

In [44]:
df_raw['is_dev_20'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('city_development_index<=.2').index
df_raw.iloc[my_query, -1] = 1

df_raw['is_dev_21_30'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('city_development_index>=.21 & city_development_index<=.3').index
df_raw.iloc[my_query, -1] = 1

df_raw['is_dev_31_40'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('city_development_index>=.31 & city_development_index<=.4').index
df_raw.iloc[my_query, -1] = 1

df_raw['is_dev_41_50'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('city_development_index>=.41 & city_development_index<=.5').index
df_raw.iloc[my_query, -1] = 1

df_raw['is_dev_51_60'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('city_development_index>=.51 & city_development_index<=.6').index
df_raw.iloc[my_query, -1] = 1

df_raw['is_dev_61_70'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('city_development_index>=.61 & city_development_index<=.7').index
df_raw.iloc[my_query, -1] = 1

df_raw['is_dev_71_80'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('city_development_index>=.71 & city_development_index<=.8').index
df_raw.iloc[my_query, -1] = 1

df_raw['is_dev_81_90'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('city_development_index>=.81 & city_development_index<=.9').index
df_raw.iloc[my_query, -1] = 1

df_raw['is_dev_91'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('city_development_index>=.91').index
df_raw.iloc[my_query, -1] = 1

In [45]:
df_test['is_dev_20'] = np.zeros(df_test.shape[0])
my_query = df_test.query('city_development_index<=.2').index
df_test.iloc[my_query, -1] = 1

df_test['is_dev_21_30'] = np.zeros(df_test.shape[0])
my_query = df_test.query('city_development_index>=.21 & city_development_index<=.3').index
df_test.iloc[my_query, -1] = 1

df_test['is_dev_31_40'] = np.zeros(df_test.shape[0])
my_query = df_test.query('city_development_index>=.31 & city_development_index<=.4').index
df_test.iloc[my_query, -1] = 1

df_test['is_dev_41_50'] = np.zeros(df_test.shape[0])
my_query = df_test.query('city_development_index>=.41 & city_development_index<=.5').index
df_test.iloc[my_query, -1] = 1

df_test['is_dev_51_60'] = np.zeros(df_test.shape[0])
my_query = df_test.query('city_development_index>=.51 & city_development_index<=.6').index
df_test.iloc[my_query, -1] = 1

df_test['is_dev_61_70'] = np.zeros(df_test.shape[0])
my_query = df_test.query('city_development_index>=.61 & city_development_index<=.7').index
df_test.iloc[my_query, -1] = 1

df_test['is_dev_71_80'] = np.zeros(df_test.shape[0])
my_query = df_test.query('city_development_index>=.71 & city_development_index<=.8').index
df_test.iloc[my_query, -1] = 1

df_test['is_dev_81_90'] = np.zeros(df_test.shape[0])
my_query = df_test.query('city_development_index>=.81 & city_development_index<=.9').index
df_test.iloc[my_query, -1] = 1

df_test['is_dev_91'] = np.zeros(df_test.shape[0])
my_query = df_test.query('city_development_index>=.91').index
df_test.iloc[my_query, -1] = 1

In [49]:
df_raw.shape, df_test.shape


Out[49]:
((18359, 23), (15021, 22))

company_size col


In [56]:
# merge both df's
df_raw['min_company_size'] = np.full(df_raw.shape[0], -1)
df_raw['max_company_size'] = np.full(df_raw.shape[0], -1)

my_query = df_raw[df_raw['company_size'] =='<10'].index
df_raw.iloc[my_query, -2] = 0
df_raw.iloc[my_query, -1] = 9

my_query = df_raw[df_raw['company_size'] =='10/49'].index
df_raw.iloc[my_query, -2] = 10
df_raw.iloc[my_query, -1] = 49

my_query = df_raw[df_raw['company_size'] =='50-99'].index
df_raw.iloc[my_query, -2] = 50
df_raw.iloc[my_query, -1] = 99

my_query = df_raw[df_raw['company_size'] =='100-500'].index
df_raw.iloc[my_query, -2] = 100
df_raw.iloc[my_query, -1] = 500

my_query = df_raw[df_raw['company_size'] =='500-999'].index
df_raw.iloc[my_query, -2] = 500
df_raw.iloc[my_query, -1] = 999

my_query = df_raw[df_raw['company_size'] =='1000-4999'].index
df_raw.iloc[my_query, -2] = 1000
df_raw.iloc[my_query, -1] = 4999

my_query = df_raw[df_raw['company_size'] =='5000-9999'].index
df_raw.iloc[my_query, -2] = 5000
df_raw.iloc[my_query, -1] = 9999

my_query = df_raw[df_raw['company_size'] =='10000+'].index
df_raw.iloc[my_query, -2] = 10000
df_raw.iloc[my_query, -1] = 15000

########################################################################

df_test['min_company_size'] = np.full(df_test.shape[0], -1)
df_test['max_company_size'] = np.full(df_test.shape[0], -1)

my_query = df_test[df_test['company_size'] =='<10'].index
df_test.iloc[my_query, -2] = 0
df_test.iloc[my_query, -1] = 9

my_query = df_test[df_test['company_size'] =='10/49'].index
df_test.iloc[my_query, -2] = 10
df_test.iloc[my_query, -1] = 49

my_query = df_test[df_test['company_size'] =='50-99'].index
df_test.iloc[my_query, -2] = 50
df_test.iloc[my_query, -1] = 99

my_query = df_test[df_test['company_size'] =='100-500'].index
df_test.iloc[my_query, -2] = 100
df_test.iloc[my_query, -1] = 500

my_query = df_test[df_test['company_size'] =='500-999'].index
df_test.iloc[my_query, -2] = 500
df_test.iloc[my_query, -1] = 999

my_query = df_test[df_test['company_size'] =='1000-4999'].index
df_test.iloc[my_query, -2] = 1000
df_test.iloc[my_query, -1] = 4999

my_query = df_test[df_test['company_size'] =='5000-9999'].index
df_test.iloc[my_query, -2] = 5000
df_test.iloc[my_query, -1] = 9999

my_query = df_test[df_test['company_size'] =='10000+'].index
df_test.iloc[my_query, -2] = 10000
df_test.iloc[my_query, -1] = 15000

# df_raw.drop('company_size', axis=1, inplace=True)
# df_test.drop('company_size', axis=1, inplace=True)

# fill na's now wrt to exp level (to do) ####################################################

#df_raw['company_size'].fillna(df_raw.groupby('experience')['company_size'].tranform('median'))
drop_col.append('company_size')

In [57]:
sns.countplot(data=df_raw,hue='target', x = 'min_company_size')


Out[57]:
<matplotlib.axes._subplots.AxesSubplot at 0x23d9e244b38>

In [58]:
sns.countplot(data=df_raw,hue='target', x = 'max_company_size')


Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x23d9e26b7f0>

In [ ]:
# df_raw['company_size'].str.split('-', expand=True)

In [ ]:
# df_raw[df_raw['company_size'] =='10/49']['company_size'].str.split('/', expand=True)[1]

last_new_job col


In [62]:
df_raw['last_new_job'].replace('>4',5, inplace = True)
df_raw['last_new_job'].replace('never',0, inplace = True)
df_raw['last_new_job'] = df_raw['last_new_job'].astype('float32')

df_test['last_new_job'].replace('>4',5, inplace = True)
df_test['last_new_job'].replace('never',0, inplace = True)
df_test['last_new_job'] = df_test['last_new_job'].astype('float32')

In [63]:
sns.countplot(df_raw['last_new_job'], hue='target', data=df_raw)


Out[63]:
<matplotlib.axes._subplots.AxesSubplot at 0x23d9e2fa470>

In [265]:
df_test['last_new_job'].fillna(method='ffill', inplace=True)

In [64]:
df_raw['last_new_job'].fillna(method='ffill', inplace=True)
df_test['last_new_job'].fillna(method='ffill', inplace=True)

In [65]:
sns.countplot(df_raw['last_new_job'], hue='target', data=df_raw)


Out[65]:
<matplotlib.axes._subplots.AxesSubplot at 0x23d9e2fd710>

experience col


In [77]:
plt.xticks(rotation=90)
sns.countplot('experience', data=df_raw, hue='target',);



In [73]:
df_raw['experience'].replace('>20',22, inplace = True)
df_test['experience'].replace('>20',22, inplace = True)

df_raw['experience'].replace('<1',.6, inplace = True)
df_test['experience'].replace('<1',.6, inplace = True)

df_raw['experience'] = df_raw['experience'].astype('float32')
df_test['experience'] = df_test['experience'].astype('float32')

In [79]:
df_raw['experience'].fillna(method='ffill',inplace=True)
df_test['experience'].fillna(method='ffill',inplace=True)

company_type col


In [118]:
df_raw['is_startup'] = np.full(df_raw.shape[0], 0)
my_query = df_raw[(df_raw['company_type'] == 'Funded Startup' )|(df_raw['company_type'] == 'Early Stage Startup')].index
df_raw.iloc[my_query, -1] = 1

df_test['is_startup'] = np.full(df_test.shape[0], 0)
my_query = df_test[(df_test['company_type'] == 'Funded Startup' )|(df_test['company_type'] == 'Early Stage Startup')].index
df_test.iloc[my_query, -1] = 1

In [121]:
df_raw['is_ltd'] = np.full(df_raw.shape[0], 0)
my_query = df_raw[(df_raw['company_type'] == 'Pvt Ltd' )|(df_raw['company_type'] == 'Public Sector')].index
df_raw.iloc[my_query, -1] = 1

df_test['is_ltd'] = np.full(df_test.shape[0], 0)
my_query = df_test[(df_test['company_type'] == 'Pvt Ltd' )|(df_test['company_type'] == 'Public Sector')].index
df_test.iloc[my_query, -1] = 1

In [151]:
df_raw.isnull().sum().sort_values(ascending=False).head()/len(df_raw)


Out[151]:
company_size                  0.260308
gender                        0.223215
major_discipline              0.154584
enrolled_university_degree    0.024892
enrolled_university           0.018628
dtype: float64

In [149]:
df_raw['company_type'].fillna(value='Unknown', axis=0, inplace=True)
df_test['company_type'].fillna(value='Unknown', axis=0, inplace=True)

gender


In [165]:
df_raw.isnull().sum().sort_values(ascending=False).head()/len(df_raw)


Out[165]:
company_size                  0.260308
gender                        0.223215
major_discipline              0.154584
enrolled_university_degree    0.024892
enrolled_university           0.018628
dtype: float64

In [161]:
sns.countplot(df_raw['gender'],data=df_raw, hue='target');



In [178]:
df_raw['gender'].value_counts(normalize=True)


Out[178]:
Male       0.701781
Unknown    0.223215
Female     0.064709
Other      0.010295
Name: gender, dtype: float64

In [175]:
df_raw1 = df_raw.copy()
df_test1 = df_test.copy()

In [176]:
df_raw['gender'].fillna(value='Unknown', axis=0, inplace=True)
df_test['gender'].fillna(value='Unknown', axis=0, inplace=True)

In [182]:
df_raw.head(2)


Out[182]:
city city_development_index gender relevent_experience enrolled_university enrolled_university_degree major_discipline experience company_size company_type ... is_dev_41_50 is_dev_51_60 is_dev_61_70 is_dev_71_80 is_dev_81_90 is_dev_91 min_company_size max_company_size is_startup is_ltd
0 149 0.689 Male Has relevent experience no_enrollment Graduate STEM 3.0 100-500 Pvt Ltd ... 0.0 0.0 1.0 0.0 0.0 0.0 100 500 0 1
1 83 0.923 Male Has relevent experience no_enrollment Graduate STEM 14.0 <10 Funded Startup ... 0.0 0.0 0.0 0.0 0.0 1.0 0 9 1 0

2 rows × 27 columns


In [197]:
survived = 'work'
not_survived = 'not work'
fig, axes = plt.subplots(nrows=2, ncols=2,figsize=(10, 10))

women = df_raw[df_raw['gender']=='Female']
men = df_raw[df_raw['gender']=='Male']
other = df_raw[df_raw['gender']== 'Other']
unknown = df_raw[df_raw['gender']== 'Unknown']

ax = sns.distplot(women[women['target']==1].city_development_index, bins=18, label = survived, ax = axes[0][0], kde =False)
ax = sns.distplot(women[women['target']==0].city_development_index, bins=40, label = not_survived, ax = axes[0][0], kde =False)
ax.legend()
ax.set_title('Female')

ax = sns.distplot(men[men['target']==1].city_development_index, bins=18, label = survived, ax = axes[0][1], kde = False)
ax = sns.distplot(men[men['target']==0].city_development_index, bins=40, label = not_survived, ax = axes[0][1], kde = False)
ax.legend()
ax.set_title('Male')

ax = sns.distplot(other[other['target']==1].city_development_index, bins=18, label = survived, ax = axes[1][0], kde =False)
ax = sns.distplot(other[other['target']==0].city_development_index, bins=40, label = not_survived, ax = axes[1][0], kde =False)
ax.legend()
ax.set_title('Other')

ax = sns.distplot(unknown[unknown['target']==1].city_development_index, bins=18, label = survived, ax = axes[1][1], kde =False)
ax = sns.distplot(unknown[unknown['target']==0].city_development_index, bins=40, label = not_survived, ax = axes[1][1], kde =False)
ax.legend()
_ = ax.set_title('Unknown')


major discipline


In [177]:
plt.xticks(rotation=90)
sns.countplot('gender', data=df_raw, hue='company_type')


Out[177]:
<matplotlib.axes._subplots.AxesSubplot at 0x23da4eac278>

In [180]:
plt.xticks(rotation=90)
sns.countplot('major_discipline', data=df_raw, hue='company_type')


Out[180]:
<matplotlib.axes._subplots.AxesSubplot at 0x23da4e8c518>

In [261]:
df_raw['unaffliated_college'] = np.zeros(df_raw.shape[0])
df_test['unaffliated_college'] = np.zeros(df_test.shape[0])

for i in ['Phd', 'Graduate', 'Masters', 'High School', 'Primary School']:

    my_query = df_raw[(df_raw['enrolled_university'] == 'no_enrollment') & (df_raw['enrolled_university_degree'] == i)].index
    df_raw.iloc[my_query, -1] = 1
    
    my_query = df_test[(df_test['enrolled_university'] == 'no_enrollment') & (df_test['enrolled_university_degree'] == i)].index
    df_test.iloc[my_query, -1] = 1

df_raw['enroll_type__enroll_deg__major_in'] = df_raw.enrolled_university+'_'+df_raw.enrolled_university_degree\
                                                +'_'+df_raw.major_discipline
df_test['enroll_type__enroll_deg__major_in'] = df_test.enrolled_university+'_'+df_test.enrolled_university_degree\
                                                +'_'+df_test.major_discipline
    
df_raw['gender__major_in'] = df_raw.gender+'_'+df_raw.major_discipline
df_test['gender__major_in'] = df_test.gender+'_'+df_test.major_discipline

df_raw['enroll_deg__major_in'] = df_raw.enrolled_university_degree+'_'+df_raw.major_discipline
df_test['enroll_deg__major_in'] = df_test.enrolled_university_degree+'_'+df_test.major_discipline
    
df_raw['city_dev__max_comp_siz'] = df_raw.city.astype(str)+'_'+df_raw.max_company_size.astype(str)
df_test['city_dev__max_comp_siz'] = df_test.city.astype(str)+'_'+df_test.max_company_size.astype(str)

In [224]:
for i in ['Phd', 'Graduate', 'Masters', 'High School', 'Primary School'] : # enrolled_university_degree
    for j in ['Arts', 'Business Degree', 'Other', 'STEM', 'Humanities', 'No Major']: #major discipline
        print(i,'&&',j,'\nTrain-', df_raw[(df_raw['major_discipline'] == j ) & (df_raw['enrolled_university'] == 'no_enrollment') & (df_raw['enrolled_university_degree'] == i)].shape[0], \
             'Test-',df_test[(df_raw['major_discipline'] == j ) & (df_test['enrolled_university'] == 'no_enrollment') & (df_test['enrolled_university_degree'] == i)].shape[0])


Phd && Arts 
Train- 3 Test- 3
Phd && Business Degree 
Train- 5 Test- 9
Phd && Other 
Train- 5 Test- 6
Phd && STEM 
Train- 378 Test- 276
Phd && Humanities 
Train- 23 Test- 18
Phd && No Major 
Train- 0 Test- 2
Graduate && Arts 
Train- 179 Test- 88
Graduate && Business Degree 
Train- 179 Test- 111
Graduate && Other 
Train- 197 Test- 119
Graduate && STEM 
Train- 6994 Test- 4924
Graduate && Humanities 
Train- 391 Test- 252
Graduate && No Major 
Train- 153 Test- 81
Masters && Arts 
Train- 32 Test- 33
Masters && Business Degree 
Train- 75 Test- 45
Masters && Other 
Train- 76 Test- 64
Masters && STEM 
Train- 3172 Test- 2152
Masters && Humanities 
Train- 203 Test- 106
Masters && No Major 
Train- 23 Test- 39
High School && Arts 
Train- 0 Test- 12
High School && Business Degree 
Train- 0 Test- 14
High School && Other 
Train- 0 Test- 14
High School && STEM 
Train- 0 Test- 628
High School && Humanities 
Train- 0 Test- 39
High School && No Major 
Train- 0 Test- 10
Primary School && Arts 
Train- 0 Test- 0
Primary School && Business Degree 
Train- 0 Test- 2
Primary School && Other 
Train- 0 Test- 4
Primary School && STEM 
Train- 0 Test- 189
Primary School && Humanities 
Train- 0 Test- 15
Primary School && No Major 
Train- 0 Test- 4

In [172]:
plt.xticks(rotation=90)
sns.countplot('enrolled_university', data=df_raw, hue='company_type')


Out[172]:
<matplotlib.axes._subplots.AxesSubplot at 0x23da4bc0da0>

In [173]:
plt.xticks(rotation=90)
sns.countplot('enrolled_university_degree', data=df_raw, hue='company_type')


Out[173]:
<matplotlib.axes._subplots.AxesSubplot at 0x23da4cb5b70>

In [258]:
df_raw['major_discipline'].fillna('Unknown', axis=0, inplace=True)
df_test['major_discipline'].fillna('Unknown', axis=0, inplace=True)

df_raw['enrolled_university'].fillna('no_enrollment', axis=0, inplace=True)
df_test['enrolled_university'].fillna('no_enrollment', axis=0, inplace=True)

df_raw['enrolled_university_degree'].fillna('Graduate', axis=0, inplace=True)
df_test['enrolled_university_degree'].fillna('Graduate', axis=0, inplace=True)

In [268]:
df_test['last_new_job'].fillna(0,axis=0,inplace=True)

In [269]:
df_test.isnull().sum().sort_values(ascending=False).head(10)/len(df_test)


Out[269]:
company_size                  0.269689
enroll_deg__major_in          0.000000
is_dev_21_30                  0.000000
city_development_index        0.000000
gender                        0.000000
relevent_experience           0.000000
enrolled_university           0.000000
enrolled_university_degree    0.000000
major_discipline              0.000000
experience                    0.000000
dtype: float64

In [263]:
df_raw.isnull().sum().sort_values(ascending=False).head(10)/len(df_raw)


Out[263]:
company_size                  0.260308
enroll_deg__major_in          0.000000
gender__major_in              0.000000
city_development_index        0.000000
gender                        0.000000
relevent_experience           0.000000
enrolled_university           0.000000
enrolled_university_degree    0.000000
major_discipline              0.000000
experience                    0.000000
dtype: float64

start here


In [282]:
X_train = df_raw.drop(categorical_features,axis=1)  #numeric ones
X_test  = df_test.drop(categorical_features,axis=1) #numeric ones

In [283]:
%%time
N_COMP = 10

print("\nStart decomposition process...")
print("PCA")
pca = PCA(n_components=N_COMP, random_state=17)
pca_results_X_train = pca.fit_transform(X_train)
pca_results_X_test = pca.transform(X_test)


Start decomposition process...
PCA
Wall time: 981 ms

In [284]:
%%time
print("Append decomposition components to datasets...")

for i in range(1, N_COMP + 1):
    X_train['pca_' + str(i)] = pca_results_X_train[:, i - 1]
    X_test['pca_' + str(i)] = pca_results_X_test[:, i - 1]


Append decomposition components to datasets...
Wall time: 12 ms

In [288]:
def prepare(data_orig):
    data = pd.DataFrame()
    data['mean'] = data_orig.mean(axis=1)
    data['std'] = data_orig.std(axis=1)
    data['min'] = data_orig.min(axis=1)
    data['max'] = data_orig.max(axis=1)
    data['number_of_different'] = data_orig.nunique(axis=1)               # Number of diferent values in a row.
    data['non_zero_count'] = data_orig.fillna(0).astype(bool).sum(axis=1) # Number of non zero values (e.g. transaction count)
    return data

# Replace 0 with NaN to ignore them.
X_test_stats = prepare(X_test.replace(0,np.nan))
X_train_stats = prepare(X_train.replace(0, np.nan))

In [289]:
from sklearn.cluster import KMeans

flist = [x for x in X_train.columns]

flist_kmeans = []
for ncl in range(2,11):
    cls = KMeans(n_clusters=ncl)
    cls.fit_predict(X_train[flist].values)
    X_train['kmeans_cluster_'+str(ncl)] = cls.predict(X_train[flist].values)
    X_test['kmeans_cluster_'+str(ncl)] = cls.predict(X_test[flist].values)
    flist_kmeans.append('kmeans_cluster_'+str(ncl))
print(flist_kmeans)


['kmeans_cluster_2', 'kmeans_cluster_3', 'kmeans_cluster_4', 'kmeans_cluster_5', 'kmeans_cluster_6', 'kmeans_cluster_7', 'kmeans_cluster_8', 'kmeans_cluster_9', 'kmeans_cluster_10']

In [290]:
X_train_stack, X_test_stack = np.hstack((X_train, X_train_stats)), np.hstack((X_test, X_test_stats))

In [33]:
X_train_stack, X_test_stack = np.hstack((stack_train, train_67)), np.hstack((stack_test, test_67))

In [34]:
params = {}
params['booster'] = 'gbtree'
params["objective"] = "binary:logistic"
# params['eval_metric'] = 'logloss'
params['eval_metric'] = 'auc'
params["eta"] = 0.1 #0.03
params["subsample"] = .7 #.85 was tried before
params["silent"] = 0
params['verbose'] = 1
params["max_depth"] = 8
params["seed"] = 1
params["max_delta_step"] = 4
params['scale_pos_weight'] =  0.13208780434664197
params["gamma"] = .6 #.5 #.1 #.2
params['colsample_bytree'] = 0.7
params['nrounds'] = 1000 #3600 #2000 #4000 #using lower no for demo
#params['max_leaves'] = 511
#params['verbose_eval'] = 50

In [35]:
model_xgb, p_train, p_test  = mlcrate.xgb.train_kfold(params, X_train_stack, target, X_test_stack\
                                                       , folds = 7,skip_checks = True, stratify=target, print_imp='final')


[mlcrate] Training 7 stratified XGBoost models on training set (18359, 32) with test set (15021, 32)
[mlcrate] Running fold 0, 15735 train samples, 2624 validation samples
[0]	train-auc:0.610262	valid-auc:0.603539
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.611687	valid-auc:0.602776
[2]	train-auc:0.625767	valid-auc:0.627222
[3]	train-auc:0.634295	valid-auc:0.631425
[4]	train-auc:0.644069	valid-auc:0.647621
[5]	train-auc:0.644902	valid-auc:0.648802
[6]	train-auc:0.645907	valid-auc:0.650127
[7]	train-auc:0.646452	valid-auc:0.649835
[8]	train-auc:0.647274	valid-auc:0.6491
[9]	train-auc:0.659563	valid-auc:0.659745
[10]	train-auc:0.660092	valid-auc:0.657501
[11]	train-auc:0.661307	valid-auc:0.662217
[12]	train-auc:0.662083	valid-auc:0.662668
[13]	train-auc:0.661966	valid-auc:0.662526
[14]	train-auc:0.662003	valid-auc:0.662717
[15]	train-auc:0.672621	valid-auc:0.668754
[16]	train-auc:0.673641	valid-auc:0.668203
[17]	train-auc:0.674055	valid-auc:0.668495
[18]	train-auc:0.674714	valid-auc:0.67047
[19]	train-auc:0.676064	valid-auc:0.670505
[20]	train-auc:0.676487	valid-auc:0.670809
[21]	train-auc:0.677122	valid-auc:0.670152
[22]	train-auc:0.67958	valid-auc:0.669344
[23]	train-auc:0.683888	valid-auc:0.670509
[24]	train-auc:0.684824	valid-auc:0.670493
[25]	train-auc:0.686109	valid-auc:0.670122
[26]	train-auc:0.687529	valid-auc:0.670154
[27]	train-auc:0.691846	valid-auc:0.66895
[28]	train-auc:0.692028	valid-auc:0.669587
[29]	train-auc:0.692463	valid-auc:0.670025
[30]	train-auc:0.694789	valid-auc:0.669435
[31]	train-auc:0.696573	valid-auc:0.672287
[32]	train-auc:0.699923	valid-auc:0.673028
[33]	train-auc:0.701121	valid-auc:0.673162
[34]	train-auc:0.703413	valid-auc:0.671912
[35]	train-auc:0.706184	valid-auc:0.672706
[36]	train-auc:0.708399	valid-auc:0.671883
[37]	train-auc:0.710058	valid-auc:0.672823
[38]	train-auc:0.713615	valid-auc:0.672202
[39]	train-auc:0.72106	valid-auc:0.673613
[40]	train-auc:0.72759	valid-auc:0.674335
[41]	train-auc:0.728618	valid-auc:0.672885
[42]	train-auc:0.72901	valid-auc:0.673118
[43]	train-auc:0.734002	valid-auc:0.675069
[44]	train-auc:0.736544	valid-auc:0.672946
[45]	train-auc:0.738491	valid-auc:0.673459
[46]	train-auc:0.740045	valid-auc:0.673226
[47]	train-auc:0.744103	valid-auc:0.671725
[48]	train-auc:0.748051	valid-auc:0.671556
[49]	train-auc:0.74981	valid-auc:0.671744
[50]	train-auc:0.756195	valid-auc:0.671231
[51]	train-auc:0.759144	valid-auc:0.669733
[52]	train-auc:0.763107	valid-auc:0.668618
[53]	train-auc:0.766309	valid-auc:0.6698
[54]	train-auc:0.769572	valid-auc:0.667365
[55]	train-auc:0.771408	valid-auc:0.666162
[56]	train-auc:0.77439	valid-auc:0.665957
[57]	train-auc:0.777875	valid-auc:0.665916
[58]	train-auc:0.780846	valid-auc:0.666111
[59]	train-auc:0.78506	valid-auc:0.666284
[60]	train-auc:0.788171	valid-auc:0.664995
[61]	train-auc:0.790616	valid-auc:0.664987
[62]	train-auc:0.792801	valid-auc:0.662239
[63]	train-auc:0.794599	valid-auc:0.661393
[64]	train-auc:0.796456	valid-auc:0.660057
[65]	train-auc:0.799563	valid-auc:0.660112
[66]	train-auc:0.802881	valid-auc:0.659702
[67]	train-auc:0.804045	valid-auc:0.659149
[68]	train-auc:0.805473	valid-auc:0.658017
[69]	train-auc:0.807417	valid-auc:0.656801
[70]	train-auc:0.809042	valid-auc:0.656888
[71]	train-auc:0.811944	valid-auc:0.65717
[72]	train-auc:0.813658	valid-auc:0.656356
[73]	train-auc:0.814897	valid-auc:0.655961
[74]	train-auc:0.816873	valid-auc:0.654479
[75]	train-auc:0.819127	valid-auc:0.653675
[76]	train-auc:0.819911	valid-auc:0.65325
[77]	train-auc:0.822076	valid-auc:0.654498
[78]	train-auc:0.82333	valid-auc:0.655678
[79]	train-auc:0.825939	valid-auc:0.656533
[80]	train-auc:0.827056	valid-auc:0.657739
[81]	train-auc:0.828796	valid-auc:0.657226
[82]	train-auc:0.832602	valid-auc:0.656261
[83]	train-auc:0.833221	valid-auc:0.65658
[84]	train-auc:0.835059	valid-auc:0.655701
[85]	train-auc:0.836827	valid-auc:0.654556
[86]	train-auc:0.839599	valid-auc:0.652649
[87]	train-auc:0.842034	valid-auc:0.654136
[88]	train-auc:0.845271	valid-auc:0.653275
[89]	train-auc:0.848043	valid-auc:0.653628
[90]	train-auc:0.849345	valid-auc:0.653243
[91]	train-auc:0.851651	valid-auc:0.654417
[92]	train-auc:0.854667	valid-auc:0.656168
[93]	train-auc:0.855906	valid-auc:0.655952
Stopping. Best iteration:
[43]	train-auc:0.734002	valid-auc:0.675069

C:\ProgramData\Anaconda3\lib\site-packages\mlcrate\backend.py:7: UserWarning: Timer.format_elapsed() has been deprecated in favour of Timer.fsince() and will be removed soon
  warn(message)
[mlcrate] Finished training fold 0 - took 5s - running score 0.675069
[mlcrate] Running fold 1, 15735 train samples, 2624 validation samples
[0]	train-auc:0.589674	valid-auc:0.609368
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.62757	valid-auc:0.654881
[2]	train-auc:0.64091	valid-auc:0.674641
[3]	train-auc:0.642562	valid-auc:0.676968
[4]	train-auc:0.644801	valid-auc:0.68056
[5]	train-auc:0.645237	valid-auc:0.680553
[6]	train-auc:0.645439	valid-auc:0.680698
[7]	train-auc:0.654119	valid-auc:0.686902
[8]	train-auc:0.66123	valid-auc:0.686155
[9]	train-auc:0.661265	valid-auc:0.68564
[10]	train-auc:0.662614	valid-auc:0.685775
[11]	train-auc:0.663505	valid-auc:0.685954
[12]	train-auc:0.663372	valid-auc:0.686443
[13]	train-auc:0.663579	valid-auc:0.686352
[14]	train-auc:0.663454	valid-auc:0.686267
[15]	train-auc:0.663717	valid-auc:0.686787
[16]	train-auc:0.665199	valid-auc:0.685499
[17]	train-auc:0.665458	valid-auc:0.685978
[18]	train-auc:0.667331	valid-auc:0.686586
[19]	train-auc:0.668453	valid-auc:0.686847
[20]	train-auc:0.669465	valid-auc:0.686435
[21]	train-auc:0.673241	valid-auc:0.692203
[22]	train-auc:0.674556	valid-auc:0.693452
[23]	train-auc:0.682984	valid-auc:0.694133
[24]	train-auc:0.683829	valid-auc:0.695238
[25]	train-auc:0.685604	valid-auc:0.695135
[26]	train-auc:0.687536	valid-auc:0.694921
[27]	train-auc:0.68911	valid-auc:0.696034
[28]	train-auc:0.692613	valid-auc:0.696129
[29]	train-auc:0.696045	valid-auc:0.695081
[30]	train-auc:0.697011	valid-auc:0.695409
[31]	train-auc:0.700265	valid-auc:0.694261
[32]	train-auc:0.702377	valid-auc:0.694404
[33]	train-auc:0.705324	valid-auc:0.69296
[34]	train-auc:0.70714	valid-auc:0.692889
[35]	train-auc:0.711811	valid-auc:0.692624
[36]	train-auc:0.715904	valid-auc:0.692423
[37]	train-auc:0.719326	valid-auc:0.694171
[38]	train-auc:0.722499	valid-auc:0.695454
[39]	train-auc:0.727197	valid-auc:0.694023
[40]	train-auc:0.730219	valid-auc:0.693562
[41]	train-auc:0.73275	valid-auc:0.696346
[42]	train-auc:0.736305	valid-auc:0.696248
[43]	train-auc:0.741266	valid-auc:0.696474
[44]	train-auc:0.744444	valid-auc:0.695471
[45]	train-auc:0.7474	valid-auc:0.695569
[46]	train-auc:0.751926	valid-auc:0.694975
[47]	train-auc:0.754534	valid-auc:0.694195
[48]	train-auc:0.758734	valid-auc:0.695303
[49]	train-auc:0.759425	valid-auc:0.694733
[50]	train-auc:0.761662	valid-auc:0.696128
[51]	train-auc:0.764566	valid-auc:0.697004
[52]	train-auc:0.768359	valid-auc:0.695569
[53]	train-auc:0.772684	valid-auc:0.695562
[54]	train-auc:0.775847	valid-auc:0.694704
[55]	train-auc:0.77722	valid-auc:0.694037
[56]	train-auc:0.781119	valid-auc:0.694257
[57]	train-auc:0.785463	valid-auc:0.692138
[58]	train-auc:0.786868	valid-auc:0.691057
[59]	train-auc:0.789484	valid-auc:0.691835
[60]	train-auc:0.792407	valid-auc:0.692105
[61]	train-auc:0.794584	valid-auc:0.691106
[62]	train-auc:0.797718	valid-auc:0.690747
[63]	train-auc:0.800436	valid-auc:0.690958
[64]	train-auc:0.803522	valid-auc:0.69015
[65]	train-auc:0.805085	valid-auc:0.6899
[66]	train-auc:0.808	valid-auc:0.689172
[67]	train-auc:0.811644	valid-auc:0.68963
[68]	train-auc:0.812203	valid-auc:0.687189
[69]	train-auc:0.812819	valid-auc:0.686107
[70]	train-auc:0.815285	valid-auc:0.685365
[71]	train-auc:0.816642	valid-auc:0.685529
[72]	train-auc:0.818658	valid-auc:0.685007
[73]	train-auc:0.820138	valid-auc:0.684464
[74]	train-auc:0.821337	valid-auc:0.684225
[75]	train-auc:0.824232	valid-auc:0.683874
[76]	train-auc:0.824846	valid-auc:0.682828
[77]	train-auc:0.827185	valid-auc:0.681704
[78]	train-auc:0.82833	valid-auc:0.6825
[79]	train-auc:0.830038	valid-auc:0.681769
[80]	train-auc:0.832651	valid-auc:0.680497
[81]	train-auc:0.833655	valid-auc:0.680884
[82]	train-auc:0.834788	valid-auc:0.681032
[83]	train-auc:0.837121	valid-auc:0.680615
[84]	train-auc:0.839648	valid-auc:0.678955
[85]	train-auc:0.840847	valid-auc:0.679395
[86]	train-auc:0.84204	valid-auc:0.67888
[87]	train-auc:0.842822	valid-auc:0.680198
[88]	train-auc:0.844915	valid-auc:0.681095
[89]	train-auc:0.845701	valid-auc:0.681059
[90]	train-auc:0.846371	valid-auc:0.681471
[91]	train-auc:0.846931	valid-auc:0.681273
[92]	train-auc:0.848881	valid-auc:0.679763
[93]	train-auc:0.849658	valid-auc:0.680924
[94]	train-auc:0.851591	valid-auc:0.679633
[95]	train-auc:0.85511	valid-auc:0.679547
[96]	train-auc:0.856715	valid-auc:0.680099
[97]	train-auc:0.858518	valid-auc:0.680341
[98]	train-auc:0.858912	valid-auc:0.679703
[99]	train-auc:0.861437	valid-auc:0.679724
[100]	train-auc:0.863208	valid-auc:0.679434
[101]	train-auc:0.865485	valid-auc:0.67991
Stopping. Best iteration:
[51]	train-auc:0.764566	valid-auc:0.697004

[mlcrate] Finished training fold 1 - took 7s - running score 0.6860364999999999
[mlcrate] Running fold 2, 15736 train samples, 2623 validation samples
[0]	train-auc:0.628293	valid-auc:0.6438
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.637195	valid-auc:0.650914
[2]	train-auc:0.642778	valid-auc:0.660092
[3]	train-auc:0.642793	valid-auc:0.660163
[4]	train-auc:0.643191	valid-auc:0.661447
[5]	train-auc:0.645297	valid-auc:0.662411
[6]	train-auc:0.645337	valid-auc:0.662417
[7]	train-auc:0.648126	valid-auc:0.66334
[8]	train-auc:0.65966	valid-auc:0.67145
[9]	train-auc:0.660369	valid-auc:0.672066
[10]	train-auc:0.660143	valid-auc:0.672288
[11]	train-auc:0.660179	valid-auc:0.671873
[12]	train-auc:0.660171	valid-auc:0.671609
[13]	train-auc:0.660819	valid-auc:0.671329
[14]	train-auc:0.665374	valid-auc:0.678485
[15]	train-auc:0.665566	valid-auc:0.678558
[16]	train-auc:0.666295	valid-auc:0.679161
[17]	train-auc:0.667891	valid-auc:0.677827
[18]	train-auc:0.671237	valid-auc:0.679325
[19]	train-auc:0.672557	valid-auc:0.679012
[20]	train-auc:0.672791	valid-auc:0.679851
[21]	train-auc:0.674229	valid-auc:0.679563
[22]	train-auc:0.674426	valid-auc:0.67994
[23]	train-auc:0.67695	valid-auc:0.684998
[24]	train-auc:0.677485	valid-auc:0.685707
[25]	train-auc:0.680395	valid-auc:0.685042
[26]	train-auc:0.682009	valid-auc:0.684515
[27]	train-auc:0.68826	valid-auc:0.684741
[28]	train-auc:0.689619	valid-auc:0.683977
[29]	train-auc:0.692285	valid-auc:0.682107
[30]	train-auc:0.694959	valid-auc:0.683638
[31]	train-auc:0.698111	valid-auc:0.682753
[32]	train-auc:0.699537	valid-auc:0.688206
[33]	train-auc:0.701629	valid-auc:0.692058
[34]	train-auc:0.704696	valid-auc:0.691646
[35]	train-auc:0.713155	valid-auc:0.693493
[36]	train-auc:0.717796	valid-auc:0.690986
[37]	train-auc:0.719409	valid-auc:0.690813
[38]	train-auc:0.722136	valid-auc:0.690879
[39]	train-auc:0.726151	valid-auc:0.692104
[40]	train-auc:0.734538	valid-auc:0.692327
[41]	train-auc:0.738845	valid-auc:0.690406
[42]	train-auc:0.74272	valid-auc:0.691255
[43]	train-auc:0.746579	valid-auc:0.69482
[44]	train-auc:0.750615	valid-auc:0.693641
[45]	train-auc:0.752538	valid-auc:0.693909
[46]	train-auc:0.756781	valid-auc:0.694
[47]	train-auc:0.759304	valid-auc:0.69281
[48]	train-auc:0.761493	valid-auc:0.691647
[49]	train-auc:0.763162	valid-auc:0.691932
[50]	train-auc:0.766584	valid-auc:0.691201
[51]	train-auc:0.768461	valid-auc:0.689274
[52]	train-auc:0.771615	valid-auc:0.689332
[53]	train-auc:0.773597	valid-auc:0.689724
[54]	train-auc:0.776027	valid-auc:0.689398
[55]	train-auc:0.779273	valid-auc:0.689523
[56]	train-auc:0.78343	valid-auc:0.688836
[57]	train-auc:0.788529	valid-auc:0.68865
[58]	train-auc:0.788757	valid-auc:0.688175
[59]	train-auc:0.789656	valid-auc:0.687798
[60]	train-auc:0.792313	valid-auc:0.687383
[61]	train-auc:0.79354	valid-auc:0.687935
[62]	train-auc:0.796018	valid-auc:0.686298
[63]	train-auc:0.797406	valid-auc:0.686025
[64]	train-auc:0.800483	valid-auc:0.683773
[65]	train-auc:0.802396	valid-auc:0.683626
[66]	train-auc:0.804029	valid-auc:0.683669
[67]	train-auc:0.807078	valid-auc:0.682786
[68]	train-auc:0.809799	valid-auc:0.683096
[69]	train-auc:0.811915	valid-auc:0.683816
[70]	train-auc:0.813741	valid-auc:0.682789
[71]	train-auc:0.815366	valid-auc:0.6813
[72]	train-auc:0.816098	valid-auc:0.682748
[73]	train-auc:0.817444	valid-auc:0.683336
[74]	train-auc:0.819304	valid-auc:0.681716
[75]	train-auc:0.820631	valid-auc:0.681445
[76]	train-auc:0.821451	valid-auc:0.679747
[77]	train-auc:0.82318	valid-auc:0.679664
[78]	train-auc:0.82393	valid-auc:0.680065
[79]	train-auc:0.827614	valid-auc:0.680367
[80]	train-auc:0.830513	valid-auc:0.678987
[81]	train-auc:0.832272	valid-auc:0.676959
[82]	train-auc:0.833147	valid-auc:0.67669
[83]	train-auc:0.836183	valid-auc:0.676657
[84]	train-auc:0.838192	valid-auc:0.674976
[85]	train-auc:0.839677	valid-auc:0.676337
[86]	train-auc:0.842163	valid-auc:0.675813
[87]	train-auc:0.842686	valid-auc:0.675834
[88]	train-auc:0.847023	valid-auc:0.674002
[89]	train-auc:0.84915	valid-auc:0.673607
[90]	train-auc:0.850807	valid-auc:0.673223
[91]	train-auc:0.852438	valid-auc:0.671735
[92]	train-auc:0.855603	valid-auc:0.669217
[93]	train-auc:0.856955	valid-auc:0.669928
Stopping. Best iteration:
[43]	train-auc:0.746579	valid-auc:0.69482

[mlcrate] Finished training fold 2 - took 5s - running score 0.6889643333333333
[mlcrate] Running fold 3, 15737 train samples, 2622 validation samples
[0]	train-auc:0.619301	valid-auc:0.588993
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.637228	valid-auc:0.601529
[2]	train-auc:0.650296	valid-auc:0.608337
[3]	train-auc:0.650186	valid-auc:0.608416
[4]	train-auc:0.652464	valid-auc:0.613802
[5]	train-auc:0.652481	valid-auc:0.613687
[6]	train-auc:0.656485	valid-auc:0.615928
[7]	train-auc:0.656491	valid-auc:0.615945
[8]	train-auc:0.658563	valid-auc:0.618833
[9]	train-auc:0.658966	valid-auc:0.618481
[10]	train-auc:0.658888	valid-auc:0.618547
[11]	train-auc:0.660949	valid-auc:0.622344
[12]	train-auc:0.661178	valid-auc:0.622315
[13]	train-auc:0.662469	valid-auc:0.623728
[14]	train-auc:0.664039	valid-auc:0.623203
[15]	train-auc:0.664963	valid-auc:0.62338
[16]	train-auc:0.667624	valid-auc:0.628014
[17]	train-auc:0.667774	valid-auc:0.627874
[18]	train-auc:0.675545	valid-auc:0.636944
[19]	train-auc:0.676151	valid-auc:0.638337
[20]	train-auc:0.679356	valid-auc:0.639531
[21]	train-auc:0.685783	valid-auc:0.645176
[22]	train-auc:0.686007	valid-auc:0.644302
[23]	train-auc:0.686255	valid-auc:0.64297
[24]	train-auc:0.68993	valid-auc:0.644556
[25]	train-auc:0.692744	valid-auc:0.645984
[26]	train-auc:0.693396	valid-auc:0.646333
[27]	train-auc:0.695278	valid-auc:0.645485
[28]	train-auc:0.69735	valid-auc:0.646193
[29]	train-auc:0.700742	valid-auc:0.644838
[30]	train-auc:0.703036	valid-auc:0.643881
[31]	train-auc:0.704878	valid-auc:0.645658
[32]	train-auc:0.710103	valid-auc:0.643456
[33]	train-auc:0.713753	valid-auc:0.638836
[34]	train-auc:0.720681	valid-auc:0.641766
[35]	train-auc:0.724518	valid-auc:0.645125
[36]	train-auc:0.726397	valid-auc:0.645351
[37]	train-auc:0.728658	valid-auc:0.644587
[38]	train-auc:0.730599	valid-auc:0.644292
[39]	train-auc:0.736942	valid-auc:0.643246
[40]	train-auc:0.739711	valid-auc:0.642785
[41]	train-auc:0.7441	valid-auc:0.640951
[42]	train-auc:0.746794	valid-auc:0.637959
[43]	train-auc:0.746392	valid-auc:0.639069
[44]	train-auc:0.748603	valid-auc:0.640957
[45]	train-auc:0.751976	valid-auc:0.64294
[46]	train-auc:0.754369	valid-auc:0.641558
[47]	train-auc:0.75737	valid-auc:0.643225
[48]	train-auc:0.759762	valid-auc:0.643878
[49]	train-auc:0.762698	valid-auc:0.644765
[50]	train-auc:0.76528	valid-auc:0.642723
[51]	train-auc:0.768525	valid-auc:0.641721
[52]	train-auc:0.771662	valid-auc:0.641785
[53]	train-auc:0.774785	valid-auc:0.641525
[54]	train-auc:0.777908	valid-auc:0.642359
[55]	train-auc:0.780472	valid-auc:0.643766
[56]	train-auc:0.782052	valid-auc:0.643843
[57]	train-auc:0.785452	valid-auc:0.643457
[58]	train-auc:0.787279	valid-auc:0.643512
[59]	train-auc:0.791613	valid-auc:0.641507
[60]	train-auc:0.792872	valid-auc:0.642965
[61]	train-auc:0.794534	valid-auc:0.644027
[62]	train-auc:0.795767	valid-auc:0.64458
[63]	train-auc:0.798099	valid-auc:0.645879
[64]	train-auc:0.80013	valid-auc:0.64508
[65]	train-auc:0.801646	valid-auc:0.645241
[66]	train-auc:0.804165	valid-auc:0.644162
[67]	train-auc:0.805208	valid-auc:0.645226
[68]	train-auc:0.807471	valid-auc:0.644985
[69]	train-auc:0.809635	valid-auc:0.644127
[70]	train-auc:0.810738	valid-auc:0.643885
[71]	train-auc:0.811994	valid-auc:0.643744
[72]	train-auc:0.814186	valid-auc:0.643271
[73]	train-auc:0.816091	valid-auc:0.641781
[74]	train-auc:0.819398	valid-auc:0.642025
[75]	train-auc:0.822542	valid-auc:0.642389
[76]	train-auc:0.82375	valid-auc:0.642384
Stopping. Best iteration:
[26]	train-auc:0.693396	valid-auc:0.646333

[mlcrate] Finished training fold 3 - took 6s - running score 0.6783064999999999
[mlcrate] Running fold 4, 15737 train samples, 2622 validation samples
[0]	train-auc:0.618735	valid-auc:0.590751
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.631431	valid-auc:0.617039
[2]	train-auc:0.631355	valid-auc:0.61779
[3]	train-auc:0.642544	valid-auc:0.626643
[4]	train-auc:0.64833	valid-auc:0.627766
[5]	train-auc:0.648834	valid-auc:0.625968
[6]	train-auc:0.648981	valid-auc:0.625939
[7]	train-auc:0.654237	valid-auc:0.628051
[8]	train-auc:0.654768	valid-auc:0.628542
[9]	train-auc:0.654755	valid-auc:0.628296
[10]	train-auc:0.654797	valid-auc:0.628301
[11]	train-auc:0.654667	valid-auc:0.628669
[12]	train-auc:0.660923	valid-auc:0.633651
[13]	train-auc:0.661408	valid-auc:0.636026
[14]	train-auc:0.663234	valid-auc:0.638201
[15]	train-auc:0.664044	valid-auc:0.637958
[16]	train-auc:0.674395	valid-auc:0.652198
[17]	train-auc:0.674243	valid-auc:0.653212
[18]	train-auc:0.680431	valid-auc:0.65096
[19]	train-auc:0.681193	valid-auc:0.652182
[20]	train-auc:0.682886	valid-auc:0.652472
[21]	train-auc:0.683511	valid-auc:0.653233
[22]	train-auc:0.686464	valid-auc:0.65285
[23]	train-auc:0.691094	valid-auc:0.647903
[24]	train-auc:0.692697	valid-auc:0.648095
[25]	train-auc:0.694354	valid-auc:0.652593
[26]	train-auc:0.698591	valid-auc:0.651678
[27]	train-auc:0.699511	valid-auc:0.651256
[28]	train-auc:0.70143	valid-auc:0.650999
[29]	train-auc:0.702853	valid-auc:0.654227
[30]	train-auc:0.706068	valid-auc:0.655199
[31]	train-auc:0.707145	valid-auc:0.653873
[32]	train-auc:0.710396	valid-auc:0.65257
[33]	train-auc:0.715355	valid-auc:0.648989
[34]	train-auc:0.716974	valid-auc:0.650046
[35]	train-auc:0.718825	valid-auc:0.65378
[36]	train-auc:0.720378	valid-auc:0.6533
[37]	train-auc:0.722213	valid-auc:0.652504
[38]	train-auc:0.725568	valid-auc:0.653131
[39]	train-auc:0.727591	valid-auc:0.653283
[40]	train-auc:0.729701	valid-auc:0.652662
[41]	train-auc:0.733648	valid-auc:0.650894
[42]	train-auc:0.736501	valid-auc:0.650606
[43]	train-auc:0.740604	valid-auc:0.646932
[44]	train-auc:0.743851	valid-auc:0.644705
[45]	train-auc:0.746711	valid-auc:0.647691
[46]	train-auc:0.749439	valid-auc:0.648242
[47]	train-auc:0.751201	valid-auc:0.647647
[48]	train-auc:0.754511	valid-auc:0.649198
[49]	train-auc:0.758101	valid-auc:0.651624
[50]	train-auc:0.759916	valid-auc:0.64842
[51]	train-auc:0.761563	valid-auc:0.649582
[52]	train-auc:0.764822	valid-auc:0.647745
[53]	train-auc:0.766811	valid-auc:0.647044
[54]	train-auc:0.769272	valid-auc:0.645013
[55]	train-auc:0.773006	valid-auc:0.642929
[56]	train-auc:0.775594	valid-auc:0.642002
[57]	train-auc:0.779562	valid-auc:0.644397
[58]	train-auc:0.780768	valid-auc:0.643649
[59]	train-auc:0.783239	valid-auc:0.641816
[60]	train-auc:0.784365	valid-auc:0.641842
[61]	train-auc:0.785627	valid-auc:0.641889
[62]	train-auc:0.789426	valid-auc:0.643334
[63]	train-auc:0.792721	valid-auc:0.640596
[64]	train-auc:0.79546	valid-auc:0.64185
[65]	train-auc:0.797475	valid-auc:0.641846
[66]	train-auc:0.800442	valid-auc:0.642303
[67]	train-auc:0.801676	valid-auc:0.641792
[68]	train-auc:0.803088	valid-auc:0.641251
[69]	train-auc:0.804916	valid-auc:0.640316
[70]	train-auc:0.807089	valid-auc:0.63918
[71]	train-auc:0.808353	valid-auc:0.639486
[72]	train-auc:0.809456	valid-auc:0.639484
[73]	train-auc:0.809994	valid-auc:0.638333
[74]	train-auc:0.811617	valid-auc:0.638785
[75]	train-auc:0.811669	valid-auc:0.638538
[76]	train-auc:0.814724	valid-auc:0.639566
[77]	train-auc:0.816462	valid-auc:0.639927
[78]	train-auc:0.817865	valid-auc:0.638568
[79]	train-auc:0.82006	valid-auc:0.639505
[80]	train-auc:0.821217	valid-auc:0.640835
Stopping. Best iteration:
[30]	train-auc:0.706068	valid-auc:0.655199

[mlcrate] Finished training fold 4 - took 6s - running score 0.673685
[mlcrate] Running fold 5, 15737 train samples, 2622 validation samples
[0]	train-auc:0.617167	valid-auc:0.601804
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.642762	valid-auc:0.627823
[2]	train-auc:0.644401	valid-auc:0.631204
[3]	train-auc:0.648819	valid-auc:0.634766
[4]	train-auc:0.651324	valid-auc:0.638614
[5]	train-auc:0.651296	valid-auc:0.638046
[6]	train-auc:0.65218	valid-auc:0.637454
[7]	train-auc:0.656175	valid-auc:0.635014
[8]	train-auc:0.657227	valid-auc:0.636454
[9]	train-auc:0.657201	valid-auc:0.636709
[10]	train-auc:0.657615	valid-auc:0.636739
[11]	train-auc:0.659172	valid-auc:0.638671
[12]	train-auc:0.659135	valid-auc:0.638897
[13]	train-auc:0.659367	valid-auc:0.638767
[14]	train-auc:0.659459	valid-auc:0.638405
[15]	train-auc:0.659604	valid-auc:0.638344
[16]	train-auc:0.659857	valid-auc:0.639286
[17]	train-auc:0.662862	valid-auc:0.642837
[18]	train-auc:0.674023	valid-auc:0.663985
[19]	train-auc:0.674165	valid-auc:0.663535
[20]	train-auc:0.674627	valid-auc:0.663881
[21]	train-auc:0.678328	valid-auc:0.662812
[22]	train-auc:0.680307	valid-auc:0.662697
[23]	train-auc:0.681835	valid-auc:0.66309
[24]	train-auc:0.683941	valid-auc:0.663635
[25]	train-auc:0.687007	valid-auc:0.661775
[26]	train-auc:0.69108	valid-auc:0.662496
[27]	train-auc:0.69311	valid-auc:0.662499
[28]	train-auc:0.694363	valid-auc:0.661303
[29]	train-auc:0.694657	valid-auc:0.661025
[30]	train-auc:0.69617	valid-auc:0.660153
[31]	train-auc:0.699898	valid-auc:0.664443
[32]	train-auc:0.701515	valid-auc:0.664007
[33]	train-auc:0.702064	valid-auc:0.664068
[34]	train-auc:0.70749	valid-auc:0.665907
[35]	train-auc:0.711115	valid-auc:0.666763
[36]	train-auc:0.71248	valid-auc:0.665654
[37]	train-auc:0.715044	valid-auc:0.667327
[38]	train-auc:0.719675	valid-auc:0.668562
[39]	train-auc:0.72394	valid-auc:0.668549
[40]	train-auc:0.728855	valid-auc:0.670424
[41]	train-auc:0.733525	valid-auc:0.672317
[42]	train-auc:0.734952	valid-auc:0.671798
[43]	train-auc:0.73865	valid-auc:0.673171
[44]	train-auc:0.740993	valid-auc:0.671687
[45]	train-auc:0.743105	valid-auc:0.669833
[46]	train-auc:0.745393	valid-auc:0.668507
[47]	train-auc:0.748581	valid-auc:0.667825
[48]	train-auc:0.751692	valid-auc:0.66787
[49]	train-auc:0.754954	valid-auc:0.665018
[50]	train-auc:0.755853	valid-auc:0.664252
[51]	train-auc:0.756951	valid-auc:0.662541
[52]	train-auc:0.759497	valid-auc:0.662543
[53]	train-auc:0.762281	valid-auc:0.663859
[54]	train-auc:0.765757	valid-auc:0.664189
[55]	train-auc:0.766801	valid-auc:0.662997
[56]	train-auc:0.768996	valid-auc:0.663174
[57]	train-auc:0.770746	valid-auc:0.662685
[58]	train-auc:0.775539	valid-auc:0.662969
[59]	train-auc:0.777863	valid-auc:0.662697
[60]	train-auc:0.780215	valid-auc:0.661619
[61]	train-auc:0.780867	valid-auc:0.661324
[62]	train-auc:0.783594	valid-auc:0.659724
[63]	train-auc:0.786726	valid-auc:0.659884
[64]	train-auc:0.789418	valid-auc:0.660073
[65]	train-auc:0.791919	valid-auc:0.66003
[66]	train-auc:0.794216	valid-auc:0.658475
[67]	train-auc:0.797184	valid-auc:0.659285
[68]	train-auc:0.797923	valid-auc:0.659307
[69]	train-auc:0.800127	valid-auc:0.658218
[70]	train-auc:0.800897	valid-auc:0.658306
[71]	train-auc:0.801691	valid-auc:0.658713
[72]	train-auc:0.80284	valid-auc:0.659154
[73]	train-auc:0.804369	valid-auc:0.658694
[74]	train-auc:0.806602	valid-auc:0.656755
[75]	train-auc:0.809123	valid-auc:0.656977
[76]	train-auc:0.811567	valid-auc:0.656118
[77]	train-auc:0.813271	valid-auc:0.655926
[78]	train-auc:0.815016	valid-auc:0.656224
[79]	train-auc:0.81616	valid-auc:0.656275
[80]	train-auc:0.817643	valid-auc:0.656707
[81]	train-auc:0.818849	valid-auc:0.657706
[82]	train-auc:0.822441	valid-auc:0.657407
[83]	train-auc:0.823119	valid-auc:0.657549
[84]	train-auc:0.824995	valid-auc:0.657599
[85]	train-auc:0.827808	valid-auc:0.658314
[86]	train-auc:0.830129	valid-auc:0.657772
[87]	train-auc:0.832624	valid-auc:0.657146
[88]	train-auc:0.834077	valid-auc:0.656396
[89]	train-auc:0.835171	valid-auc:0.65681
[90]	train-auc:0.837287	valid-auc:0.656891
[91]	train-auc:0.838765	valid-auc:0.657812
[92]	train-auc:0.839765	valid-auc:0.658104
[93]	train-auc:0.841409	valid-auc:0.657547
Stopping. Best iteration:
[43]	train-auc:0.73865	valid-auc:0.673171

[mlcrate] Finished training fold 5 - took 5s - running score 0.6735993333333333
[mlcrate] Running fold 6, 15737 train samples, 2622 validation samples
[0]	train-auc:0.614568	valid-auc:0.611226
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[1]	train-auc:0.638058	valid-auc:0.643234
[2]	train-auc:0.645265	valid-auc:0.642438
[3]	train-auc:0.646165	valid-auc:0.64351
[4]	train-auc:0.655627	valid-auc:0.648533
[5]	train-auc:0.657473	valid-auc:0.646935
[6]	train-auc:0.659608	valid-auc:0.64917
[7]	train-auc:0.661322	valid-auc:0.650421
[8]	train-auc:0.663886	valid-auc:0.650533
[9]	train-auc:0.663881	valid-auc:0.650457
[10]	train-auc:0.663944	valid-auc:0.65046
[11]	train-auc:0.665206	valid-auc:0.652675
[12]	train-auc:0.665579	valid-auc:0.652947
[13]	train-auc:0.666093	valid-auc:0.652253
[14]	train-auc:0.666882	valid-auc:0.653652
[15]	train-auc:0.667535	valid-auc:0.653031
[16]	train-auc:0.667577	valid-auc:0.653133
[17]	train-auc:0.667751	valid-auc:0.653733
[18]	train-auc:0.669415	valid-auc:0.653561
[19]	train-auc:0.677088	valid-auc:0.657811
[20]	train-auc:0.678154	valid-auc:0.65747
[21]	train-auc:0.679737	valid-auc:0.657464
[22]	train-auc:0.683136	valid-auc:0.657395
[23]	train-auc:0.685249	valid-auc:0.655573
[24]	train-auc:0.686465	valid-auc:0.655759
[25]	train-auc:0.687434	valid-auc:0.656078
[26]	train-auc:0.688686	valid-auc:0.656368
[27]	train-auc:0.691715	valid-auc:0.6542
[28]	train-auc:0.694766	valid-auc:0.656012
[29]	train-auc:0.69655	valid-auc:0.655989
[30]	train-auc:0.702517	valid-auc:0.658963
[31]	train-auc:0.703235	valid-auc:0.659367
[32]	train-auc:0.703809	valid-auc:0.658496
[33]	train-auc:0.705692	valid-auc:0.659471
[34]	train-auc:0.706796	valid-auc:0.661676
[35]	train-auc:0.710808	valid-auc:0.657743
[36]	train-auc:0.714246	valid-auc:0.657883
[37]	train-auc:0.715664	valid-auc:0.658442
[38]	train-auc:0.7171	valid-auc:0.659525
[39]	train-auc:0.72382	valid-auc:0.667035
[40]	train-auc:0.726973	valid-auc:0.667036
[41]	train-auc:0.730504	valid-auc:0.667798
[42]	train-auc:0.733809	valid-auc:0.664366
[43]	train-auc:0.737181	valid-auc:0.665485
[44]	train-auc:0.74189	valid-auc:0.665378
[45]	train-auc:0.743097	valid-auc:0.665339
[46]	train-auc:0.745711	valid-auc:0.665721
[47]	train-auc:0.750234	valid-auc:0.666006
[48]	train-auc:0.754497	valid-auc:0.666261
[49]	train-auc:0.75712	valid-auc:0.667318
[50]	train-auc:0.76116	valid-auc:0.667696
[51]	train-auc:0.765612	valid-auc:0.664916
[52]	train-auc:0.76796	valid-auc:0.666024
[53]	train-auc:0.770412	valid-auc:0.664593
[54]	train-auc:0.77209	valid-auc:0.665844
[55]	train-auc:0.773675	valid-auc:0.665973
[56]	train-auc:0.774595	valid-auc:0.665499
[57]	train-auc:0.776013	valid-auc:0.669245
[58]	train-auc:0.779417	valid-auc:0.669799
[59]	train-auc:0.78121	valid-auc:0.670826
[60]	train-auc:0.783204	valid-auc:0.67062
[61]	train-auc:0.784371	valid-auc:0.670754
[62]	train-auc:0.785739	valid-auc:0.670501
[63]	train-auc:0.788704	valid-auc:0.669534
[64]	train-auc:0.790803	valid-auc:0.671029
[65]	train-auc:0.794323	valid-auc:0.672207
[66]	train-auc:0.797002	valid-auc:0.668739
[67]	train-auc:0.798941	valid-auc:0.667835
[68]	train-auc:0.800274	valid-auc:0.667309
[69]	train-auc:0.801656	valid-auc:0.666381
[70]	train-auc:0.803449	valid-auc:0.666972
[71]	train-auc:0.805201	valid-auc:0.666703
[72]	train-auc:0.808459	valid-auc:0.666504
[73]	train-auc:0.809873	valid-auc:0.666805
[74]	train-auc:0.811064	valid-auc:0.666201
[75]	train-auc:0.812569	valid-auc:0.66594
[76]	train-auc:0.814129	valid-auc:0.665224
[77]	train-auc:0.817674	valid-auc:0.665721
[78]	train-auc:0.820418	valid-auc:0.66505
[79]	train-auc:0.822119	valid-auc:0.664598
[80]	train-auc:0.82485	valid-auc:0.663782
[81]	train-auc:0.827209	valid-auc:0.665227
[82]	train-auc:0.829144	valid-auc:0.664217
[83]	train-auc:0.831815	valid-auc:0.662836
[84]	train-auc:0.832206	valid-auc:0.662424
[85]	train-auc:0.834265	valid-auc:0.662916
[86]	train-auc:0.836778	valid-auc:0.662213
[87]	train-auc:0.838666	valid-auc:0.661718
[88]	train-auc:0.840203	valid-auc:0.661087
[89]	train-auc:0.841724	valid-auc:0.660848
[90]	train-auc:0.843744	valid-auc:0.660664
[91]	train-auc:0.847277	valid-auc:0.661182
[92]	train-auc:0.849616	valid-auc:0.662085
[93]	train-auc:0.851121	valid-auc:0.66248
[94]	train-auc:0.852751	valid-auc:0.662537
[95]	train-auc:0.854391	valid-auc:0.662752
[96]	train-auc:0.855938	valid-auc:0.663607
[97]	train-auc:0.857686	valid-auc:0.664285
[98]	train-auc:0.860355	valid-auc:0.662694
[99]	train-auc:0.862614	valid-auc:0.662459
[100]	train-auc:0.863344	valid-auc:0.662029
[101]	train-auc:0.865265	valid-auc:0.660631
[102]	train-auc:0.866996	valid-auc:0.661501
[103]	train-auc:0.86878	valid-auc:0.660817
[104]	train-auc:0.870956	valid-auc:0.661349
[105]	train-auc:0.87223	valid-auc:0.660619
[106]	train-auc:0.873951	valid-auc:0.661776
[107]	train-auc:0.875289	valid-auc:0.662393
[108]	train-auc:0.876732	valid-auc:0.661577
[109]	train-auc:0.87686	valid-auc:0.661645
[110]	train-auc:0.878495	valid-auc:0.66209
[111]	train-auc:0.880265	valid-auc:0.66352
[112]	train-auc:0.88195	valid-auc:0.664038
[113]	train-auc:0.884231	valid-auc:0.664354
[114]	train-auc:0.886128	valid-auc:0.662974
[115]	train-auc:0.887824	valid-auc:0.663911
Stopping. Best iteration:
[65]	train-auc:0.794323	valid-auc:0.672207

[mlcrate] Finished training fold 6 - took 7s - running score 0.6734004285714287
[mlcrate] Finished training 7 XGBoost models, took 45s

In [36]:
def make_submission(probs):
    sample = pd.read_csv(f'{PATH}\\AV_Stud_2\\sample_submission.csv')
    submit = sample.copy()
    submit['target'] = probs
    return submit

In [37]:
submit = make_submission(p_test)
submit.to_csv(f'{PATH}\\AV_Stud_2\\xgb_remake_2.csv', index=False)
submit.head(2)

# np.save(f'{PATH}\\AV_Stud_2\\xgb_oof_1207.npy', p_train)
# np.save(f'{PATH}\\AV_Stud_2\\raw_train_dummy_impact_train67.npy', X_train_stack)
# np.save(f'{PATH}\\AV_Stud_2\\raw_test_dummy_impact_test67.npy', X_test_stack)

# np.save(f'{PATH}\\AV_Stud_2\\train_no_cat_with_std_pca.npy', df_raw)
# np.save(f'{PATH}\\AV_Stud_2\\test_no_cat_with_std_pca.npy', df_test)


Out[37]:
enrollee_id target
0 16548 0.212668
1 12036 0.021563

In [272]:
df_raw.to_csv(f'{PATH}\\AV_Stud_2\\clean_train_1207.csv', index=False)
df_test.to_csv(f'{PATH}\\AV_Stud_2\\clean_test_1207.csv', index=False)

In [4]:
train = np.load(f'{PATH}\\AV_Stud_2\\train_dummy_impact_train67.npy')

In [5]:
np.savetxt(f'{PATH}\\AV_Stud_2\\train_dummy_impact_train67.csv',train, delimiter=',')

In [6]:
test = np.load(f'{PATH}\\AV_Stud_2\\test_dummy_impact_test67.npy')
np.savetxt(f'{PATH}\\AV_Stud_2\\test_dummy_impact_train67.csv',test, delimiter=',')

In [15]:
clf_et = ExtraTreesClassifier(criterion='entropy',max_leaf_nodes=0,n_estimators=500,\
                             min_impurity_split=0.0001,n_jobs=4,max_features=0.7,max_depth=8,min_samples_leaf=1,\
                             class_weight='balanced')

In [40]:
# Create parameters to search
gridParams = {
    'learning_rate': [0.05],
    'n_estimators': [100, 150, 250, 500],
    'num_leaves': [255,511],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [501], # Updated from 'seed'
    'colsample_bytree' : [.7, 0.74, 0.75, 0.76, .85],
    'subsample' : [0.7,0.75, .8],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    }

# Create classifier to use. Note that parameters have to be input manually
# not as a dict!
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt', 
          objective = 'binary', 
          n_jobs = -1, # Updated from 'nthread' 
          silent = False,
          max_depth = 8,
          max_bin = 128, 
          subsample_for_bin = 1,
          subsample = .7, 
          subsample_freq = 1, 
          min_split_gain = .5,  
          scale_pos_weight = 0.1346)

# To view the default model params:
mdl.get_params().keys()


Out[40]:
dict_keys(['boosting_type', 'class_weight', 'colsample_bytree', 'learning_rate', 'max_depth', 'min_child_samples', 'min_child_weight', 'min_split_gain', 'n_estimators', 'n_jobs', 'num_leaves', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'silent', 'subsample', 'subsample_for_bin', 'subsample_freq', 'max_bin', 'scale_pos_weight'])

In [41]:
# Create the grid
grid = GridSearchCV(mdl, gridParams, verbose=1, cv=5, n_jobs=-1)
# Run the grid
grid.fit(X_train_stack, target)

# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)


Fitting 5 folds for each of 720 candidates, totalling 3600 fits
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   11.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   33.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 18.0min finished
{'boosting_type': 'gbdt', 'colsample_bytree': 0.7, 'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 255, 'objective': 'binary', 'random_state': 501, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.7}
0.867912195653

In [45]:
params = {}

In [46]:
# Using parameters already set above, replace in the best from the grid search
params['colsample_bytree'] = grid.best_params_['colsample_bytree']
params['learning_rate'] = 0.05 
# params['max_bin'] = grid.best_params_['max_bin']
params['num_leaves'] = grid.best_params_['num_leaves']
params['reg_alpha'] = grid.best_params_['reg_alpha']
params['reg_lambda'] = grid.best_params_['reg_lambda']
params['subsample'] = grid.best_params_['subsample']

print('Fitting with params: ')
print(params)

# Kit k models with early-stopping on different training/validation splits
k = 2;
predsValid = 0 
predsTrain = 0
predsTest = 0
l = ['split','gain']
for i in range(0, k): 
    print('Fitting model', i)
    
    # Prepare the data set for fold
    X_train, X_test, y_train, y_test = train_test_split(X_train_stack, target, test_size=0.2, random_state=7, shuffle=True)
    
    d_train = lgb.Dataset(X_train, label=y_train)
    d_test = lgb.Dataset(X_test, label=y_test)
    
    # Train     
    gbm = lgb.train(params,
                    d_train, 
                    1000, 
                    verbose_eval=1)

    # Plot importance
    lgb.plot_importance(gbm,max_num_features=10, importance_type=l[i])
    plt.show()


Fitting with params: 
{'colsample_bytree': 0.7, 'learning_rate': 0.05, 'num_leaves': 255, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 0.7}
Fitting model 0
Fitting model 1