In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
import time
import xgboost as xgb
import lightgbm as lgb
# import category_encoders as cat_ed
import gc, mlcrate, glob

# from gplearn.genetic import SymbolicTransformer, SymbolicClassifier
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor
from IPython.display import display

from catboost import CatBoostClassifier
from scipy.cluster import hierarchy as hc
from collections import Counter

from sklearn import metrics
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import  roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# will ignore all warning from sklearn, seaborn etc..
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

pd.option_context("display.max_rows", 1000);
pd.option_context("display.max_columns", 1000);

In [3]:
PATH = os.getcwd()
PATH


Out[3]:
'D:\\Github\\fastai\\courses\\ml1\\AV_WNS'

In [45]:
df_raw = pd.read_csv(f'{PATH}\\train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}\\test.csv', low_memory=False)

In [46]:
df_raw.shape, df_test.shape


Out[46]:
((54808, 14), (23490, 13))

In [47]:
new_cols = ['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met_more_than_80_percent', 'awards_won_bool',
       'avg_training_score', 'is_promoted']
#re-naming them
df_raw.columns = new_cols
df_test.columns = new_cols[:-1]

cat_cols = list(df_raw.select_dtypes(include=['object']).columns)
num_cols = list(df_raw.select_dtypes(exclude=['object']).columns)

In [48]:
# NaN in education could be replaced with mode
mode_education = df_raw.education.value_counts().index.values[0]
df_raw.education = df_raw.education.fillna(mode_education)

# NaN in education could be replaced with mode
mode_education = df_test.education.value_counts().index.values[0]
df_test.education = df_test.education.fillna(mode_education)

In [49]:
# converting it to ordinal numbers since education is "ordinal" feature; higher the level, higher is the weight
repl = {"Below Secondary":1, "Bachelor's":2, "Master's & above":3}
df_raw.education = df_raw.education.replace(repl)
df_test.education = df_test.education.replace(repl)

In [50]:
df_raw.previous_year_rating = df_raw.previous_year_rating.fillna(0)
df_test.previous_year_rating = df_test.previous_year_rating.fillna(0)

In [51]:
###########################kind of binning age at trivial level #####################################

df_raw['is_age_30'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age<=30.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_30_39'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=30 and age<=39.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_39_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=39. & age<=45.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=45.').index
df_raw.iloc[my_query, -1] = 1

#######################################################################################################
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
#######################################################################################################

df_raw['age_group'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=20. & age<=30.').index
df_raw.iloc[my_query, -1] = 'young'
my_query = df_raw.query('age>=31. & age<=50.').index
df_raw.iloc[my_query, -1] = 'middle_aged'
my_query = df_raw.query('age>=51. & age<=60.').index
df_raw.iloc[my_query, -1] = 'senior_aged'
###################################################################################################################
###################################################################################################################
###################################################################################################################

###########################kind of binning age at trivial level #####################################

df_test['is_age_30'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age<=30.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_30_39'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=30 and age<=39.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_39_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=39. & age<=45.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=45.').index
df_test.iloc[my_query, -1] = 1

#######################################################################################################
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
#######################################################################################################

df_test['age_group'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=20. & age<=30.').index
df_test.iloc[my_query, -1] = 'young'
my_query = df_test.query('age>=31. & age<=50.').index
df_test.iloc[my_query, -1] = 'middle_aged'
my_query = df_test.query('age>=51. & age<=60.').index
df_test.iloc[my_query, -1] = 'senior_aged';
###############################################################################

feats_added = []

df_raw['joining_age'] = df_raw['age'] - df_raw['length_of_service']
df_test['joining_age'] = df_test['age'] - df_test['length_of_service']
feats_added.append('joining_age')

df_raw['region'].replace('region_', '', True, None, True)
df_test['region'].replace('region_', '', True, None, True)
################################################################################3
bins = [20., 25., 30., 35., 40., 45., 50., 55., 60., 70]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['age']
for col in bin_cols:    
    df_raw[f'bin_{col}'.format(col)]  = pd.cut(df_raw[col] ,bins,labels = labels)
    df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)
feats_added.append('bin_age')

bins = [39., 44., 54., 66., 75., 80., 85., 90., 95.]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['avg_training_score']
for col in bin_cols:    
    df_raw[f'bin_{col}'.format(col)]  = pd.cut(df_raw[col] ,bins,labels = labels)
    df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)

feats_added.append('bin_avg_training_score')
feats_added.append(['age_group', 'is_age_39', 'is_age_39_45', 'is_age_45', 'promotion_chance',\
                   'reg_count','mean_age_per_region','mean_joining_age_per_region','mean_previous_year_rating_per_region',\
                    'mean_avg_training_score_per_region','mean_length_of_service_per_region'])
################################################################################################
df_raw['promotion_chance'] = 'low'
my_query = df_raw.query('avg_training_score>=90').index
df_raw.iloc[my_query, -1] = 'very_high'
my_query = df_raw.query('avg_training_score>=75 and avg_training_score<90').index
df_raw.iloc[my_query, -1] = 'high'
my_query = df_raw.query('avg_training_score>=65 and avg_training_score<75').index
df_raw.iloc[my_query, -1] = 'medium'
my_query = df_raw.query('avg_training_score>=53 and avg_training_score<65').index
df_raw.iloc[my_query, -1] = 'low_medium'

df_test['promotion_chance'] = 'low'
my_query = df_test.query('avg_training_score>=90').index
df_test.iloc[my_query, -1] = 'very_high'
my_query = df_test.query('avg_training_score>=75 and avg_training_score<90').index
df_test.iloc[my_query, -1] = 'high'
my_query = df_test.query('avg_training_score>=65 and avg_training_score<75').index
df_test.iloc[my_query, -1] = 'medium'
my_query = df_test.query('avg_training_score>=53 and avg_training_score<65').index
df_test.iloc[my_query, -1] = 'low_medium'
###############################################################################################
def map_(regs, age):
    d = {}
    for i,j in zip(regs, age):
        d[i] = j
    return d

xyz = df_raw.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_raw['region'])

regs                   = xyz.reset_index()['region'].values
age                    = xyz.reset_index()['age'].values
joining_age            = xyz.reset_index()['joining_age'].values
previous_year_rating   = xyz.reset_index()['previous_year_rating'].values
length_of_service      = xyz.reset_index()['length_of_service'].values 
avg_training_score     = xyz.reset_index()['avg_training_score'].values

df_raw['reg_count'] = df_raw['region'].map(count)
d = map_(regs, age)
df_raw['mean_age_per_region']   = df_raw['region'].map(d)
d = map_(regs, joining_age)
df_raw['mean_joining_age_per_region']   = df_raw['region'].map(d)
d = map_(regs, previous_year_rating)
df_raw['mean_previous_year_rating_per_region']   = df_raw['region'].map(d)
d = map_(regs, avg_training_score)
df_raw['mean_avg_training_score_per_region']   = df_raw['region'].map(d)
d = map_(regs, length_of_service)
df_raw['mean_length_of_service_per_region']   = df_raw['region'].map(d)

xyz = df_test.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_test['region'])

regs                   = xyz.reset_index()['region'].values
age                    = xyz.reset_index()['age'].values
joining_age            = xyz.reset_index()['joining_age'].values
previous_year_rating   = xyz.reset_index()['previous_year_rating'].values
length_of_service      = xyz.reset_index()['length_of_service'].values 
avg_training_score     = xyz.reset_index()['avg_training_score'].values

df_test['reg_count'] = df_test['region'].map(count)
d = map_(regs, age)
df_test['mean_age_per_region']   = df_test['region'].map(d)
d = map_(regs, joining_age)
df_test['mean_joining_age_per_region']   = df_test['region'].map(d)
d = map_(regs, previous_year_rating)
df_test['mean_previous_year_rating_per_region']   = df_test['region'].map(d)
d = map_(regs, avg_training_score)
df_test['mean_avg_training_score_per_region']   = df_test['region'].map(d)
d = map_(regs, length_of_service)
df_test['mean_length_of_service_per_region']   = df_test['region'].map(d)
####################################################################################


del d, count, regs, joining_age, previous_year_rating, length_of_service, avg_training_score
gc.collect()


Out[51]:
4368

In [52]:
df_raw.shape, df_test.shape


Out[52]:
((54808, 29), (23490, 28))

In [54]:
train_cats(df_raw);
apply_cats(df_test,df_raw)

In [55]:
target = df_raw.is_promoted
df_raw.drop(['is_promoted', 'employee_id'], axis=1, inplace=True)
df_test.drop(['employee_id'], axis=1, inplace=True)

In [56]:
cat_cols = list(df_raw.select_dtypes(include=['category']).columns)
num_cols = list(df_raw.select_dtypes(exclude=['category']).columns)
print(cat_cols)
print(num_cols)


['department', 'region', 'gender', 'recruitment_channel', 'age_group', 'bin_age', 'bin_avg_training_score', 'promotion_chance']
['education', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met_more_than_80_percent', 'awards_won_bool', 'avg_training_score', 'is_age_30', 'is_age_30_39', 'is_age_39_45', 'is_age_45', 'joining_age', 'reg_count', 'mean_age_per_region', 'mean_joining_age_per_region', 'mean_previous_year_rating_per_region', 'mean_avg_training_score_per_region', 'mean_length_of_service_per_region']

In [57]:
dummy_train = pd.get_dummies(df_raw[cat_cols], drop_first=True)
dummy_test = pd.get_dummies(df_test[cat_cols], drop_first=True)

In [58]:
from sklearn.preprocessing import Normalizer

In [59]:
normalizer = Normalizer(copy=False)
df_raw[num_cols] = normalizer.fit_transform(df_raw[num_cols])
df_raw[num_cols].head()


Out[59]:
education no_of_trainings age previous_year_rating length_of_service KPIs_met_more_than_80_percent awards_won_bool avg_training_score is_age_30 is_age_30_39 is_age_39_45 is_age_45 joining_age reg_count mean_age_per_region mean_joining_age_per_region mean_previous_year_rating_per_region mean_avg_training_score_per_region mean_length_of_service_per_region
0 0.000619 0.000206 0.007225 0.001032 0.001651 0.000206 0.0 0.010115 0.000000 0.000206 0.000000 0.000000 0.005574 0.999771 0.007358 0.006073 0.000666 0.013200 0.001285
1 0.000311 0.000156 0.004666 0.000778 0.000622 0.000000 0.0 0.009333 0.000156 0.000156 0.000000 0.000000 0.004044 0.999862 0.005028 0.004269 0.000466 0.010219 0.000759
2 0.002273 0.001136 0.038638 0.003409 0.007955 0.000000 0.0 0.056821 0.000000 0.001136 0.000000 0.000000 0.030683 0.993236 0.041040 0.033709 0.003702 0.069629 0.007331
3 0.001695 0.001695 0.033058 0.000848 0.008476 0.000000 0.0 0.042382 0.000000 0.000848 0.000848 0.000000 0.024582 0.995979 0.029270 0.024289 0.002745 0.054529 0.004981
4 0.000884 0.000442 0.019882 0.001325 0.000884 0.000000 0.0 0.032252 0.000000 0.000000 0.000442 0.000442 0.018998 0.998495 0.014764 0.012452 0.001317 0.028755 0.002312

In [60]:
df_test[num_cols] = normalizer.fit_transform(df_test[num_cols])
df_test.head()


Out[60]:
department region education gender recruitment_channel no_of_trainings age previous_year_rating length_of_service KPIs_met_more_than_80_percent ... joining_age bin_age bin_avg_training_score promotion_chance reg_count mean_age_per_region mean_joining_age_per_region mean_previous_year_rating_per_region mean_avg_training_score_per_region mean_length_of_service_per_region
0 Technology 26 0.001966 m sourcing 0.000983 0.023587 0.000000 0.000983 0.000983 ... 0.022604 1 5 high 0.993591 0.033154 0.027869 0.002953 0.063747 0.005285
1 HR 4 0.002558 f other 0.001279 0.039655 0.003838 0.006396 0.000000 ... 0.033259 3 2 low 0.991375 0.046475 0.038277 0.004324 0.080408 0.008198
2 Sales & Marketing 13 0.001708 m other 0.000854 0.026467 0.000854 0.003415 0.000000 ... 0.023052 3 2 low 0.996359 0.030435 0.025019 0.002710 0.053111 0.005416
3 Procurement 2 0.000377 f other 0.000566 0.005849 0.000377 0.001698 0.000000 ... 0.004151 3 3 medium 0.999781 0.007312 0.005969 0.000591 0.011929 0.001343
4 Finance 29 0.004678 m sourcing 0.002339 0.070175 0.009357 0.016374 0.000000 ... 0.053801 2 3 low_medium 0.968420 0.073955 0.062926 0.006701 0.154900 0.011029

5 rows × 27 columns


In [61]:
#################### VIC for Continuous Features ##########################

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = add_constant(df_raw[num_cols].iloc[:,1:])
pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)


C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\stats\outliers_influence.py:181: RuntimeWarning: divide by zero encountered in double_scalars
  vif = 1. / (1. - r_squared_i)
Out[61]:
const                                   4.353502e+04
no_of_trainings                         4.104262e+00
age                                              inf
previous_year_rating                    3.494831e+00
length_of_service                                inf
KPIs_met_more_than_80_percent           1.355360e+00
awards_won_bool                         1.028833e+00
avg_training_score                      1.469532e+01
is_age_30                               6.019026e+00
is_age_30_39                            4.614607e+00
is_age_39_45                            2.591466e+00
is_age_45                               3.015141e+00
joining_age                                      inf
reg_count                               1.527225e+01
mean_age_per_region                              inf
mean_joining_age_per_region                      inf
mean_previous_year_rating_per_region    2.418972e+02
mean_avg_training_score_per_region      4.220415e+02
mean_length_of_service_per_region                inf
dtype: float64

In [73]:
df_raw.drop(cat_cols, axis=1, inplace=True)
df_test.drop(cat_cols, axis=1, inplace=True)

train_all = pd.concat([df_raw, dummy_train], axis=1)
test_all = pd.concat([df_test, dummy_test], axis=1)

In [74]:
train_all.shape, test_all.shape


Out[74]:
((54808, 84), (23490, 84))

In [77]:
# Feature importance using Random Forest

from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(train_all, target)

# Print the name and gini importance of each feature
for feature in zip(train_all.columns, clf.feature_importances_*100):
    print(feature)


('education', 2.6807222629270711)
('no_of_trainings', 2.7006303484429064)
('age', 6.1499829807096491)
('previous_year_rating', 5.394392762980468)
('length_of_service', 6.1949218991485022)
('KPIs_met_more_than_80_percent', 5.4396756516265707)
('awards_won_bool', 2.8411076407330653)
('avg_training_score', 7.9218787141748788)
('is_age_30', 0.80314493211120952)
('is_age_30_39', 1.3173850817446546)
('is_age_39_45', 0.66173541609123954)
('is_age_45', 0.32295421178422412)
('joining_age', 6.8002603198141554)
('reg_count', 8.1269111662097178)
('mean_age_per_region', 2.6707107147632301)
('mean_joining_age_per_region', 2.598398754985737)
('mean_previous_year_rating_per_region', 1.4140894848999219)
('mean_avg_training_score_per_region', 3.1315744163000452)
('mean_length_of_service_per_region', 1.7221517175781667)
('department_Finance', 0.76025016491243258)
('department_HR', 0.39584257939066331)
('department_Legal', 0.22140642064580474)
('department_Operations', 2.7511116903300152)
('department_Procurement', 1.9530847277556873)
('department_R&D', 0.30695440032463034)
('department_Sales & Marketing', 2.3914798098574774)
('department_Technology', 1.044152574753386)
('region_10', 0.029324450809734833)
('region_11', 0.044024039995606105)
('region_12', 0.01696362991149386)
('region_13', 0.068189296806663607)
('region_14', 0.042432619661090631)
('region_15', 0.053585406378968449)
('region_16', 0.043802177970569608)
('region_17', 0.066667394040104022)
('region_18', 0.00076170627263315522)
('region_19', 0.03505304677552535)
('region_2', 0.047925862120192488)
('region_20', 0.034185694337270742)
('region_21', 0.014669945309561349)
('region_22', 0.083508509150106006)
('region_23', 0.073323077647171067)
('region_24', 0.017471862774138584)
('region_25', 0.059940693759993655)
('region_26', 0.06194857484573954)
('region_27', 0.055039304733388715)
('region_28', 0.066559842892834939)
('region_29', 0.046475912566782192)
('region_3', 0.029976330521906538)
('region_30', 0.042276828765003278)
('region_31', 0.061961380753369849)
('region_32', 0.028286212775545688)
('region_33', 0.0046851269709223091)
('region_34', 0.0075285297280966845)
('region_4', 0.12983056393887471)
('region_5', 0.021782913540163276)
('region_6', 0.024800040476965803)
('region_7', 0.073426308612046187)
('region_8', 0.030528594564657278)
('region_9', 0.0065697528073624277)
('gender_m', 1.7450552471818168)
('recruitment_channel_referred', 0.45489337569108257)
('recruitment_channel_sourcing', 1.8558354365828966)
('age_group_senior_aged', 0.12628156917156128)
('age_group_young', 0.35345242263292354)
('bin_age_2', 0.51402230382992364)
('bin_age_3', 0.60292666733464673)
('bin_age_4', 0.5503923373906815)
('bin_age_5', 0.30129929749629075)
('bin_age_6', 0.18593332047730757)
('bin_age_7', 0.12210517332372907)
('bin_age_8', 0.099733343441113231)
('bin_age_9', 0.0)
('bin_avg_training_score_2', 0.80310061293270651)
('bin_avg_training_score_3', 0.9323665406464885)
('bin_avg_training_score_4', 0.72063018666259482)
('bin_avg_training_score_5', 0.42545261431333703)
('bin_avg_training_score_6', 0.57556359883763619)
('bin_avg_training_score_7', 0.66317662874619066)
('bin_avg_training_score_8', 2.3898678327158827)
('promotion_chance_low', 0.64415463081940461)
('promotion_chance_low_medium', 0.69939111583440994)
('promotion_chance_medium', 0.66130865788351179)
('promotion_chance_very_high', 4.432640608843867)

In [80]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_all, target, test_size=0.3, random_state=124)

In [81]:
# Computing class weights for binary classification
#weights = dict(1 - train_all.is_promoted.value_counts()/len(train_all))
weights = {0.0: 0.08, 1.0: 0.92}

In [104]:
from sklearn.linear_model import LogisticRegression

logit_clf = LogisticRegression(penalty='l2', C=5, tol=1e-4, max_iter=500, verbose=1, n_jobs=-1, class_weight=weights)
logit_clf.fit(X_train, y_train)
logit_clf.score(X_test, y_test)

from sklearn.metrics import classification_report, confusion_matrix
y = logit_clf.predict(X_test)
print(pd.Series(y).value_counts())
print(classification_report(y_test, y)) # target_names=['not_promoted', 'promoted']
print(confusion_matrix(y_test, y))


[LibLinear]0    12101
1     4342
dtype: int64
             precision    recall  f1-score   support

          0       0.96      0.77      0.85     15101
          1       0.19      0.61      0.29      1342

avg / total       0.89      0.75      0.80     16443

[[11574  3527]
 [  527   815]]

In [105]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
scores = cross_val_score(logit_clf, X_train, y_train, cv=cv)
scores


[LibLinear][LibLinear][LibLinear]
Out[105]:
array([ 0.74309,  0.74978,  0.74292])

In [106]:
pred = logit_clf.predict(test_all)
pred


Out[106]:
array([0, 0, 0, ..., 0, 0, 1], dtype=int64)

In [107]:
pd.Series(pred).value_counts(dropna=False)


Out[107]:
0    17092
1     6398
dtype: int64

In [111]:
from lightgbm import LGBMClassifier
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=80, max_depth=-1, learning_rate=0.01, n_estimators=1000, subsample_for_bin=1000, objective="xentropy", min_split_gain=0.0,\
                        min_child_weight=0.01, min_child_samples=10, subsample=0.9, subsample_freq=1, colsample_bytree=0.7, reg_alpha=0.1, reg_lambda=0.1, random_state=1, n_jobs=4)

In [120]:
clf.fit(X_train, y_train)


Out[120]:
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
        learning_rate=0.01, max_depth=-1, min_child_samples=10,
        min_child_weight=0.01, min_split_gain=0.0, n_estimators=1000,
        n_jobs=4, num_leaves=80, objective='xentropy', random_state=1,
        reg_alpha=0.1, reg_lambda=0.1, silent=True, subsample=0.9,
        subsample_for_bin=1000, subsample_freq=1)

In [122]:
val_preds = clf.predict_proba(X_test)[:,1]
p_test = clf.predict_proba(test_all)[:,1]

In [124]:
import numpy as np
from sklearn.metrics import matthews_corrcoef

def find_matthews_threshold(p_valid, y_valid, try_all=False, verbose=False):
    p_valid, y_valid = np.array(p_valid), np.array(y_valid)

    best = 0
    best_score = -2
    totry = np.arange(0,1,0.01) if try_all is False else np.unique(p_valid)
    for t in totry:
        score = matthews_corrcoef(y_valid, p_valid > t)
        if score > best_score:
            best_score = score
            best = t
    if verbose is True: 
        print('Best score: ', round(best_score, 5), ' @ threshold ', best)

    return best

def best_threshold_submission(p_valid, y_valid, p_test, try_all=False, verbose=False):
    p_test = np.array(p_test)
    thresh = find_matthews_threshold(p_valid, y_valid, try_all, verbose)
    return p_test > thresh

submission_values = best_threshold_submission(val_preds, y_test, p_test, True, True)


C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:538: RuntimeWarning: invalid value encountered in double_scalars
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
Best score:  0.50976  @ threshold  0.511860397358

In [114]:
pd.Series(clf.predict(test_all)).value_counts()


Out[114]:
0    22899
1      591
dtype: int64

In [125]:
pd.Series(np.where(clf.predict_proba(test_all)[:,1]>=0.51,1,0)).value_counts()


Out[125]:
0    22910
1      580
dtype: int64

In [115]:
subs = pd.read_csv('sample_submission.csv')

In [128]:
subs['is_promoted'] = submission_values*1

In [129]:
subs.to_csv('lgb.csv', index=None)