notebook.community

Edit and run



In [1]:

    
%load_ext autoreload
%autoreload 2

%matplotlib inline



In [2]:

    
import time
import xgboost as xgb
import lightgbm as lgb
# import category_encoders as cat_ed
import gc, mlcrate, glob

# from gplearn.genetic import SymbolicTransformer, SymbolicClassifier
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor
from IPython.display import display

from catboost import CatBoostClassifier
from scipy.cluster import hierarchy as hc
from collections import Counter

from sklearn import metrics
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import  roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import KMeans

from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# will ignore all warning from sklearn, seaborn etc..
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn

pd.option_context("display.max_rows", 1000);
pd.option_context("display.max_columns", 1000);



In [3]:

    
PATH = os.getcwd()
PATH









    Out[3]:





'D:\\Github\\fastai\\courses\\ml1\\AV_WNS'



In [45]:

    
df_raw = pd.read_csv(f'{PATH}\\train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}\\test.csv', low_memory=False)



In [46]:

    
df_raw.shape, df_test.shape









    Out[46]:





((54808, 14), (23490, 13))



In [47]:

    
new_cols = ['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met_more_than_80_percent', 'awards_won_bool',
       'avg_training_score', 'is_promoted']
#re-naming them
df_raw.columns = new_cols
df_test.columns = new_cols[:-1]

cat_cols = list(df_raw.select_dtypes(include=['object']).columns)
num_cols = list(df_raw.select_dtypes(exclude=['object']).columns)



In [48]:

    
# NaN in education could be replaced with mode
mode_education = df_raw.education.value_counts().index.values[0]
df_raw.education = df_raw.education.fillna(mode_education)

# NaN in education could be replaced with mode
mode_education = df_test.education.value_counts().index.values[0]
df_test.education = df_test.education.fillna(mode_education)



In [49]:

    
# converting it to ordinal numbers since education is "ordinal" feature; higher the level, higher is the weight
repl = {"Below Secondary":1, "Bachelor's":2, "Master's & above":3}
df_raw.education = df_raw.education.replace(repl)
df_test.education = df_test.education.replace(repl)



In [50]:

    
df_raw.previous_year_rating = df_raw.previous_year_rating.fillna(0)
df_test.previous_year_rating = df_test.previous_year_rating.fillna(0)



In [51]:

    
###########################kind of binning age at trivial level #####################################

df_raw['is_age_30'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age<=30.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_30_39'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=30 and age<=39.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_39_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=39. & age<=45.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=45.').index
df_raw.iloc[my_query, -1] = 1

#######################################################################################################
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
#######################################################################################################

df_raw['age_group'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=20. & age<=30.').index
df_raw.iloc[my_query, -1] = 'young'
my_query = df_raw.query('age>=31. & age<=50.').index
df_raw.iloc[my_query, -1] = 'middle_aged'
my_query = df_raw.query('age>=51. & age<=60.').index
df_raw.iloc[my_query, -1] = 'senior_aged'
###################################################################################################################
###################################################################################################################
###################################################################################################################

###########################kind of binning age at trivial level #####################################

df_test['is_age_30'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age<=30.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_30_39'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=30 and age<=39.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_39_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=39. & age<=45.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=45.').index
df_test.iloc[my_query, -1] = 1

#######################################################################################################
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
#######################################################################################################

df_test['age_group'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=20. & age<=30.').index
df_test.iloc[my_query, -1] = 'young'
my_query = df_test.query('age>=31. & age<=50.').index
df_test.iloc[my_query, -1] = 'middle_aged'
my_query = df_test.query('age>=51. & age<=60.').index
df_test.iloc[my_query, -1] = 'senior_aged';
###############################################################################

feats_added = []

df_raw['joining_age'] = df_raw['age'] - df_raw['length_of_service']
df_test['joining_age'] = df_test['age'] - df_test['length_of_service']
feats_added.append('joining_age')

df_raw['region'].replace('region_', '', True, None, True)
df_test['region'].replace('region_', '', True, None, True)
################################################################################3
bins = [20., 25., 30., 35., 40., 45., 50., 55., 60., 70]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['age']
for col in bin_cols:    
    df_raw[f'bin_{col}'.format(col)]  = pd.cut(df_raw[col] ,bins,labels = labels)
    df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)
feats_added.append('bin_age')

bins = [39., 44., 54., 66., 75., 80., 85., 90., 95.]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['avg_training_score']
for col in bin_cols:    
    df_raw[f'bin_{col}'.format(col)]  = pd.cut(df_raw[col] ,bins,labels = labels)
    df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)

feats_added.append('bin_avg_training_score')
feats_added.append(['age_group', 'is_age_39', 'is_age_39_45', 'is_age_45', 'promotion_chance',\
                   'reg_count','mean_age_per_region','mean_joining_age_per_region','mean_previous_year_rating_per_region',\
                    'mean_avg_training_score_per_region','mean_length_of_service_per_region'])
################################################################################################
df_raw['promotion_chance'] = 'low'
my_query = df_raw.query('avg_training_score>=90').index
df_raw.iloc[my_query, -1] = 'very_high'
my_query = df_raw.query('avg_training_score>=75 and avg_training_score<90').index
df_raw.iloc[my_query, -1] = 'high'
my_query = df_raw.query('avg_training_score>=65 and avg_training_score<75').index
df_raw.iloc[my_query, -1] = 'medium'
my_query = df_raw.query('avg_training_score>=53 and avg_training_score<65').index
df_raw.iloc[my_query, -1] = 'low_medium'

df_test['promotion_chance'] = 'low'
my_query = df_test.query('avg_training_score>=90').index
df_test.iloc[my_query, -1] = 'very_high'
my_query = df_test.query('avg_training_score>=75 and avg_training_score<90').index
df_test.iloc[my_query, -1] = 'high'
my_query = df_test.query('avg_training_score>=65 and avg_training_score<75').index
df_test.iloc[my_query, -1] = 'medium'
my_query = df_test.query('avg_training_score>=53 and avg_training_score<65').index
df_test.iloc[my_query, -1] = 'low_medium'
###############################################################################################
def map_(regs, age):
    d = {}
    for i,j in zip(regs, age):
        d[i] = j
    return d

xyz = df_raw.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_raw['region'])

regs                   = xyz.reset_index()['region'].values
age                    = xyz.reset_index()['age'].values
joining_age            = xyz.reset_index()['joining_age'].values
previous_year_rating   = xyz.reset_index()['previous_year_rating'].values
length_of_service      = xyz.reset_index()['length_of_service'].values 
avg_training_score     = xyz.reset_index()['avg_training_score'].values

df_raw['reg_count'] = df_raw['region'].map(count)
d = map_(regs, age)
df_raw['mean_age_per_region']   = df_raw['region'].map(d)
d = map_(regs, joining_age)
df_raw['mean_joining_age_per_region']   = df_raw['region'].map(d)
d = map_(regs, previous_year_rating)
df_raw['mean_previous_year_rating_per_region']   = df_raw['region'].map(d)
d = map_(regs, avg_training_score)
df_raw['mean_avg_training_score_per_region']   = df_raw['region'].map(d)
d = map_(regs, length_of_service)
df_raw['mean_length_of_service_per_region']   = df_raw['region'].map(d)

xyz = df_test.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_test['region'])

regs                   = xyz.reset_index()['region'].values
age                    = xyz.reset_index()['age'].values
joining_age            = xyz.reset_index()['joining_age'].values
previous_year_rating   = xyz.reset_index()['previous_year_rating'].values
length_of_service      = xyz.reset_index()['length_of_service'].values 
avg_training_score     = xyz.reset_index()['avg_training_score'].values

df_test['reg_count'] = df_test['region'].map(count)
d = map_(regs, age)
df_test['mean_age_per_region']   = df_test['region'].map(d)
d = map_(regs, joining_age)
df_test['mean_joining_age_per_region']   = df_test['region'].map(d)
d = map_(regs, previous_year_rating)
df_test['mean_previous_year_rating_per_region']   = df_test['region'].map(d)
d = map_(regs, avg_training_score)
df_test['mean_avg_training_score_per_region']   = df_test['region'].map(d)
d = map_(regs, length_of_service)
df_test['mean_length_of_service_per_region']   = df_test['region'].map(d)
####################################################################################


del d, count, regs, joining_age, previous_year_rating, length_of_service, avg_training_score
gc.collect()









    Out[51]:





4368



In [52]:

    
df_raw.shape, df_test.shape









    Out[52]:





((54808, 29), (23490, 28))



In [54]:

    
train_cats(df_raw);
apply_cats(df_test,df_raw)



In [55]:

    
target = df_raw.is_promoted
df_raw.drop(['is_promoted', 'employee_id'], axis=1, inplace=True)
df_test.drop(['employee_id'], axis=1, inplace=True)



In [56]:

    
cat_cols = list(df_raw.select_dtypes(include=['category']).columns)
num_cols = list(df_raw.select_dtypes(exclude=['category']).columns)
print(cat_cols)
print(num_cols)









    



['department', 'region', 'gender', 'recruitment_channel', 'age_group', 'bin_age', 'bin_avg_training_score', 'promotion_chance']
['education', 'no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'KPIs_met_more_than_80_percent', 'awards_won_bool', 'avg_training_score', 'is_age_30', 'is_age_30_39', 'is_age_39_45', 'is_age_45', 'joining_age', 'reg_count', 'mean_age_per_region', 'mean_joining_age_per_region', 'mean_previous_year_rating_per_region', 'mean_avg_training_score_per_region', 'mean_length_of_service_per_region']



In [57]:

    
dummy_train = pd.get_dummies(df_raw[cat_cols], drop_first=True)
dummy_test = pd.get_dummies(df_test[cat_cols], drop_first=True)



In [58]:

    
from sklearn.preprocessing import Normalizer



In [59]:

    
normalizer = Normalizer(copy=False)
df_raw[num_cols] = normalizer.fit_transform(df_raw[num_cols])
df_raw[num_cols].head()









    Out[59]:







  
    
      
      education
      no_of_trainings
      age
      previous_year_rating
      length_of_service
      KPIs_met_more_than_80_percent
      awards_won_bool
      avg_training_score
      is_age_30
      is_age_30_39
      is_age_39_45
      is_age_45
      joining_age
      reg_count
      mean_age_per_region
      mean_joining_age_per_region
      mean_previous_year_rating_per_region
      mean_avg_training_score_per_region
      mean_length_of_service_per_region
    
  
  
    
      0
      0.000619
      0.000206
      0.007225
      0.001032
      0.001651
      0.000206
      0.0
      0.010115
      0.000000
      0.000206
      0.000000
      0.000000
      0.005574
      0.999771
      0.007358
      0.006073
      0.000666
      0.013200
      0.001285
    
    
      1
      0.000311
      0.000156
      0.004666
      0.000778
      0.000622
      0.000000
      0.0
      0.009333
      0.000156
      0.000156
      0.000000
      0.000000
      0.004044
      0.999862
      0.005028
      0.004269
      0.000466
      0.010219
      0.000759
    
    
      2
      0.002273
      0.001136
      0.038638
      0.003409
      0.007955
      0.000000
      0.0
      0.056821
      0.000000
      0.001136
      0.000000
      0.000000
      0.030683
      0.993236
      0.041040
      0.033709
      0.003702
      0.069629
      0.007331
    
    
      3
      0.001695
      0.001695
      0.033058
      0.000848
      0.008476
      0.000000
      0.0
      0.042382
      0.000000
      0.000848
      0.000848
      0.000000
      0.024582
      0.995979
      0.029270
      0.024289
      0.002745
      0.054529
      0.004981
    
    
      4
      0.000884
      0.000442
      0.019882
      0.001325
      0.000884
      0.000000
      0.0
      0.032252
      0.000000
      0.000000
      0.000442
      0.000442
      0.018998
      0.998495
      0.014764
      0.012452
      0.001317
      0.028755
      0.002312



In [60]:

    
df_test[num_cols] = normalizer.fit_transform(df_test[num_cols])
df_test.head()









    Out[60]:







  
    
      
      department
      region
      education
      gender
      recruitment_channel
      no_of_trainings
      age
      previous_year_rating
      length_of_service
      KPIs_met_more_than_80_percent
      ...
      joining_age
      bin_age
      bin_avg_training_score
      promotion_chance
      reg_count
      mean_age_per_region
      mean_joining_age_per_region
      mean_previous_year_rating_per_region
      mean_avg_training_score_per_region
      mean_length_of_service_per_region
    
  
  
    
      0
      Technology
      26
      0.001966
      m
      sourcing
      0.000983
      0.023587
      0.000000
      0.000983
      0.000983
      ...
      0.022604
      1
      5
      high
      0.993591
      0.033154
      0.027869
      0.002953
      0.063747
      0.005285
    
    
      1
      HR
      4
      0.002558
      f
      other
      0.001279
      0.039655
      0.003838
      0.006396
      0.000000
      ...
      0.033259
      3
      2
      low
      0.991375
      0.046475
      0.038277
      0.004324
      0.080408
      0.008198
    
    
      2
      Sales & Marketing
      13
      0.001708
      m
      other
      0.000854
      0.026467
      0.000854
      0.003415
      0.000000
      ...
      0.023052
      3
      2
      low
      0.996359
      0.030435
      0.025019
      0.002710
      0.053111
      0.005416
    
    
      3
      Procurement
      2
      0.000377
      f
      other
      0.000566
      0.005849
      0.000377
      0.001698
      0.000000
      ...
      0.004151
      3
      3
      medium
      0.999781
      0.007312
      0.005969
      0.000591
      0.011929
      0.001343
    
    
      4
      Finance
      29
      0.004678
      m
      sourcing
      0.002339
      0.070175
      0.009357
      0.016374
      0.000000
      ...
      0.053801
      2
      3
      low_medium
      0.968420
      0.073955
      0.062926
      0.006701
      0.154900
      0.011029
    
  

5 rows × 27 columns



In [61]:

    
#################### VIC for Continuous Features ##########################

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = add_constant(df_raw[num_cols].iloc[:,1:])
pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)









    



C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\stats\outliers_influence.py:181: RuntimeWarning: divide by zero encountered in double_scalars
  vif = 1. / (1. - r_squared_i)






    Out[61]:





const                                   4.353502e+04
no_of_trainings                         4.104262e+00
age                                              inf
previous_year_rating                    3.494831e+00
length_of_service                                inf
KPIs_met_more_than_80_percent           1.355360e+00
awards_won_bool                         1.028833e+00
avg_training_score                      1.469532e+01
is_age_30                               6.019026e+00
is_age_30_39                            4.614607e+00
is_age_39_45                            2.591466e+00
is_age_45                               3.015141e+00
joining_age                                      inf
reg_count                               1.527225e+01
mean_age_per_region                              inf
mean_joining_age_per_region                      inf
mean_previous_year_rating_per_region    2.418972e+02
mean_avg_training_score_per_region      4.220415e+02
mean_length_of_service_per_region                inf
dtype: float64



In [73]:

    
df_raw.drop(cat_cols, axis=1, inplace=True)
df_test.drop(cat_cols, axis=1, inplace=True)

train_all = pd.concat([df_raw, dummy_train], axis=1)
test_all = pd.concat([df_test, dummy_test], axis=1)



In [74]:

    
train_all.shape, test_all.shape









    Out[74]:





((54808, 84), (23490, 84))



In [77]:

    
# Feature importance using Random Forest

from sklearn.ensemble import RandomForestClassifier

# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)

# Train the classifier
clf.fit(train_all, target)

# Print the name and gini importance of each feature
for feature in zip(train_all.columns, clf.feature_importances_*100):
    print(feature)









    



('education', 2.6807222629270711)
('no_of_trainings', 2.7006303484429064)
('age', 6.1499829807096491)
('previous_year_rating', 5.394392762980468)
('length_of_service', 6.1949218991485022)
('KPIs_met_more_than_80_percent', 5.4396756516265707)
('awards_won_bool', 2.8411076407330653)
('avg_training_score', 7.9218787141748788)
('is_age_30', 0.80314493211120952)
('is_age_30_39', 1.3173850817446546)
('is_age_39_45', 0.66173541609123954)
('is_age_45', 0.32295421178422412)
('joining_age', 6.8002603198141554)
('reg_count', 8.1269111662097178)
('mean_age_per_region', 2.6707107147632301)
('mean_joining_age_per_region', 2.598398754985737)
('mean_previous_year_rating_per_region', 1.4140894848999219)
('mean_avg_training_score_per_region', 3.1315744163000452)
('mean_length_of_service_per_region', 1.7221517175781667)
('department_Finance', 0.76025016491243258)
('department_HR', 0.39584257939066331)
('department_Legal', 0.22140642064580474)
('department_Operations', 2.7511116903300152)
('department_Procurement', 1.9530847277556873)
('department_R&D', 0.30695440032463034)
('department_Sales & Marketing', 2.3914798098574774)
('department_Technology', 1.044152574753386)
('region_10', 0.029324450809734833)
('region_11', 0.044024039995606105)
('region_12', 0.01696362991149386)
('region_13', 0.068189296806663607)
('region_14', 0.042432619661090631)
('region_15', 0.053585406378968449)
('region_16', 0.043802177970569608)
('region_17', 0.066667394040104022)
('region_18', 0.00076170627263315522)
('region_19', 0.03505304677552535)
('region_2', 0.047925862120192488)
('region_20', 0.034185694337270742)
('region_21', 0.014669945309561349)
('region_22', 0.083508509150106006)
('region_23', 0.073323077647171067)
('region_24', 0.017471862774138584)
('region_25', 0.059940693759993655)
('region_26', 0.06194857484573954)
('region_27', 0.055039304733388715)
('region_28', 0.066559842892834939)
('region_29', 0.046475912566782192)
('region_3', 0.029976330521906538)
('region_30', 0.042276828765003278)
('region_31', 0.061961380753369849)
('region_32', 0.028286212775545688)
('region_33', 0.0046851269709223091)
('region_34', 0.0075285297280966845)
('region_4', 0.12983056393887471)
('region_5', 0.021782913540163276)
('region_6', 0.024800040476965803)
('region_7', 0.073426308612046187)
('region_8', 0.030528594564657278)
('region_9', 0.0065697528073624277)
('gender_m', 1.7450552471818168)
('recruitment_channel_referred', 0.45489337569108257)
('recruitment_channel_sourcing', 1.8558354365828966)
('age_group_senior_aged', 0.12628156917156128)
('age_group_young', 0.35345242263292354)
('bin_age_2', 0.51402230382992364)
('bin_age_3', 0.60292666733464673)
('bin_age_4', 0.5503923373906815)
('bin_age_5', 0.30129929749629075)
('bin_age_6', 0.18593332047730757)
('bin_age_7', 0.12210517332372907)
('bin_age_8', 0.099733343441113231)
('bin_age_9', 0.0)
('bin_avg_training_score_2', 0.80310061293270651)
('bin_avg_training_score_3', 0.9323665406464885)
('bin_avg_training_score_4', 0.72063018666259482)
('bin_avg_training_score_5', 0.42545261431333703)
('bin_avg_training_score_6', 0.57556359883763619)
('bin_avg_training_score_7', 0.66317662874619066)
('bin_avg_training_score_8', 2.3898678327158827)
('promotion_chance_low', 0.64415463081940461)
('promotion_chance_low_medium', 0.69939111583440994)
('promotion_chance_medium', 0.66130865788351179)
('promotion_chance_very_high', 4.432640608843867)



In [80]:

    
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_all, target, test_size=0.3, random_state=124)



In [81]:

    
# Computing class weights for binary classification
#weights = dict(1 - train_all.is_promoted.value_counts()/len(train_all))
weights = {0.0: 0.08, 1.0: 0.92}



In [104]:

    
from sklearn.linear_model import LogisticRegression

logit_clf = LogisticRegression(penalty='l2', C=5, tol=1e-4, max_iter=500, verbose=1, n_jobs=-1, class_weight=weights)
logit_clf.fit(X_train, y_train)
logit_clf.score(X_test, y_test)

from sklearn.metrics import classification_report, confusion_matrix
y = logit_clf.predict(X_test)
print(pd.Series(y).value_counts())
print(classification_report(y_test, y)) # target_names=['not_promoted', 'promoted']
print(confusion_matrix(y_test, y))









    



[LibLinear]0    12101
1     4342
dtype: int64
             precision    recall  f1-score   support

          0       0.96      0.77      0.85     15101
          1       0.19      0.61      0.29      1342

avg / total       0.89      0.75      0.80     16443

[[11574  3527]
 [  527   815]]



In [105]:

    
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
scores = cross_val_score(logit_clf, X_train, y_train, cv=cv)
scores









    



[LibLinear][LibLinear][LibLinear]





    Out[105]:





array([ 0.74309,  0.74978,  0.74292])



In [106]:

    
pred = logit_clf.predict(test_all)
pred









    Out[106]:





array([0, 0, 0, ..., 0, 0, 1], dtype=int64)



In [107]:

    
pd.Series(pred).value_counts(dropna=False)









    Out[107]:





0    17092
1     6398
dtype: int64



In [111]:

    
from lightgbm import LGBMClassifier
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=80, max_depth=-1, learning_rate=0.01, n_estimators=1000, subsample_for_bin=1000, objective="xentropy", min_split_gain=0.0,\
                        min_child_weight=0.01, min_child_samples=10, subsample=0.9, subsample_freq=1, colsample_bytree=0.7, reg_alpha=0.1, reg_lambda=0.1, random_state=1, n_jobs=4)



In [120]:

    
clf.fit(X_train, y_train)









    Out[120]:





LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
        learning_rate=0.01, max_depth=-1, min_child_samples=10,
        min_child_weight=0.01, min_split_gain=0.0, n_estimators=1000,
        n_jobs=4, num_leaves=80, objective='xentropy', random_state=1,
        reg_alpha=0.1, reg_lambda=0.1, silent=True, subsample=0.9,
        subsample_for_bin=1000, subsample_freq=1)



In [122]:

    
val_preds = clf.predict_proba(X_test)[:,1]
p_test = clf.predict_proba(test_all)[:,1]



In [124]:

    
import numpy as np
from sklearn.metrics import matthews_corrcoef

def find_matthews_threshold(p_valid, y_valid, try_all=False, verbose=False):
    p_valid, y_valid = np.array(p_valid), np.array(y_valid)

    best = 0
    best_score = -2
    totry = np.arange(0,1,0.01) if try_all is False else np.unique(p_valid)
    for t in totry:
        score = matthews_corrcoef(y_valid, p_valid > t)
        if score > best_score:
            best_score = score
            best = t
    if verbose is True: 
        print('Best score: ', round(best_score, 5), ' @ threshold ', best)

    return best

def best_threshold_submission(p_valid, y_valid, p_test, try_all=False, verbose=False):
    p_test = np.array(p_test)
    thresh = find_matthews_threshold(p_valid, y_valid, try_all, verbose)
    return p_test > thresh

submission_values = best_threshold_submission(val_preds, y_test, p_test, True, True)









    



C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:538: RuntimeWarning: invalid value encountered in double_scalars
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)






    



Best score:  0.50976  @ threshold  0.511860397358



In [114]:

    
pd.Series(clf.predict(test_all)).value_counts()









    Out[114]:





0    22899
1      591
dtype: int64



In [125]:

    
pd.Series(np.where(clf.predict_proba(test_all)[:,1]>=0.51,1,0)).value_counts()









    Out[125]:





0    22910
1      580
dtype: int64



In [115]:

    
subs = pd.read_csv('sample_submission.csv')



In [128]:

    
subs['is_promoted'] = submission_values*1



In [129]:

    
subs.to_csv('lgb.csv', index=None)

	education	no_of_trainings	age	previous_year_rating	length_of_service	KPIs_met_more_than_80_percent	avg_training_score	is_age_30	is_age_30_39	is_age_39_45	is_age_45	joining_age	reg_count	mean_age_per_region	mean_joining_age_per_region	mean_previous_year_rating_per_region	mean_avg_training_score_per_region	mean_length_of_service_per_region
0	0.000619	0.000206	0.007225	0.001032	0.001651	0.000206	0.010115	0.000000	0.000206	0.000000	0.000000	0.005574	0.999771	0.007358	0.006073	0.000666	0.013200	0.001285
1	0.000311	0.000156	0.004666	0.000778	0.000622	0.000000	0.009333	0.000156	0.000156	0.000000	0.000000	0.004044	0.999862	0.005028	0.004269	0.000466	0.010219	0.000759
2	0.002273	0.001136	0.038638	0.003409	0.007955	0.000000	0.056821	0.000000	0.001136	0.000000	0.000000	0.030683	0.993236	0.041040	0.033709	0.003702	0.069629	0.007331
3	0.001695	0.001695	0.033058	0.000848	0.008476	0.000000	0.042382	0.000000	0.000848	0.000848	0.000000	0.024582	0.995979	0.029270	0.024289	0.002745	0.054529	0.004981
4	0.000884	0.000442	0.019882	0.001325	0.000884	0.000000	0.032252	0.000000	0.000000	0.000442	0.000442	0.018998	0.998495	0.014764	0.012452	0.001317	0.028755	0.002312

	department	region	education	gender	recruitment_channel	no_of_trainings	age	previous_year_rating	length_of_service	KPIs_met_more_than_80_percent	...	joining_age	bin_age	bin_avg_training_score	promotion_chance	reg_count	mean_age_per_region	mean_joining_age_per_region	mean_previous_year_rating_per_region	mean_avg_training_score_per_region	mean_length_of_service_per_region
0	Technology	26	0.001966	m	sourcing	0.000983	0.023587	0.000000	0.000983	0.000983	...	0.022604	1	5	high	0.993591	0.033154	0.027869	0.002953	0.063747	0.005285
1	HR	4	0.002558	f	other	0.001279	0.039655	0.003838	0.006396	0.000000	...	0.033259	3	2	low	0.991375	0.046475	0.038277	0.004324	0.080408	0.008198
2	Sales & Marketing	13	0.001708	m	other	0.000854	0.026467	0.000854	0.003415	0.000000	...	0.023052	3	2	low	0.996359	0.030435	0.025019	0.002710	0.053111	0.005416
3	Procurement	2	0.000377	f	other	0.000566	0.005849	0.000377	0.001698	0.000000	...	0.004151	3	3	medium	0.999781	0.007312	0.005969	0.000591	0.011929	0.001343
4	Finance	29	0.004678	m	sourcing	0.002339	0.070175	0.009357	0.016374	0.000000	...	0.053801	2	3	low_medium	0.968420	0.073955	0.062926	0.006701	0.154900	0.011029