In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
import time
import xgboost as xgb
import lightgbm as lgb
# import category_encoders as cat_ed
import gc, mlcrate, glob
# from gplearn.genetic import SymbolicTransformer, SymbolicClassifier
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor
from IPython.display import display
from catboost import CatBoostClassifier
from scipy.cluster import hierarchy as hc
from collections import Counter
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
# will ignore all warning from sklearn, seaborn etc..
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn
pd.option_context("display.max_rows", 1000);
pd.option_context("display.max_columns", 1000);
In [3]:
PATH = os.getcwd()
PATH
Out[3]:
In [45]:
df_raw = pd.read_csv(f'{PATH}\\train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}\\test.csv', low_memory=False)
In [46]:
df_raw.shape, df_test.shape
Out[46]:
In [47]:
new_cols = ['employee_id', 'department', 'region', 'education', 'gender',
'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
'length_of_service', 'KPIs_met_more_than_80_percent', 'awards_won_bool',
'avg_training_score', 'is_promoted']
#re-naming them
df_raw.columns = new_cols
df_test.columns = new_cols[:-1]
cat_cols = list(df_raw.select_dtypes(include=['object']).columns)
num_cols = list(df_raw.select_dtypes(exclude=['object']).columns)
In [48]:
# NaN in education could be replaced with mode
mode_education = df_raw.education.value_counts().index.values[0]
df_raw.education = df_raw.education.fillna(mode_education)
# NaN in education could be replaced with mode
mode_education = df_test.education.value_counts().index.values[0]
df_test.education = df_test.education.fillna(mode_education)
In [49]:
# converting it to ordinal numbers since education is "ordinal" feature; higher the level, higher is the weight
repl = {"Below Secondary":1, "Bachelor's":2, "Master's & above":3}
df_raw.education = df_raw.education.replace(repl)
df_test.education = df_test.education.replace(repl)
In [50]:
df_raw.previous_year_rating = df_raw.previous_year_rating.fillna(0)
df_test.previous_year_rating = df_test.previous_year_rating.fillna(0)
In [51]:
###########################kind of binning age at trivial level #####################################
df_raw['is_age_30'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age<=30.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_30_39'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=30 and age<=39.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_39_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=39. & age<=45.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=45.').index
df_raw.iloc[my_query, -1] = 1
#######################################################################################################
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
#######################################################################################################
df_raw['age_group'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=20. & age<=30.').index
df_raw.iloc[my_query, -1] = 'young'
my_query = df_raw.query('age>=31. & age<=50.').index
df_raw.iloc[my_query, -1] = 'middle_aged'
my_query = df_raw.query('age>=51. & age<=60.').index
df_raw.iloc[my_query, -1] = 'senior_aged'
###################################################################################################################
###################################################################################################################
###################################################################################################################
###########################kind of binning age at trivial level #####################################
df_test['is_age_30'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age<=30.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_30_39'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=30 and age<=39.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_39_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=39. & age<=45.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=45.').index
df_test.iloc[my_query, -1] = 1
#######################################################################################################
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
#######################################################################################################
df_test['age_group'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=20. & age<=30.').index
df_test.iloc[my_query, -1] = 'young'
my_query = df_test.query('age>=31. & age<=50.').index
df_test.iloc[my_query, -1] = 'middle_aged'
my_query = df_test.query('age>=51. & age<=60.').index
df_test.iloc[my_query, -1] = 'senior_aged';
###############################################################################
feats_added = []
df_raw['joining_age'] = df_raw['age'] - df_raw['length_of_service']
df_test['joining_age'] = df_test['age'] - df_test['length_of_service']
feats_added.append('joining_age')
df_raw['region'].replace('region_', '', True, None, True)
df_test['region'].replace('region_', '', True, None, True)
################################################################################3
bins = [20., 25., 30., 35., 40., 45., 50., 55., 60., 70]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['age']
for col in bin_cols:
df_raw[f'bin_{col}'.format(col)] = pd.cut(df_raw[col] ,bins,labels = labels)
df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)
feats_added.append('bin_age')
bins = [39., 44., 54., 66., 75., 80., 85., 90., 95.]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['avg_training_score']
for col in bin_cols:
df_raw[f'bin_{col}'.format(col)] = pd.cut(df_raw[col] ,bins,labels = labels)
df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)
feats_added.append('bin_avg_training_score')
feats_added.append(['age_group', 'is_age_39', 'is_age_39_45', 'is_age_45', 'promotion_chance',\
'reg_count','mean_age_per_region','mean_joining_age_per_region','mean_previous_year_rating_per_region',\
'mean_avg_training_score_per_region','mean_length_of_service_per_region'])
################################################################################################
df_raw['promotion_chance'] = 'low'
my_query = df_raw.query('avg_training_score>=90').index
df_raw.iloc[my_query, -1] = 'very_high'
my_query = df_raw.query('avg_training_score>=75 and avg_training_score<90').index
df_raw.iloc[my_query, -1] = 'high'
my_query = df_raw.query('avg_training_score>=65 and avg_training_score<75').index
df_raw.iloc[my_query, -1] = 'medium'
my_query = df_raw.query('avg_training_score>=53 and avg_training_score<65').index
df_raw.iloc[my_query, -1] = 'low_medium'
df_test['promotion_chance'] = 'low'
my_query = df_test.query('avg_training_score>=90').index
df_test.iloc[my_query, -1] = 'very_high'
my_query = df_test.query('avg_training_score>=75 and avg_training_score<90').index
df_test.iloc[my_query, -1] = 'high'
my_query = df_test.query('avg_training_score>=65 and avg_training_score<75').index
df_test.iloc[my_query, -1] = 'medium'
my_query = df_test.query('avg_training_score>=53 and avg_training_score<65').index
df_test.iloc[my_query, -1] = 'low_medium'
###############################################################################################
def map_(regs, age):
d = {}
for i,j in zip(regs, age):
d[i] = j
return d
xyz = df_raw.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_raw['region'])
regs = xyz.reset_index()['region'].values
age = xyz.reset_index()['age'].values
joining_age = xyz.reset_index()['joining_age'].values
previous_year_rating = xyz.reset_index()['previous_year_rating'].values
length_of_service = xyz.reset_index()['length_of_service'].values
avg_training_score = xyz.reset_index()['avg_training_score'].values
df_raw['reg_count'] = df_raw['region'].map(count)
d = map_(regs, age)
df_raw['mean_age_per_region'] = df_raw['region'].map(d)
d = map_(regs, joining_age)
df_raw['mean_joining_age_per_region'] = df_raw['region'].map(d)
d = map_(regs, previous_year_rating)
df_raw['mean_previous_year_rating_per_region'] = df_raw['region'].map(d)
d = map_(regs, avg_training_score)
df_raw['mean_avg_training_score_per_region'] = df_raw['region'].map(d)
d = map_(regs, length_of_service)
df_raw['mean_length_of_service_per_region'] = df_raw['region'].map(d)
xyz = df_test.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_test['region'])
regs = xyz.reset_index()['region'].values
age = xyz.reset_index()['age'].values
joining_age = xyz.reset_index()['joining_age'].values
previous_year_rating = xyz.reset_index()['previous_year_rating'].values
length_of_service = xyz.reset_index()['length_of_service'].values
avg_training_score = xyz.reset_index()['avg_training_score'].values
df_test['reg_count'] = df_test['region'].map(count)
d = map_(regs, age)
df_test['mean_age_per_region'] = df_test['region'].map(d)
d = map_(regs, joining_age)
df_test['mean_joining_age_per_region'] = df_test['region'].map(d)
d = map_(regs, previous_year_rating)
df_test['mean_previous_year_rating_per_region'] = df_test['region'].map(d)
d = map_(regs, avg_training_score)
df_test['mean_avg_training_score_per_region'] = df_test['region'].map(d)
d = map_(regs, length_of_service)
df_test['mean_length_of_service_per_region'] = df_test['region'].map(d)
####################################################################################
del d, count, regs, joining_age, previous_year_rating, length_of_service, avg_training_score
gc.collect()
Out[51]:
In [52]:
df_raw.shape, df_test.shape
Out[52]:
In [54]:
train_cats(df_raw);
apply_cats(df_test,df_raw)
In [55]:
target = df_raw.is_promoted
df_raw.drop(['is_promoted', 'employee_id'], axis=1, inplace=True)
df_test.drop(['employee_id'], axis=1, inplace=True)
In [56]:
cat_cols = list(df_raw.select_dtypes(include=['category']).columns)
num_cols = list(df_raw.select_dtypes(exclude=['category']).columns)
print(cat_cols)
print(num_cols)
In [57]:
dummy_train = pd.get_dummies(df_raw[cat_cols], drop_first=True)
dummy_test = pd.get_dummies(df_test[cat_cols], drop_first=True)
In [58]:
from sklearn.preprocessing import Normalizer
In [59]:
normalizer = Normalizer(copy=False)
df_raw[num_cols] = normalizer.fit_transform(df_raw[num_cols])
df_raw[num_cols].head()
Out[59]:
In [60]:
df_test[num_cols] = normalizer.fit_transform(df_test[num_cols])
df_test.head()
Out[60]:
In [61]:
#################### VIC for Continuous Features ##########################
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
X = add_constant(df_raw[num_cols].iloc[:,1:])
pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index=X.columns)
Out[61]:
In [73]:
df_raw.drop(cat_cols, axis=1, inplace=True)
df_test.drop(cat_cols, axis=1, inplace=True)
train_all = pd.concat([df_raw, dummy_train], axis=1)
test_all = pd.concat([df_test, dummy_test], axis=1)
In [74]:
train_all.shape, test_all.shape
Out[74]:
In [77]:
# Feature importance using Random Forest
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
# Train the classifier
clf.fit(train_all, target)
# Print the name and gini importance of each feature
for feature in zip(train_all.columns, clf.feature_importances_*100):
print(feature)
In [80]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_all, target, test_size=0.3, random_state=124)
In [81]:
# Computing class weights for binary classification
#weights = dict(1 - train_all.is_promoted.value_counts()/len(train_all))
weights = {0.0: 0.08, 1.0: 0.92}
In [104]:
from sklearn.linear_model import LogisticRegression
logit_clf = LogisticRegression(penalty='l2', C=5, tol=1e-4, max_iter=500, verbose=1, n_jobs=-1, class_weight=weights)
logit_clf.fit(X_train, y_train)
logit_clf.score(X_test, y_test)
from sklearn.metrics import classification_report, confusion_matrix
y = logit_clf.predict(X_test)
print(pd.Series(y).value_counts())
print(classification_report(y_test, y)) # target_names=['not_promoted', 'promoted']
print(confusion_matrix(y_test, y))
In [105]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
cv = ShuffleSplit(n_splits=3, test_size=0.3, random_state=0)
scores = cross_val_score(logit_clf, X_train, y_train, cv=cv)
scores
Out[105]:
In [106]:
pred = logit_clf.predict(test_all)
pred
Out[106]:
In [107]:
pd.Series(pred).value_counts(dropna=False)
Out[107]:
In [111]:
from lightgbm import LGBMClassifier
clf = LGBMClassifier(boosting_type='gbdt', num_leaves=80, max_depth=-1, learning_rate=0.01, n_estimators=1000, subsample_for_bin=1000, objective="xentropy", min_split_gain=0.0,\
min_child_weight=0.01, min_child_samples=10, subsample=0.9, subsample_freq=1, colsample_bytree=0.7, reg_alpha=0.1, reg_lambda=0.1, random_state=1, n_jobs=4)
In [120]:
clf.fit(X_train, y_train)
Out[120]:
In [122]:
val_preds = clf.predict_proba(X_test)[:,1]
p_test = clf.predict_proba(test_all)[:,1]
In [124]:
import numpy as np
from sklearn.metrics import matthews_corrcoef
def find_matthews_threshold(p_valid, y_valid, try_all=False, verbose=False):
p_valid, y_valid = np.array(p_valid), np.array(y_valid)
best = 0
best_score = -2
totry = np.arange(0,1,0.01) if try_all is False else np.unique(p_valid)
for t in totry:
score = matthews_corrcoef(y_valid, p_valid > t)
if score > best_score:
best_score = score
best = t
if verbose is True:
print('Best score: ', round(best_score, 5), ' @ threshold ', best)
return best
def best_threshold_submission(p_valid, y_valid, p_test, try_all=False, verbose=False):
p_test = np.array(p_test)
thresh = find_matthews_threshold(p_valid, y_valid, try_all, verbose)
return p_test > thresh
submission_values = best_threshold_submission(val_preds, y_test, p_test, True, True)
In [114]:
pd.Series(clf.predict(test_all)).value_counts()
Out[114]:
In [125]:
pd.Series(np.where(clf.predict_proba(test_all)[:,1]>=0.51,1,0)).value_counts()
Out[125]:
In [115]:
subs = pd.read_csv('sample_submission.csv')
In [128]:
subs['is_promoted'] = submission_values*1
In [129]:
subs.to_csv('lgb.csv', index=None)