In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
import time
import xgboost as xgb
import lightgbm as lgb
# import category_encoders as cat_ed
import gc, mlcrate, glob
# from gplearn.genetic import SymbolicTransformer, SymbolicClassifier
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor
from IPython.display import display
from catboost import CatBoostClassifier
from scipy.cluster import hierarchy as hc
from collections import Counter
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
# will ignore all warning from sklearn, seaborn etc..
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn
pd.option_context("display.max_rows", 1000);
pd.option_context("display.max_columns", 1000);
In [3]:
PATH = os.getcwd()
PATH
Out[3]:
In [4]:
df_raw = pd.read_csv(f'{PATH}\\train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}\\test.csv', low_memory=False)
df_raw['previous_year_rating'].fillna(0, inplace=True)
df_test['previous_year_rating'].fillna(0, inplace=True)
df_raw['education'].replace('\'s', '', regex=True, inplace=True)
df_test['education'].replace('\'s', '', regex=True, inplace=True)
df_raw.drop('employee_id', axis=1, inplace=True)
df_test.drop('employee_id', axis=1, inplace=True)
In [5]:
for c in df_raw.columns:
n = df_raw[c].nunique()
print(c)
if n <= 8:
print(n, sorted(df_raw[c].value_counts().to_dict().items()))
else:
print(n)
print(120 * '-')
In [6]:
df_raw.shape, df_test.shape
Out[6]:
In [7]:
new_cols = ['department', 'region', 'education', 'gender',
'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
'length_of_service', 'KPIs_met_more_than_80_percent', 'awards_won_bool',
'avg_training_score', 'is_promoted']
#re-naming them
df_raw.columns = new_cols
df_test.columns = new_cols[:-1]
cat_cols = list(df_raw.select_dtypes(include=['object']).columns)
num_cols = list(df_raw.select_dtypes(exclude=['object']).columns)
In [8]:
df_raw_cat, df_test_cat = df_raw[cat_cols], df_test[cat_cols]
In [9]:
df_raw.get_ftype_counts()
Out[9]:
In [10]:
bins = [0., 5., 10., 15., 20., 25., 30., 35., 40]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['length_of_service']
for col in bin_cols:
df_raw[f'bin_{col}'.format(col)] = pd.cut(df_raw[col] , bins, labels = labels)
df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col], bins, labels = labels)
bins = [10, 20., 25., 30., 35., 40., 45., 50., 55., 60., 70]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['age']
for col in bin_cols:
df_raw[f'bin_{col}'.format(col)] = pd.cut(df_raw[col] ,bins,labels = labels)
df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)
bins = [30, 39., 44., 54., 66., 75., 80., 85., 90., 100]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['avg_training_score']
for col in bin_cols:
df_raw[f'bin_{col}'.format(col)] = pd.cut(df_raw[col] ,bins,labels = labels)
df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)
df_raw['promotion_chance'] = 'low'
my_query = df_raw.query('avg_training_score>=90').index
df_raw.iloc[my_query, -1] = 'very_high'
my_query = df_raw.query('avg_training_score>=75 and avg_training_score<90').index
df_raw.iloc[my_query, -1] = 'high'
my_query = df_raw.query('avg_training_score>=65 and avg_training_score<75').index
df_raw.iloc[my_query, -1] = 'medium'
my_query = df_raw.query('avg_training_score>=53 and avg_training_score<65').index
df_raw.iloc[my_query, -1] = 'low_medium'
df_test['promotion_chance'] = 'low'
my_query = df_test.query('avg_training_score>=90').index
df_test.iloc[my_query, -1] = 'very_high'
my_query = df_test.query('avg_training_score>=75 and avg_training_score<90').index
df_test.iloc[my_query, -1] = 'high'
my_query = df_test.query('avg_training_score>=65 and avg_training_score<75').index
df_test.iloc[my_query, -1] = 'medium'
my_query = df_test.query('avg_training_score>=53 and avg_training_score<65').index
df_test.iloc[my_query, -1] = 'low_medium'
#######################################################################################
df_raw['new_employee'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('previous_year_rating == 0').index
df_raw.iloc[my_query, -1] = 1
df_test['new_employee'] = np.zeros(df_test.shape[0])
my_query = df_test.query('previous_year_rating == 0').index
df_test.iloc[my_query, -1] = 1
df_raw['joining_age'] = df_raw['age'] - df_raw['length_of_service']
df_test['joining_age'] = df_test['age'] - df_test['length_of_service']
df_raw['main_training_received_for_promo'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('no_of_trainings <=2').index
df_raw.iloc[my_query, -1] = 1
df_test['main_training_received_for_promo'] = np.zeros(df_test.shape[0])
my_query = df_test.query('no_of_trainings <=2').index
df_test.iloc[my_query, -1] = 1
df_raw['region'].replace('region_', '', True, None, True)
df_test['region'].replace('region_', '', True, None, True)
df_raw['major_promo_region'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('region == \'2\' or region == \'22\' or region ==\'7\' or region == \'4\' or region == \'13\' or region ==\'15\'').index
df_raw.iloc[my_query, -1] = 1
df_test['major_promo_region'] = np.zeros(df_test.shape[0])
my_query = df_test.query('region == \'2\' or region == \'22\' or region ==\'7\' or region == \'4\' or region == \'13\' or region ==\'15\'').index
df_test.iloc[my_query, -1] = 1
df_raw['trainings_less_2'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('no_of_trainings <=2').index
df_raw.iloc[my_query, -1] = 1
df_test['trainings_less_2'] = np.zeros(df_test.shape[0])
my_query = df_test.query('no_of_trainings <=2').index
df_test.iloc[my_query, -1] = 1
def map_(regs, age):
d = {}
for i,j in zip(regs, age):
d[i] = j
return d
xyz = df_raw.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_raw['region'])
regs = xyz.reset_index()['region'].values
age = xyz.reset_index()['age'].values
joining_age = xyz.reset_index()['joining_age'].values
previous_year_rating = xyz.reset_index()['previous_year_rating'].values
length_of_service = xyz.reset_index()['length_of_service'].values
avg_training_score = xyz.reset_index()['avg_training_score'].values
df_raw['reg_count'] = df_raw['region'].map(count)
d = map_(regs, age)
df_raw['mean_age_per_region'] = df_raw['region'].map(d)
d = map_(regs, joining_age)
df_raw['mean_joining_age_per_region'] = df_raw['region'].map(d)
d = map_(regs, previous_year_rating)
df_raw['mean_previous_year_rating_per_region'] = df_raw['region'].map(d)
d = map_(regs, avg_training_score)
df_raw['mean_avg_training_score_per_region'] = df_raw['region'].map(d)
d = map_(regs, length_of_service)
df_raw['mean_length_of_service_per_region'] = df_raw['region'].map(d)
xyz = df_test.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_test['region'])
regs = xyz.reset_index()['region'].values
age = xyz.reset_index()['age'].values
joining_age = xyz.reset_index()['joining_age'].values
previous_year_rating = xyz.reset_index()['previous_year_rating'].values
length_of_service = xyz.reset_index()['length_of_service'].values
avg_training_score = xyz.reset_index()['avg_training_score'].values
df_test['reg_count'] = df_test['region'].map(count)
d = map_(regs, age)
df_test['mean_age_per_region'] = df_test['region'].map(d)
d = map_(regs, joining_age)
df_test['mean_joining_age_per_region'] = df_test['region'].map(d)
d = map_(regs, previous_year_rating)
df_test['mean_previous_year_rating_per_region'] = df_test['region'].map(d)
d = map_(regs, avg_training_score)
df_test['mean_avg_training_score_per_region'] = df_test['region'].map(d)
d = map_(regs, length_of_service)
df_test['mean_length_of_service_per_region'] = df_test['region'].map(d)
####################################################################################
del d, count, regs, joining_age, previous_year_rating, length_of_service, avg_training_score
gc.collect()
Out[10]:
In [11]:
df_raw.shape, df_test.shape
Out[11]:
In [12]:
df_raw.drop(cat_cols, axis=1, inplace=True)
df_test.drop(cat_cols, axis=1, inplace=True)
In [13]:
df_raw_cat['dept_rec'] = 'in ' + df_raw_cat['department'] + ' via ' + df_raw_cat['recruitment_channel']
df_test_cat['dept_rec'] = 'in ' + df_test_cat['department'] + ' via ' + df_test_cat['recruitment_channel']
df_raw_cat['gen_edu'] = df_raw_cat['gender'] + ' has ' + df_raw_cat['education']
df_test_cat['gen_edu'] = df_test_cat['gender'] + ' has ' + df_test_cat['education']
In [14]:
idx_split = df_raw_cat.shape[0]
df_raw_cat.head(1)
Out[14]:
In [15]:
target = df_raw.is_promoted
In [16]:
full_df = pd.concat((df_raw_cat, df_test_cat), axis = 0, ignore_index=True)
In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
full_sites = full_df[:].astype('str')
lst = full_sites.as_matrix().tolist()
flat_list = [' '.join(sublist) for sublist in lst]
vect = TfidfVectorizer(ngram_range=(1,2), max_features=500,analyzer='char_wb')
tfidf_matrix = vect.fit_transform(flat_list)
X_train_tf = tfidf_matrix[:idx_split]
X_test_tf = tfidf_matrix[idx_split:]
X_train_tf.shape, X_test_tf.shape
Out[17]:
In [18]:
%%time
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=120)
svd.fit(X_train_tf)
X_train_svd = svd.transform(X_train_tf)
X_test_svd = svd.transform(X_test_tf)
# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(X_train_svd)
X_train_svd_scl = scl.transform(X_train_svd)
X_test_svd_scl = scl.transform(X_test_svd)
In [36]:
train_cats(df_raw);
apply_cats(df_test, df_raw);
In [23]:
cat_cols = list(df_raw.select_dtypes(include=['category', 'object']).columns)
num_cols = list(df_raw.select_dtypes(exclude=['object', 'category']).columns)
In [28]:
train_dummies = pd.get_dummies(pd.concat((df_raw_cat,df_raw[cat_cols]),axis=1), drop_first=True)
test_dummies = pd.get_dummies(pd.concat((df_test_cat,df_test[cat_cols]),axis=1), drop_first=True)
#df_raw.drop(cat_cols, axis=1, inplace=True)
#df_test.drop(cat_cols), axis=1, inplace=True)
df_raw.shape, df_test.shape, train_dummies.shape, test_dummies.shape
Out[28]:
In [29]:
# df_raw.drop(cat_cols, axis=1, inplace=True)
# df_test.drop(cat_cols, axis=1, inplace=True)
In [36]:
X_train_tf.shape, df_raw.shape, train_dummies.shape
Out[36]:
In [40]:
X_test_tf.shape, df_test.shape, test_dummies.shape
Out[40]:
In [53]:
train_all, test_all = np.hstack([df_raw.values, train_dummies.values, X_train_tf.todense()]),np.hstack([df_test.values, test_dummies.values, X_test_tf.todense()])
train_all.shape, test_all.shape
Out[53]:
In [65]:
train_all, test_all = pd.concat([df_raw, train_dummies], axis=1), pd.concat([df_test, test_dummies], axis=1)
In [102]:
train_all, target, test_all = np.save('train_all_better.npy', train_all), np.save('target.npy',target), np.save('test_all_better.npy',test_all)
In [4]:
train_all, target, test_all = np.load('train_all_better.npy'), np.load('target.npy'), np.load('test_all_better.npy')
In [5]:
def make_submission(probs):
sample = pd.read_csv(f'{PATH}\\sample_submission.csv')
submit = sample.copy()
submit['is_promoted'] = probs
return submit
In [6]:
np.count_nonzero(target), target.shape[0]- 4668
Out[6]:
In [7]:
X_train, X_valid, y_train, y_valid = train_test_split(train_all, target, test_size = .2, stratify = target)
In [8]:
%%time
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix, auc, f1_score
clf_lr = LogisticRegression (random_state = 42,n_jobs=-1,max_iter=1000)
clf_lr.fit (X_train, y_train)
preds_lr = clf_lr.predict_proba (X_valid) [:, 1]
print ('Train test split LogisticRegression score:% s ROC AUC'% round (roc_auc_score (y_valid, preds_lr), 4))
print('F1 Score', f1_score(y_valid, (preds_lr>.61)*1))
print('F1 Score', f1_score(y_valid, (preds_lr>.7)*1))
In [22]:
def show_confusion_matrix ( y_true , y_pred , title = 'Confusion matrix' ):
table = confusion_matrix ( y_true , y_pred )#.values maybe
fig , ax = plt . subplots ( frameon = False )
fig.set_size_inches ( 4 , 3 )
fig.suptitle (title , fontsize = 20 )
ax.axis( 'off' )
ax.xaxis.set_visible ( False )
ax.yaxis.set_visible ( False )
the_table = ax . table ( cellText = table ,
colWidths = [ 0.5 ] * len ([ 0 , 1 ]),
rowLabels = ['True 0','True 1'],colLabels = [ 'Predicted 0' , 'Predicted 1' ],
cellLoc='center',rowLoc='center',loc="center")
the_table.set_fontsize(34)
the_table.scale(1,4)
In [23]:
show_confusion_matrix(y_valid, clf_lr.predict( X_valid ))
In [27]:
def get_auc_lr_valid(X, y, C=1.0, seed=17, ratio = 0.8):
# Split the data into the training and validation sets
idx = int(round(X.shape[0] * ratio))
# Classifier training
lr = LogisticRegression(C=C, random_state=seed,max_iter=8000,n_jobs=-1, class_weight='balanced').fit(X[:idx, :], y[:idx])
# Prediction for validation set
y_pred = lr.predict(X[idx:, :])
# Calculate the quality
score = f1_score(y[idx:], y_pred)
return score
In [28]:
%%time
from tqdm import tqdm_notebook
# List of possible C-values
Cs = np.logspace(-1, 3, 20)
scores = []
for C in tqdm_notebook(Cs):
scores.append(get_auc_lr_valid(X_train, y_train, C=C))
In [29]:
max(scores), Cs[np.argmax(scores)]
Out[29]:
In [30]:
final_model = LogisticRegression(random_state = 17,max_iter=8000, C = Cs[np.argmax(scores)],n_jobs=-1)
final_model.fit(X_train,y_train)
print('F1 on the test sample: {} '.format(round(f1_score(y_valid,final_model.predict(X_valid)),4)))
show_confusion_matrix(y_valid, final_model.predict(X_valid))
In [8]:
def runXGB(train_X, train_y, test_X, test_y=None):
params = {}
params['booster'] = 'gbtree'
params['tree_method'] = 'gpu_hist'
params["objective"] = "binary:logistic"
params['eval_metric'] = 'aucpr'
params["eta"] = 0.05 #0.03
params["subsample"] = .8
params["silent"] = 0
params['verbose'] = 2
params["max_depth"] = 9
params["seed"] = 1
params["max_delta_step"] = 4
params['scale_pos_weight'] = 50140/4668
params["gamma"] = 0.6 #.5 #.1 #.2
params['colsample_bytree'] = 0.75
nrounds = 700 #3600 #2000 #4000
plst = list(params.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)
xgtest = xgb.DMatrix(test_X)
model = xgb.train(plst, xgtrain, nrounds)
pred_test_y = model.predict(xgtest)
return pred_test_y
In [81]:
val_preds = runXGB(X_train, y_train, X_valid,)
In [82]:
val_preds, max(val_preds)
Out[82]:
In [36]:
params = {}
params['booster'] = 'gbtree'
#params['tree_method'] = 'gpu_hist'
params["objective"] = "binary:logistic"
params['eval_metric'] = 'auc'
params["eta"] = 0.05
params["subsample"] = .7
params["silent"] = 0
params['verbose'] = 1
params["max_depth"] = 9
params["seed"] = 1
params["max_delta_step"] = 4
params['scale_pos_weight'] = 50140/4668
params["gamma"] = 1. #.5 #.1 #.2
params['colsample_bytree'] = 0.7
params['nrounds'] = 500 #3600 #2000 #4000
In [37]:
model, p_train, p_test = mlcrate.xgb.train_kfold(params, train_all, target, test_all, folds = 5, stratify=target)
In [38]:
p_test, max(p_test)
Out[38]:
In [52]:
pd.Series(target).value_counts()
Out[52]:
In [57]:
4668/54808*100
Out[57]:
In [53]:
p_test.shape
Out[53]:
In [60]:
4668/54808*23490
Out[60]:
In [66]:
pd.Series((p_test>.7)*1).value_counts()
Out[66]:
In [46]:
pd.Series((p_test>.75)*1).value_counts()
Out[46]:
In [50]:
pd.Series((p_test>.8)*1).value_counts()
Out[50]:
In [51]:
pd.Series((p_test>.85)*1).value_counts()
Out[51]:
In [63]:
preds = pd.Series((p_test>.61)*1)
In [64]:
make_submission(preds).to_csv('clipped_at_.8.csv', index=False)
In [ ]:
In [ ]:
In [107]:
import joblib
joblib.dump(model[0],'xgb_best_0th')
joblib.dump(model,'xgb_best')
Out[107]:
In [88]:
import numpy as np
from sklearn.metrics import matthews_corrcoef
def find_matthews_threshold(p_valid, y_valid, try_all=False, verbose=False):
p_valid, y_valid = np.array(p_valid), np.array(y_valid)
best = 0
best_score = -2
totry = np.arange(0.3,1,0.01) if try_all is False else np.unique(p_valid)
for t in totry:
score = matthews_corrcoef(y_valid, p_valid > t)
if score > best_score:
best_score = score
best = t
if verbose is True:
print('Best score: ', round(best_score, 5), ' @ threshold ', best)
return best
def best_threshold_submission(p_valid, y_valid, p_test, try_all=False, verbose=False):
p_test = np.array(p_test)
thresh = find_matthews_threshold(p_valid, y_valid, try_all, verbose)
return (p_test > thresh)*1
submission_values = best_threshold_submission(val_preds, y_valid, p_test, True, True)
In [89]:
submission_values
Out[89]:
In [90]:
pd.Series(submission_values).value_counts()
Out[90]:
In [96]:
(p_test>.9) *1
Out[96]:
In [101]:
make_submission((p_test>.75) *1).to_csv('day_2.csv', index=None)
In [113]:
def get_xgb_imp(xgb, feat_names):
imp_vals = xgb.get_fscore()
feats_imp = pd.DataFrame(imp_vals,index=np.arange(2)).T
feats_imp.iloc[:,0]= feats_imp.index
feats_imp.columns=['feature','importance']
feats_imp.sort_values('importance',inplace=True,ascending=False)
feats_imp.reset_index(drop=True,inplace=True)
return feats_imp
feature_importance_df = get_xgb_imp(model[3], feat_names= train_all.columns)
In [114]:
feature_importance_df
Out[114]:
In [121]:
for i,j in enumerate(train_all.columns):
print('f{}'.format(i), j)
In [ ]: