In [1]:
from pystacknet.pystacknet import StackNetClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor, GradientBoostingClassifier,GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
In [2]:
models=[
[
LGBMClassifier(boosting_type='gbdt', num_leaves=80, max_depth=-1, learning_rate=0.01, n_estimators=1000, subsample_for_bin=1000, objective="xentropy", min_split_gain=0.0,\
min_child_weight=0.01, min_child_samples=10, subsample=0.9, subsample_freq=1, colsample_bytree=0.7, reg_alpha=0.1, reg_lambda=0.1, random_state=1, n_jobs=3),
LogisticRegression(C=1, random_state=1),
RandomForestClassifier(n_estimators=300, criterion="entropy", max_depth=7, max_features=0.7, random_state=1),
RandomForestClassifier(n_estimators=200, criterion="entropy", max_depth=8, max_features=0.7, random_state=1),
LGBMClassifier(boosting_type='gbdt', num_leaves=40, max_depth=10, learning_rate=0.1, n_estimators=1000, subsample_for_bin=1000, objective="xentropy", min_split_gain=0.0,\
min_child_weight=0.01, min_child_samples=10, subsample=0.9, subsample_freq=1, colsample_bytree=0.9, reg_alpha=0.2, reg_lambda=0.2, random_state=1, n_jobs=3),
LogisticRegression(penalty="l1", C=1, random_state=1),
XGBClassifier(max_depth=8,learning_rate=0.1, n_estimators=300, objective="binary:logistic", n_jobs=3, booster="gbtree", random_state=1, colsample_bytree=0.5),
XGBClassifier(max_depth=10,learning_rate=0.1, n_estimators=300, objective="rank:pairwise", n_jobs=3, booster="gbtree", random_state=1, colsample_bytree=0.7),
LGBMClassifier(boosting_type='gbdt', num_leaves=40, max_depth=-1, learning_rate=0.01, n_estimators=1000, subsample_for_bin=1000, objective="xentropy", min_split_gain=0.0,\
min_child_weight=0.01, min_child_samples=10, subsample=0.9, subsample_freq=1, colsample_bytree=0.5, reg_alpha=0.0, reg_lambda=0.0, random_state=1, n_jobs=3)
],
[
XGBClassifier(max_depth=10,learning_rate=0.1, n_estimators=300, objective="rank:pairwise", n_jobs=3, booster="gbtree", random_state=1, colsample_bytree=0.7),
LGBMClassifier(boosting_type='gbdt', num_leaves=40, max_depth=-1, learning_rate=0.01, n_estimators=1000, subsample_for_bin=1000, objective="xentropy", min_split_gain=0.0,\
min_child_weight=0.01, min_child_samples=10, subsample=0.9, subsample_freq=1, colsample_bytree=0.5, reg_alpha=0.0, reg_lambda=0.0, random_state=1, n_jobs=3),
RandomForestClassifier(n_estimators=300, criterion="entropy", max_depth=8, max_features=0.7, random_state=1)
]
]
In [3]:
model=StackNetClassifier(models, metric="f1", folds=3, restacking=True,
use_retraining=True, use_proba=False, random_state=12345,
n_jobs=4, verbose=2)
In [9]:
model.fit(train_all, target)
In [10]:
model
Out[10]:
In [12]:
preds=model.predict_proba(test_all)
In [18]:
sub = np.where(preds[:,2]>=0.61,1,0)
make_submission(sub).to_csv('py_stacknet.csv', index=False)
In [15]:
np.save('preds_pystacknet.py', preds)
In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [2]:
import time
import xgboost as xgb
import lightgbm as lgb
# import category_encoders as cat_ed
import gc, mlcrate, glob
# from gplearn.genetic import SymbolicTransformer, SymbolicClassifier
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, ExtraTreesRegressor
from IPython.display import display
from catboost import CatBoostClassifier
from scipy.cluster import hierarchy as hc
from collections import Counter
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection import GaussianRandomProjection, SparseRandomProjection
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, log_loss
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
# will ignore all warning from sklearn, seaborn etc..
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn
pd.option_context("display.max_rows", 1000);
pd.option_context("display.max_columns", 1000);
In [3]:
PATH = os.getcwd()
PATH
Out[3]:
In [4]:
df_raw = pd.read_csv(f'{PATH}\\train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}\\test.csv', low_memory=False)
In [5]:
df_raw.shape, df_test.shape
Out[5]:
In [6]:
new_cols = ['employee_id', 'department', 'region', 'education', 'gender',
'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
'length_of_service', 'KPIs_met_more_than_80_percent', 'awards_won_bool',
'avg_training_score', 'is_promoted']
#re-naming them
df_raw.columns = new_cols
df_test.columns = new_cols[:-1]
cat_cols = list(df_raw.select_dtypes(include=['object']).columns)
num_cols = list(df_raw.select_dtypes(exclude=['object']).columns)
In [7]:
## Since we can't use id cols, it better dropping them straight front!
drop_col = ['employee_id']
df_raw.drop(drop_col, axis=1, inplace=True)
df_test.drop(drop_col, axis=1, inplace=True)
In [8]:
# %%time
#creating interactions all Run At Last and add tf-idf, count vec
# for f in range (0,len(cat_cols)):
# for s in range (f+1,len(cat_cols)):
# # Basically this is creating interactions..( 2 - way)
# df_raw[cat_cols[f] + "_" + cat_cols[s]] = df_raw[cat_cols[f]] + "_" + df_raw[cat_cols[s]]
# df_test[cat_cols[f] + "_" + cat_cols[s]] = df_test[cat_cols[f]] + "_" + df_test[cat_cols[s]]
# cat_cols.append(cat_cols[f] + "_" + cat_cols[s])
# print(len(cat_cols), end=' ')
In [9]:
###########################kind of binning age at trivial level #####################################
df_raw['is_age_39'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age<=39.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_39_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=39. & age<=45.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=45.').index
df_raw.iloc[my_query, -1] = 1
#######################################################################################################
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
#######################################################################################################
df_raw['age_group'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=20. & age<=30.').index
df_raw.iloc[my_query, -1] = 'young'
my_query = df_raw.query('age>=31. & age<=50.').index
df_raw.iloc[my_query, -1] = 'middle_aged'
my_query = df_raw.query('age>=51. & age<=60.').index
df_raw.iloc[my_query, -1] = 'senior_aged'
###################################################################################################################
###################################################################################################################
###################################################################################################################
###########################kind of binning age at trivial level #####################################
df_test['is_age_39'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age<=39.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_39_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=39. & age<=45.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=45.').index
df_test.iloc[my_query, -1] = 1
#######################################################################################################
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
#######################################################################################################
df_test['age_group'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=20. & age<=30.').index
df_test.iloc[my_query, -1] = 'young'
my_query = df_test.query('age>=31. & age<=50.').index
df_test.iloc[my_query, -1] = 'middle_aged'
my_query = df_test.query('age>=51. & age<=60.').index
df_test.iloc[my_query, -1] = 'senior_aged';
###############################################################################
In [10]:
df_raw['promotion_chance'] = 'low'
my_query = df_raw.query('avg_training_score>=90').index
df_raw.iloc[my_query, -1] = 'very_high'
my_query = df_raw.query('avg_training_score>=75 and avg_training_score<90').index
df_raw.iloc[my_query, -1] = 'high'
my_query = df_raw.query('avg_training_score>=65 and avg_training_score<75').index
df_raw.iloc[my_query, -1] = 'medium'
my_query = df_raw.query('avg_training_score>=53 and avg_training_score<65').index
df_raw.iloc[my_query, -1] = 'low_medium'
df_test['promotion_chance'] = 'low'
my_query = df_test.query('avg_training_score>=90').index
df_test.iloc[my_query, -1] = 'very_high'
my_query = df_test.query('avg_training_score>=75 and avg_training_score<90').index
df_test.iloc[my_query, -1] = 'high'
my_query = df_test.query('avg_training_score>=65 and avg_training_score<75').index
df_test.iloc[my_query, -1] = 'medium'
my_query = df_test.query('avg_training_score>=53 and avg_training_score<65').index
df_test.iloc[my_query, -1] = 'low_medium'
In [10]:
feats_added = []
df_raw['joining_age'] = df_raw['age'] - df_raw['length_of_service']
df_test['joining_age'] = df_test['age'] - df_test['length_of_service']
feats_added.append('joining_age')
df_raw['region'].replace('region_', '', True, None, True)
df_test['region'].replace('region_', '', True, None, True)
################################################################################3
bins = [20., 25., 30., 35., 40., 45., 50., 55., 60., 70]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['age']
for col in bin_cols:
df_raw[f'bin_{col}'.format(col)] = pd.cut(df_raw[col] ,bins,labels = labels)
df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)
feats_added.append('bin_age')
bins = [39., 44., 54., 66., 75., 80., 85., 90., 95.]
labels = [i+1 for i in range(len(bins) - 1)]
bin_cols = ['avg_training_score']
for col in bin_cols:
df_raw[f'bin_{col}'.format(col)] = pd.cut(df_raw[col] ,bins,labels = labels)
df_test[f'bin_{col}'.format(col)] = pd.cut(df_test[col],bins,labels = labels)
feats_added.append('bin_avg_training_score')
feats_added.append(['age_group', 'is_age_39', 'is_age_39_45', 'is_age_45', 'promotion_chance',\
'reg_count','mean_age_per_region','mean_joining_age_per_region','mean_previous_year_rating_per_region',\
'mean_avg_training_score_per_region','mean_length_of_service_per_region'])
################################################################################################
df_raw['promotion_chance'] = 'low'
my_query = df_raw.query('avg_training_score>=90').index
df_raw.iloc[my_query, -1] = 'very_high'
my_query = df_raw.query('avg_training_score>=75 and avg_training_score<90').index
df_raw.iloc[my_query, -1] = 'high'
my_query = df_raw.query('avg_training_score>=65 and avg_training_score<75').index
df_raw.iloc[my_query, -1] = 'medium'
my_query = df_raw.query('avg_training_score>=53 and avg_training_score<65').index
df_raw.iloc[my_query, -1] = 'low_medium'
df_test['promotion_chance'] = 'low'
my_query = df_test.query('avg_training_score>=90').index
df_test.iloc[my_query, -1] = 'very_high'
my_query = df_test.query('avg_training_score>=75 and avg_training_score<90').index
df_test.iloc[my_query, -1] = 'high'
my_query = df_test.query('avg_training_score>=65 and avg_training_score<75').index
df_test.iloc[my_query, -1] = 'medium'
my_query = df_test.query('avg_training_score>=53 and avg_training_score<65').index
df_test.iloc[my_query, -1] = 'low_medium'
###############################################################################################
def map_(regs, age):
d = {}
for i,j in zip(regs, age):
d[i] = j
return d
xyz = df_raw.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_raw['region'])
regs = xyz.reset_index()['region'].values
age = xyz.reset_index()['age'].values
joining_age = xyz.reset_index()['joining_age'].values
previous_year_rating = xyz.reset_index()['previous_year_rating'].values
length_of_service = xyz.reset_index()['length_of_service'].values
avg_training_score = xyz.reset_index()['avg_training_score'].values
df_raw['reg_count'] = df_raw['region'].map(count)
d = map_(regs, age)
df_raw['mean_age_per_region'] = df_raw['region'].map(d)
d = map_(regs, joining_age)
df_raw['mean_joining_age_per_region'] = df_raw['region'].map(d)
d = map_(regs, previous_year_rating)
df_raw['mean_previous_year_rating_per_region'] = df_raw['region'].map(d)
d = map_(regs, avg_training_score)
df_raw['mean_avg_training_score_per_region'] = df_raw['region'].map(d)
d = map_(regs, length_of_service)
df_raw['mean_length_of_service_per_region'] = df_raw['region'].map(d)
xyz = df_test.groupby('region').mean().sort_values(by='region')[['age', 'joining_age', 'previous_year_rating', 'length_of_service', 'avg_training_score']]
count = Counter(df_test['region'])
regs = xyz.reset_index()['region'].values
age = xyz.reset_index()['age'].values
joining_age = xyz.reset_index()['joining_age'].values
previous_year_rating = xyz.reset_index()['previous_year_rating'].values
length_of_service = xyz.reset_index()['length_of_service'].values
avg_training_score = xyz.reset_index()['avg_training_score'].values
df_test['reg_count'] = df_test['region'].map(count)
d = map_(regs, age)
df_test['mean_age_per_region'] = df_test['region'].map(d)
d = map_(regs, joining_age)
df_test['mean_joining_age_per_region'] = df_test['region'].map(d)
d = map_(regs, previous_year_rating)
df_test['mean_previous_year_rating_per_region'] = df_test['region'].map(d)
d = map_(regs, avg_training_score)
df_test['mean_avg_training_score_per_region'] = df_test['region'].map(d)
d = map_(regs, length_of_service)
df_test['mean_length_of_service_per_region'] = df_test['region'].map(d)
####################################################################################
del d, count, regs, joining_age, previous_year_rating, length_of_service, avg_training_score
gc.collect()
Out[10]:
In [11]:
df_raw['promotion_chance'].head()
Out[11]:
In [12]:
#https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features
def add_noise(series, noise_level):
return series * (1 + noise_level * np.random.randn(len(series)))
def target_encode(trn_series=None,
tst_series=None,
target=None,
min_samples_leaf=1,
smoothing=1,
noise_level=0):
"""
Smoothing is computed like in the following paper by Daniele Micci-Barreca
https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
trn_series : training categorical feature as a pd.Series
tst_series : test categorical feature as a pd.Series
target : target data as a pd.Series
min_samples_leaf (int) : minimum samples to take category average into account
smoothing (int) : smoothing effect to balance categorical average vs prior
"""
assert len(trn_series) == len(target)
assert trn_series.name == tst_series.name
temp = pd.concat([trn_series, target], axis=1)
# Compute target mean
averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
# Compute smoothing
smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
# Apply average function to all target data
prior = target.mean()
# The bigger the count the less full_avg is taken into account
averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
averages.drop(["mean", "count"], axis=1, inplace=True)
# Apply averages to trn and tst series
ft_trn_series = pd.merge(
trn_series.to_frame(trn_series.name),
averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
on=trn_series.name,
how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
# pd.merge does not keep the index so restore it
ft_trn_series.index = trn_series.index
ft_tst_series = pd.merge(
tst_series.to_frame(tst_series.name),
averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
on=tst_series.name,
how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
# pd.merge does not keep the index so restore it
ft_tst_series.index = tst_series.index
return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)
In [13]:
train_cats(df_raw);
apply_cats(df_test,df_raw)
In [14]:
cat_cols = list(df_raw.select_dtypes(include=['object','category']).columns)
In [15]:
train_encoded, test_encoded = [], []
for i in range(len(cat_cols)):
trn, sub = target_encode(df_raw[cat_cols[i]],
df_test[cat_cols[i]],
target=df_raw.is_promoted,
min_samples_leaf=100,
smoothing=10,
noise_level=0.01)
train_encoded.append(trn)
test_encoded.append(sub)
print(i, end=',')
In [16]:
df_raw_cat = df_raw[cat_cols]
df_test_cat = df_test[cat_cols]
df_raw.drop(cat_cols, axis=1, inplace=True)
df_test.drop(cat_cols, axis=1, inplace=True)
df_raw.shape, df_test.shape
Out[16]:
In [17]:
df_raw_cat.get_ftype_counts()
Out[17]:
In [18]:
df_raw.head()
Out[18]:
In [16]:
target = df_raw.is_promoted
df_raw.drop('is_promoted', axis=1, inplace=True)
In [23]:
categorical_features_indices1 = np.where(df_raw.dtypes == 'category')[0];
categorical_features_indices1
Out[23]:
In [32]:
df_raw['previous_year_rating'].fillna(0,inplace=True)
df_test['previous_year_rating'].fillna(0,inplace=True)
In [36]:
df_raw.fillna(method='bfill',inplace=True)
df_test.fillna(method='bfill',inplace=True)
In [37]:
X_train, X_validation, y_train, y_validation = train_test_split(df_raw, target, test_size=0.8, random_state=1234, stratify=target)
In [47]:
#importing library and building model
from catboost import CatBoostClassifier
model=CatBoostClassifier(logging_level='Verbose',class_weights=[0.3,0.7],iterations=500, depth=10, learning_rate=0.01, loss_function='Logloss',\
)
model.fit(X_train, y_train,cat_features=categorical_features_indices1,eval_set=(X_validation, y_validation))
Out[47]:
In [50]:
model.predict_proba(df_test)
Out[50]:
In [60]:
temp = pd.DataFrame()
l =[]
for i,j in zip(model.feature_importances_, df_raw.columns):
l.append([j,i])
In [73]:
temp = pd.DataFrame(l,columns=['col', 'imp'])
plt.figure(figsize=(10,10))
temp.plot('col','imp',kind='barh')
Out[73]:
In [19]:
target = df_raw.is_promoted
df_raw.drop('is_promoted', axis=1, inplace=True)
df_raw['previous_year_rating'].fillna(0,inplace=True)
df_test['previous_year_rating'].fillna(0,inplace=True)
In [20]:
#df_raw[pd.isnull(df_raw['previous_year_rating'])]
In [20]:
train_encoded_T, test_encoded_T = np.asarray(train_encoded).T, np.asarray(test_encoded).T
In [21]:
# ###need to check on this
# df_raw_cat['promotion_chance'] = df_raw['promotion_chance']
# df_test_cat['promotion_chance'] = df_test['promotion_chance']
# df_raw.drop('promotion_chance', axis=1,inplace=True)
# df_test.drop('promotion_chance', axis=1,inplace=True)
In [21]:
df_raw_cat.columns
Out[21]:
In [22]:
train_all, test_all = np.hstack((df_raw, train_encoded_T, pd.get_dummies(df_raw_cat))), np.hstack((df_test, test_encoded_T, pd.get_dummies(df_test_cat)))
In [23]:
train_all.shape, test_all.shape
Out[23]:
In [7]:
def make_submission(probs):
sample = pd.read_csv(f'{PATH}\\sample_submission.csv')
submit = sample.copy()
submit['is_promoted'] = probs
return submit
In [25]:
np.count_nonzero(target), target.shape[0]- 4668
Out[25]:
In [8]:
train_all, test_all, target = np.load('train_all.npy'), np.load('test_all.npy'), np.load('target.npy')
In [26]:
X_train, X_valid, y_train, y_valid = train_test_split(train_all, target, test_size = .2, stratify = target)
In [27]:
def runXGB(train_X, train_y, test_X, test_y=None):
params = {}
params['booster'] = 'gbtree'
params['tree_method'] = 'gpu_hist'
params["objective"] = "binary:logistic"
params['eval_metric'] = 'auc'
params["eta"] = 0.05 #0.03
params["subsample"] = .8
params["silent"] = 0
params['verbose'] = 1
params["max_depth"] = 10
params["seed"] = 1
params["max_delta_step"] = 4
params['scale_pos_weight'] = 50140/4668
params["gamma"] = 0.6 #.5 #.1 #.2
params['colsample_bytree'] = 0.75
params['nrounds'] = 1000 #3600 #2000 #4000
plst = list(params.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)
xgtest = xgb.DMatrix(test_X)
model = xgb.train(plst, xgtrain)
pred_test_y = model.predict(xgtest)
return pred_test_y
In [29]:
val_preds = runXGB(X_train, y_train, X_valid,)
In [30]:
val_preds, max(val_preds)
Out[30]:
In [10]:
# test_preds = model_srk.predict(test_all)
In [31]:
params = {}
params['booster'] = 'gbtree'
params['tree_method'] = 'gpu_hist'
params["objective"] = "binary:logistic"
params['eval_metric'] = 'auc'
params["eta"] = 0.05 #0.03
params["subsample"] = .85
params["silent"] = 0
params['verbose'] = 1
params["max_depth"] = 10
params["seed"] = 1
params["max_delta_step"] = 4
params['scale_pos_weight'] = 50140/4668
params["gamma"] = 0.6 #.5 #.1 #.2
params['colsample_bytree'] = 0.75
params['nrounds'] = 500 #3600 #2000 #4000
In [23]:
# X_train, X_valid, y_train, y_valid = train_test_split(train_all, target, test_size = .2, stratify = target)
In [33]:
model, p_train, p_test = mlcrate.xgb.train_kfold(params, train_all, target, test_all, folds = 7, stratify=target)
In [34]:
import numpy as np
from sklearn.metrics import matthews_corrcoef
def find_matthews_threshold(p_valid, y_valid, try_all=False, verbose=False):
p_valid, y_valid = np.array(p_valid), np.array(y_valid)
best = 0
best_score = -2
totry = np.arange(0,1,0.01) if try_all is False else np.unique(p_valid)
for t in totry:
score = matthews_corrcoef(y_valid, p_valid > t)
if score > best_score:
best_score = score
best = t
if verbose is True:
print('Best score: ', round(best_score, 5), ' @ threshold ', best)
return best
def best_threshold_submission(p_valid, y_valid, p_test, try_all=False, verbose=False):
p_test = np.array(p_test)
thresh = find_matthews_threshold(p_valid, y_valid, try_all, verbose)
return p_test > thresh
submission_values = best_threshold_submission(val_preds, y_valid, p_test, True, True)
In [35]:
submission_values*1
Out[35]:
In [36]:
sample = pd.read_csv('sample_submission.csv')
In [48]:
def get_xgb_imp(xgb, feat_names):
imp_vals = xgb.get_fscore()
feats_imp = pd.DataFrame(imp_vals,index=np.arange(2)).T
feats_imp.iloc[:,0]= feats_imp.index
feats_imp.columns=['feature','importance']
feats_imp.sort_values('importance',inplace=True,ascending=False)
feats_imp.reset_index(drop=True,inplace=True)
return feats_imp
feature_importance_df = get_xgb_imp(model[3], feat_names=[])
In [ ]:
# model, p_train, p_test = mlcrate.xgb.train_kfold(params, train_all, target, test_all, folds = 7, stratify=target)
In [51]:
predictions_test = submission_values#np.where(p_test>=0.75, 1,0)
sample['is_promoted'] = predictions_test
sample.to_csv('preds_2_xgb_with_dummies_optimised_version.csv', index=False)
In [52]:
import joblib
#save model
joblib.dump(model[0], 'xgb_model_1')
Out[52]:
In [250]:
train_all, target, test_all = np.save('train_all.npy',np.hstack((train_all, pd.get_dummies(df_raw_cat)))), np.save('target.npy',target), np.save('test_all.npy',np.hstack((test_all,pd.get_dummies(df_test_cat))))