In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
from fastai.imports import *
from fastai.structured import *
from mlcrate import *
import time
from gplearn.genetic import SymbolicTransformer
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import gc
from scipy.cluster import hierarchy as hc
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import StratifiedKFold
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn
#will ignore all warning from sklearn, seaborn etc..
pd.option_context("display.max_rows", 1000);
pd.option_context("display.max_columns", 1000);
In [2]:
PATH = os.getcwd();
PATH
Out[2]:
In [11]:
df_raw = pd.read_csv(f'{PATH}\\AV_Stud\\lgb_train_v1.csv', low_memory= False)
df_test = pd.read_csv(f'{PATH}\\AV_Stud\\lgb_test_v1.csv', low_memory=False)
In [60]:
df_raw.columns[:3]
Out[60]:
In [90]:
df_raw.drop('Unnamed: 0', axis=1, inplace=True)
df_test.drop('Unnamed: 0', axis=1, inplace=True)
In [64]:
df_raw.head(2)
Out[64]:
In [5]:
cols_dropped = ['id']
target = df_raw.is_pass
df_raw.drop('is_pass', inplace=True, axis =1)
In [6]:
df_raw['trainee_id'].value_counts().sort_values(ascending=False).plot(kind='hist')
Out[6]:
In [7]:
df_test['trainee_id'].value_counts().sort_values(ascending=False).plot(kind='hist')
Out[7]:
In [8]:
from collections import Counter
train_trainee_id = Counter(df_raw['trainee_id'])
test_trainee_id = Counter(df_test['trainee_id'])
######################### New Cols Added ##########################
df_raw['trainee_count'] = df_raw['trainee_id'].map(train_trainee_id).astype(np.int32)
df_test['trainee_count'] = df_test['trainee_id'].map(train_trainee_id).astype(np.int32)
########################### Creating Validation Dataset #########################
'''
rows_to_dropped = df_raw.loc[df_raw['trainee_count'] <=3].index
valid_data = df_raw.loc[df_raw['trainee_count'] <=3]
df_raw.drop(rows_to_dropped, inplace=True)
df_raw.shape
########################## Done For Temp ########################################
''';
In [9]:
##################### Dropping ID #######################
df_raw.drop('id', axis =1, inplace=True)
df_test.drop('id', axis =1, inplace=True)
#########################################################
In [10]:
# This way we have randomness and are able to reproduce the behaviour within this cell.
np.random.seed(13)
from sklearn.model_selection import KFold
def impact_coding(data, feature, target='y'):
'''
In this implementation we get the values and the dictionary as two different steps.
This is just because initially we were ignoring the dictionary as a result variable.
In this implementation the KFolds use shuffling. If you want reproducibility the cv
could be moved to a parameter.
'''
n_folds = 7
n_inner_folds = 5
impact_coded = pd.Series()
oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)
kf = KFold(n_splits=n_folds, shuffle=True)
oof_mean_cv = pd.DataFrame()
split = 0
for infold, oof in kf.split(data[feature]):
impact_coded_cv = pd.Series()
kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)
inner_split = 0
inner_oof_mean_cv = pd.DataFrame()
oof_default_inner_mean = data.iloc[infold][target].mean()
for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]):
# The mean to apply to the inner oof split (a 1/n_folds % based on the rest)
oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()
impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply(
lambda x: oof_mean[x[feature]]
if x[feature] in oof_mean.index
else oof_default_inner_mean
, axis=1))
# Also populate mapping (this has all group -> mean for all inner CV folds)
inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')
inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)
inner_split += 1
# Also populate mapping
oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')
oof_mean_cv.fillna(value=oof_default_mean, inplace=True)
split += 1
impact_coded = impact_coded.append(data.iloc[oof].apply(
lambda x: inner_oof_mean_cv.loc[x[feature]].mean()
if x[feature] in inner_oof_mean_cv.index
else oof_default_mean
, axis=1))
return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean
In [13]:
df_raw['program_type__program_duration'] = df_raw.program_type.str.cat(df_raw.program_duration.astype(str),sep='_')
df_raw['test_id__program_duration'] = df_raw.test_id.astype(str).str.cat(df_raw.program_duration.astype(str),sep='_')
df_raw['test_id__test_type'] = df_raw.test_id.astype(str).str.cat(df_raw.test_type.astype(str),sep='_')
df_raw['test_type__difficulty_level'] = df_raw.test_type.str.cat(df_raw.difficulty_level.astype(str),sep='_')
df_raw['education__gender'] = df_raw.education.str.cat(df_raw.gender.astype(str),sep='_')
df_raw['education__city_tier'] = df_raw.education.str.cat(df_raw.city_tier.astype(str),sep='_')
df_raw['gender__city_tier'] = df_raw.gender.str.cat(df_raw.city_tier.astype(str),sep='_')
df_raw['trainee_engagement_rating'] = df_raw['trainee_engagement_rating'].astype(object)
###########################kind of binning age at trivial level #####################################
df_raw['is_age_39'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age<=39.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_39_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=39. & age<=45.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=45.').index
df_raw.iloc[my_query, -1] = 1
#######################################################################################################
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
#######################################################################################################
'''df_raw['age_group'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=13. & age<=30.').index
df_raw.iloc[my_query, -1] = 'young'
my_query = df_raw.query('age>=31. & age<=50.').index
df_raw.iloc[my_query, -1] = 'middle_aged'
my_query = df_raw.query('age>=51. & age<=70.').index
df_raw.iloc[my_query, -1] = 'senior_aged'''
###################################################################################################################
###################################################################################################################
###################################################################################################################
df_test['program_type__program_duration'] = df_test.program_type.str.cat(df_test.program_duration.astype(str),sep='_')
df_test['test_id__program_duration'] = df_test.test_id.astype(str).str.cat(df_test.program_duration.astype(str),sep='_')
df_test['test_id__test_type'] = df_test.test_id.astype(str).str.cat(df_test.test_type.astype(str),sep='_')
df_test['test_type__difficulty_level'] = df_test.test_type.str.cat(df_test.difficulty_level.astype(str),sep='_')
df_test['education__gender'] = df_test.education.str.cat(df_test.gender.astype(str),sep='_')
df_test['education__city_tier'] = df_test.education.str.cat(df_test.city_tier.astype(str),sep='_')
df_test['gender__city_tier'] = df_test.gender.str.cat(df_test.city_tier.astype(str),sep='_')
df_test['trainee_engagement_rating'] = df_test['trainee_engagement_rating'].astype(object)
###########################kind of binning age at trivial level #####################################
df_test['is_age_39'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age<=39.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_39_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=39. & age<=45.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=45.').index
df_test.iloc[my_query, -1] = 1
#######################################################################################################
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
#######################################################################################################
'''
df_test['age_group'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=13. & age<=30.').index
df_test.iloc[my_query, -1] = 'young'
my_query = df_test.query('age>=31. & age<=50.').index
df_test.iloc[my_query, -1] = 'middle_aged'
my_query = df_test.query('age>=51. & age<=70.').index
df_test.iloc[my_query, -1] = 'senior_aged''';
###############################################################################
In [14]:
features = df_raw.columns
numeric_features = []
categorical_features = []
for dtype, feature in zip(df_raw.dtypes, df_raw.columns):
if dtype == object:
#print(column)
#print(train_data[column].describe())
categorical_features.append(feature)
else:
numeric_features.append(feature)
categorical_features
Out[14]:
In [15]:
df_raw['is_pass'] = target
###############################remeber to drop the target column again after the next block
In [16]:
%%time
# Apply the encoding to training and test data, and preserve the mapping
impact_coding_map = {}
for f in categorical_features:
print("Impact coding for {}".format(f))
df_raw["impact_encoded_{}".format(f)], impact_coding_mapping, default_coding = impact_coding(df_raw, f,'is_pass')
impact_coding_map[f] = (impact_coding_mapping, default_coding)
mapping, default_mean = impact_coding_map[f]
df_test["impact_encoded_{}".format(f)] = df_test.apply(lambda x: mapping[x[f]]
if x[f] in mapping
else default_mean
, axis=1)
df_raw.drop('is_pass', inplace=True, axis =1);
In [17]:
##################### sanity check should be empty #####################
set(df_raw.columns) - set(df_test.columns)
Out[17]:
In [18]:
df_raw['is_pass'] = target
df_raw.to_csv(f'{PATH}\\AV_Stud\\lgb_train_v1.csv')
df_test.to_csv(f'{PATH}\\AV_Stud\\lgb_test_v1.csv')
df_raw.drop('is_pass', inplace=True, axis =1);
In [20]:
X_train, X_test, y = df_raw.copy(), df_test.copy(), target
In [21]:
X_train['is_pass'] = target
In [22]:
rows_to_dropped = X_train.loc[X_train['trainee_count'] <=3].index
X_valid = X_train.loc[X_train['trainee_count'] <=3] #x_valid having is_pass intact drop at end
X_train.drop(rows_to_dropped, inplace=True) #x_train having is_pass intact drop at end
y_train = X_train['is_pass']
y_valid = X_valid['is_pass']
X_train.drop('is_pass', axis =1, inplace=True)
X_valid.drop('is_pass', axis =1, inplace=True)
In [23]:
####################### sanity checks #######################
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape, df_raw.shape, df_test.shape
Out[23]:
In [24]:
df_raw['trainee_engagement_rating'].value_counts()
########Addd this as a feature tooo split rating at 4 #########################:
Out[24]:
In [37]:
df_raw['age'].fillna(-1, inplace=True)
df_test['age'].fillna(-1, inplace=True)
df_raw['trainee_engagement_rating'].fillna(method='ffill', inplace=True)
df_test['trainee_engagement_rating'].fillna(method='ffill', inplace=True)
In [65]:
#X_train['age'].fillna(int(df_raw['age'].mean()), inplace=True)
#X_valid['age'].fillna(int(df_raw['age'].mean()), inplace=True)
#X_train['trainee_engagement_rating'].fillna(value=2., inplace=True)
#X_valid['trainee_engagement_rating'].fillna(value=2., inplace=True)
In [30]:
train_cats(df_raw)
apply_cats(df_test, df_raw);
In [19]:
categorical_features_indices = np.where(df_raw.dtypes == 'object')[0];
#df_raw.drop('is_pass',axis=1,inplace=True);
categorical_features_indices
Out[19]:
In [39]:
X_train, X_valid, y_train, y_valid = train_test_split(df_raw, target, train_size=0.8, random_state=1234)
model=CatBoostClassifier(iterations=200, depth=12, learning_rate=0.5, loss_function='Logloss',use_best_model=True,\
class_weights = [0.3045921227117995, 0.6954078772882005 ])
model.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_valid, y_valid));
In [40]:
preds_cat = model.predict_proba(df_test)
In [20]:
train_cats(df_raw);
apply_cats(df_test, df_raw);
In [81]:
train_cats(X_train);
apply_cats(X_valid, X_train);
In [21]:
def lable_encode_edu(x):
switcher = {
"High School Diploma": 1,
"Matriculation": 2,
"Bachelors": 3,
"Masters": 4,
}
return switcher.get(x, 0)
def lable_encode_diff(x):
switcher = {
"easy": 1,
"intermediate": 2,
"hard": 3,
"very hard": 4,
}
return switcher.get(x, 0)
df_raw["education"] = df_raw["education"].apply(lambda x: lable_encode_edu(x))
df_test["education"] = df_test["education"].apply(lambda x: lable_encode_edu(x))
df_raw["difficulty_level"] = df_raw["difficulty_level"].apply(lambda x: lable_encode_edu(x))
df_test["difficulty_level"] = df_test["difficulty_level"].apply(lambda x: lable_encode_edu(x))
In [24]:
print("LGB startting")
params = {
'use_missing': True,
'application': 'binary',
'learning_rate': 0.05,
'objective': 'binary',
'max_depth': 11,
'num_leaves': 500,
'verbosity': 1,
'metric' : ['auc', 'binary_logloss'],
'data_random_seed': 1,
'bagging_fraction': 0.75,
'feature_fraction': 0.75,
'nthread': 4,
'min_data_in_leaf': 100,
'max_bin': 255,
'is_unbalance': 'True',
'max_cat_threshold' : 32,
'enable_bundle' : False
}
d_train = lgb.Dataset(df_raw, label= target)
#d_val = lgb.Dataset(X_valid, label=y_valid)
watchlist = [d_train]
model_lgb = lgb.train(params, train_set=d_train,num_boost_round=1500,verbose_eval=100) #valid_sets=watchlist, verbose_eval=100,\
#early_stopping_rounds=500,)
print('Finish LGB Training')
In [36]:
df_raw = pd.get_dummies(df_raw, drop_first=True,prefix='dummy',)
df_test = pd.get_dummies(df_raw, drop_first=True,prefix='dummy',)
In [38]:
df_raw.shape, df_test.shape
Out[38]:
In [25]:
def make_submission(probs):
sample = pd.read_csv(f'{PATH}\\AV_Stud\\sample_submission_vaSxamm.csv')
submit = sample.copy()
submit['is_pass'] = probs
return submit
In [26]:
preds_lgb = model_lgb.predict(data=df_test)
In [116]:
submit = make_submission(preds_xgb)
In [117]:
submit.head(2)
Out[117]:
In [118]:
submit.to_csv(f'{PATH}\\AV_Stud\\xgb_____.csv', index=False)
In [ ]:
model.
In [39]:
model_lgb.save_model(f'{PATH}\\AV_Stud\\model.txt', num_iteration=model_lgb.best_iteration)
In [40]:
json_model = model_lgb.dump_model()
In [88]:
train_cats(df_raw)
apply_cats(df_test,df_raw)
In [92]:
cols_dropped = ['program_type__program_duration','test_id__program_duration','test_id__test_type',\
'test_type__difficulty_level','education__gender','education__city_tier',\
'gender__city_tier']
df_test = df_test.drop(cols_dropped, axis=1)
df_raw = df_raw.drop(cols_dropped, axis=1)
df_raw.drop('is_pass', axis=1, inplace=True)
In [102]:
set(df_raw.columns) - set(df_test.columns)
Out[102]:
In [98]:
df_raw = pd.get_dummies(df_raw, drop_first=True, prefix='dummy',columns=['program_id','program_type','test_type','difficulty_level',\
'gender','education','is_handicapped'])
df_test = pd.get_dummies(df_test, drop_first=True, prefix='dummy',columns=['program_id','program_type','test_type','difficulty_level',\
'gender','education','is_handicapped'])
In [114]:
def runXGB(train_X, train_y, test_X, test_y=None, seed_val=1, depth = 11, model = None):
params = {}
params['booster'] = 'gbtree'
#params['updater'] = 'coord_descent'
params["objective"] = "binary:logistic"
params['eval_metric'] = 'auc'
params["eta"] = 0.05 #0.03
params["subsample"] = .85
params["silent"] = 0
params['verbose'] = 2
params["max_depth"] = depth
params["seed"] = seed_val
params["max_delta_step"] = 4
params['scale_pos_weight'] = 0.4380049934141978
params["gamma"] = 0.6 #.5 #.1 #.2
params['colsample_bytree'] = 0.9
num_rounds = 2500 #3600 #2000 #4000
plst = list(params.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)
if test_y is not None:
print('1st block\n')
xgtest = xgb.DMatrix(test_X, label=test_y)
watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds= 50,verbose_eval=True)
else:
print('2nd block\n')
xgtest = xgb.DMatrix(test_X)
watchlist = [ (xgtrain,'train')]
print('########################### model ######################\n')
model = xgb.train(plst, xgtrain, num_rounds)
pred_test_y = model.predict(xgtest,ntree_limit=model.best_ntree_limit)
return pred_test_y, model, plst, model.best_ntree_limit
In [111]:
df_raw.drop('dummy_Y', axis=1, inplace=True)
df_test.drop('dummy_Y', axis=1, inplace=True)
In [52]:
cols = np.unique(df_raw.columns)
In [115]:
%%time
preds_xgb, model, params, num_rounds = runXGB(df_raw, target, df_test)
In [123]:
xgb.plot_importance(model,max_num_features=15);
In [125]:
df_raw.to_csv(f'{PATH}\\AV_Stud\\xgb_train_cleaned.csv', index=False)
df_test.to_csv(f'{PATH}\\AV_Stud\\xgb_test_cleaned.csv', index=False)
In [ ]:
In [3]:
df_raw = pd.read_csv(f'{PATH}\\AV_Stud\\xgb_train_cleaned.csv')
df_test = pd.read_csv(f'{PATH}\\AV_Stud\\xgb_test_cleaned.csv')
In [5]:
df_raw.drop('trainee_id', axis =1, inplace=True)
df_test.drop('trainee_id', axis =1, inplace=True)
df_raw.drop('test_id', axis =1, inplace=True)
df_test.drop('test_id', axis =1, inplace=True)
In [9]:
df_raw['is_age_20_30'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=20. & age<=30.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_30_40'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=30. & age<=40.').index
df_raw.iloc[my_query, -1] = 1
df_test['is_age_20_30'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=20. & age<=30.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_30_40'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=30. & age<=40.').index
df_test.iloc[my_query, -1] = 1
In [25]:
import mlcrate
In [26]:
params = {}
params['booster'] = 'gbtree'
#params['updater'] = 'coord_descent'
params["objective"] = "binary:logistic"
params['eval_metric'] = 'auc'
params["eta"] = 0.05 #0.03
params["subsample"] = .85
params["silent"] = 0
params['verbose'] = 2
params["max_depth"] = 11
params["seed"] = 1
params["max_delta_step"] = 4
params['scale_pos_weight'] = 0.4380049934141978
params["gamma"] = 0.6 #.5 #.1 #.2
params['colsample_bytree'] = 0.9
params['nrounds'] = 2500 #3600 #2000 #4000
In [31]:
model, p_train, p_test = mlcrate.xgb.train_kfold(params, df_raw, target, df_test, folds =7, stratify=target)
In [33]:
p_train
Out[33]: