In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
In [3]:
from fastai.imports import *
from fastai.structured import *
import time
from gplearn.genetic import SymbolicTransformer
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
import gc
from scipy.cluster import hierarchy as hc
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import StratifiedKFold
def ignore_warn(*args, **kwargs):
pass
warnings.warn = ignore_warn
#will ignore all warning from sklearn, seaborn etc..
In [4]:
PATH = os.getcwd();
PATH
Out[4]:
In [32]:
df_raw = pd.read_csv(f'{PATH}\\AV_Stud\\train_HK6lq50_encoded_v3_250518.csv', low_memory= False)
df_test = pd.read_csv(f'{PATH}\\AV_Stud\\test_2nAIblo_encoded_v3_250518.csv', low_memory=False)
In [5]:
df_raw.head(2)
Out[5]:
In [6]:
#dropping id columns
df_raw.drop(['id','trainee_id'], inplace = True, axis =1)
df_test.drop(['id', 'trainee_id'], inplace = True, axis =1);
target = df_raw.is_pass;
df_raw.drop('is_pass', axis =1, inplace= True)
df_raw['age'].fillna(value = 45., inplace=True)
df_raw['trainee_engagement_rating'].fillna(value = 1., inplace=True)
In [34]:
df_raw.head(3)
Out[34]:
In [38]:
#############################################################################################################
##########################################TRAIN SET FE'S######################################################
#############################################################################################################
df_raw['program_type__program_duration'] = df_raw.program_type.str.cat(df_raw.program_duration.astype(str),sep='_')
df_raw['program_type__city_tier'] = df_raw.program_type.str.cat(df_raw.city_tier.astype(str),sep='_')
df_raw['program_type__test_type'] = df_raw.program_type.str.cat(df_raw.test_type.astype(str),sep='_')
df_raw['program_type__difficulty_level'] = df_raw.program_type.str.cat(df_raw.difficulty_level.astype(str),sep='_')
df_raw['test_id__program_duration'] = df_raw.test_id.astype(str).str.cat(df_raw.program_duration.astype(str),sep='_')
df_raw['test_id__test_type'] = df_raw.test_id.astype(str).str.cat(df_raw.test_type.astype(str),sep='_')
df_raw['test_id_test_type__difficulty_level'] = df_raw.test_id__test_type.str.cat(df_raw.difficulty_level.astype(str),sep='_')
df_raw['test_type__difficulty_level'] = df_raw.test_type.str.cat(df_raw.difficulty_level.astype(str),sep='_')
df_raw['education__gender'] = df_raw.education.str.cat(df_raw.gender.astype(str),sep='_')
df_raw['education__total_programs_enrolled'] = df_raw.education.str.cat(df_raw.total_programs_enrolled.astype(str),sep='_')
df_raw['gender__city_tier'] = df_raw.gender.str.cat(df_raw.city_tier.astype(str),sep='_')
df_raw['gender__is_handicapped'] = df_raw.gender.str.cat(df_raw.is_handicapped.astype(str),sep='_')
df_raw['education__city_tier'] = df_raw.education.str.cat(df_raw.city_tier.astype(str),sep='_')
df_raw['program_duration_months'] = df_raw['program_duration'] / (7.)
df_raw['program_duration_years'] = df_raw['program_duration'] / (365.)
df_raw['program_duration_avg'] = df_raw['program_duration']/df_raw['total_programs_enrolled']
df_raw['is_age_39'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age<=39.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_39_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=39. & age<=45.').index
df_raw.iloc[my_query, -1] = 1
df_raw['is_age_45'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=45.').index
df_raw.iloc[my_query, -1] = 1
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
df_raw['age_group'] = np.zeros(df_raw.shape[0])
my_query = df_raw.query('age>=13. & age<=30.').index
df_raw.iloc[my_query, -1] = 'young'
my_query = df_raw.query('age>=31. & age<=50.').index
df_raw.iloc[my_query, -1] = 'middle_aged'
my_query = df_raw.query('age>=51. & age<=70.').index
df_raw.iloc[my_query, -1] = 'senior_aged'
df_raw['program_level'] = df_raw['program_id'].str.split(pat='_', expand=True).get(1).astype(object)
#df_raw['is_pass'] = target
#df_raw.drop('is_pass', axis = 1, inplace = True);
#############################################################################################################
##########################################TEST SET FE'S######################################################
#############################################################################################################
df_test['program_type__program_duration'] = df_test.program_type.str.cat(df_test.program_duration.astype(str),sep='_')
df_test['program_type__city_tier'] = df_test.program_type.str.cat(df_test.city_tier.astype(str),sep='_')
df_test['program_type__test_type'] = df_test.program_type.str.cat(df_test.test_type.astype(str),sep='_')
df_test['program_type__difficulty_level'] = df_test.program_type.str.cat(df_test.difficulty_level.astype(str),sep='_')
df_test['test_id__program_duration'] = df_test.test_id.astype(str).str.cat(df_test.program_duration.astype(str),sep='_')
df_test['test_id__test_type'] = df_test.test_id.astype(str).str.cat(df_test.test_type.astype(str),sep='_')
df_test['test_id_test_type__difficulty_level'] = df_test.test_id__test_type.str.cat(df_test.difficulty_level.astype(str),sep='_')
df_test['test_type__difficulty_level'] = df_test.test_type.str.cat(df_test.difficulty_level.astype(str),sep='_')
df_test['education__gender'] = df_test.education.str.cat(df_test.gender.astype(str),sep='_')
df_test['education__total_programs_enrolled'] = df_test.education.str.cat(df_test.total_programs_enrolled.astype(str),sep='_')
df_test['gender__city_tier'] = df_test.gender.str.cat(df_test.city_tier.astype(str),sep='_')
df_test['gender__is_handicapped'] = df_test.gender.str.cat(df_test.is_handicapped.astype(str),sep='_')
df_test['education__city_tier'] = df_test.education.str.cat(df_test.city_tier.astype(str),sep='_')
df_test['program_duration_months'] = df_test['program_duration'] / (7.)
df_test['program_duration_years'] = df_test['program_duration'] / (365.)
df_test['program_duration_avg'] = df_test['program_duration']/df_test['total_programs_enrolled']
df_test['is_age_39'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age<=39.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_39_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=39. & age<=45.').index
df_test.iloc[my_query, -1] = 1
df_test['is_age_45'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=45.').index
df_test.iloc[my_query, -1] = 1
###################young age (13–30), middle age (31–50) and senior age (51–70)########################
df_test['age_group'] = np.zeros(df_test.shape[0])
my_query = df_test.query('age>=13. & age<=30.').index
df_test.iloc[my_query, -1] = 'young'
my_query = df_test.query('age>=31. & age<=50.').index
df_test.iloc[my_query, -1] = 'middle_aged'
my_query = df_test.query('age>=51. & age<=70.').index
df_test.iloc[my_query, -1] = 'senior_aged'
df_test['program_level'] = df_test['program_id'].str.split(pat='_', expand=True).get(1).astype(object)
########################################### Dropping few cols ##############################################3
df_raw.drop('program_id', inplace=True, axis =1)
df_test.drop('program_id', inplace=True, axis =1)
In [83]:
##################### sanity check should be empty #####################
set(df_raw.columns) - set(df_test.columns)
Out[83]:
In [84]:
# This way we have randomness and are able to reproduce the behaviour within this cell.
np.random.seed(13)
from sklearn.model_selection import KFold
def impact_coding(data, feature, target='y'):
'''
In this implementation we get the values and the dictionary as two different steps.
This is just because initially we were ignoring the dictionary as a result variable.
In this implementation the KFolds use shuffling. If you want reproducibility the cv
could be moved to a parameter.
'''
n_folds = 10
n_inner_folds = 7
impact_coded = pd.Series()
oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)
kf = KFold(n_splits=n_folds, shuffle=True)
oof_mean_cv = pd.DataFrame()
split = 0
for infold, oof in kf.split(data[feature]):
impact_coded_cv = pd.Series()
kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)
inner_split = 0
inner_oof_mean_cv = pd.DataFrame()
oof_default_inner_mean = data.iloc[infold][target].mean()
for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]):
# The mean to apply to the inner oof split (a 1/n_folds % based on the rest)
oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()
impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply(
lambda x: oof_mean[x[feature]]
if x[feature] in oof_mean.index
else oof_default_inner_mean
, axis=1))
# Also populate mapping (this has all group -> mean for all inner CV folds)
inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')
inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)
inner_split += 1
# Also populate mapping
oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')
oof_mean_cv.fillna(value=oof_default_mean, inplace=True)
split += 1
impact_coded = impact_coded.append(data.iloc[oof].apply(
lambda x: inner_oof_mean_cv.loc[x[feature]].mean()
if x[feature] in inner_oof_mean_cv.index
else oof_default_mean
, axis=1))
return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean
In [85]:
features = df_raw.columns
numeric_features = []
categorical_features = []
for dtype, feature in zip(df_raw.dtypes, df_raw.columns):
if dtype == object:
categorical_features.append(feature)
else:
numeric_features.append(feature)
categorical_features
Out[85]:
In [86]:
%%time
# Apply the encoding to training and test data, and preserve the mapping
df_raw['is_pass'] = target
impact_coding_map = {}
for f in categorical_features:
print("Impact coding for {}".format(f))
df_raw["impact_encoded_{}".format(f)], impact_coding_mapping, default_coding = impact_coding(df_raw, f,'is_pass')
impact_coding_map[f] = (impact_coding_mapping, default_coding)
mapping, default_mean = impact_coding_map[f]
df_test["impact_encoded_{}".format(f)] = df_test.apply(lambda x: mapping[x[f]]
if x[f] in mapping
else default_mean
, axis=1)
df_raw.drop('is_pass', inplace=True, axis=1)
In [89]:
df_raw['is_pass'] = target
In [90]:
df_raw.to_csv(f'{PATH}\\AV_Stud\\train_HK6lq50_encoded_v3_250518.csv',index=False)
df_test.to_csv(f'{PATH}\\AV_Stud\\test_2nAIblo_encoded_v3_250518.csv',index=False)
In [92]:
categorical_features_indices = np.where(df_raw.dtypes == 'object')[0];
df_raw.drop('is_pass',axis=1,inplace=True);
categorical_features_indices
Out[92]:
In [94]:
X_train, X_validation, y_train, y_validation = train_test_split(df_raw, target, train_size=0.8, random_state=1234, shuffle=True)
model=CatBoostClassifier(iterations=1000, depth=12, learning_rate=0.01, loss_function='Logloss',use_best_model=True,\
class_weights = [0.3045921227117995, 0.6954078772882005 ])
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation));
In [106]:
prediction_proba = model.predict_proba(df_test)
In [22]:
gc.collect()
Out[22]:
In [7]:
def make_submission(probs):
sample = pd.read_csv(f'{PATH}\\AV_Stud\\sample_submission_vaSxamm.csv')
submit = sample.copy()
submit['is_pass'] = probs
return submit
In [74]:
#submit = make_submission(prediction_proba[:,1]);
#submit = make_submission(preds_xgb)
submit = make_submission(new_preds)
In [75]:
submit.head(2)
Out[75]:
In [76]:
submit.to_csv(f'{PATH}\\AV_Stud\\nn2.csv', index=False)
In [33]:
target = df_raw['is_pass'];
df_raw.drop(['is_pass'], axis =1, inplace =True)
'''df_raw.drop(['is_pass','program_id', 'program_type', 'test_type', 'difficulty_level', 'gender',
'education', 'is_handicapped'], inplace =True, axis =1)
df_test.drop(['program_id', 'program_type', 'test_type', 'difficulty_level', 'gender',
'education', 'is_handicapped'], inplace =True, axis =1)
'''
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=1337)
folds = list(cv.split(df_raw, target))
x_trn, x_val, y_trn, y_val = train_test_split(df_raw, target, test_size=0.2, random_state=42, shuffle= True)
#sanity check
set(df_raw.columns) - set(df_test.columns)
Out[33]:
In [7]:
def cross_val_xgb(params, X, y, folds):
n = 1
num_rounds = 3000
list_rounds = []
list_scores = []
for train_idx, valid_idx in folds:
print('#################################')
print('######### Validating for fold:', n)
xgtrain = xgb.DMatrix(X[train_idx], label=y[train_idx])
xgtest = xgb.DMatrix(X[valid_idx], label=y[valid_idx])
watchlist = [ (xgtest, 'test') ]
model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=50, verbose_eval=True)
rounds = model.best_ntree_limit
score = model.best_score
print('\nFold', n,'- best round:', rounds)
print('Fold', n,'- best score:', score)
list_rounds.append(rounds)
list_scores.append(score)
n +=1
mean_score = np.mean(list_scores)
std_score = np.std(list_scores)
mean_round = np.mean(list_rounds)
std_round = np.std(list_rounds)
print('End cross validating',n-1,'folds') #otherwise it displays 6 folds
print("Cross Validation Scores are: ", np.round(list_scores,3))
print("Mean CrossVal score is: ", round(mean_score,3))
print("Std Dev CrossVal score is: ", round(std_score,3))
print("Cross Validation early stopping rounds are: ", np.round(list_rounds,3))
print("Mean early stopping round is: ", round(mean_round,3))
print("Std Dev early stopping round is: ", round(std_round,3))
return mean_round, model_cv
In [8]:
def runXGB(train_X, train_y, test_X, test_y=None, seed_val=1, depth = 10):
params = {}
params['booster'] = 'gbtree'
#params['updater'] = 'coord_descent'
params["objective"] = "binary:logistic"
params['eval_metric'] = 'auc'
params["eta"] = 0.05 #0.00334
params["subsample"] = .9
params["silent"] = 0
params['verbose'] = 2
params["max_depth"] = depth
params["seed"] = seed_val
params["max_delta_step"] = 4
params['scale_pos_weight'] = 0.4380049934141978
#params['alpha'] = 0.05
params["gamma"] = 0.3
params['colsample_bytree'] = 0.9
num_rounds = 2000 #3600
plst = list(params.items())
xgtrain = xgb.DMatrix(train_X, label=train_y)
if test_y is not None:
print('1st block\n')
xgtest = xgb.DMatrix(test_X, label=test_y)
watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds= 50,verbose_eval=True)
else:
print('2nd block\n')
xgtest = xgb.DMatrix(test_X)
#watchlist = [ (xgtrain,'train')]
#cv_results = xgb.cv(plst, xgtrain, num_rounds, nfold=5, stratified=True, show_stdv=False,\
# verbose_eval=True, folds=7, metrics=['auc', 'logloss'], early_stopping_rounds=50)
#print('########################### model ######################\n', model)
model = xgb.train(plst, xgtrain, num_rounds)
pred_test_y = model.predict(xgtest,ntree_limit=model.best_ntree_limit)
return pred_test_y, model, plst, model.best_ntree_limit
In [34]:
cols_not_to_be_encoded = ['program_type__program_duration','program_duration','test_id',
'program_type__city_tier',
'program_type__test_type',
'program_type__difficulty_level',
'test_id__program_duration',
'test_id__test_type',
'test_id_test_type__difficulty_level',
'test_type__difficulty_level',
'education__gender',
'education__total_programs_enrolled',
'gender__city_tier',
'gender__is_handicapped',
'education__city_tier',
'program_level']
In [35]:
encoded_cols = ['program_type', 'test_type',
'difficulty_level', 'gender', 'education',
'total_programs_enrolled', 'is_handicapped',
'trainee_engagement_rating','age_group']
df_raw = pd.get_dummies(df_raw, drop_first=True, prefix = 'one_hot', columns=encoded_cols);
df_test = pd.get_dummies(df_test, drop_first=True, prefix = 'one_hot', columns=encoded_cols);
In [36]:
df_raw.drop(cols_not_to_be_encoded, inplace=True, axis=1);
df_test.drop(cols_not_to_be_encoded, inplace=True, axis=1);
df_test.drop('one_hot_middle_aged', axis =1, inplace=True);
In [20]:
%%time
preds_xgb, model, params, num_rounds = runXGB(df_raw, target, df_test)
In [21]:
xgb.plot_importance(model, max_num_features=15, importance_type='weight');
In [37]:
len(df_test.columns), len(np.unique(df_test.columns))
Out[37]:
In [38]:
df_test.columns.value_counts().sort_values(ascending=False)[:2]
Out[38]:
In [39]:
len(df_raw.columns), len(np.unique(df_raw.columns))
Out[39]:
In [40]:
set(df_raw.columns) - set(df_test.columns)
Out[40]:
In [41]:
set(df_test.columns) - set(df_raw.columns)
Out[41]:
In [42]:
df_test.drop(['one_hot_Y'], axis =1, inplace=True)
In [43]:
df_raw.drop(['one_hot_Y','one_hot_60.0'], axis =1, inplace=True)
In [8]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Flatten, LeakyReLU, Dropout
from keras.activations import relu, softmax
from keras import metrics
In [9]:
model = Sequential()
In [10]:
df_raw.shape
Out[10]:
In [64]:
def build_model():
model = Sequential()
model.add(Dense(256, input_dim = df_raw.shape[1], activation = 'relu', kernel_initializer='normal'))
model.add(Dense(128, activation = 'relu',kernel_initializer='normal'))
model.add(BatchNormalization())
model.add(Dense(64, activation = 'relu',kernel_initializer='normal'))
model.add(Dropout(0.13))
model.add(Dense(32,activation = 'relu',kernel_initializer='normal'))
model.add(Dropout(0.1))
model.add(Dense(16,activation = 'relu',kernel_initializer='normal'))
model.add(Dropout(0.1))
model.add(Dense(1,kernel_initializer='normal',activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='rmsprop',metrics=['accuracy'],)
return model
In [65]:
model = build_model()
In [66]:
model.summary()
In [53]:
df_test.fillna(36.,inplace=True, axis =1)
In [55]:
scaler = StandardScaler()
X_train = scaler.fit_transform(df_raw)
X_test = scaler.transform(df_test)
In [68]:
hist = model.fit(X_train,target,epochs = 25,batch_size = 32,verbose=2,validation_split=.2)
In [69]:
preds = model.predict_proba(X_test)
In [70]:
# serialize model to JSON
model_json = model.to_json()
with open(f'{PATH}\\AV_Stud\\model.json', "w") as json_file:
json_file.write(model_json)
# serialize weights to HDF5
model.save_weights(f"{PATH}\\AV_Stud\\model.h5")
print("Saved model to disk")
load json and create model json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
score = loaded_model.evaluate(X, Y, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))
In [71]:
preds[12]
Out[71]:
In [72]:
new_preds = []
In [73]:
for i in range(df_test.shape[0]):
new_preds.append(preds[i][0])
In [ ]: