In [1]:
import pandas as pd
import numpy as np
import time
import machine_learning_helper as machine_learning_helper
import metrics_helper as metrics_helper
import sklearn.neighbors, sklearn.linear_model, sklearn.ensemble, sklearn.naive_bayes
from sklearn.model_selection import KFold, train_test_split, ShuffleSplit
from sklearn import model_selection
from sklearn import ensemble
from xgboost.sklearn import XGBClassifier
import scipy as sp
import xgboost as xgb
import matplotlib.pyplot as plt
% matplotlib inline
from sklearn.model_selection import learning_curve
from sklearn import linear_model, datasets
import os
In [2]:
dataFolder = 'cleaned_data'
resultFolder = 'results'
filenameAdress_train_user = 'cleaned_train_user.csv'
filenameAdress_test_user = 'cleaned_test_user.csv'
filenameAdress_time_mean_user_id = 'time_mean_user_id.csv'
filenameAdress_time_total_user_id = 'time_total_user_id.csv'
filenameAdress_total_action_user_id = 'total_action_user_id.csv'
df_train_users = pd.read_csv(os.path.join(dataFolder, filenameAdress_train_user))
df_test_users = pd.read_csv(os.path.join(dataFolder, filenameAdress_test_user))
df_time_mean_user_id = pd.read_csv(os.path.join(dataFolder, filenameAdress_time_mean_user_id))
df_time_total_user_id = pd.read_csv(os.path.join(dataFolder, filenameAdress_time_total_user_id))
df_total_action_user_id = pd.read_csv(os.path.join(dataFolder, filenameAdress_total_action_user_id))
In [3]:
df_total_action_user_id.columns = ['id','action']
df_sessions = pd.merge(df_time_mean_user_id, df_time_total_user_id, on='id', how='outer')
df_sessions = pd.merge(df_sessions, df_total_action_user_id, on='id', how='outer')
df_sessions.columns = ['id','time_mean_user','time_total_user','action']
The destination countries, now as string, are encoded in int format. Each country will be assigned to a int.
In [4]:
y_labels, label_enc = machine_learning_helper.buildTargetMat(df_train_users)
In [5]:
X_train, X_test = machine_learning_helper.buildFeatsMat(df_train_users, df_test_users, df_sessions)
In [6]:
#X_train = X_train[200000:201000]
#y_labels = y_labels[200000:201000]
For Memory purpose, the train matrix is formatted in sparse
In [7]:
X_train_sparse = sp.sparse.csr_matrix(X_train.values)
In [8]:
cv = model_selection.KFold(n_splits=5, random_state=None, shuffle=True)
Several models are tried, and their parameter optimized through Cross validation. The code is optimized to run on 12 processors at the same time. The metric used is the NDCG. Because of the computation complexity, the for loops for the cross validations were not nested.
Models that were tried:
In [9]:
number_trees = [125, 300, 500, 600 ]
max_depth = [5, 8, 12, 16, 20]
rf_score_trees = []
rf_score_depth = []
rf_param_trees = []
rf_param_depth = []
#Loop for hyperparameter number_trees
for number_trees_idx, number_trees_value in enumerate(number_trees):
print('number_trees_idx: ',number_trees_idx+1,'/',len(number_trees),', value: ', number_trees_value)
# Random forest
rand_forest_model = ensemble.RandomForestClassifier(n_estimators=number_trees_value, max_depth=14)
#Scores
scores = model_selection.cross_val_score(rand_forest_model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
rf_score_trees.append(scores.mean())
rf_param_trees.append(number_trees_value)
print('Mean NDCG for this number_trees = ', scores.mean())
# best number of trees from above
print()
print('best NDCG:')
print(np.max(rf_score_trees))
print('best parameter num_trees:')
idx_best = np.argmax(rf_score_trees)
best_num_trees_RF = rf_param_trees[idx_best]
print(best_num_trees_RF)
In [10]:
#Loop for hyperparameter max_depth
for max_depth_idx, max_depth_value in enumerate(max_depth):
print('max_depth_idx: ',max_depth_idx+1,'/',len(max_depth),', value: ', max_depth_value)
# Random forest
rand_forest_model = ensemble.RandomForestClassifier(n_estimators=best_num_trees_RF, max_depth=max_depth_value)
#Scores
scores = model_selection.cross_val_score(rand_forest_model, X_train_sparse, y_labels, cv=cv, verbose = 10, n_jobs = 12, scoring=metrics_helper.ndcg_scorer)
rf_score_depth.append(scores.mean())
rf_param_depth.append(max_depth_value)
print('Mean NDCG for this max:_depth = ', scores.mean())
# best max_depth from above
print()
print('best NDCG:')
print(np.max(rf_score_depth))
print('best parameter max_depth:')
idx_best = np.argmax(rf_score_depth)
best_max_depth_RF = rf_param_depth[idx_best]
print(best_max_depth_RF)
Random forest 600 trees, 16 depth
In [11]:
best_num_trees_RF = 600
best_max_depth_RF = 16
rand_forest_model = ensemble.RandomForestClassifier(n_estimators=best_num_trees_RF, max_depth=best_max_depth_RF)
rand_forest_model.fit(X_train_sparse,y_labels)
y_pred1 = rand_forest_model.predict_proba(X_test)
id_test = df_test_users['id']
cts1,idsubmission1 = machine_learning_helper.get5likelycountries(y_pred1, id_test)
ctsSubmission1 = label_enc.inverse_transform(cts1)
# Save to csv
df_submission1 = pd.DataFrame(np.column_stack((idsubmission1, ctsSubmission1)), columns=['id', 'country'])
df_submission1.to_csv(os.path.join(resultFolder, 'submission_country_dest_RF.csv'),index=False)
In [12]:
learning_rates = [0.001, 0.01, 0.05,0.1, 0.2]
max_depth = [3, 5, 7, 9, 12]
n_estimators = [20,30,50,75,100]
gamma = [0,0.3, 0.5, 0.7, 1]
best_gamma_XCG, best_num_estimators_XCG,best_num_depth_XCG, best_learning_rate_XCG = machine_learning_helper.CrossVal_XGB(X_train_sparse, y_labels, cv,max_depth,n_estimators,learning_rates,gamma)
XGboost - learning_rate = 0.1, gamma =1, depth = 7, estimators = 75
XGboost - learning_rate = 0.1, gamma =0.7, depth = 5, estimators = 75
In [204]:
best_learning_rate_XCG = 0.1
best_num_depth_XCG = 5
best_gamma_XCG = 0.7
best_num_estimators_XCG = 75
XGB_model = XGBClassifier(max_depth=best_num_depth_XCG, learning_rate=best_learning_rate_XCG, n_estimators=best_num_estimators_XCG,objective='multi:softprob',
subsample=0.5, colsample_bytree=0.5, gamma = best_gamma_XCG)
XGB_model.fit(X_train,y_labels, eval_metric=metrics_helper.ndcg_scorer)
y_pred2 = XGB_model.predict_proba(X_test)
id_test = df_test_users['id']
cts2,idsubmission2 = machine_learning_helper.get5likelycountries(y_pred2, id_test)
ctsSubmission2 = label_enc.inverse_transform(cts2)
df_submission2 = pd.DataFrame(np.column_stack((idsubmission2, ctsSubmission2)), columns=['id', 'country'])
df_submission2.to_csv(os.path.join(resultFolder, 'submission_country_dest_XGB.csv'),index=False)
As seen previously, the classes in this dataset are unbalanced. Indeed, half of the users didn't book. We are going to try to make good use of that information.
This model is composed of 2 layers :
A small mistake : For the training of the 1st layer, the features of the date_account_created and timestamp_first_active were not used.
In [9]:
# Build 1st layer training matrix, text matrix, target vector
y_labels_binary, X_train_layer1, X_test_layer1 = machine_learning_helper.buildFeatsMatBinary(df_train_users, df_test_users, df_sessions)
#y_labels_binary = y_labels_binary[0:1000]
#X_train_layer1 = X_train_layer1[0:1000]
y_labels_binary = y_labels_binary.astype(np.int8)
In [10]:
# Build 1st layer model
# Cross validation with parameter C
C = [0.1, 1.0, 10, 100, 1000]
logistic_score_C = []
logistic_param_C = []
#Loop for hyperparameter
for C_idx, C_value in enumerate(C):
print('C_idx: ',C_idx+1,'/',len(C),', value: ', C_value)
# Logistic
model = linear_model.LogisticRegression(C = C_value)
#Scores
scores = model_selection.cross_val_score(model, X_train_layer1, y_labels_binary, cv=cv, verbose = 10, scoring='f1', n_jobs = 12)
logistic_score_C.append(scores.mean())
logistic_param_C.append(C_value)
print('Mean f1 for this C = ', scores.mean())
# best C from above
print()
print('best f1:')
print(np.max(logistic_score_C))
print('best parameter C:')
idx_best = np.argmax(logistic_score_C)
best_C_logistic = logistic_param_C[idx_best]
print(best_C_logistic)
# Build model with best parameter from cross validation
logreg_layer1 = linear_model.LogisticRegression(C = best_C_logistic)
logreg_layer1.fit(X_train_layer1, y_labels_binary)
score_training = logreg_layer1.predict(X_train_layer1)
# 1st layer model prediction
prediction_layer_1 = logreg_layer1.predict(X_test_layer1)
Training accuracy:
In [13]:
from sklearn import metrics
metrics.accuracy_score(y_labels_binary,score_training)
Out[13]:
In [ ]:
# Build 2nd layer training matrix, text matrix, target vector
#df_train_users.reset_index(inplace=True,drop=True)
#y_labels, label_enc = machine_learning_helper.buildTargetMat(df_train_users)
#y_labels = y_labels[0:1000]
#X_train_layer1 = X_train_layer1[0:1000]
X_train_layer2 = X_train_layer1
X_train_layer2['meta_layer_1'] = pd.Series(y_labels_binary).astype(np.int8)
X_test_layer2 = X_test_layer1
X_test_layer2['meta_layer_1'] = pd.Series(prediction_layer_1).astype(np.int8)
learning_rates = [0.001, 0.01, 0.05,0.1, 0.2]
max_depth = [3, 5, 7, 9, 12]
n_estimators = [20,30,50,75,100]
gamma = [0,0.3, 0.5, 0.7, 1]
cv2 = model_selection.KFold(n_splits=5, random_state=None, shuffle=True)
best_gamma_XCG, best_num_estimators_XCG,best_num_depth_XCG, best_learning_rate_XCG = machine_learning_helper.CrossVal_XGB(X_train_layer2, y_labels, cv2,max_depth,n_estimators,learning_rates,gamma)
2 layers stack model - learning_rate = 0.1, gamma =0.7, depth = 5, estimators = 75
In [19]:
best_learning_rate_XCG = 0.1
best_num_depth_XCG = 5
best_gamma_XCG = 0.7
best_num_estimators_XCG = 50
XGB_model = XGBClassifier(max_depth=best_num_depth_XCG, learning_rate=best_learning_rate_XCG, n_estimators=best_num_estimators_XCG,objective='multi:softprob',
subsample=0.5, colsample_bytree=0.5, gamma = best_gamma_XCG)
XGB_model.fit(X_train_layer2,y_labels, eval_metric=metrics_helper.ndcg_scorer)
y_pred2 = XGB_model.predict_proba(X_test_layer2)
id_test = df_test_users['id']
cts2,idsubmission2 = machine_learning_helper.get5likelycountries(y_pred2, id_test)
ctsSubmission2 = label_enc.inverse_transform(cts2)
df_submission2 = pd.DataFrame(np.column_stack((idsubmission2, ctsSubmission2)), columns=['id', 'country'])
df_submission2.to_csv(os.path.join(resultFolder, 'submission_country_dest_stacking.csv'),index=False)
In [17]:
# Create the sub models
estimators = []
model1 = ensemble.RandomForestClassifier(max_depth=best_max_depth_RF, n_estimators= best_num_trees_RF)
estimators.append(('random_forest', model1))
model2 = XGBClassifier(max_depth=best_num_depth_XCG,learning_rate=best_learning_rate_XCG,n_estimators= best_num_estimators_XCG,
objective='multi:softprob',
subsample=0.5, colsample_bytree=0.5, gamma = best_gamma_XCG)
estimators.append(('xgb', model2))
model3 = XGB_model
estimators.append(('2layer', model3))
# Create Voting classifier
finalModel = ensemble.VotingClassifier(estimators,voting='soft')
# Run cross validation score
results = model_selection.cross_val_score(finalModel, X_train, y_labels, cv=cv, scoring = metrics_helper.ndcg_scorer, verbose = 10, n_jobs=12)
print("Voting Classifier Cross Validation Score found:")
print(results.mean())
Voting classifier
In [18]:
finalModel.fit(X_train,y_labels)
y_pred1 = finalModel.predict_proba(X_test)
id_test = df_test_users['id']
cts1,idsubmission1 = machine_learning_helper.get5likelycountries(y_pred1, id_test)
ctsSubmission1 = label_enc.inverse_transform(cts1)
In [19]:
df_submission1 = pd.DataFrame(np.column_stack((idsubmission1, ctsSubmission1)), columns=['id', 'country'])
df_submission1.to_csv(os.path.join(resultFolder, 'submission_country_dest_Voting.csv'),index=False)
In [20]:
model = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=75,objective='multi:softprob',
subsample=0.5, colsample_bytree=0.5, gamma=0.7 )
model.fit(X_train,y_labels)
Out[20]:
In [21]:
machine_learning_helper.plotFeaturesImportance(model,X_train)
The figure above shows the 20 most important features following the NDCG score. The age feature is by far the most important one.
The figure below shows the most important features using the F score.
In [22]:
fig, ax = plt.subplots(figsize=(15, 10))
xgb.plot_importance(model,height=0.7, ax=ax)
Out[22]:
In [23]:
machine_learning_helper.plotFeaturesImportance(XGB_model,X_train_layer2)
fig, ax = plt.subplots(figsize=(15, 10))
xgb.plot_importance(XGB_model,height=0.7, ax=ax)
Out[23]:
Simpler is better ?