In [23]:
import numpy as np
import pandas as pd
import itertools
from sklearn import pipeline, model_selection
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import log_loss
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
Set our parameters for xgboost For Imbalanced dataset we set max_delta_step = 1
In [21]:
params = {}
class_names = ['Dislike','Like']
params['max_depth'] = [4,6,10]
params['max_delta_step'] = (1,3)
params['n_estimators'] = [100, 250, 500]
params['subsample'] = [0.9, 1.0]
params['colsample_bytree']= [0.9, 1.0]
test_size = 0.20
random_seed = 42
label_encoder = LabelEncoder()
In [3]:
dataset = pd.read_csv('../../../data/processed_data.csv')
labels = dataset[['like']]
data = dataset.drop(['Unnamed: 0','player_id','subject_id','like'],axis=1)
Encoding function for Categorical variables
In [4]:
def encode_features(df, encoder):
columnsToEncode = list(df.select_dtypes(include=['category','object']))
for feature in columnsToEncode:
try:
df[feature] = encoder.fit_transform(df[feature])
except:
print('Error encoding '+ feature)
return df
Data Preparation:
In [5]:
def data_prep():
data_label_encoded = encode_features(data, label_encoder)
X_train, X_test, y_train, y_test = train_test_split(data_label_encoded, labels, test_size=test_size, random_state=random_seed)
return X_train, X_test, y_train, y_test
In [20]:
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [6]:
X_train, X_test, y_train, y_test = data_prep()
In [7]:
X_train.shape
Out[7]:
Initialize the XGBClassifier model
In [8]:
xgb_model = XGBClassifier( learning_rate=0.1,
min_child_weight=1, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=random_seed)
Run grid search
In [9]:
grid_search = model_selection.GridSearchCV(estimator = xgb_model ,
param_grid = params, scoring='f1_weighted',n_jobs=4, cv=5)
grid_search.fit(X_train,y_train.values.ravel())
Out[9]:
In [11]:
grid_search.best_params_, grid_search.best_score_
Out[11]:
In [14]:
with open('../models/xgboost.pkl', "wb") as fp:
pickle.dump(grid_search, fp)
In [18]:
y_scores = grid_search.best_estimator_.predict_proba(X_test)[:,1]
precision, recall, thresholds = precision_recall_curve(y_test, y_scores)
print(auc(y_test, y_scores, reorder=True))
print(auc(precision, recall, reorder=True))
y_predict = grid_search.best_estimator_.predict(X_test)
confusion_matrix_pca_logistic = confusion_matrix(y_test, y_predict)
In [25]:
plt.figure()
plot_confusion_matrix(confusion_matrix_pca_logistic, classes=class_names,
title='Confusion matrix, without normalization')
plt.show()
In [37]:
feat_imp = pd.Series(grid_search.best_estimator_.get_booster().get_fscore()).sort_values(ascending=False)
In [38]:
feat_imp
Out[38]:
In [40]:
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.show()
In [41]:
grid_search.predict_proba(X_test)[:,1]
Out[41]:
In [ ]: