In [1]:
from jupyterthemes import get_themes
from jupyterthemes.stylefx import set_nb_theme
themes = get_themes()
set_nb_theme(themes[3])
Out[1]:
In [2]:
# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from joblib import dump, load
from xgboost import XGBClassifier
from sortedcontainers import SortedSet
from scipy.stats import randint, uniform
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from mlutils.transformers import Preprocessor
from utils import clean, build_xgb, write_output
%watermark -a 'Ethen' -d -t -v -p numpy,scipy,pandas,joblib,xgboost,sklearn,matplotlib,sortedcontainers
Problem description is available at https://www.kaggle.com/c/DontGetKicked
Please download the training and testing dataset provided at the link above and store it under the ../data directory (i.e. there should be a data directory one level above this notebook).
The utils.py contains utility function to prevent cluttering the notebook.
In [3]:
# original raw data
data_dir = os.path.join('..', 'data')
path_train = os.path.join(data_dir, 'training.csv')
data = pd.read_csv(path_train)
data.head()
Out[3]:
The next section specifies the categorical, numerical, datetime columns, columns that are dropped and the rationale behind them.
Columns that are dropped:
For categorical variables, use dataframe[colname].value_counts() to check for the number of distinct categories, we'll choose to drop columns with too many distinct categories (number of categories is listed in the parenthesis)
Columns that are drop due to too many null values, (percentage of null is listed in the parenthesis):
Drop due to being a redundant column:
In [4]:
# note that the drop_cols variable indicating which columns are dropped is not
# actually used, this is used in the notebook for sanity checking purpose, i.e.
# ensuring the column number adds up to the original column
drop_cols = [
'Make', 'Model', 'Trim', 'SubModel', 'Color',
'WheelTypeID', 'VNST', 'BYRNO', 'VNZIP1',
'PRIMEUNIT', 'AUCGUART', 'VehYear']
cat_cols = [
'Auction', 'Transmission', 'WheelType', 'Nationality',
'Size', 'TopThreeAmericanName', 'IsOnlineSale']
num_cols = [
'VehicleAge', 'VehOdo', 'VehBCost', 'WarrantyCost',
'MMRCurrentAuctionAveragePrice', 'MMRAcquisitionAuctionAveragePrice',
'MMRCurrentAuctionCleanPrice', 'MMRAcquisitionAuctionCleanPrice',
'MMRCurrentRetailAveragePrice', 'MMRAcquisitionRetailAveragePrice',
'MMRCurrentRetailCleanPrice', 'MMRAcquisitonRetailCleanPrice']
date_cols = ['PurchDate']
label_col = 'IsBadBuy'
ids_col = 'RefId'
# current time for computing recency feature
now = '2011-01-01 00:00:00'
The next code block executes some preprocessing steps that are specific to this problem.
In [5]:
data = clean(path_train, now, cat_cols, num_cols, date_cols, ids_col, label_col)
print('dimension:', data.shape)
data.head()
Out[5]:
In [6]:
# extract target variable, perform
# a quick check of the target variable's skewness
ids = data[ids_col].values
label = data[label_col].values
data = data.drop([ids_col, label_col], axis = 1)
print('labels distribution:', np.bincount(label) / label.size)
In [7]:
# train/validation stratified split
val_size = 0.1
test_size = 0.1
split_random_state = 1234
df_train, df_test, y_train, y_test, ids_train, ids_test = train_test_split(
data, label, ids, test_size = test_size,
random_state = split_random_state, stratify = label)
df_train, df_val, y_train, y_val, ids_train, ids_val = train_test_split(
df_train, y_train, ids_train, test_size = val_size,
random_state = split_random_state, stratify = y_train)
In [8]:
# due the fact that in the cleaning step, some numeric columns
# got transformed, thus we obtain the new numeric columns after
# the cleaning step;
# use sorted set to ensure the consistency of the column order
num_cols_cleaned = list(SortedSet(df_train.columns) - SortedSet(cat_cols))
# final sanity check to ensure numeric columns are
# all normally distributed-ish
df_train[num_cols_cleaned].hist(bins = 50, figsize = (20, 15))
plt.show()
Converts the DataFrame format data to numpy array format.
In [9]:
# ideally this preprocessing step should be constructed
# into a pipeline along with the model, but this is infeasible
# as of now
# https://github.com/dmlc/xgboost/issues/2039
preprocess = Preprocessor(num_cols_cleaned, cat_cols)
X_train = preprocess.fit_transform(df_train)
X_val = preprocess.transform(df_val)
X_test = preprocess.transform(df_test)
print('colnames', preprocess.colnames_)
X_train
Out[9]:
In [10]:
cv = 10
n_iter = 3
model_random_state = 4321
eval_set = [(X_train, y_train), (X_val, y_val)]
xgb_tuned = build_xgb(n_iter, cv, model_random_state, eval_set)
xgb_tuned.fit(X_train, y_train)
pd.DataFrame(xgb_tuned.cv_results_)
Out[10]:
In [11]:
# model checkpoint for future scoring
model_dir = os.path.join('..', 'model')
if not os.path.isdir(model_dir):
os.mkdir(model_dir)
checkpoint_preprocess = os.path.join(model_dir, 'preprocess.pkl')
checkpoint_xgb = os.path.join(model_dir, 'xgb.pkl')
In [12]:
dump(preprocess, checkpoint_preprocess)
dump(xgb_tuned, checkpoint_xgb)
Out[12]:
In [13]:
# monitor the train, validation and test AUC score
y_pred = []
xgb_best = xgb_tuned.best_estimator_
zipped = zip(
('train', 'validation', 'test'),
(X_train, X_val, X_test),
(y_train, y_val, y_test))
for name, X, y in zipped:
xgb_pred = xgb_best.predict_proba(
X, ntree_limit = xgb_best.best_ntree_limit)[:, 1]
score = round(roc_auc_score(y, xgb_pred), 2)
print('{} AUC: {}'.format(name, score))
y_pred.append(xgb_pred)
In [14]:
# output the prediction
output_dir = os.path.join('..', 'output')
if not os.path.isdir(output_dir):
os.mkdir(output_dir)
ids = np.hstack((ids_train, ids_val, ids_test))
y_pred = np.hstack(y_pred)
# this prediction table can be written to a .csv or upload back to database
output = pd.DataFrame({
ids_col: ids,
label_col: y_pred
}, columns = [ids_col, label_col])
output.head()
Out[14]:
In [15]:
# output to .csv file
output_path = os.path.join(output_dir, 'prediction.csv')
write_output(ids, ids_col, y_pred, label_col, output_path)
In [16]:
path_future = os.path.join(data_dir, 'test.csv')
data = clean(path_future, now, cat_cols, num_cols, date_cols, ids_col)
ids = data[ids_col].values
data = data.drop(ids_col, axis = 1)
preprocess = load(checkpoint_preprocess)
xgb_tuned = load(checkpoint_xgb)
X = preprocess.transform(data)
xgb_best = xgb_tuned.best_estimator_
xgb_pred = xgb_best.predict_proba(
X, ntree_limit = xgb_best.best_ntree_limit)[:, 1]
xgb_pred
Out[16]:
In [17]:
output_path = os.path.join(output_dir, 'prediction_future.csv')
write_output(ids, ids_col, xgb_pred, label_col, output_path)
After understanding the overall workflow, the you can simply use the main.py script and follow the steps below to replicate the workflow:
# assuming you're at the project's root directory
# train the model on the training set and store it
python src/main.py --train --inputfile training.csv --outputfile prediction.csv
# predict on future dataset and output the prediction
# to a .csv file in a output directory (will be created
# one level above where the script is if it doesn't exist yet)
python src/main.py --inputfile test.csv --outputfile prediction_future.csv
As of now, most of the changeable parameters used throughout this notebook are coded as constant at the top of script and not exposed as command line arguments.
This script reaches around 0.70 ~ 0.72 AUC on the test set. Some potential ways of improving this score includes: