This example is pandas-ml
transration based on kaggle's "Understanding XGBoost Model on Otto Dataset". To run this Jupyter Notebook, you must download data to otto_data
directory.
https://github.com/dmlc/xgboost/blob/master/demo/kaggle-otto/understandingXGBoostModel.Rmd
In [1]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 8)
pd.set_option('display.max_rows', 6)
In [2]:
import pandas_ml as pdml
In [3]:
train_df = pd.read_csv(os.path.join('otto_data', 'train.csv'), header=0, index_col=0)
test_df = pd.read_csv(os.path.join('otto_data', 'test.csv'), header=0, index_col=0)
# convert to pdml.ModelFrame
train_df = pdml.ModelFrame(train_df, target='target')
test_df = pdml.ModelFrame(test_df)
train_df
Out[3]:
In [4]:
train_df['target'], classes = train_df['target'].factorize()
train_df
Out[4]:
In [5]:
classes
Out[5]:
In [6]:
xgc = train_df.xgboost.XGBClassifier(objective="multi:softprob")
xgc
Out[6]:
In [7]:
train_df.cross_validation.cross_val_score(xgc, cv=3, scoring='log_loss')
Out[7]:
In [8]:
train_df.fit(xgc, eval_metric='mlogloss')
Out[8]:
In [9]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(7, 20))
train_df.xgboost.plot_importance(ax=ax)
In [10]:
train_df.xgboost.to_graphviz(num_trees=0)
Out[10]: