XGBoost => Model Studio


In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

df = pd.read_csv('http://support.sas.com/documentation/onlinedoc/viya/exampledatasets/hmeq.csv')
df.head()


Out[1]:
BAD LOAN MORTDUE VALUE REASON JOB YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC
0 1 1100 25860.0 39025.0 HomeImp Other 10.5 0.0 0.0 94.366667 1.0 9.0 NaN
1 1 1300 70053.0 68400.0 HomeImp Other 7.0 0.0 2.0 121.833333 0.0 14.0 NaN
2 1 1500 13500.0 16700.0 HomeImp Other 4.0 0.0 0.0 149.466667 1.0 10.0 NaN
3 1 1500 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 0 1700 97800.0 112000.0 HomeImp Office 3.0 0.0 0.0 93.333333 0.0 14.0 NaN

Handle categorical data


In [2]:
# One-hot encode character variables
dtypes = df.dtypes
nominals = dtypes[dtypes=='object'].keys().tolist()
df = pd.concat([df, pd.get_dummies(df[nominals])], axis = 1).drop(nominals, axis = 1)
df.head()


Out[2]:
BAD LOAN MORTDUE VALUE YOJ DEROG DELINQ CLAGE NINQ CLNO DEBTINC REASON_DebtCon REASON_HomeImp JOB_Mgr JOB_Office JOB_Other JOB_ProfExe JOB_Sales JOB_Self
0 1 1100 25860.0 39025.0 10.5 0.0 0.0 94.366667 1.0 9.0 NaN 0 1 0 0 1 0 0 0
1 1 1300 70053.0 68400.0 7.0 0.0 2.0 121.833333 0.0 14.0 NaN 0 1 0 0 1 0 0 0
2 1 1500 13500.0 16700.0 4.0 0.0 0.0 149.466667 1.0 10.0 NaN 0 1 0 0 1 0 0 0
3 1 1500 NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 0 0 0 0 0 0 0
4 0 1700 97800.0 112000.0 3.0 0.0 0.0 93.333333 0.0 14.0 NaN 0 1 0 1 0 0 0 0

Split into training and validation


In [3]:
# Create partition indicator (70/30 split)
target = 'BAD'
stratification = list(StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state = 1234).split(df, df[target]))
df['_PartInd_'] = df.index.isin(stratification[0][0]).astype(int)

# Inputs for prediction
X = df.iloc[:,1:-1]

# Training inputs
train = df[df['_PartInd_'] == 1]
X_train = train.iloc[:,1:-1]
y_train = train.iloc[:,0]

Fit and predict XGBoost model


In [4]:
# Fit XGBoost model
model = XGBClassifier()
model.fit(X_train, y_train)

# Find Misclassification rate
valid = df[df['_PartInd_'] == 0]
print('Misclassification Rate:', "%.4f" % round(1 - accuracy_score(valid.iloc[:,0], model.predict(valid.iloc[:,1:-1])), 4), '\n')

# Predict and create new columns
tmp = model.predict_proba(X)
df['P_' + target + '1'] = tmp[:,1]
df['P_' + target + '0'] = tmp[:,0]
df.head()


Misclassification Rate: 0.0912 

Out[4]:
BAD LOAN MORTDUE VALUE YOJ DEROG DELINQ CLAGE NINQ CLNO ... REASON_HomeImp JOB_Mgr JOB_Office JOB_Other JOB_ProfExe JOB_Sales JOB_Self _PartInd_ P_BAD1 P_BAD0
0 1 1100 25860.0 39025.0 10.5 0.0 0.0 94.366667 1.0 9.0 ... 1 0 0 1 0 0 0 0 0.854603 0.145397
1 1 1300 70053.0 68400.0 7.0 0.0 2.0 121.833333 0.0 14.0 ... 1 0 0 1 0 0 0 1 0.866406 0.133594
2 1 1500 13500.0 16700.0 4.0 0.0 0.0 149.466667 1.0 10.0 ... 1 0 0 1 0 0 0 0 0.909950 0.090050
3 1 1500 NaN NaN NaN NaN NaN NaN NaN NaN ... 0 0 0 0 0 0 0 1 0.808513 0.191487
4 0 1700 97800.0 112000.0 3.0 0.0 0.0 93.333333 0.0 14.0 ... 1 0 1 0 0 0 0 1 0.359471 0.640529

5 rows × 22 columns

Upload results to CAS


In [5]:
import swat
conn = swat.CAS()
conn.upload_frame(df, casout = dict(name = 'hmeq_xgboost', caslib = 'public', promote = True))
conn.endsession()


NOTE: Cloud Analytic Services made the uploaded file available as table HMEQ_XGBOOST in caslib public.
NOTE: The table HMEQ_XGBOOST has been created in caslib public from binary data uploaded to Cloud Analytic Services.
Out[5]:

elapsed 0.0107s · user 0.003s · sys 0.005s · mem 0.796MB