In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
df = pd.read_csv('http://support.sas.com/documentation/onlinedoc/viya/exampledatasets/hmeq.csv')
df.head()
Out[1]:
In [2]:
# One-hot encode character variables
dtypes = df.dtypes
nominals = dtypes[dtypes=='object'].keys().tolist()
df = pd.concat([df, pd.get_dummies(df[nominals])], axis = 1).drop(nominals, axis = 1)
df.head()
Out[2]:
In [3]:
# Create partition indicator (70/30 split)
target = 'BAD'
stratification = list(StratifiedShuffleSplit(n_splits = 1, test_size = 0.3, random_state = 1234).split(df, df[target]))
df['_PartInd_'] = df.index.isin(stratification[0][0]).astype(int)
# Inputs for prediction
X = df.iloc[:,1:-1]
# Training inputs
train = df[df['_PartInd_'] == 1]
X_train = train.iloc[:,1:-1]
y_train = train.iloc[:,0]
In [4]:
# Fit XGBoost model
model = XGBClassifier()
model.fit(X_train, y_train)
# Find Misclassification rate
valid = df[df['_PartInd_'] == 0]
print('Misclassification Rate:', "%.4f" % round(1 - accuracy_score(valid.iloc[:,0], model.predict(valid.iloc[:,1:-1])), 4), '\n')
# Predict and create new columns
tmp = model.predict_proba(X)
df['P_' + target + '1'] = tmp[:,1]
df['P_' + target + '0'] = tmp[:,0]
df.head()
Out[4]:
In [5]:
import swat
conn = swat.CAS()
conn.upload_frame(df, casout = dict(name = 'hmeq_xgboost', caslib = 'public', promote = True))
conn.endsession()
Out[5]: