In [1]:
# Import necessary packages and modules
import swat, collections
import pandas as pd
import matplotlib.pyplot as plt
from pipefitter.pipeline import Pipeline
from pipefitter.transformer import Imputer
from pipefitter.estimator import DecisionTree, DecisionForest, GBTree, NeuralNetwork
%matplotlib inline
# Set the connection by specifying the hostname, port, username, and password
conn = swat.CAS(host, port, username, password)
# Data set shortcut
indata = 'hmeq'
# Create a CAS library called DMLib pointing to the defined server-side directory
DMLib = conn.addCaslib('DMlib', datasource = 'path', path = '/viyafiles')
# Do a server-side data load into CAS memory
castbl = conn.loadTable(indata + '.sas7bdat', casOut = indata)['casTable']
castbl.replace = True
In [2]:
# Assign the variable name df to the new CASTable object
df = conn.CASTable(indata)
# Perform the head method to return the first 5 rows
df.head()
Out[2]:
In [3]:
# Create new columns to help with analysis
df['MORTPAID'] = df['VALUE'] - df['MORTDUE']
df.head()
Out[3]:
In [4]:
# Get the variable types
df.dtypes
Out[4]:
In [5]:
# Get summary statistics using the describe method, then switch rows and columns
summary = df.describe(include = 'all').transpose()
summary
Out[5]:
In [6]:
# Get the distribution of all numeric variables
df.hist(figsize = (15, 10));
In [7]:
# Create percent missing column for plotting
summary['pctmiss'] = (len(castbl) - summary['count'])/len(castbl)
# Make a bar graph using pandas/matplotlib functionality
summary.query('pctmiss > 0')['pctmiss'].plot('bar', title = 'Pct Missing Values', figsize = (10, 6), color = 'c');
In [8]:
# Load the sampling actionset
conn.loadactionset('sampling')
# Do a simple random sample with a 70/30 split
df.srs(samppct = 70, partind = True, output = dict(casout = castbl, copyvars = 'all'))
# Verify that the partition worked using the groupby method
castbl.groupby('_PartInd_')['_PartInd_'].count()/len(castbl)
Out[8]:
In [9]:
# Will end up imputing the median value for numeric variables, most common value for nominal
imp_castbl = Pipeline([Imputer(Imputer.MEDIAN), Imputer(Imputer.MODE)]).transform(castbl)
# I want my imputed dataset name to have the imp_ prefix
imp_castbl.partition(casout = dict(name = 'imp_' + indata, replace = True))
# Remove the unnecessary impute staging tables
[conn.droptable(s) for s in conn.tableinfo()['TableInfo']['Name'] if 'IMPUTE' in s]
# Make sure everything worked properly for the new imputed dataset
conn.fetch('imp_' + indata, to = 5)
Out[9]:
In [10]:
conn.tableinfo()
Out[10]:
In [11]:
# Create CASTable objects for training and validation for models that can handle missing values
train = conn.CASTable(indata, where = '_partind_ = 1')
valid = conn.CASTable(indata, where = '_partind_ = 0')
# Create CASTable objects for training and validation for models that cannot handle missing values
imp_train = conn.CASTable('imp_' + indata, where = '_partind_ = 1')
imp_valid = conn.CASTable('imp_' + indata, where = '_partind_ = 0')
# Key word argument shortcuts for model building (common imputs)
params = dict(
target = castbl.columns[0],
inputs = castbl.columns[1:-1].tolist(),
nominals = [castbl.columns[0]] + castbl.columninfo()['ColumnInfo'].query('Type == "varchar"')['Column'].tolist()
)
In [12]:
# Train decision tree on training dataset
dt = DecisionTree(**params).fit(train)
# Score decision tree on validation dataset
dt_score = dt.score(valid)
dt_score
Out[12]:
In [13]:
# Train random forest on training dataset
rf = DecisionForest(**params).fit(train)
# Score decision tree on validation dataset
rf_score = rf.score(valid)
rf_score
Out[13]:
In [14]:
# Train gradient boosting on training dataset
gbt = GBTree(**params).fit(train)
# Score gradient boosting on validation dataset
gbt_score = gbt.score(valid)
gbt_score
Out[14]:
In [15]:
# Train neural network on training dataset
nn = NeuralNetwork(**params).fit(imp_train)
# Score neural network on validation dataset
nn_score = nn.score(imp_valid, event = 0)
nn_score
Out[15]:
In [16]:
# Aggregate the misclassification metrics from previous output
Models = ['Decision Tree', 'Random Forest', 'Gradient Boosting', 'Neural Network']
Misclassification = [dt_score[-1], rf_score[-1], gbt_score[-1], nn_score[-1]]
mcr = pd.DataFrame({'Misclassification Rate': Misclassification}, Models).sort_values('Misclassification Rate')
print('\n', mcr)
# Which model is the champion?
print('\n', 'The', mcr.index[0], 'model is the champion!')
In [17]:
# What are the in-memory tables?
conn.tableinfo()
Out[17]:
In [18]:
# Give model tables appropriate shortcuts in a dictionary
saved = {}
saved['dt'] = next((s for s in conn.tableinfo()['TableInfo']['Name'] if 'MODELTREE' in s), None)
saved['rf'] = next((s for s in conn.tableinfo()['TableInfo']['Name'] if 'MODELFOREST' in s), None)
saved['gbt'] = next((s for s in conn.tableinfo()['TableInfo']['Name'] if 'MODELGBT' in s), None)
saved['nn'] = next((s for s in conn.tableinfo()['TableInfo']['Name'] if 'NNMODEL' in s), None)
# Models to be assessed
models = collections.OrderedDict()
models['dt'] = 'Decision Tree'
models['rf'] = 'Random Forest'
models['gbt'] = 'Gradient Boosting'
models['nn'] = 'Neural Network'
In [19]:
# Define function that will score the models based on the model prefix
def score_model(model):
return dict(
table = valid,
modelTable = saved[model],
assessonerow = True,
copyvars = [castbl.columns[0], castbl.columns[-1]],
casOut = dict(name = model + '_scored', replace = True)
)
### Decision Tree
conn.dtreeScore(**score_model('dt'))
### Random Forest
conn.forestScore(**score_model('rf'))
### Gradient Boosting
conn.gbtreeScore(**score_model('gbt'))
### Neural Network
conn.annScore(**score_model('nn'))
# See the available tables now
conn.tableinfo()
Out[19]:
In [20]:
# Model assessment function
def assess_model(model):
return conn.assess(
table = dict(name = model + '_scored', where = '_partind_ = 0'),
inputs = '_' + model + '_p_ 1',
response = castbl.columns[0],
event = '1'
)
# Loop through the models and append to the roc_df dataframe
roc_df = pd.DataFrame()
for i in range(len(models)):
tmp = assess_model(list(models)[i])
tmp.ROCInfo['Model'] = list(models.values())[i]
roc_df = pd.concat([roc_df, tmp.ROCInfo])
In [21]:
# Display stacked confusion matrix
print('\n', 'Confusion Matrix Information'.center(38, ' '))
roc_df[round(roc_df['CutOff'], 2) == 0.5][['Model', 'TP', 'FP', 'FN', 'TN']].reset_index(drop = True)
Out[21]:
In [22]:
# Plot ROC curve
plt.figure(figsize = (7, 6))
for key, grp in roc_df.groupby(['Model']):
plt.plot(grp['FPR'], grp['Sensitivity'], label = key + ' (C = %0.2f)' % grp['C'].mean())
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Postivie Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc = 'lower right')
plt.title('ROC Curve (using validation data)');
In [23]:
# End the session
conn.endsession()
Out[23]: