In [1]:
# Import necessary packages and modules
import swat
import pandas as pd
import collections
from matplotlib import pyplot as plt
swat.options.cas.print_messages = False
%matplotlib inline
# Define directory and data file name
indata_dir = '/viyafiles'
indata = 'hmeq'
# Host, port, username, password
s = swat.CAS(host, port, username, password)
# Load actionsets for analysis (for data prep, modeling, assessing)
actionsets = ['cardinality', 'sampling', 'fedSQL', 'decisionTree', 'neuralNet', 'svm', 'astore']
[s.builtins.loadactionset(i) for i in actionsets]
# Create a CAS library called DMLib pointing to the defined directory
## Note, need to specify the srctype is path, otherwise it defaults to HDFS
DMLib = s.table.addCaslib('DMlib', datasource = 'path', path = indata_dir)
# Push the relevant table In-Memory
## Note, this is a server side data load, not being loaded from the client
inMem = s.table.loadTable(indata + '.sas7bdat', casOut = indata)
In [2]:
# Point castbl to the CAS Table object (client-side view of distributed CAS data)
castbl = s.CASTable(indata, replace = True)
# Print out the first 10 rows of the dataset
castbl.head(10)
Out[2]:
In [3]:
# Create table of summary statistics in SAS
castbl.cardinality.summarize(cardinality = dict(name = 'data_card', replace = True))
df_data_card = s.CASTable('data_card').to_frame() # bring the data locally
# Modify SAS output table using Python to present summary statistics
df_data_card['_PCTMISS_'] = (df_data_card['_NMISS_']/df_data_card['_NOBS_'])*100
print('\n', 'Summary Statistics'.center(90, ' '))
df_data_card[['_VARNAME_','_TYPE_','_PCTMISS_','_MIN_','_MAX_','_MEAN_','_STDDEV_','_SKEWNESS_','_KURTOSIS_']].round(2)
Out[3]:
In [4]:
# Use the built in hist() method to plot the distribution of every variable
castbl.hist(figsize = (15, 10));
In [5]:
# Plot missing values in matplotlib
df_data_miss = df_data_card[df_data_card['_PCTMISS_'] > 0]
tbl_forplot = pd.Series(list(df_data_miss['_PCTMISS_']), index = list(df_data_miss['_VARNAME_']))
missing_val = tbl_forplot.plot(kind = 'bar', title = 'Percentage of Missing Values', color = 'c', figsize = (10, 6))
missing_val.set_ylabel('Percent Missing')
missing_val.set_xlabel('Variable Names');
In [6]:
# Impute missing values
castbl.dataPreprocess.impute(
outVarsNamePrefix = 'IMP',
methodContinuous = 'MEDIAN',
methodNominal = 'MODE',
inputs = list(df_data_card['_VARNAME_'])[1:],
copyAllVars = True,
casOut = castbl
)
# Print the first five rows with imputations
castbl.head()
Out[6]:
In [7]:
# Create a 70/30 simple random sample split
castbl.sampling.srs(
samppct = 70,
partind = True,
seed = 1,
output = dict(casOut = castbl, copyVars = 'ALL')
)
# Verify the partition worked properly using SQL
s.fedSQL.execDirect('''
SELECT
CASE WHEN _PartInd_ = 1 THEN 'Training' ELSE 'Validation' END AS Name,
_PartInd_,
100.0*COUNT(*)/(SELECT COUNT(*) FROM ''' + indata + ''') AS Pct
FROM ''' + indata + '''
GROUP BY
CASE WHEN _PartInd_ = 1 THEN 'Training' ELSE 'Validation' END,
_PartInd_
ORDER BY _PartInd_ DESC;
''')
Out[7]:
In [8]:
# Models to be performed
models = collections.OrderedDict()
models['dt'] = 'Decision Tree'
models['gbt'] = 'Gradient Boosting'
models['nn'] = 'Neural Network'
models['svm'] = 'Support Vector Machine'
In [9]:
# Set variables for later use by models
## For models that can handle missing values
target = df_data_card['_VARNAME_'][0]
class_inputs = list(df_data_card.query('_TYPE_ == "C"')['_VARNAME_'])
interval_inputs = list(df_data_card.query('_TYPE_ == "N"')['_VARNAME_'])[1:]
class_vars = [target] + class_inputs
all_inputs = interval_inputs + class_inputs
## For models that can't handle missing values
imp_class_inputs = ['IMP_' + s for s in class_inputs]
imp_interval_inputs = ['IMP_' + s for s in interval_inputs]
imp_class_vars = [target] + imp_class_inputs
imp_all_inputs = imp_interval_inputs + imp_class_inputs
In [10]:
# Set key-word argument shortcuts (common model inputs)
## For models that can handle missing values (decision tree, gradient boosting)
params = dict(
table = dict(name = indata, where = '_partind_ = 1'),
target = target,
inputs = all_inputs,
nominals = class_vars,
)
## For models that can't have missing values (neural network, support vector machine)
imp_params = dict(
table = dict(name = indata, where = '_partind_ = 1'),
target = target,
inputs = imp_all_inputs,
nominals = imp_class_vars,
)
In [11]:
s.decisionTree.dtreeTrain(**params, varImp = True, casOut = dict(name = 'dt_model', replace = True))
Out[11]:
In [12]:
s.decisionTree.gbtreeTrain(**params, seed = 1, casOut = dict(name = 'gbt_model', replace = True))
Out[12]:
In [13]:
s.neuralNet.annTrain(**imp_params, seed = 1, casOut = dict(name = 'nn_model', replace = True))
Out[13]:
In [14]:
s.svm.svmTrain(**imp_params, seed = 1, kernel = 'polynomial', id = [target, '_partind_'], savestate = 'svm_model')
Out[14]:
In [15]:
def score_model(model):
score = dict(
table = indata,
modelTable = model + '_model',
copyVars = [target, '_partind_'],
casOut = dict(name = '_scored_' + model, replace = True)
)
return score
### Decision Tree
s.decisionTree.dtreeScore(**score_model('dt'))
### Gradient Boosting
s.decisionTree.gbtreeScore(**score_model('gbt'))
### Neural Network
s.neuralNet.annScore(**score_model('nn'))
### Support Vector Machine
castbl.astore.score(rstore = 'svm_model', out = dict(name = '_scored_svm', replace = True))
### Create standardized prediction column
for i in range(len(models)-1):
s.dataStep.runCode('''
data _scored_''' + list(models)[i] + ''';
set _scored_''' + list(models)[i] + ''';
if _''' + list(models)[i] + '''_predname_ = 1
then p_''' + target + '''1 = _''' + list(models)[i] + '''_predp_;
else p_''' + target + '''1 = 1 - _''' + list(models)[i] + '''_predp_;
run;
''')
In [16]:
# Model assessment function
def assess_model(model):
assess = s.percentile.assess(
table = dict(name = '_scored_' + model, where = '_partind_ = 0'),
inputs = 'p_' + target + '1',
response = target,
event = '1',
)
return assess
# Loop through the models and append to the roc_df dataframe
roc_df = pd.DataFrame()
for i in range(len(models)):
tmp = assess_model(list(models)[i])
tmp.ROCInfo['Model'] = list(models.values())[i]
roc_df = pd.concat([roc_df, tmp.ROCInfo])
# Display stacked confusion matrix using Python
print('\n', 'Confusion Matrix Information'.center(42, ' '))
roc_df[round(roc_df['CutOff'], 2) == 0.5][['Model', 'TP', 'FP', 'FN', 'TN']].reset_index(drop = True)
Out[16]:
In [17]:
# Add misclassification rate calculation
roc_df['Misclassification'] = 1 - roc_df['ACC']
print('\n', 'Misclassification Rate Comparison'.center(37, ' '))
miss = roc_df[round(roc_df['CutOff'], 2) == 0.5][['Model', 'Misclassification']].reset_index(drop = True)
miss.sort_values('Misclassification')
Out[17]:
In [18]:
# Plot ROC curve
plt.figure(figsize = (7, 6))
for key, grp in roc_df.groupby(['Model']):
plt.plot(grp['FPR'], grp['Sensitivity'], label = key + ' (C = %0.2f)' % grp['C'].mean())
plt.plot([0,1], [0,1], 'k--')
plt.xlabel('False Postivie Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right')
plt.title('ROC Curve (using validation data)');
In [19]:
# Bring specified columns to the client (note: Python models must be run locally)
local_inputs = [target, '_PartInd_'] + imp_all_inputs
local = castbl[local_inputs].to_frame()
# Create dummy variables for class inputs (note: scikit-learn cannot have character variables)
local = pd.concat([local, pd.get_dummies(local[imp_class_inputs])], axis = 1).drop(imp_class_inputs, axis = 1)
# Split into training and validation
train = local[local['_PartInd_'] == 1]
valid = local[local['_PartInd_'] == 0]
# Split target and inputs and remove unnecessary variables (note: scikit-learn Gradient Boosting can't handle missing values)
X_train = train.drop(target, axis = 1)
X_valid = valid.drop(target, axis = 1)
y_train = train[target]
y_valid = valid[target]
# Build scikit-learn gradient boosting model using default values
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
df = pd.DataFrame(dict(actual = y_valid, pred = gb.predict_proba(X_valid)[:,1])) # put results in pandas dataframe
# Predict and assess model
gb_y_score = gb.predict(X_valid)
gb_misclassification = 1 - accuracy_score(y_valid, gb_y_score)
gb_confusion_matrix = confusion_matrix(y_valid, gb_y_score)
print('Confusion Matrix\n', gb_confusion_matrix, '\n') # note: scikit-learn reverses True Positives and True Negatives
print('Misclassification Rate\n ', gb_misclassification)
In [20]:
pytbl = s.upload_frame(df, casout=dict(name='Python', replace=True))
# Verify that the Python actuals vs. predicted are in CAS
pytbl.fetch(to = 5)
Out[20]:
In [21]:
# Assess the Python model using CAS
python_assess = pytbl.percentile.assess(
inputs = 'pred',
response = 'actual',
event = '1',
)
python_assess.ROCInfo['Model'] = 'Gradient Boosting - Python'
roc_df['Model'] = roc_df['Model'] + ' - CAS'
roc_df = pd.concat([roc_df.query('Model == "Gradient Boosting - CAS"'), python_assess.ROCInfo])
roc_df['Misclassification'] = 1 - roc_df['ACC']
print('\n', 'Misclassification Rate Comparison'.center(37, ' '))
miss = roc_df[round(roc_df['CutOff'], 2) == 0.5][['Model', 'Misclassification']].reset_index(drop = True)
miss.sort_values('Misclassification')
Out[21]:
In [22]:
# Generate the model code that can be leveraged by CAS in the future
unload = s.table.save('gbt_model', name = 'gbt_model', replace = True)
In [23]:
s.session.endsession() # end the session
Out[23]: