This example is provided to demonstrate some of the typical programming activities for working with Python in a SAS Viya environment to run actions in SAS Cloud Analytic Services. The actions that are used in the example require SAS Visual Data Mining and Machine Learning.
For more information, see http://support.sas.com/documentation/prod-p/vdmml/index.html.
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML
%matplotlib inline
In [6]:
import swat
s = swat.CAS('cloud.example.com', 5570)
In [ ]:
ulresult = s.upload('http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.csv')
titanic3 = ulresult.casTable
type(titanic3)
In [8]:
titanic3.table.columnInfo()
Out[8]:
Some of the columns in the data are problematic for modeling:
In [11]:
# Create a computed variable.
titanic3.computedVars = ['deck'] # 1
titanic3.computedVarsProgram = \
"if cabin ne '' then deck = ksubstr(cabin,1,1); else deck = '';"
numeric=['pclass', 'survived', 'age', 'sibsp', 'parch', 'fare']
# Remove boat and body because they are proxies for survived.
# Remove ticket and cabin. Use the computed column, deck, instead.
char = ['sex', 'deck', 'embarked', 'home.dest']
all = numeric + char
In [12]:
# The numeric variable was defined earlier.
results = titanic3[numeric].groupby("survived").simple.summary()
resultColumns = ['Column', 'Min', 'Max', 'N', 'NMiss', 'Mean', 'Sum', 'Std', 'StdErr'];
display(HTML('<h3>Perished</h3>'))
display(results['ByGroup1.Summary'][resultColumns]) # 1
display(HTML('<h3>Survived</h3>'))
display(results['ByGroup2.Summary'][resultColumns])
In [13]:
s.builtins.loadActionSet('sampling')
# The sampling.stratified action does not accept the vars parameter.
# Instead, copyVars is used to select the columns to copy to the output table.
if 'vars' in titanic3.params:
del titanic3.vars
# Temporarily set a groupBy parameter.
with titanic3:
titanic3.groupBy={'survived'}
titanic3.sampling.stratified(
partInd=True, # 1
samppct=40, # 2
seed=1234,
output={
'casout':{'name':'titanic3part', 'replace':True},
'copyVars':all
}
)
titanic3.table.dropTable() # 3
titanic3part = s.CASTable('titanic3part') # 4
ci = titanic3part.columnInfo()
display(ci)
In [14]:
survSummary = titanic3part['survived'].groupby('_partind_').simple.summary()
resultColumns = ['Column', 'N', 'NMiss', 'Mean', 'Sum', 'Std', 'StdErr']
display(survSummary['ByGroupInfo'])
display(survSummary['ByGroup1.Summary'][resultColumns])
display(survSummary['ByGroup2.Summary'][resultColumns])
In [15]:
s.builtins.loadActionSet('decisionTree') # 1
training = titanic3part.query('0 = _partind_') # 2
trainingResults = training.forestTrain(
target='survived',
inputs=all,
nominals=char + ['pclass', 'survived'],
casOut={'name':'forestModel', 'replace':True},
seed=1234,
binOrder=True,
varImp=True
)
display(trainingResults)
In this example, both the training data and the validation data are scored. This is done so that we can assess the effectiveness of the model for predicting whether someone survives on the Titanic.
The in-memory table, forestModel, is used as the model. The scoring output is stored in an in-memory table that is named forestScored.
In [16]:
forestModel = s.CASTable('forestModel')
titanic3part.forestScore(
modelTable=forestModel,
copyVars=['survived', '_partind_'],
casOut={'name':'forestScored', 'replace':True},
vote='prob'
)
Out[16]:
In [17]:
s.builtins.loadActionSet('percentile')
forestScored = s.CASTable('forestScored') # 1
forestScored.groupBy='_PartInd_' # 2
forestScored.computedVars=['P1', 'P0'] # 3
forestScored.computedVarsProgram='''
if '1' eq strip(_RF_PredName_) then do;
P1 = _RF_PredP_;
P0 = 1 - _RF_PredP_;
end;
else do;
P1 = 1 - _RF_PredP_;
P0 = _RF_PredP_;
end;
''' # 4
forestScored.percentile.assess(
casOut={'name':'forestAssess', 'replace':True},
nbins=10,
cutStep = 0.01,
inputs=['P1'],
response='survived',
event='1',
pVar=['P0'],
pEvent='0'
)
Out[17]:
In [18]:
forestAssess_ROC = \
s.CASTable('forestAssess_ROC', where='1 = _partind_') # 1
out2 = forestAssess_ROC.to_frame()
plt.figure(figsize=(8,8))
plt.plot(out2._FPR_,out2._Sensitivity_,'bo-',linewidth=2)
plt.plot(pd.Series(range(0,11,1))/10,pd.Series(range(0,11,1))/10,'k--',linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('Correct Classification Rate')
plt.grid(True)
plt.title('ROC Curve')
plt.show()
In [19]:
forestAssess = \
s.CASTable('forestAssess', where='1 = _partind_') # 1
lift = forestAssess.to_frame()
plt.figure(figsize=(8,8))
plt.plot(lift._Depth_, lift._Lift_,'bo-',linewidth=2)
plt.xlabel('Percentile')
plt.ylabel('Lift')
plt.grid(True)
plt.title('Lift Chart')
plt.show()
In [20]:
s.close()
Copyright SAS Institute
Disclaimer: SAS may reference other websites or content or resources for use at Customer's sole discretion. SAS has no control over any websites or resources that are provided by companies or persons other than SAS. Customer acknowledges and agrees that SAS is not responsible for the availability or use of any such external sites or resources, and does not endorse any advertising, products, or other materials on or available from such websites or resources. Customer acknowledges and agrees that SAS is not liable for any loss or damage that may be incurred by Customer or its end users as a result of the availability or use of those external sites or resources, or as a result of any reliance placed by Customer or its end users on the completeness, accuracy, or existence of any advertising, products, or other materials on, or available from, such websites or resources.