Overview

This is a simple end to end example of how you can use SAS Viya for analysis The example follows these steps:

  1. Starting a CAS session on an already running CAS server
  2. Loading data from the local file system to the CAS server
  3. Explore the data
  4. Impute missing values
  5. Partition the data into training, validation, and test partitions
  6. Build a decision tree model
  7. Build a gradient boost model
  8. Build a decision forest model
  9. Build a neural network model
  10. Assess the models
  11. Build ROC and Lift charts

Documentation

Start CAS session


In [ ]:
option casport=5570 cashost="localhost";
cas casauto;
caslib _all_ assign;

In [ ]:
%let target          = bad;
%let class_inputs    = reason job;
%let class_vars      = &target &class_inputs;
%let interval_inputs = im_clage clno im_debtinc loan mortdue value im_yoj im_ninq derog im_delinq;
%let all_inputs      = &interval_inputs &class_inputs;

%let indata = hmeq;

In [ ]:
filename hmeq url 'http://support.sas.com/documentation/onlinedoc/viya/exampledatasets/hmeq.csv'; 
libname mycas cas;
proc import file=hmeq out=mycas.hmeq dbms=csv;
run;

Explore and Impute missing values


In [ ]:
proc mdsummary data = mycas.&indata.;
  var _numeric_;
  output out=mycas.summary1;
run;
proc print data=mycas.summary1;
run;

In [ ]:
ods graphics;
proc sgplot data = mycas.summary1;
  vbar _column_ / response=_nmiss_;
run;

Impute missing values


In [ ]:
proc varimpute data=mycas.&indata.;
  input clage       / ctech = mean;
  input delinq      / ctech = median;
  input ninq        / ctech = value cvalues=2;
  input debtinc yoj / ctech = value cvalues=35.0, 7, 2;
  output out=mycas.out COPYVARS=(_all_);
run;

Partition data into Training, Validation, and Test


In [ ]:
%let part_data = hmeq_part;
proc partition data = mycas.out partind samppct=30 samppct2=10;
  target bad;
  output out=mycas.&part_data copyvars=(_all_);
run;

Decision Tree


In [ ]:
proc treesplit data=mycas.&part_data.
                outmodel=mycas.model_treesplit;
  autotune;
  target &target. / level=nominal;
  input &class_inputs. / level=nominal;
  input &interval_inputs. / level=interval;
  partition rolevar = _partind_(train='0' valid='1' test='2');
  output out=mycas._scored_treesplit copyvars=(_partind_ &target.);
run;

Gradient Boosting


In [ ]:
/* Create Decision Trees with Gradient Boosting */
proc gradboost data=mycas.&part_data. maxdepth=8 minleafsize=5 seed=9878 outmodel=mycas.model_gradboost;
   autotune;
   target &target. / level=nominal;
   input &class_inputs. / level=nominal;
   input &interval_inputs. / level=interval;
   partition rolevar = _partind_(train='0' valid='1' test='2');
   output out=mycas._scored_gradboost copyvars=(_partind_ &target.);
   title "Gradient Boost";
run;

Forest


In [ ]:
proc forest data=mycas.&part_data. ntrees=50 minleafsize=5 outmodel=mycas.model_forest;
   autotune;
   target &target. / level=nominal; 
   input &class_inputs. / level=nominal;
   input &interval_inputs. / level=interval;
   partition rolevar = _partind_(train='0' valid='1' test='2');
   output out=mycas._scored_forest copyvars=(_partind_ &target);
   title "Random Forest";
run;

Neural Network


In [ ]:
proc nnet data=mycas.&part_data.;
   autotune;
   architecture mlp;
   target &target. / level=nominal; 
   input &class_inputs. / level=nominal;
   input &interval_inputs. / level=interval;
   hidden 7;
   optimization algorithm=lbfgs maxiter=300;
   partition rolevar = _partind_(train='0' valid='1' test='2');
   train outmodel=mycas.model_neural seed=12345 numtries=3;
   output out=mycas._scored_neural copyvars=(_partind_ &target);
   
   title "Neural Network";
run;

Assess Models


In [ ]:
/* Macro to assess the different models */
%macro assess_model(prefix=, var_evt=, var_nevt=);
proc assess data=mycas._scored_&prefix. nbins=20;
  input &var_evt.;
  target &target. / level=nominal event='1';
  fitstat pvar=&var_nevt. / pevent='0';
  by _partind_;

ods output fitstat=work.&prefix._fitstat
           rocinfo=work.&prefix._rocinfo
           liftinfo=work.&prefix._liftinfo;
run;
%mend assess_model;

title "Assess Forest";
%assess_model(prefix=forest, 
              var_evt=P_BAD1, 
              var_nevt=P_BAD0);

title "Assess Gradient Boost";
%assess_model(prefix=gradboost, 
              var_evt=P_BAD1, 
              var_nevt=P_BAD0);


title "Assess Decision Tree";
%assess_model(prefix=treesplit, 
              var_evt=P_BAD1, 
              var_nevt=P_BAD0);


title "Assess Neural";
%assess_model(prefix=neural, 
              var_evt=P_BAD1, 
              var_nevt=P_BAD0);


/*Prepare ROC and Lift Data Sets for Plotting*/
data work.all_rocinfo;
  set work.neural_rocinfo(keep=sensitivity fpr _partind_ in=n) 
      work.forest_rocinfo(keep=sensitivity fpr _partind_ in=f)
      work.treesplit_rocinfo(keep=sensitivity fpr _partind_ in=t)
      work.gradboost_rocinfo(keep=sensitivity fpr _partind_ in=g);

  length model $ 16;
  select;
    when (n) model='Neural';
    when (f) model='Forest';
    when (g) model='GradientBoosting';
    when (t) model='TreeSplit';
  end;
run;

data work.all_liftinfo;
  set work.neural_liftinfo(keep=depth lift cumlift _partind_ in=n)
      work.forest_liftinfo(keep=depth lift cumlift _partind_ in=f)
      work.treesplit_liftinfo(keep=depth lift cumlift _partind_ in=t)
      work.gradboost_liftinfo(keep=depth lift cumlift _partind_ in=g);

  length model $ 16;
  select;
    when (n) model='Neural';
    when (f) model='Forest';
    when (g) model='GradientBoosting';
    when (t) model='TreeSplit';
  end;
run;

Build ROC and Lift charts


In [ ]:
/*Plot ROC Curves*/
ods graphics on;

/* _partind_=2 specifies the test partition */
proc sgplot data=work.all_rocinfo(where=(_partind_=2)) aspect=1;
  title "ROC Curves for Test Partition (Unbiased)";
  series x=fpr y=sensitivity / group=model;
  lineparm x=0 y=0 slope=1 / transparency=.7;
  yaxis values=(0 to 1 by 0.25) grid offsetmin=.05 offsetmax=.05;
  xaxis values=(0 to 1 by 0.25) grid offsetmin=.05 offsetmax=.05;
run;

In [ ]:
/*Plot Lift*/
proc sgplot data=work.all_liftinfo(where=(_partind_=2));
   title "Cumulative Lift Chart for Test Partition (Unbiased)";
   xaxis label="Percentile" grid;
   series x=depth y=CumLift / group=model markers 
                           markerattrs=(symbol=circlefilled);
run;

In [ ]:
/* Create Fit Statistics */
%macro print_fitstats(prefix=);
proc print data=work.&prefix._fitstat;
run;
%mend print_fitstats;

title "Forest Fit Statistics";
%print_fitstats(prefix=forest);

title "Gradient Boosting Fit Statistics";
%print_fitstats(prefix=gradboost);

title "Neural Fit Statistics";
%print_fitstats(prefix=neural);

title "TreeSplit Fit Statistics";
%print_fitstats(prefix=treesplit);

In [ ]: