This notebook demonstrates how to use the pre-built and cached IRIS Classifier Models + Analysis housed in the redis cache named "CACHE" to make new predictions.
I built this notebook from the predictors examples:
https://github.com/jay-johnson/sci-pype/tree/master/bins/ml/predictors
In [1]:
    
# Setup the Sci-pype environment
import sys, os
# Only redis is needed for this notebook:
os.environ["ENV_DEPLOYMENT_TYPE"] = "JustRedis"
# Load the Sci-pype PyCore as a named-object called "core" and environment variables
from src.common.load_ipython_env import *
    
a) What column (Target Column) do you want to analyze?
In [2]:
    
target_column_name = "ResultTargetValue"
    
b) What are the possible values in that column? (Target Column Values)
In [3]:
    
target_column_values = [ "Iris-setosa", "Iris-versicolor", "Iris-virginica" ]
    
c) What columns can the algorithms use for training and learning? (Feature Columnns)
In [4]:
    
feature_column_names = [ "SepalLength", "SepalWidth", "PetalLength", "PetalWidth", "ResultTargetValue" ]
    
d) Is there a column that's a non-numeric representation of the values in the Target Column? (Label Column)
In [5]:
    
label_column_name = "ResultLabel"
    
e) Are there any columns you want to ignore from the training? (Ignored Features usually any non-numeric columns should be ignored)
In [6]:
    
ignore_features = [ # Prune non-int/float columns as needed: 
                    label_column_name
                ]
    
f) Select a supported Classifier Algorithm
In [7]:
    
ml_algo_name = "xgb-classifier"
    
g) Use the correct dataset name
In [8]:
    
ds_name = "iris_classifier"
    
h) Assign the downloaded IRIS dataset csv file
In [9]:
    
# This will use <repo>/bins/ml/downloaders/download_iris.py to download this file before running
dataset_filename    = "iris.csv"
ml_csv              = str(os.getenv("ENV_DATA_SRC_DIR", "/opt/work/data/src")) + "/" + dataset_filename
# Check the file exists and download if not
if os.path.exists(ml_csv) == False:
    downloader="/opt/work/bins/ml/downloaders/download_iris.py"
    lg("Downloading and preparing(" + str(ml_csv) + ") for analysis")
    os.system(downloader)
    lg("Done Downloading and preparing(" + str(ml_csv) + ")", 5)
# end of downloading if the csv is missing
if os.path.exists(ml_csv) == False:
    downloader="/opt/work/bins/ml/downloaders/download_iris.py"
    lg("Please use the downloader: " + str(downloader) + " to download + prepare the IRIS csv file", 0)
else:
    lg("Dataset(" + str(ml_csv) + ") is ready", 5)
# end of error checking the csv file was downloaded and built
    
    
i) Build the Machine Learning API Request dictionary
In [10]:
    
ml_type     = "Predict with Filter"
ml_request  = {
                "MLType"    : ml_type,
                "MLAlgo"    : {
                    "Name"      : ml_algo_name,
                    "Version"   : 1,
                    "Meta"      : {
                        "UnitsAhead"    : 0,
                        "DatasetName"   : ds_name,
                        "FilterMask"    : None,
                        "Source"        : {
                            "CSVFile"       : ml_csv,
                            "S3File"        : "",        # <Bucket Name>:<Key>
                            "RedisKey"      : ""         # <App Name>:<Key>
                        },
                    },
                    "Steps"     : {
                        "Train"     :{              # these are specific for building an XGB Classifier
                            "LearningRate"          : 0.1,
                            "NumEstimators"         : 1000,
                            "Objective"             : "reg:linear",
                            "MaxDepth"              : 6,
                            "MaxDeltaStep"          : 0,
                            "MinChildWeight"        : 1,
                            "Gamma"                 : 0,
                            "SubSample"             : 0.8,
                            "ColSampleByTree"       : 0.8,
                            "ColSampleByLevel"      : 1.0,
                            "RegAlpha"              : 0,
                            "RegLambda"             : 1,
                            "BaseScore"             : 0.5,
                            "NumThreads"            : -1, # infinite = -1
                            "ScaledPositionWeight"  : 1,
                            "Seed"                  : 27,
                            "Debug"                 : True
                        }
                    },
                    "Cache" : {     # Use the cached dataset models instead of rebuilding them:
                        "RLoc"      : "CACHE:_MODELS_" + str(ds_name) + "_LATEST",
                        "UseCaches" : True
                    }
                },
                "FeatureColumnNames": feature_column_names,
                "TargetColumnName"  : target_column_name,
                "TargetColumnValues": target_column_values,
                "IgnoreFeatures"    : ignore_features,
                "UnitsAheadSet"     : [],
                "UnitsAheadType"    : "",
                "PredictionType"    : "Predict",
                "MaxFeatures"       : 10,
                "Version"           : 1,
                "TrackingType"      : "UseTargetColAndUnits",
                "TrackingName"      : core.to_upper(ds_name),
                "TrackingID"        : "ML_" + ds_name + "_" + str(core.build_unique_key()),
                "Debug"             : False
            }
    
i) Validate the dataset is ready for use
In [11]:
    
# Load the dataset
csv_res             = core.ml_load_csv_dataset(ml_request, core.get_rds(), core.get_dbs(), debug)
if csv_res["Status"] != "SUCCESS":
    lg("ERROR: Failed to Load CSV(" + str(ml_request["MLAlgo"]["Meta"]["Source"]["CSVFile"]) + ")", 0)
    sys.exit(1)
# Assign a local variable to build a sample record mask:
ds_df               = csv_res["Record"]["SourceDF"]
# Build a filter record mask for pruning bad records out before creating the train/test sets
samples_filter_mask =    (ds_df["SepalLength"] > 0.0) \
                       & (ds_df["PetalWidth"]  > 0.0)
# For patching on the fly you can use the encoder method to replace labels with target dictionary values:
#ds_df                = core.ml_encode_target_column(ds_df, "ResultLabel", "Target")
# Add the filter mask to the request for changing the train/test samples in the dataset:
ml_request["MLAlgo"]["Meta"]["SamplesFilterMask"]  = samples_filter_mask
ml_images               = []
show_pair_plot      = True
if show_pair_plot:
    lg("Samples(" + str(len(ds_df.index)) + ") in CSV(" + str(ml_request["MLAlgo"]["Meta"]["Source"]["CSVFile"]) + ")", 6)
    lg("")
    print ds_df.describe()
    lg("")
    num_per_class   = ds_df.groupby("ResultLabel").size()
    print num_per_class
    lg("")
    pair_plot_req   = {
                    "Title"         : "Iris Dataset PairPlot",
                    "SourceDF"      : ds_df[samples_filter_mask],
                    "Style"         : "default",
                    "DiagKind"      : "hist", # "kde" or "hist"
                    "HueColumnName" : ml_request["TargetColumnName"],
                    "XLabel"        : "",
                    "YLabel"        : "",
                    "CompareColumns": ml_request["FeatureColumnNames"],
                    "Size"          : 3.0,
                    "ImgFile"       : str(os.getenv("ENV_DATA_SRC_DIR", "/opt/work/data/src")) + "/" + "validate_jupyter_iris_classification_pairplot.png",
                    "ShowPlot"      : True
                  }
    
    lg("Plotting Validation Pair Plot - Please wait a moment...", 6)
    core.sb_pairplot(pair_plot_req)
    if os.path.exists(pair_plot_req["ImgFile"]):
        lg("Done Plotting Valiation Pair Plot - Saved(" + str(pair_plot_req["ImgFile"]) + ")", 5)
    else:
        lg("Failed to save Validation Pair Plot(" + str(pair_plot_req["ImgFile"]) + "). Please check the ENV_DATA_SRC_DIR is writeable by this user and exposed to the docker container correctly.", 0)
# end of showing a pairplot for validation
    
    
    
    
In [12]:
    
train_results           = core.ml_train_models_for_predictions(ml_request, core.get_rds(), core.get_dbs(), debug)
if train_results["Status"] != "SUCCESS":
    lg("ERROR: Failed to Train Models for Predictions with Error(" + str(train_results["Error"]) + ") StoppedEarly(" + str(train_results["Record"]["StoppedEarly"]) + ")", 0)
else:
    lg("Models prepared for new Predictions by using the ones in the Cache", 5)
    
    
    
    
In [13]:
    
predict_row   = {
                  "SepalLength"       : 5.4,
                  "SepalWidth"        : 3.4,
                  "PetalLength"       : 1.7,
                  "PetalWidth"        : 0.2,
                  "ResultTargetValue" : 0
              }
    
In [14]:
    
algo_nodes      = train_results["Record"]["AlgoNodes"]
predict_row_df  = pd.DataFrame(predict_row, index=[0])
predict_req     = {
                    "AlgoNodes"     : algo_nodes,
                    "PredictionMask": samples_filter_mask,
                    "PredictionRow" : predict_row_df
                }
predict_results = core.ml_compile_predictions_from_models(predict_req, core.get_rds(), core.get_dbs(), debug)
if predict_results["Status"] != "SUCCESS":
    lg("ERROR: Failed to Compile Predictions from Models with Error(" + str(predict_results["Error"]) + ")", 0)
else:
    lg("Done with new Predictions", 5)
    
    
In [15]:
    
al_req           = train_results["Record"]
al_req["DSName"]            = ml_request["TrackingName"]
al_req["Version"]           = 1
al_req["FeatureColumnNames"]= ml_request["FeatureColumnNames"]
al_req["TargetColumnName"]  = ml_request["TargetColumnName"]
al_req["TargetColumnValues"]= ml_request["TargetColumnValues"]
al_req["IgnoreFeatures"]    = ml_request["IgnoreFeatures"]
al_req["PredictionType"]    = ml_request["PredictionType"]
al_req["ConfMatrices"]      = predict_results["Record"]["ConfMatrices"]
al_req["PredictionMarkers"] = predict_results["Record"]["PredictionMarkers"]
analysis_dataset = core.ml_compile_analysis_dataset(al_req, core.get_rds(), core.get_dbs(), debug)
lg("Analyzed Models(" + str(len(analysis_dataset["Models"])) + ")", 5)
    
    
In [16]:
    
lg("Caching Models", 6)
cache_req     = {
                "Name"      : "CACHE",
                "Key"       : "_MODELS_" + str(al_req["Tracking"]["TrackingName"]) + "_LATEST",
                "TrackingID": "_MD_" + str(al_req["Tracking"]["TrackingName"]),
                "Analysis"  : analysis_dataset
              }
cache_results = core.ml_cache_analysis_and_models(cache_req, core.get_rds(), core.get_dbs(), debug)
lg("Done Caching Models", 5)
    
    
a) Set common plot settings
In [17]:
    
# Turn this False to show the images:
analysis_dataset["ShowPlot"] = True
analysis_dataset["SourceDF"] = al_req["SourceDF"]
    
b) Plot Feature Importance
In [18]:
    
lg("Plotting Feature Importance", 6)
for midx,model_node in enumerate(analysis_dataset["Models"]):
    predict_col     = model_node["Target"]
    if predict_col == "ResultTargetValue":
        plot_req    = {
                        "ImgFile"   : analysis_dataset["FeatImpImgFile"],
                        "Model"     : model_node["Model"],
                        "XLabel"    : str(predict_col),
                        "YLabel"    : "Importance Amount",
                        "Title"     : str(predict_col) + " Importance Analysis",
                        "ShowPlot"  : analysis_dataset["ShowPlot"]
                    }
        image_list  = core.sb_model_feature_importance(plot_req, debug)
        for img in image_list:
            ml_images.append(img)
# end of for all models
    
    
    
c) Show Pair Plot
In [19]:
    
lg("Plotting PairPlot", 6)
plot_req  = {
             "DSName"        : str(analysis_dataset["DSName"]),
             "Title"         : str(analysis_dataset["DSName"]) + " - Pair Plot",
             "ImgFile"       : str(analysis_dataset["PairPlotImgFile"]),
             "SourceDF"      : al_req["SourceDF"],
             "HueColumnName" : target_column_name,
             "CompareColumns": feature_column_names,
             "Markers"       : ["o", "s", "D"],
             "Width"         : 15.0,
             "Height"        : 15.0,
             "ShowPlot"      : analysis_dataset["ShowPlot"]
           }
image_list = core.sb_pairplot(plot_req, debug)
for img in image_list:
    ml_images.append(img)
    
    
    
d) Show Confusion Matrix
In [20]:
    
lg("Plotting Confusion Matrices", 6)
plot_req   = {
              "DSName"        : str(analysis_dataset["DSName"]),
              "Title"         : str(analysis_dataset["DSName"]) + " - Confusion Matrix",
              "ImgFile"       : str(analysis_dataset["CMatrixImgFile"]),
              "SourceDF"      : al_req["SourceDF"],
              "ConfMatrices"  : al_req["ConfMatrices"],
              "Width"         : 15.0,
              "Height"        : 15.0,
              "XLabel"        : "Dates",
              "YLabel"        : "Values",
              "ShowPlot"      : analysis_dataset["ShowPlot"]
            }
image_list  = core.sb_confusion_matrix(plot_req, debug)
for img in image_list:
    ml_images.append(img)
    
    
    
    
    
    
    
e) Show Scatter Plots
In [21]:
    
lg("Plotting Scatters", 6)
plot_req = {
            "DSName"            : str(analysis_dataset["DSName"]),
            "Title"             : str(analysis_dataset["DSName"]) + " - Scatter Plot",
            "ImgFile"           : str(analysis_dataset["ScatterImgFile"]),
            "SourceDF"          : analysis_dataset["SourceDF"],
            "UnitsAheadType"    : analysis_dataset["UnitsAheadType"],
            "FeatureColumnNames": analysis_dataset["FeatureColumnNames"],
            "Hue"               : label_column_name,
            "Width"             : 7.0,
            "Height"            : 7.0,
            "XLabel"            : "Dates",
            "YLabel"            : "Values",
            "ShowPlot"          : analysis_dataset["ShowPlot"]
        }
image_list = core.sb_all_scatterplots(plot_req, debug)
for img in image_list:
    ml_images.append(img)
    
    
    
    
    
    
    
f) Show Joint Plots
In [22]:
    
lg("Plotting JointPlots", 6)
plot_req = {
             "DSName"            : str(analysis_dataset["DSName"]),
             "Title"             : str(analysis_dataset["DSName"]) + " - Joint Plot",
             "ImgFile"           : str(analysis_dataset["JointPlotImgFile"]),
             "SourceDF"          : analysis_dataset["SourceDF"],
             "UnitsAheadType"    : analysis_dataset["UnitsAheadType"],
             "FeatureColumnNames": analysis_dataset["FeatureColumnNames"],
             "Hue"               : label_column_name,
             "Width"             : 15.0,
             "Height"            : 15.0,
             "XLabel"            : "Dates",
             "YLabel"            : "Values",
             "ShowPlot"          : analysis_dataset["ShowPlot"]
          }
image_list = core.sb_all_jointplots(plot_req, debug)
for img in image_list:
    ml_images.append(img)
    
lg("", 6)
lg("Analysis Complete Saved Images(" + str(len(ml_images)) + ")", 5)
lg("", 6)
    
    
    
    
    
    
    
    
I built this notebook from the predictor examples:
https://github.com/jay-johnson/sci-pype/tree/master/bins/ml/predictors