In [ ]:
# SETUP
import sys
import skytree
import skytree.prediction
from skytree import Dataset, TransformSteps
from skytree.prediction import gbt, rdf, rdfr, gbtr
import pandas
import numpy as np
import matplotlib.pyplot as plt
%pylab inline
pylab.rcParams['figure.figsize'] = 15, 12
import time
import getpass
server = "localhost"
hostname = 'http://{0}:8080/v1'.format(server);
email = "trial@skytree.net"
password = getpass.getpass()
datadir = "/user/skytree/datasets"
# Authenticate
help(skytree.authenticate)
skytree.authenticate(email, password, hostname)
In [2]:
# List all projects
for project in skytree.list_projects():
print project
In [3]:
# Next we will create a project
help(skytree.create_project)
project = skytree.create_project("My First SDK Project", "Income prediction dataset")
print project
In [4]:
# Next we will create a dataset in the project
help(project.create_dataset)
# create a dataset in the project from a file in HDFS
# If NOT running on a small machine like a laptop you may
# replace "income.data.sample" with "income.data" to run on
# a larger dataset.
IncomeData = project.create_dataset(
url = 'hdfs://{0}/income.data.sample'.format(datadir),
has_header = True,
missing_value = "?").ready()
for dataset in project.list_datasets():
print dataset.summary()
In [5]:
# Get information about the dataset, it's columns, statistics, column types etc.
print pandas.DataFrame(IncomeData.columns)
In [6]:
# Next, we shall do some basic data preparation
# Add a unique id column
help(Dataset.add_unique_id_column)
# Clamp a continuous value
help(Dataset.clamp_values)
# Normalize a continuous value
help(Dataset.normalize)
In [7]:
# Preparing datasets
# Example shows adding an id column
IncomeWithId = IncomeData.add_unique_id_column("id").ready()
# Apply multiple transform steps together
ts = TransformSteps()
ts.add_clamp_values('age', min_value = 21, max_value = 60)
ts.add_normalize('Unit', normalization_column = 'fnlwgt')
IncomeTransformed = IncomeWithId.apply_transform_steps(ts).ready()
IncomeTraining, IncomeTest = IncomeTransformed.split([7,3])
# Set the ID properties and let the datasets go to ready state
IncomeTraining = IncomeTraining.ready().set_as_id('id').ready()
IncomeTest = IncomeTest.ready().set_as_id('id').ready()
In [8]:
print pandas.DataFrame(IncomeTransformed.columns)
In [9]:
# Now let's do some machine learning
help(skytree.prediction.learn)
In [10]:
# Automodel: Skytree Infinity can automatically find the best model for the given target variable
# NOTE: This can take a while depending on the size of the dataset, the model configuration
# and the hardware resources used/allocated. The default configuration below will build 100
# distinct models and pick the best
# Arguments: The dataset and what is to be predicted (the target)
model = skytree.prediction.learn(IncomeTraining, objective_column = 'yearly-income')
In [11]:
# Loop and refresh the local model's object state and print status
for i in range(0,5):
model = project.get_model(model.id)
print model
print model.status
time.sleep(15)
# You can also see a model summary()
print model.summary()
In [12]:
# Finally wait for model to be ready
model.ready() # blocking call
print model.summary()
In [14]:
# Use model to make predictions on the test set
results = model.test(IncomeTest)
# Loop and refresh the results object state and print status
for i in range(0,5):
results = project.get_result(results.id)
print results.status
time.sleep(15)
# Print generalization error to view model accuracy on validation/test dataset
print "\n" + results.ready().summary()
# Download and save predicted probabilities to a local file
probs_file = open("probabilities.csv", "w")
for (id, probability) in results.get_probabilities(): probs_file.write("%s,%f\n"%(id, probability))
probs_file.close()
In [28]:
# Now let's do some tuning ourselves and build a few different models
# Start with: GBT - Automatic Grid Search
gbt_config = gbt.Config()
gbt_config.num_folds = 5
gbt_config.tree_depth = [2,3,4,5]
gbt_config.learning_rate = [0.10,0.20,0.30]
gbt_config.num_trees = 10
print "Model Configuration:\n" + gbt_config.summary() + "\n"
# Training and Auto-Tuning
model = skytree.prediction.learn(IncomeTraining, 'yearly-income', gbt_config, name = "Gbt Grid Search").ready()
print model.summary()
In [34]:
# Let's look at the tuning results of the grid search
# Let's print the top 6 best results by Classification Accuracy score
print pandas.DataFrame(model.tuning_results).sort(['accuracy'], ascending = False).head(6)
In [20]:
# Let's Analyze the tuning results of the grid search.
# Quick matplotlib chart to put a tuning hyperparameter on the x-axis
# a accuracy metric on the y-axis to see relationship between
# accuracy and hyperparameter
def create_tuning_results_chart(model, x_axis_feature, list_column_groups, y_axis_metric):
pdf = pandas.DataFrame(model.tuning_results)
legend = []
colormap = plt.cm.gist_ncar
plt.gca().set_color_cycle([colormap(i) for i in np.linspace(0, 0.9, len(pdf.groupby(list_column_groups)))])
for group in pdf.groupby(list_column_groups):
label = ""
if len(list_column_groups) > 1:
for i in range(0, len(list_column_groups)):
label = label + list_column_groups[i] + "=" + str(round(group[0][i],1)) + ","
else:
label = label + list_column_groups[0] + "=" + str(round(group[0],1)) + ","
plt.plot(group[1][x_axis_feature], group[1][y_axis_metric], label = label)
plt.legend(loc='lower right', prop={'size':12})
plt.ylabel(y_axis_metric)
plt.xlabel(x_axis_feature)
title(model.name)
plt.show()
In [21]:
create_tuning_results_chart(model.ready(), "numTrees", ["treeDepth", "learningRate"], "gini")
In [24]:
# Let's try a random forest
# RDF - Automatic Grid Search
rdf_config = rdf.Config()
rdf_config.holdout_ratio = 0.2
rdf_config.num_dimensions = [1,2,3,4,5,6]
rdf_config.num_trees = 10
print "Model Configuration:\n" + rdf_config.summary() + "\n"
# Training and Auto-Tuning
model = skytree.prediction.learn(IncomeTraining, 'yearly-income', rdf_config, name = "Rdf Grid Search").ready()
print model.summary()
In [25]:
create_tuning_results_chart(model.ready(), "numTrees", ["numDimensions"], "gini")
In [35]:
# Now let's tune a GBT model with smart search
# Smart Seach GBT - Classification
gbt_config = gbt.Config()
print gbt_config
gbt_config.smart_search_iterations = 30
gbt_config.smart_search = True;
gbt_config.num_trees = None
gbt_config.tree_depth = None
gbt_config.learning_rate = None
gbt_config.holdout_ratio = 0.2
print "Model Configuration:\n" + gbt_config.summary() + "\n"
model = skytree.prediction.learn(IncomeTraining, 'yearly-income', gbt_config, name = "Gbt Smart Search 1").ready()
print model.summary()
In [38]:
# Let's look at the tuning results of the grid search
# Let's print the top 6 best results by Gini score
print pandas.DataFrame(model.tuning_results).sort("gini", ascending = False).head(6)
In [39]:
# Now let's build some regression models and compare the
# validation error amongst them
# Smart Seach GBT - Regression
gbtr_config = gbtr.Config()
gbtr_config.smart_search_iterations = 30
gbtr_config.smart_search = True;
gbtr_config.holdout_ratio = 0.3
gbtr_config.num_trees = None
gbtr_config.tree_depth = None
gbtr_config.learning_rate = None
print "Model Configuration:\n" + gbtr_config.summary() + "\n"
model = skytree.prediction.learn(IncomeTraining, 'age', gbtr_config, name = "Gbtr Smart Search Model").ready()
print model.summary()
results = model.test(IncomeTest, name = "Gbtr Smart Search Results").ready()
In [53]:
# Helper function to plot true targets vs. predicted target values in regression problems
def plot_true_vs_predicted_regression_plot(dataset, results, target_index, id_index):
ids = []
predictions = []
for p in results.get_targets(): ids.append(p[0]), predictions.append(p[1])
predicted = pandas.DataFrame(predictions, index = ids, columns = ["target"])
data = IncomeTest.sample(1000)
true = pandas.DataFrame({'target': data["data"][target_index]} , index = data["data"][id_index], dtype=float32)
joined = predicted.join(true, how = "inner", lsuffix = "_predicted")
#Plot predicted results versus true
max_a= max(joined["target_predicted"].max(), joined["target"].max())
min_a= min(joined["target_predicted"].min(), joined["target"].min())
plt.xlim(min_a,max_a) #Axes are cut to 0 <= distance <= 1 for readability
plt.ylim(min_a,max_a)
plt.grid()
plt.xlabel('True Target')
plt.ylabel('Predicted Target')
plt.title(results.name)
plt.plot([0,100],[0,100],'k-')
plt.plot(joined["target_predicted"],joined["target"],'go')
return plt
In [56]:
print results.summary()
plt = plot_true_vs_predicted_regression_plot(IncomeTest, results, 0, 15)
plt.show()
In [42]:
# RDF - Regression
rdfr_config = rdfr.Config()
rdfr_config.num_trees = 100
rdfr_config.num_dimensions = [4,6,8,10];
rdfr_config.holdout_ratio = 0.2
print "Model Configuration:\n" + rdfr_config.summary() + "\n"
model = skytree.prediction.learn(IncomeTraining, 'age', rdfr_config, name = "Rdfr Grid Search").ready()
print model.summary()
results = model.test(IncomeTest, name = "Rdfr Grid Search Results").ready()
In [58]:
print results.summary()
plt = plot_true_vs_predicted_regression_plot(IncomeTest, results, 0, 15)
plt.show()
In [ ]: