In [ ]:
# SETUP
import sys, skytree, skytree.prediction
from skytree import Dataset
from skytree.prediction import gbt
import pandas, getpass
server = "localhost"
hostname = 'http://{0}:8080/v1'.format(server);
email = "trial@skytree.net"
datadir = "/user/skytree/datasets"
# Authenticate
help(skytree.authenticate)
skytree.authenticate(email, getpass.getpass(), hostname)
In [ ]:
help(skytree.create_project)
project = skytree.create_project("My First SDK Project", "Income prediction dataset")
print project
In [ ]:
# Next we will create a dataset in the project
help(project.create_dataset)
# create a dataset in the project from a file on the VM
IncomeData = project.create_dataset(
path = '/home/skytree/datasets/income.data.small',
has_header = True,
missing_value = "?")
for dataset in project.list_datasets():
print dataset.summary()
In [ ]:
# Get information about the dataset, it's columns, statistics, column types etc.
IncomeData.ready() # Wait for dataset object to be ready on the backend
print pandas.DataFrame(IncomeData.columns)
In [ ]:
# Automodel: Skytree Infinity can automatically find the best model for the given target variable
# The default configuration below will build 100 distinct models and pick the most accurate
model = skytree.prediction.learn(IncomeData, objective_column = 'yearly-income')
In [ ]:
model.ready() # Wait for model to get ready
print model.summary()
In [ ]:
# Make predictions on test data
IncomeTest = project.create_dataset(
path = '/home/skytree/datasets/income.test',
has_header = True,
missing_value = "?").ready()
IncomeTestWithId = IncomeTest.add_unique_id_column("id").ready() # Prediction requires a test dataset to have id's
# Use model to make predictions on the test set
results = model.test(IncomeTestWithId)
# Wait for results to complete on the server
results.ready()
In [ ]:
# Print generalization error to view model accuracy on validation/test dataset
print "\n" + results.summary()
# Download and save predicted probabilities to a local file
probs_file = open("probabilities.csv", "w") # This will get written to /home/skytree on the VM
for (id, probability) in results.get_probabilities(): probs_file.write("%s,%f\n"%(id, probability))
probs_file.close()