Authenticate


In [ ]:
# SETUP
import sys, skytree, skytree.prediction
from skytree import Dataset
from skytree.prediction import gbt

import pandas, getpass

server = "localhost"
hostname = 'http://{0}:8080/v1'.format(server);
email = "trial@skytree.net"
datadir = "/user/skytree/datasets"

# Authenticate
help(skytree.authenticate)
skytree.authenticate(email, getpass.getpass(), hostname)

Create a project


In [ ]:
help(skytree.create_project)
project = skytree.create_project("My First SDK Project", "Income prediction dataset")
print project

Load/Create a Dataset


In [ ]:
# Next we will create a dataset in the project
help(project.create_dataset)

# create a dataset in the project from a file on the VM
IncomeData = project.create_dataset(
    path = '/home/skytree/datasets/income.data.small', 
    has_header = True, 
    missing_value = "?")

for dataset in project.list_datasets():
    print dataset.summary()

In [ ]:
# Get information about the dataset, it's columns, statistics, column types etc.
IncomeData.ready() # Wait for dataset object to be ready on the backend
print pandas.DataFrame(IncomeData.columns)

Build a classification model


In [ ]:
# Automodel: Skytree Infinity can automatically find the best model for the given target variable
# The default configuration below will build 100 distinct models and pick the most accurate
model = skytree.prediction.learn(IncomeData, objective_column = 'yearly-income')

In [ ]:
model.ready() # Wait for model to get ready
print model.summary()

Make predictions on test dataset using model


In [ ]:
# Make predictions on test data
IncomeTest = project.create_dataset(
    path = '/home/skytree/datasets/income.test', 
    has_header = True, 
    missing_value = "?").ready()
IncomeTestWithId = IncomeTest.add_unique_id_column("id").ready() # Prediction requires a test dataset to have id's

# Use model to make predictions on the test set
results = model.test(IncomeTestWithId)
# Wait for results to complete on the server
results.ready()

Download predictions


In [ ]:
# Print generalization error to view model accuracy on validation/test dataset
print "\n" + results.summary()

# Download and save predicted probabilities to a local file
probs_file = open("probabilities.csv", "w")   # This will get written to /home/skytree on the VM 
for (id, probability) in results.get_probabilities(): probs_file.write("%s,%f\n"%(id, probability))
probs_file.close()