Authenticate



In [ ]:

    
# SETUP
import sys, skytree, skytree.prediction
from skytree import Dataset
from skytree.prediction import gbt

import pandas, getpass

server = "localhost"
hostname = 'http://{0}:8080/v1'.format(server);
email = "trial@skytree.net"
datadir = "/user/skytree/datasets"

# Authenticate
help(skytree.authenticate)
skytree.authenticate(email, getpass.getpass(), hostname)

Create a project



In [ ]:

    
help(skytree.create_project)
project = skytree.create_project("My First SDK Project", "Income prediction dataset")
print project

Load/Create a Dataset



In [ ]:

    
# Next we will create a dataset in the project
help(project.create_dataset)

# create a dataset in the project from a file on the VM
IncomeData = project.create_dataset(
    path = '/home/skytree/datasets/income.data.small', 
    has_header = True, 
    missing_value = "?")

for dataset in project.list_datasets():
    print dataset.summary()



In [ ]:

    
# Get information about the dataset, it's columns, statistics, column types etc.
IncomeData.ready() # Wait for dataset object to be ready on the backend
print pandas.DataFrame(IncomeData.columns)

Build a classification model



In [ ]:

    
# Automodel: Skytree Infinity can automatically find the best model for the given target variable
# The default configuration below will build 100 distinct models and pick the most accurate
model = skytree.prediction.learn(IncomeData, objective_column = 'yearly-income')



In [ ]:

    
model.ready() # Wait for model to get ready
print model.summary()

Make predictions on test dataset using model



In [ ]:

    
# Make predictions on test data
IncomeTest = project.create_dataset(
    path = '/home/skytree/datasets/income.test', 
    has_header = True, 
    missing_value = "?").ready()
IncomeTestWithId = IncomeTest.add_unique_id_column("id").ready() # Prediction requires a test dataset to have id's

# Use model to make predictions on the test set
results = model.test(IncomeTestWithId)
# Wait for results to complete on the server
results.ready()

Download predictions



In [ ]:

    
# Print generalization error to view model accuracy on validation/test dataset
print "\n" + results.summary()

# Download and save predicted probabilities to a local file
probs_file = open("probabilities.csv", "w")   # This will get written to /home/skytree on the VM 
for (id, probability) in results.get_probabilities(): probs_file.write("%s,%f\n"%(id, probability))
probs_file.close()