In [ ]:
import h2o

In [ ]:
# Connect to a cluster
h2o.init()

In [ ]:
weather_path = h2o.locate("smalldata/chicago/chicagoAllWeather.csv")
census_path = h2o.locate("smalldata/chicago/chicagoCensus.csv")
crimes_path = h2o.locate("smalldata/chicago/chicagoCrimes10k.csv.zip")

print "Import and Parse weather data"
weather = h2o.import_frame(path=weather_path)
weather.drop("date")
weather.describe()

print "Import and Parse census data"
census = h2o.import_frame(path=census_path)
census.describe()

print "Import and Parse crimes data"
crimes = h2o.import_frame(path=crimes_path)
crimes.describe()

In [ ]:
def refine_date_col(data, col, pattern):
    data[col]         = data[col].as_date(pattern)
    data["Day"]       = data[col].day()
    data["Month"]     = data[col].month() + 1     # Since H2O indexes from 0
    data["Year"]      = data[col].year() + 1900   # Start of epoch is 1900
    data["WeekNum"]   = data[col].week()
    data["WeekDay"]   = data[col].dayOfWeek()
    data["HourOfDay"] = data[col].hour()
    
    data.describe()  # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
    
    # Create weekend and season cols
    # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
    # data["Weekend"]   = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
    data["Weekend"] = h2o.ifelse(data["WeekDay"] == "Sun" or data["WeekDay"] == "Sat", 1, 0)[0]
    data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
    
refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
crimes = crimes.drop("Date")
crimes.describe()

In [ ]:
# Merge crimes data with weather and census
census.setName(0,"Community Area")
weather.setName(1,"Month")
weather.setName(2,"Day")
weather.setName(3,"Year")
crimes.merge(census, allLeft=True, allRite=False)
crimes.merge(weather, allLeft=True, allRite=False)

In [ ]:
# Create test/train split
data_split = h2o.split_frame(data, ratios = [0.8,0.2])
train = data_split[1]
test  = data_split[2]

# Simple GBM - Predict Arrest
data_gbm = h2o.gbm(x              =train.drop("Arrest"),
                   y              =train     ["Arrest"],
                   validation_x   =test .drop("Arrest"),
                   validation_y   =test      ["Arrest"],
                   ntrees         =10,
                   max_depth      =6,
                   distribution   ="bernoulli")

# Simple Deep Learning
data_dl = h2o.deeplearning(x                   =train.drop("Arrest"),
                           y                   =train     ["Arrest"],
                           validation_x        =test .drop("Arrest"),
                           validation_y        =test      ["Arrest"],
                           variable_importances=True,
                           loss                ="Automatic")

In [ ]:
# GBM performance on train/test data
train_auc_gbm = data_gbm.model_performance(train).auc()
test_auc_gbm  = data_gbm.model_performance(test) .auc()

# Deep Learning performance on train/test data
train_auc_dl = data_dl.model_performance(train).auc()
test_auc_dl  = data_dl.model_performance(test) .auc()

# Make a pretty HTML table printout of the results
header = ["Model", "AUC Train", "AUC Test"]
table  = [
           ["GBM", train_auc_gbm, test_auc_gbm],
           ["DL ", train_auc_dl,  test_auc_dl]
         ]
h2o.H2ODisplay(table, header)

In [ ]:
# Create new H2OFrame of crime observations
examples = {
            "Date":                 ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"],
            "IUCR":                 [1811, 1150],
            "Primary.Type":         ["NARCOTICS", "DECEPTIVE PRACTICE"],
            "Location.Description": ["STREET", "RESIDENCE"],
            "Domestic":             ["false", "false"],
            "Beat":                 [422, 923],
            "District":             [4, 9],
            "Ward":                 [7, 14],
            "Community.Area":       [46, 63],
            "FBI.Code":             [18, 11]
            }

crime_examples = h2o.H2OFrame(python_obj = examples)

# Refine date column and merge with census data
refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
crime_examples.drop("Date")
crime_examples.merge(census, allLeft=True, allRite=False)

In [ ]:
# Predict probability of arrest from new observations
gbm_pred = data_gbm.predict(crime_examples)
dl_pred  = data_dl .predict(crime_examples)

# TODO: Replace with a pretty HTML table
gbm_pred.describe()
dl_pred.describe()