In [ ]:
import h2o
In [ ]:
# Connect to a cluster
h2o.init()
In [ ]:
weather_path = h2o.locate("smalldata/chicago/chicagoAllWeather.csv")
census_path = h2o.locate("smalldata/chicago/chicagoCensus.csv")
crimes_path = h2o.locate("smalldata/chicago/chicagoCrimes10k.csv.zip")
print "Import and Parse weather data"
weather = h2o.import_frame(path=weather_path)
weather.drop("date")
weather.describe()
print "Import and Parse census data"
census = h2o.import_frame(path=census_path)
census.describe()
print "Import and Parse crimes data"
crimes = h2o.import_frame(path=crimes_path)
crimes.describe()
In [ ]:
def refine_date_col(data, col, pattern):
data[col] = data[col].as_date(pattern)
data["Day"] = data[col].day()
data["Month"] = data[col].month() + 1 # Since H2O indexes from 0
data["Year"] = data[col].year() + 1900 # Start of epoch is 1900
data["WeekNum"] = data[col].week()
data["WeekDay"] = data[col].dayOfWeek()
data["HourOfDay"] = data[col].hour()
data.describe() # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
# Create weekend and season cols
# Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
# data["Weekend"] = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
data["Weekend"] = h2o.ifelse(data["WeekDay"] == "Sun" or data["WeekDay"] == "Sat", 1, 0)[0]
data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
crimes = crimes.drop("Date")
crimes.describe()
In [ ]:
# Merge crimes data with weather and census
census.setName(0,"Community Area")
weather.setName(1,"Month")
weather.setName(2,"Day")
weather.setName(3,"Year")
crimes.merge(census, allLeft=True, allRite=False)
crimes.merge(weather, allLeft=True, allRite=False)
In [ ]:
# Create test/train split
data_split = h2o.split_frame(data, ratios = [0.8,0.2])
train = data_split[1]
test = data_split[2]
# Simple GBM - Predict Arrest
data_gbm = h2o.gbm(x =train.drop("Arrest"),
y =train ["Arrest"],
validation_x =test .drop("Arrest"),
validation_y =test ["Arrest"],
ntrees =10,
max_depth =6,
distribution ="bernoulli")
# Simple Deep Learning
data_dl = h2o.deeplearning(x =train.drop("Arrest"),
y =train ["Arrest"],
validation_x =test .drop("Arrest"),
validation_y =test ["Arrest"],
variable_importances=True,
loss ="Automatic")
In [ ]:
# GBM performance on train/test data
train_auc_gbm = data_gbm.model_performance(train).auc()
test_auc_gbm = data_gbm.model_performance(test) .auc()
# Deep Learning performance on train/test data
train_auc_dl = data_dl.model_performance(train).auc()
test_auc_dl = data_dl.model_performance(test) .auc()
# Make a pretty HTML table printout of the results
header = ["Model", "AUC Train", "AUC Test"]
table = [
["GBM", train_auc_gbm, test_auc_gbm],
["DL ", train_auc_dl, test_auc_dl]
]
h2o.H2ODisplay(table, header)
In [ ]:
# Create new H2OFrame of crime observations
examples = {
"Date": ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"],
"IUCR": [1811, 1150],
"Primary.Type": ["NARCOTICS", "DECEPTIVE PRACTICE"],
"Location.Description": ["STREET", "RESIDENCE"],
"Domestic": ["false", "false"],
"Beat": [422, 923],
"District": [4, 9],
"Ward": [7, 14],
"Community.Area": [46, 63],
"FBI.Code": [18, 11]
}
crime_examples = h2o.H2OFrame(python_obj = examples)
# Refine date column and merge with census data
refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
crime_examples.drop("Date")
crime_examples.merge(census, allLeft=True, allRite=False)
In [ ]:
# Predict probability of arrest from new observations
gbm_pred = data_gbm.predict(crime_examples)
dl_pred = data_dl .predict(crime_examples)
# TODO: Replace with a pretty HTML table
gbm_pred.describe()
dl_pred.describe()