In [7]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [8]:
# Connect to a cluster
h2o.init()


H2O cluster uptime: 4 minutes 13 seconds 228 milliseconds
H2O cluster version: 3.5.0.99999
H2O cluster name: ece
H2O cluster total nodes: 1
H2O cluster total memory: 3.56 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321

In [9]:
from h2o.h2o import _locate # private function. used to find files within h2o git project directory.
weather_path = _locate("smalldata/chicago/chicagoAllWeather.csv")
census_path = _locate("smalldata/chicago/chicagoCensus.csv")
crimes_path = _locate("smalldata/chicago/chicagoCrimes10k.csv.zip")

print "Import and Parse weather data"
weather = h2o.import_file(path=weather_path, col_types = ["time"] + ["numeric"]*6)
weather.drop("date")
weather.describe()

print "Import and Parse census data"
census = h2o.import_file(path=census_path, col_types = ["numeric", "enum"] + ["numeric"]*7)
census.describe()

print "Import and Parse crimes data"
crimes = h2o.import_file(path=crimes_path)
crimes.describe()


Import and Parse weather data

Parse Progress: [##################################################] 100%
Imported /Users/ece/0xdata/h2o-dev/smalldata/chicago/chicagoAllWeather.csv. Parsed 5,162 rows and 7 cols
Rows: 5,162 Cols: 7

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0D Constant Reals 1 14.285715 80 B 0.2537749
C1N 1-Byte Integers (w/o NAs) 2 28.57143 10.2 KB 33.18107
C1S 1-Byte Fractions 4 57.14286 20.5 KB 66.565155
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.24:54321 30.8 KB 5162.0 1.0 7.0
mean 30.8 KB 5162.0 1.0 7.0
min 30.8 KB 5162.0 1.0 7.0
max 30.8 KB 5162.0 1.0 7.0
stddev 0 B 0.0 0.0 0.0
total 30.8 KB 5162.0 1.0 7.0
Column-by-Column Summary:

date month day year maxTemp meanTemp minTemp
type time int int int int int int
mins NaN 1.0 1.0 2001.0 -2.0 -9.0 -18.0
maxs NaN 12.0 31.0 2015.0 103.0 93.0 82.0
mean 0.0 6.47442851608 15.7082526153 2007.57148392 58.871042921 50.3103515246 41.4812584968
sigma -0.0 3.46905171694 8.79895173997 4.0773409057 21.4829777237 19.9302399266 19.0207297123
zero_count -5162 0 0 0 0 2 16
missing_count 5162 0 0 0 13 13 13
Import and Parse census data

Parse Progress: [##################################################] 100%
Imported /Users/ece/0xdata/h2o-dev/smalldata/chicago/chicagoCensus.csv. Parsed 79 rows and 9 cols
Rows: 79 Cols: 9

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C1 1-Byte Integers 3 33.333336 441 B 22.546013
C1S 1-Byte Fractions 1 11.111112 163 B 8.333334
C2S 2-Byte Fractions 4 44.444447 968 B 49.488754
C4 4-Byte Integers 1 11.111112 384 B 19.6319
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.24:54321 1.9 KB 79.0 1.0 9.0
mean 1.9 KB 79.0 1.0 9.0
min 1.9 KB 79.0 1.0 9.0
max 1.9 KB 79.0 1.0 9.0
stddev 0 B 0.0 0.0 0.0
total 1.9 KB 79.0 1.0 9.0
Column-by-Column Summary:

Community Area Number COMMUNITY AREA NAME PERCENT OF HOUSING CROWDED PERCENT HOUSEHOLDS BELOW POVERTY PERCENT AGED 16 UNEMPLOYED PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA PERCENT AGED UNDER 18 OR OVER 64 PER CAPITA INCOME HARDSHIP INDEX
type int enum real real real real real int int
mins 1.0 0.0 0.3 3.3 4.7 2.5 13.5 8201.0 1.0
maxs 77.0 78.0 15.8 56.5 35.9 54.8 51.5 88669.0 98.0
mean 39.0 NaN 4.92051282051 21.7397435897 15.341025641 20.3307692308 35.7179487179 25597.0 49.5064935065
sigma 22.3718573212 NaN 3.65898144135 11.457230913 7.49949670861 11.7465143511 7.28442108494 15196.4055413 28.6905556516
zero_count 0 1 0 0 0 0 0 0 0
missing_count 2 0 1 1 1 1 1 1 2
Import and Parse crimes data

Parse Progress: [##################################################] 100%
Imported /Users/ece/0xdata/h2o-dev/smalldata/chicago/chicagoCrimes10k.csv.zip. Parsed 9,999 rows and 22 cols
Rows: 9,999 Cols: 22

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 4 4.5454545 320 B 0.036952097
C1 1-Byte Integers 32 36.363636 80.2 KB 9.488375
C1N 1-Byte Integers (w/o NAs) 8 9.090909 20.1 KB 2.3720937
C2 2-Byte Integers 16 18.181818 79.2 KB 9.362738
C4 4-Byte Integers 12 13.636364 118.0 KB 13.949879
CStr String 8 9.090909 391.2 KB 46.25294
C8D 64-bit Reals 8 9.090909 156.8 KB 18.53702
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.24:54321 845.7 KB 9999.0 4.0 88.0
mean 845.7 KB 9999.0 4.0 88.0
min 845.7 KB 9999.0 4.0 88.0
max 845.7 KB 9999.0 4.0 88.0
stddev 0 B 0.0 0.0 0.0
total 845.7 KB 9999.0 4.0 88.0
Column-by-Column Summary:

ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location
type int string string enum int enum enum enum enum enum int int int int int int int int enum real real enum
mins 21735.0 NaN NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 2015.0 0.0 41.64507243 -87.906463888 0.0
maxs 9962898.0 NaN NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 2015.0 32.0 42.022646183 -87.524773286 8603.0
mean 9931318.73737 NaN NaN NaN 1189.67651357 NaN NaN NaN 0.292829282928 0.152315231523 1159.61806181 11.3489885128 22.9540954095 37.4476447645 12.7401236227 1163880.59815 1885916.14984 2015.0 NaN 41.8425652247 -87.6741405221 NaN
sigma 396787.564221 NaN NaN NaN 927.751435583 NaN NaN NaN 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 NaN 0.0860186579358 0.0600357970653 NaN
zero_count 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1
missing_count 0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162

In [10]:
def refine_date_col(data, col, pattern):
    data[col]         = data[col].as_date(pattern)
    data["Day"]       = data[col].day()
    data["Month"]     = data[col].month() + 1     # Since H2O indexes from 0
    data["Year"]      = data[col].year() + 1900   # Start of epoch is 1900
    data["WeekNum"]   = data[col].week()
    data["WeekDay"]   = data[col].dayOfWeek()
    data["HourOfDay"] = data[col].hour()
    
    data.describe()  # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
    
    # Create weekend and season cols
    # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
    # data["Weekend"]   = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
    data["Weekend"] = h2o.ifelse(data["WeekDay"] == "Sun" or data["WeekDay"] == "Sat", 1, 0)[0]
    data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
    
refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
crimes = crimes.drop("Date")
crimes.describe()


Rows: 9,999 Cols: 27

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 9 8.333334 720 B 0.10067409
C1 1-Byte Integers 32 29.62963 80.2 KB 11.489151
C1N 1-Byte Integers (w/o NAs) 23 21.296297 57.9 KB 8.296664
C2 2-Byte Integers 16 14.814815 79.2 KB 11.337022
C4 4-Byte Integers 12 11.111112 118.0 KB 16.891436
C8 64-bit Integers 4 3.7037036 78.4 KB 11.222924
CStr String 4 3.7037036 127.2 KB 18.21628
C8D 64-bit Reals 8 7.4074073 156.8 KB 22.445848
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.24:54321 698.4 KB 9999.0 4.0 108.0
mean 698.4 KB 9999.0 4.0 108.0
min 698.4 KB 9999.0 4.0 108.0
max 698.4 KB 9999.0 4.0 108.0
stddev 0 B 0.0 0.0 0.0
total 698.4 KB 9999.0 4.0 108.0
Column-by-Column Summary:

ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay
type int string int enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int
mins 21735.0 NaN 1.42203063e+12 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0
maxs 9962898.0 NaN 1.42346782e+12 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0
mean 9931318.73737 NaN 1.42271445081e+12 NaN 1189.67651357 NaN NaN NaN 0.292829282928 0.152315231523 1159.61806181 11.3489885128 22.9540954095 37.4476447645 12.7401236227 1163880.59815 1885916.14984 3915.0 NaN 41.8425652247 -87.6741405221 NaN 17.6839683968 2.41944194419 5.18081808181 NaN 13.6319631963
sigma 396787.564221 NaN 433879245.187 NaN 927.751435583 NaN NaN NaN 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 NaN 0.0860186579358 0.0600357970653 NaN 11.1801043358 0.493492406787 0.738929830409 NaN 6.47321735807
zero_count 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374
missing_count 0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0
Rows: 9,999 Cols: 28

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 13 11.607142 1.0 KB 0.16332531
CBS Bits 4 3.5714288 1.5 KB 0.24043368
C1 1-Byte Integers 32 28.57143 80.2 KB 12.903955
C1N 1-Byte Integers (w/o NAs) 23 20.535715 57.9 KB 9.3183365
C2 2-Byte Integers 16 14.285715 79.2 KB 12.733091
C4 4-Byte Integers 12 10.714286 118.0 KB 18.97149
CStr String 4 3.5714288 127.2 KB 20.459478
C8D 64-bit Reals 8 7.1428576 156.8 KB 25.209888
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.24:54321 621.8 KB 9999.0 4.0 112.0
mean 621.8 KB 9999.0 4.0 112.0
min 621.8 KB 9999.0 4.0 112.0
max 621.8 KB 9999.0 4.0 112.0
stddev 0 B 0.0 0.0 0.0
total 621.8 KB 9999.0 4.0 112.0
Column-by-Column Summary:

ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum
mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0
maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0
mean 9931318.73737 NaN NaN 1189.67651357 NaN NaN NaN 0.292829282928 0.152315231523 1159.61806181 11.3489885128 22.9540954095 37.4476447645 12.7401236227 1163880.59815 1885916.14984 3915.0 NaN 41.8425652247 -87.6741405221 NaN 17.6839683968 2.41944194419 5.18081808181 NaN 13.6319631963 0.159115911591 NaN
sigma 396787.564221 NaN NaN 927.751435583 NaN NaN NaN 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 NaN 0.0860186579358 0.0600357970653 NaN 11.1801043358 0.493492406787 0.738929830409 NaN 6.47321735807 0.365802434041 NaN
zero_count 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 8408 5805
missing_count 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0

In [11]:
# Merge crimes data with weather and census
census.set_name(0,"Community Area")
weather.set_name(1,"Month")
weather.set_name(2,"Day")
weather.set_name(3,"Year")
crimes.merge(census, allLeft=True, allRite=False)
crimes.merge(weather, allLeft=True, allRite=False)
crimes.describe()


Rows: 9,999 Cols: 28

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 13 11.607142 1.0 KB 0.16332531
CBS Bits 4 3.5714288 1.5 KB 0.24043368
C1 1-Byte Integers 32 28.57143 80.2 KB 12.903955
C1N 1-Byte Integers (w/o NAs) 23 20.535715 57.9 KB 9.3183365
C2 2-Byte Integers 16 14.285715 79.2 KB 12.733091
C4 4-Byte Integers 12 10.714286 118.0 KB 18.97149
CStr String 4 3.5714288 127.2 KB 20.459478
C8D 64-bit Reals 8 7.1428576 156.8 KB 25.209888
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.24:54321 621.8 KB 9999.0 4.0 112.0
mean 621.8 KB 9999.0 4.0 112.0
min 621.8 KB 9999.0 4.0 112.0
max 621.8 KB 9999.0 4.0 112.0
stddev 0 B 0.0 0.0 0.0
total 621.8 KB 9999.0 4.0 112.0
Column-by-Column Summary:

ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int int int int enum enum enum enum enum int int int int int int int int enum real real enum string enum int enum int int enum
mins 3915.0 1.0 2.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 21735.0 2.0 1100317.0 1814255.0 1.0 0.0 41.64507243 -87.906463888 0.0 NaN 0.0 4.0 0.0 0.0 0.0 0.0
maxs 3915.0 31.0 3.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 9962898.0 26.0 1205069.0 1951533.0 77.0 32.0 42.022646183 -87.524773286 8603.0 NaN 6517.0 6.0 6.0 23.0 1.0 1.0
mean 3915.0 17.6839683968 2.41944194419 1189.67651357 NaN NaN NaN 0.292829282928 0.152315231523 1159.61806181 11.3489885128 22.9540954095 9931318.73737 12.7401236227 1163880.59815 1885916.14984 37.4476447645 NaN 41.8425652247 -87.6741405221 NaN NaN NaN 5.18081808181 NaN 13.6319631963 0.159115911591 NaN
sigma 0.0 11.1801043358 0.493492406787 927.751435583 NaN NaN NaN 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 396787.564221 7.57423857911 16496.4493681 31274.0163199 21.2748762223 NaN 0.0860186579358 0.0600357970653 NaN NaN NaN 0.738929830409 NaN 6.47321735807 0.365802434041 NaN
zero_count 0 0 0 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 3 0 1038 374 8408 5805
missing_count 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0

In [12]:
# Create test/train split
r = crimes["Arrest"].runif()
train = crimes[r < 0.8]
test = crimes[r >= 0.8]

# Simple GBM - Predict Arrest
crimes_names_x = crimes.names
crimes_names_x.remove("Arrest")
data_gbm = H2OGradientBoostingEstimator(ntrees         =10,
                                        max_depth      =6,
                                        distribution   ="bernoulli")

data_gbm.train(x               =crimes_names_x,
               y               ="Arrest",
               training_frame  =train,
               validation_frame=test)

# Simple Deep Learning - Predict Arrest
data_dl = H2ODeepLearningEstimator(variable_importances=True,
                                   loss                ="Automatic")

data_dl.train(x                =crimes_names_x,
              y                ="Arrest",
              training_frame  =train,
              validation_frame=test)


gbm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%

In [13]:
# GBM performance on train/test data
train_auc_gbm = data_gbm.model_performance(train).auc()
test_auc_gbm  = data_gbm.model_performance(test) .auc()

# Deep Learning performance on train/test data
train_auc_dl = data_dl.model_performance(train).auc()
test_auc_dl  = data_dl.model_performance(test) .auc()

# Make a pretty HTML table printout of the results
header = ["Model", "AUC Train", "AUC Test"]
table  = [
           ["GBM", train_auc_gbm, test_auc_gbm],
           ["DL ", train_auc_dl,  test_auc_dl]
         ]
h2o.H2ODisplay(table, header)


Model AUC Train AUC Test
GBM 0.957115520619 0.914403005062
DL 0.98411333776 0.910417417547
Out[13]:


In [14]:
# Create new H2OFrame of crime observations
examples = {
            "Date":                 ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"],
            "IUCR":                 [1811, 1150],
            "Primary.Type":         ["NARCOTICS", "DECEPTIVE PRACTICE"],
            "Location.Description": ["STREET", "RESIDENCE"],
            "Domestic":             ["false", "false"],
            "Beat":                 [422, 923],
            "District":             [4, 9],
            "Ward":                 [7, 14],
            "Community.Area":       [46, 63],
            "FBI.Code":             [18, 11]
            }

crime_examples = h2o.H2OFrame(python_obj = examples)

# Refine date column and merge with census data
refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
crime_examples.drop("Date")
census.set_name(0,"Community.Area")
crime_examples.merge(census, allLeft=True, allRite=False)
crime_examples.describe()


Parse Progress: [##################################################] 100%
Uploaded pycd62ccbb-7411-40be-abce-e89cf22744b4 into cluster with 2 rows and 10 cols
Rows: 2 Cols: 16

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 7 43.75 560 B 43.75
C1N 1-Byte Integers (w/o NAs) 4 25.0 280 B 21.875
C2 2-Byte Integers 2 12.5 144 B 11.25
C2S 2-Byte Fractions 1 6.25 88 B 6.875
CStr String 2 12.5 208 B 16.25
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.24:54321 1.3 KB 2.0 1.0 16.0
mean 1.3 KB 2.0 1.0 16.0
min 1.3 KB 2.0 1.0 16.0
max 1.3 KB 2.0 1.0 16.0
stddev 0 B 0.0 0.0 0.0
total 1.3 KB 2.0 1.0 16.0
Column-by-Column Summary:

Location.Description FBI.Code Beat Primary.Type Community.Area District Date Ward Domestic IUCR Day Month Year WeekNum WeekDay HourOfDay
type string int int string int int int int enum int int int int int enum int
mins NaN 11.0 422.0 NaN 46.0 4.0 1.423465239e+12 7.0 0.0 1150.0 8.0 3.0 3915.0 6.0 6.0 23.0
maxs NaN 18.0 923.0 NaN 63.0 9.0 1.423467838e+12 14.0 0.0 1811.0 8.0 3.0 3915.0 6.0 6.0 23.0
mean NaN 14.5 672.5 NaN 54.5 6.5 1.4234665385e+12 10.5 0.0 1480.5 8.0 3.0 3915.0 6.0 NaN 23.0
sigma NaN 4.94974746831 354.260497374 NaN 12.0208152802 3.53553390593 1837770.5243 4.94974746831 0.0 467.397582364 0.0 0.0 0.0 0.0 NaN 0.0
zero_count 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0
missing_count 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Rows: 2 Cols: 18

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 9 50.0 720 B 50.0
C1N 1-Byte Integers (w/o NAs) 4 22.222223 280 B 19.444445
C2 2-Byte Integers 2 11.111112 144 B 10.0
C2S 2-Byte Fractions 1 5.555556 88 B 6.111111
CStr String 2 11.111112 208 B 14.444445
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.24:54321 1.4 KB 2.0 1.0 18.0
mean 1.4 KB 2.0 1.0 18.0
min 1.4 KB 2.0 1.0 18.0
max 1.4 KB 2.0 1.0 18.0
stddev 0 B 0.0 0.0 0.0
total 1.4 KB 2.0 1.0 18.0
Column-by-Column Summary:

Location.Description FBI.Code Beat Primary.Type Community.Area District Date Ward Domestic IUCR Day Month Year WeekNum WeekDay HourOfDay Weekend Season
type int int int string string int int int enum int int int int int enum int int enum
mins 46.0 11.0 422.0 NaN NaN 4.0 1.423465239e+12 7.0 0.0 1150.0 8.0 3.0 3915.0 6.0 6.0 23.0 1.0 1.0
maxs 63.0 18.0 923.0 NaN NaN 9.0 1.423467838e+12 14.0 0.0 1811.0 8.0 3.0 3915.0 6.0 6.0 23.0 1.0 1.0
mean 54.5 14.5 672.5 NaN NaN 6.5 1.4234665385e+12 10.5 0.0 1480.5 8.0 3.0 3915.0 6.0 NaN 23.0 1.0 NaN
sigma 12.0208152802 4.94974746831 354.260497374 NaN NaN 3.53553390593 1837770.5243 4.94974746831 0.0 467.397582364 0.0 0.0 0.0 0.0 NaN 0.0 0.0 NaN
zero_count 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
missing_count 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [15]:
# Predict probability of arrest from new observations
gbm_pred = data_gbm.predict(crime_examples)
dl_pred  = data_dl .predict(crime_examples)

# Make a pretty HTML table printout of the results
header = ["FBI Code", "GBM Arrest Prob", "DL Arrest Prob"]
table  = [
           [examples["FBI.Code"][0], gbm_pred[0,"true"], dl_pred[0,"true"]],
           [examples["FBI.Code"][1], gbm_pred[1,"true"], dl_pred[1,"true"]]
         ]
h2o.H2ODisplay(table, header)


FBI Code GBM Arrest Prob DL Arrest Prob
18 0.113645062475 0.00228440147384
11 0.113645062475 0.00461659407729
Out[15]: