In [1]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [2]:
# Connect to a cluster
h2o.init()


H2O cluster uptime: 1 hours 13 minutes 22 seconds 521 milliseconds
H2O cluster version: 3.7.0.99999
H2O cluster name: ludirehak
H2O cluster total nodes: 1
H2O cluster total free memory: 3.24 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321
H2O Connection proxy: None
Python Version: 3.5.1

In [3]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
weather_path = _locate("smalldata/chicago/chicagoAllWeather.csv")
census_path = _locate("smalldata/chicago/chicagoCensus.csv")
crimes_path = _locate("smalldata/chicago/chicagoCrimes10k.csv.zip")

print("Import and Parse weather data")
weather = h2o.import_file(path=weather_path, col_types = ["time"] + ["numeric"]*6)
weather.drop("date")
weather.describe()

print("Import and Parse census data")
census = h2o.import_file(path=census_path, col_types = ["numeric", "enum"] + ["numeric"]*7)
census.describe()

print("Import and Parse crimes data")
crimes = h2o.import_file(path=crimes_path)
crimes.describe()


Import and Parse weather data

Parse Progress: [##################################################] 100%
Rows:5,162 Cols:7

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 19 8.225108 1.5 KB 3.3811588
C0D Constant Reals 33 14.285715 2.6 KB 5.872539
C1 1-Byte Integers 8 3.4632034 1.7 KB 3.9528418
C1N 1-Byte Integers (w/o NAs) 135 58.441555 29.6 KB 67.35625
C1S 1-Byte Fractions 36 15.584415 8.5 KB 19.437214
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.43:54321 43.9 KB 5162.0 33.0 231.0
mean 43.9 KB 5162.0 33.0 231.0
min 43.9 KB 5162.0 33.0 231.0
max 43.9 KB 5162.0 33.0 231.0
stddev 0 B 0.0 0.0 0.0
total 43.9 KB 5162.0 33.0 231.0

date month day year maxTemp meanTemp minTemp
type time int int int int int int
mins NaN 1.0 1.0 2001.0 -2.0 -9.0 -18.0
mean 0.0 6.47442851607904 15.7082526152654012007.571483920960958.87104292095552450.31035152456788 41.4812584967955
maxs NaN 12.0 31.0 2015.0 103.0 93.0 82.0
sigma -0.0 3.4690517169376858.798951739966594 4.077340905700527 21.48297772368538719.93023992660888419.020729712312264
zeros -5162 0 0 0 0 2 16
missing5162 0 0 0 13 13 13
0 nan 1.0 1.0 2001.0 23.0 14.0 6.0
1 nan 1.0 2.0 2001.0 18.0 12.0 6.0
2 nan 1.0 3.0 2001.0 28.0 18.0 8.0
3 nan 1.0 4.0 2001.0 30.0 24.0 19.0
4 nan 1.0 5.0 2001.0 36.0 30.0 21.0
5 nan 1.0 6.0 2001.0 33.0 26.0 19.0
6 nan 1.0 7.0 2001.0 34.0 28.0 21.0
7 nan 1.0 8.0 2001.0 26.0 20.0 14.0
8 nan 1.0 9.0 2001.0 23.0 16.0 10.0
9 nan 1.0 10.0 2001.0 34.0 26.0 19.0
Import and Parse census data

Parse Progress: [##################################################] 100%
Rows:79 Cols:9

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
C1 1-Byte Integers 3 33.333336 441 B 22.546013
C1S 1-Byte Fractions 1 11.111112 163 B 8.333334
C2S 2-Byte Fractions 4 44.444447 968 B 49.488754
C4 4-Byte Integers 1 11.111112 384 B 19.6319
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.43:54321 1.9 KB 79.0 1.0 9.0
mean 1.9 KB 79.0 1.0 9.0
min 1.9 KB 79.0 1.0 9.0
max 1.9 KB 79.0 1.0 9.0
stddev 0 B 0.0 0.0 0.0
total 1.9 KB 79.0 1.0 9.0

Community Area Number COMMUNITY AREA NAME PERCENT OF HOUSING CROWDED PERCENT HOUSEHOLDS BELOW POVERTY PERCENT AGED 16 UNEMPLOYED PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA PERCENT AGED UNDER 18 OR OVER 64 PER CAPITA INCOME HARDSHIP INDEX
type int enum real real real real real int int
mins 1.0 0.0 0.30000000000000004 3.3000000000000003 4.7 2.5 13.5 8201.0 1.0
mean 39.0 NaN 4.920512820512822 21.73974358974359 15.341025641025642 20.33076923076924 35.71794871794871 25597.000000000004 49.506493506493506
maxs 77.0 78.0 15.8 56.5 35.9 54.800000000000004 51.5 88669.0 98.0
sigma 22.371857321197094 NaN 3.6589814413502006 11.457230912971083 7.49949670860991 11.746514351100048 7.284421084944952 15196.405541331917 28.69055565156158
zeros 0 1 0 0 0 0 0 0 0
missing2 0 1 1 1 1 1 1 2
0 nan COMMUNITY AREA NAME nan nan nan nan nan nan nan
1 1.0 Rogers Park 7.7 23.6 8.700000000000001 18.2 27.5 23939.0 39.0
2 2.0 West Ridge 7.800000000000001 17.2 8.8 20.8 38.5 23040.0 46.0
3 3.0 Uptown 3.8000000000000003 24.0 8.9 11.8 22.200000000000003 35787.0 20.0
4 4.0 Lincoln Square 3.4000000000000004 10.9 8.200000000000001 13.4 25.5 37524.0 17.0
5 5.0 North Center 0.30000000000000004 7.5 5.2 4.5 26.200000000000003 57123.0 6.0
6 6.0 Lake View 1.1 11.4 4.7 2.6 17.0 60058.0 5.0
7 7.0 Lincoln Park 0.8 12.3 5.1000000000000005 3.6 21.5 71551.0 2.0
8 8.0 Near North Side 1.9000000000000001 12.9 7.0 2.5 22.6 88669.0 1.0
9 9.0 Edison Park 1.1 3.3000000000000003 6.5 7.4 35.300000000000004 40959.0 8.0
Import and Parse crimes data

Parse Progress: [##################################################] 100%
Rows:9,999 Cols:22

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 1 4.5454545 80 B 0.0092869
C1 1-Byte Integers 8 36.363636 78.6 KB 9.349084
C1N 1-Byte Integers (w/o NAs) 2 9.090909 19.7 KB 2.337271
C2 2-Byte Integers 4 18.181818 78.4 KB 9.317509
C4 4-Byte Integers 3 13.636364 117.4 KB 13.952581
CStr String 2 9.090909 390.7 KB 46.446617
C8D 64-bit Reals 2 9.090909 156.4 KB 18.587654
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.43:54321 841.2 KB 9999.0 1.0 22.0
mean 841.2 KB 9999.0 1.0 22.0
min 841.2 KB 9999.0 1.0 22.0
max 841.2 KB 9999.0 1.0 22.0
stddev 0 B 0.0 0.0 0.0
total 841.2 KB 9999.0 1.0 22.0

ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location
type int string string enum int enum enum enum enum enum int int int int int int int int enum real real enum
mins 21735.0 NaN NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 2015.00.0 41.64507243 -87.906463888 0.0
mean 9931318.737373699NaN NaN NaN 1189.676513569939NaN NaN NaN 0.292829282928292740.152315231523152351159.618061806176511.34898851275791822.95409540954100837.44764476447653612.7401236226821141163880.59814984071885916.14984243082015.0NaN 41.842565224673535 -87.67414052209607 NaN
maxs 9962898.0 NaN NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 2015.032.0 42.022646183 -87.524773286 8603.0
sigma 396787.5642214295NaN NaN NaN 927.7514355826443NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.64956611436129621.2748762223208567.574238579108433 16496.44936814723831274.01631985589 0.0 NaN 0.086018657935848240.06003579706529789NaN
zeros 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1
missing0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162
0 9955810.0 HY144797 02/08/2015 11:43:40 PM081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 2015.002/15/2015 12:43:39 PM41.747693646 -87.54903538900001 (41.747693646, -87.549035389)
1 9955861.0 HY144838 02/08/2015 11:41:42 PM118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 2015.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)
2 9955801.0 HY144779 02/08/2015 11:30:22 PM002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 2015.002/15/2015 12:43:39 PM41.877773330000004 -87.755117993 (41.87777333, -87.755117993)
3 9956197.0 HY144787 02/08/2015 11:30:23 PM006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 2015.002/15/2015 12:43:39 PMnan nan
4 9955846.0 HY144829 02/08/2015 11:30:58 PM0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 2015.002/15/2015 12:43:39 PM41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324)
5 9955835.0 HY144778 02/08/2015 11:30:21 PM010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 2015.002/15/2015 12:43:39 PM41.807059405000004 -87.65206589 (41.807059405, -87.65206589)
6 9955872.0 HY144822 02/08/2015 11:27:24 PM015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 2015.002/15/2015 12:43:39 PM41.999814056000005 -87.669342967 (41.999814056, -87.669342967)
7 21752.0 HY144738 02/08/2015 11:26:12 PM060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 2015.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)
8 9955808.0 HY144775 02/08/2015 11:20:33 PM001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)
9 9958275.0 HY146732 02/08/2015 11:15:36 PM001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)

In [4]:
def refine_date_col(data, col, pattern):
    data[col]         = data[col].as_date(pattern)
    data["Day"]       = data[col].day()
    data["Month"]     = data[col].month() + 1     # Since H2O indexes from 0
    data["Year"]      = data[col].year() + 1900   # Start of epoch is 1900
    data["WeekNum"]   = data[col].week()
    data["WeekDay"]   = data[col].dayOfWeek()
    data["HourOfDay"] = data[col].hour()
    
    data.describe()  # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
    
    # Create weekend and season cols
    # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
    # data["Weekend"]   = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
    data["Weekend"] = ((data["WeekDay"] == "Sun") | (data["WeekDay"] == "Sat"))
    data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
    
refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
crimes = crimes.drop("Date")
crimes.describe()


Rows:9,999 Cols:27

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 1 3.7037036 80 B 0.0110837
C1 1-Byte Integers 8 29.62963 78.6 KB 11.157955
C1N 1-Byte Integers (w/o NAs) 7 25.925926 68.8 KB 9.763211
C2 2-Byte Integers 4 14.814815 78.4 KB 11.12027
C4 4-Byte Integers 3 11.111112 117.4 KB 16.652143
C8 64-bit Integers 1 3.7037036 78.2 KB 11.092008
CStr String 1 3.7037036 127.0 KB 18.019316
C8D 64-bit Reals 2 7.4074073 156.4 KB 22.184013
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.43:54321 704.9 KB 9999.0 1.0 27.0
mean 704.9 KB 9999.0 1.0 27.0
min 704.9 KB 9999.0 1.0 27.0
max 704.9 KB 9999.0 1.0 27.0
stddev 0 B 0.0 0.0 0.0
total 704.9 KB 9999.0 1.0 27.0

ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay
type int string int enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int
mins 21735.0 NaN 1422030630000.0 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.00.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0
mean 9931318.737373699NaN 1422714450809.2847NaN 1189.676513569939NaN NaN NaN 0.292829282928292740.152315231523152351159.618061806176511.34898851275791822.95409540954100837.44764476447653612.7401236226821141163880.59814984071885916.14984243083915.0NaN 41.842565224673535 -87.67414052209607 NaN 17.6839683968396632.419441944194423 5.1808180818082 NaN 13.631963196319662
maxs 9962898.0 NaN 1423467820000.0 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.032.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0
sigma 396787.5642214295NaN 433879245.1905283 NaN 927.7514355826443NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.64956611436129621.2748762223208567.574238579108433 16496.44936814723831274.01631985589 0.0 NaN 0.086018657935848240.06003579706529789NaN 11.1801043358277020.49349240678653860.7389298304087689NaN 6.4732173580715475
zeros 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374
missing0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0
0 9955810.0 HY144797 1423467820000.0 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.002/15/2015 12:43:39 PM41.747693646 -87.54903538900001 (41.747693646, -87.549035389)8.0 3.0 6.0 Sun 23.0
1 9955861.0 HY144838 1423467702000.0 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)8.0 3.0 6.0 Sun 23.0
2 9955801.0 HY144779 1423467022000.0 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.002/15/2015 12:43:39 PM41.877773330000004 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0
3 9956197.0 HY144787 1423467023000.0 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.002/15/2015 12:43:39 PMnan nan 8.0 3.0 6.0 Sun 23.0
4 9955846.0 HY144829 1423467058000.0 0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.002/15/2015 12:43:39 PM41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324)8.0 3.0 6.0 Sun 23.0
5 9955835.0 HY144778 1423467021000.0 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.002/15/2015 12:43:39 PM41.807059405000004 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0
6 9955872.0 HY144822 1423466844000.0 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.002/15/2015 12:43:39 PM41.999814056000005 -87.669342967 (41.999814056, -87.669342967)8.0 3.0 6.0 Sun 23.0
7 21752.0 HY144738 1423466772000.0 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)8.0 3.0 6.0 Sun 23.0
8 9955808.0 HY144775 1423466433000.0 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0
9 9958275.0 HY146732 1423466136000.0 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0
Rows:9,999 Cols:28

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 1 3.5714288 80 B 0.0124154
CBS Bits 2 7.1428576 2.6 KB 0.4097082
C1 1-Byte Integers 8 28.57143 78.6 KB 12.498584
C1N 1-Byte Integers (w/o NAs) 7 25.0 68.8 KB 10.936261
C2 2-Byte Integers 4 14.285715 78.4 KB 12.456371
C4 4-Byte Integers 3 10.714286 117.4 KB 18.652899
CStr String 1 3.5714288 127.0 KB 20.184338
C8D 64-bit Reals 2 7.1428576 156.4 KB 24.849424
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.43:54321 629.3 KB 9999.0 1.0 28.0
mean 629.3 KB 9999.0 1.0 28.0
min 629.3 KB 9999.0 1.0 28.0
max 629.3 KB 9999.0 1.0 28.0
stddev 0 B 0.0 0.0 0.0
total 629.3 KB 9999.0 1.0 28.0

ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum
mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.00.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0
mean 9931318.737373699NaN NaN 1189.676513569939NaN NaN NaN 0.292829282928292740.152315231523152351159.618061806176511.34898851275791822.95409540954100837.44764476447653612.7401236226821141163880.59814984071885916.14984243083915.0NaN 41.842565224673535 -87.67414052209607 NaN 17.6839683968396632.419441944194423 5.1808180818082 NaN 13.6319631963196620.35753575357535755NaN
maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.032.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0
sigma 396787.5642214295NaN NaN 927.7514355826443NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.64956611436129621.2748762223208567.574238579108433 16496.44936814723831274.01631985589 0.0 NaN 0.086018657935848240.06003579706529789NaN 11.1801043358277020.49349240678653860.7389298304087689NaN 6.47321735807154750.47929835538994453NaN
zeros 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 6424 5805
missing0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0
0 9955810.0 HY144797 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.002/15/2015 12:43:39 PM41.747693646 -87.54903538900001 (41.747693646, -87.549035389)8.0 3.0 6.0 Sun 23.0 1.0 Spring
1 9955861.0 HY144838 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)8.0 3.0 6.0 Sun 23.0 1.0 Spring
2 9955801.0 HY144779 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.002/15/2015 12:43:39 PM41.877773330000004 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
3 9956197.0 HY144787 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.002/15/2015 12:43:39 PMnan nan 8.0 3.0 6.0 Sun 23.0 1.0 Spring
4 9955846.0 HY144829 0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.002/15/2015 12:43:39 PM41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324)8.0 3.0 6.0 Sun 23.0 1.0 Spring
5 9955835.0 HY144778 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.002/15/2015 12:43:39 PM41.807059405000004 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
6 9955872.0 HY144822 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.002/15/2015 12:43:39 PM41.999814056000005 -87.669342967 (41.999814056, -87.669342967)8.0 3.0 6.0 Sun 23.0 1.0 Spring
7 21752.0 HY144738 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)8.0 3.0 6.0 Sun 23.0 1.0 Spring
8 9955808.0 HY144775 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring
9 9958275.0 HY146732 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring

In [5]:
# Merge crimes data with weather and census
census.set_name(0,"Community Area")
weather.set_name(1,"Month")
weather.set_name(2,"Day")
weather.set_name(3,"Year")
crimes.merge(census, all_x=True, all_y=False)
crimes.merge(weather, all_x=True, all_y=False)
crimes.describe()


Rows:9,999 Cols:28

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 1 3.5714288 80 B 0.0124154
CBS Bits 2 7.1428576 2.6 KB 0.4097082
C1 1-Byte Integers 8 28.57143 78.6 KB 12.498584
C1N 1-Byte Integers (w/o NAs) 7 25.0 68.8 KB 10.936261
C2 2-Byte Integers 4 14.285715 78.4 KB 12.456371
C4 4-Byte Integers 3 10.714286 117.4 KB 18.652899
CStr String 1 3.5714288 127.0 KB 20.184338
C8D 64-bit Reals 2 7.1428576 156.4 KB 24.849424
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.43:54321 629.3 KB 9999.0 1.0 28.0
mean 629.3 KB 9999.0 1.0 28.0
min 629.3 KB 9999.0 1.0 28.0
max 629.3 KB 9999.0 1.0 28.0
stddev 0 B 0.0 0.0 0.0
total 629.3 KB 9999.0 1.0 28.0

ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum
mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.00.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0
mean 9931318.737373699NaN NaN 1189.676513569939NaN NaN NaN 0.292829282928292740.152315231523152351159.618061806176511.34898851275791822.95409540954100837.44764476447653612.7401236226821141163880.59814984071885916.14984243083915.0NaN 41.842565224673535 -87.67414052209607 NaN 17.6839683968396632.419441944194423 5.1808180818082 NaN 13.6319631963196620.35753575357535755NaN
maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.032.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0
sigma 396787.5642214295NaN NaN 927.7514355826443NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.64956611436129621.2748762223208567.574238579108433 16496.44936814723831274.01631985589 0.0 NaN 0.086018657935848240.06003579706529789NaN 11.1801043358277020.49349240678653860.7389298304087689NaN 6.47321735807154750.47929835538994453NaN
zeros 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 6424 5805
missing0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0
0 9955810.0 HY144797 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.002/15/2015 12:43:39 PM41.747693646 -87.54903538900001 (41.747693646, -87.549035389)8.0 3.0 6.0 Sun 23.0 1.0 Spring
1 9955861.0 HY144838 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)8.0 3.0 6.0 Sun 23.0 1.0 Spring
2 9955801.0 HY144779 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.002/15/2015 12:43:39 PM41.877773330000004 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
3 9956197.0 HY144787 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.002/15/2015 12:43:39 PMnan nan 8.0 3.0 6.0 Sun 23.0 1.0 Spring
4 9955846.0 HY144829 0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.002/15/2015 12:43:39 PM41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324)8.0 3.0 6.0 Sun 23.0 1.0 Spring
5 9955835.0 HY144778 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.002/15/2015 12:43:39 PM41.807059405000004 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
6 9955872.0 HY144822 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.002/15/2015 12:43:39 PM41.999814056000005 -87.669342967 (41.999814056, -87.669342967)8.0 3.0 6.0 Sun 23.0 1.0 Spring
7 21752.0 HY144738 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)8.0 3.0 6.0 Sun 23.0 1.0 Spring
8 9955808.0 HY144775 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring
9 9958275.0 HY146732 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring

In [6]:
# Create test/train split
r = crimes["Arrest"].runif(1234)
train = crimes[r < 0.8]
test = crimes[r >= 0.8]

# Simple GBM - Predict Arrest
crimes_names_x = crimes.names[:]
crimes_names_x.remove("Arrest")
data_gbm = H2OGradientBoostingEstimator(ntrees         =10,
                                        max_depth      =6,
                                        distribution   ="bernoulli")

data_gbm.train(x               =crimes_names_x,
               y               ="Arrest",
               training_frame  =train,
               validation_frame=test)

# Simple Deep Learning - Predict Arrest
data_dl = H2ODeepLearningEstimator(variable_importances=True,
                                   loss                ="Automatic")

data_dl.train(x                =crimes_names_x,
              y                ="Arrest",
              training_frame  =train,
              validation_frame=test)


gbm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%

In [7]:
# GBM performance on train/test data
train_auc_gbm = data_gbm.model_performance(train).auc()
test_auc_gbm  = data_gbm.model_performance(test) .auc()

# Deep Learning performance on train/test data
train_auc_dl = data_dl.model_performance(train).auc()
test_auc_dl  = data_dl.model_performance(test) .auc()

# Make a pretty HTML table printout of the results
header = ["Model", "AUC Train", "AUC Test"]
table  = [
           ["GBM", train_auc_gbm, test_auc_gbm],
           ["DL ", train_auc_dl, test_auc_dl]
         ]
h2o.display.H2ODisplay(table, header)


Model AUC Train AUC Test
GBM 0.9568221 0.9307979
DL 0.8956055 0.8841564
Out[7]:
Model AUC Train AUC Test
GBM 0.9568221 0.9307979
DL 0.8956055 0.8841564

In [8]:
# Create new H2OFrame of crime observations
examples = {
            "Date":                 ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"],
            "IUCR":                 [1811, 1150],
            "Primary.Type":         ["NARCOTICS", "DECEPTIVE PRACTICE"],
            "Location.Description": ["STREET", "RESIDENCE"],
            "Domestic":             ["false", "false"],
            "Beat":                 [422, 923],
            "District":             [4, 9],
            "Ward":                 [7, 14],
            "Community.Area":       [46, 63],
            "FBI.Code":             [18, 11]
            }

crime_examples = h2o.H2OFrame(examples)

# Refine date column and merge with census data
refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
crime_examples.drop("Date")
census.set_name(0,"Community.Area")
crime_examples.merge(census, all_x=True, all_y=False)
crime_examples.describe()


Parse Progress: [##################################################] 100%
Rows:2 Cols:16

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 7 43.75 560 B 43.75
C1N 1-Byte Integers (w/o NAs) 4 25.0 280 B 21.875
C2 2-Byte Integers 2 12.5 144 B 11.25
C2S 2-Byte Fractions 1 6.25 88 B 6.875
CStr String 2 12.5 208 B 16.25
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.43:54321 1.3 KB 2.0 1.0 16.0
mean 1.3 KB 2.0 1.0 16.0
min 1.3 KB 2.0 1.0 16.0
max 1.3 KB 2.0 1.0 16.0
stddev 0 B 0.0 0.0 0.0
total 1.3 KB 2.0 1.0 16.0

Primary.Type Domestic FBI.Code Ward District Community.Area Location.Description Date IUCR Beat Day Month Year WeekNum WeekDay HourOfDay
type string enum int int int int string int int int int int int int enum int
mins NaN 0.0 11.0 7.0 4.0 46.0 NaN 1423465239000.0 1150.0 422.0 8.0 3.0 3915.06.0 6.0 23.0
mean NaN 0.0 14.5 10.5 6.5 54.5 NaN 1423466538500.0 1480.5 672.5 8.0 3.0 3915.06.0 NaN 23.0
maxs NaN 0.0 18.0 14.0 9.0 63.0 NaN 1423467838000.0 1811.0 923.0 8.0 3.0 3915.06.0 6.0 23.0
sigma NaN 0.0 4.9497474683058334.9497474683058333.535533905932737812.020815280171307NaN 1837770.524303837467.39758236430794354.260497374460330.0 0.0 0.0 0.0 NaN 0.0
zeros 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 NARCOTICS false 18.0 7.0 4.0 46.0 STREET 1423467838000.0 1811.0 422.0 8.0 3.0 3915.06.0 Sun 23.0
1 DECEPTIVE PRACTICEfalse 11.0 14.0 9.0 63.0 RESIDENCE 1423465239000.0 1150.0 923.0 8.0 3.0 3915.06.0 Sun 23.0
Rows:2 Cols:18

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 9 50.0 720 B 50.0
C1N 1-Byte Integers (w/o NAs) 4 22.222223 280 B 19.444445
C2 2-Byte Integers 2 11.111112 144 B 10.0
C2S 2-Byte Fractions 1 5.555556 88 B 6.111111
CStr String 2 11.111112 208 B 14.444445
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.43:54321 1.4 KB 2.0 1.0 18.0
mean 1.4 KB 2.0 1.0 18.0
min 1.4 KB 2.0 1.0 18.0
max 1.4 KB 2.0 1.0 18.0
stddev 0 B 0.0 0.0 0.0
total 1.4 KB 2.0 1.0 18.0

Primary.Type Domestic FBI.Code Ward District Community.Area Location.Description Date IUCR Beat Day Month Year WeekNum WeekDay HourOfDay Weekend Season
type string enum int int int int string int int int int int int int enum int int enum
mins NaN 0.0 11.0 7.0 4.0 46.0 NaN 1423465239000.0 1150.0 422.0 8.0 3.0 3915.06.0 6.0 23.0 1.0 1.0
mean NaN 0.0 14.5 10.5 6.5 54.5 NaN 1423466538500.0 1480.5 672.5 8.0 3.0 3915.06.0 NaN 23.0 1.0 NaN
maxs NaN 0.0 18.0 14.0 9.0 63.0 NaN 1423467838000.0 1811.0 923.0 8.0 3.0 3915.06.0 6.0 23.0 1.0 1.0
sigma NaN 0.0 4.9497474683058334.9497474683058333.535533905932737812.020815280171307NaN 1837770.524303837467.39758236430794354.260497374460330.0 0.0 0.0 0.0 NaN 0.0 0.0 NaN
zeros 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 NARCOTICS false 18.0 7.0 4.0 46.0 STREET 1423467838000.0 1811.0 422.0 8.0 3.0 3915.06.0 Sun 23.0 1.0 Spring
1 DECEPTIVE PRACTICEfalse 11.0 14.0 9.0 63.0 RESIDENCE 1423465239000.0 1150.0 923.0 8.0 3.0 3915.06.0 Sun 23.0 1.0 Spring

In [9]:
# Predict probability of arrest from new observations
gbm_pred = data_gbm.predict(crime_examples)
dl_pred  = data_dl .predict(crime_examples)

# Make a pretty HTML table printout of the results
header = ["FBI Code", "GBM Arrest Prob", "DL Arrest Prob"]
table  = [
           [examples["FBI.Code"][0], gbm_pred[0,"true"], dl_pred[0,"true"]],
           [examples["FBI.Code"][1], gbm_pred[1,"true"], dl_pred[1,"true"]]
         ]
h2o.display.H2ODisplay(table, header)


FBI Code GBM Arrest Prob DL Arrest Prob
18 0.1199714 0.3047381
11 0.1199714 0.2496035
Out[9]:
FBI Code GBM Arrest Prob DL Arrest Prob
18 0.1199714 0.3047381
11 0.1199714 0.2496035