In [1]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
In [2]:
# Connect to a cluster
h2o.init()
H2O cluster uptime:
1 hours 13 minutes 22 seconds 521 milliseconds
H2O cluster version:
3.7.0.99999
H2O cluster name:
ludirehak
H2O cluster total nodes:
1
H2O cluster total free memory:
3.24 GB
H2O cluster total cores:
8
H2O cluster allowed cores:
8
H2O cluster healthy:
True
H2O Connection ip:
127.0.0.1
H2O Connection port:
54321
H2O Connection proxy:
None
Python Version:
3.5.1
In [3]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
weather_path = _locate("smalldata/chicago/chicagoAllWeather.csv")
census_path = _locate("smalldata/chicago/chicagoCensus.csv")
crimes_path = _locate("smalldata/chicago/chicagoCrimes10k.csv.zip")
print("Import and Parse weather data")
weather = h2o.import_file(path=weather_path, col_types = ["time"] + ["numeric"]*6)
weather.drop("date")
weather.describe()
print("Import and Parse census data")
census = h2o.import_file(path=census_path, col_types = ["numeric", "enum"] + ["numeric"]*7)
census.describe()
print("Import and Parse crimes data")
crimes = h2o.import_file(path=crimes_path)
crimes.describe()
Import and Parse weather data
Parse Progress: [##################################################] 100%
Rows:5,162 Cols:7
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
19
8.225108
1.5 KB
3.3811588
C0D
Constant Reals
33
14.285715
2.6 KB
5.872539
C1
1-Byte Integers
8
3.4632034
1.7 KB
3.9528418
C1N
1-Byte Integers (w/o NAs)
135
58.441555
29.6 KB
67.35625
C1S
1-Byte Fractions
36
15.584415
8.5 KB
19.437214
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.43:54321
43.9 KB
5162.0
33.0
231.0
mean
43.9 KB
5162.0
33.0
231.0
min
43.9 KB
5162.0
33.0
231.0
max
43.9 KB
5162.0
33.0
231.0
stddev
0 B
0.0
0.0
0.0
total
43.9 KB
5162.0
33.0
231.0
date month day year maxTemp meanTemp minTemp
type time int int int int int int
mins NaN 1.0 1.0 2001.0 -2.0 -9.0 -18.0
mean 0.0 6.47442851607904 15.708252615265401 2007.5714839209609 58.871042920955524 50.31035152456788 41.4812584967955
maxs NaN 12.0 31.0 2015.0 103.0 93.0 82.0
sigma -0.0 3.469051716937685 8.798951739966594 4.077340905700527 21.482977723685387 19.930239926608884 19.020729712312264
zeros -5162 0 0 0 0 2 16
missing 5162 0 0 0 13 13 13
0 nan 1.0 1.0 2001.0 23.0 14.0 6.0
1 nan 1.0 2.0 2001.0 18.0 12.0 6.0
2 nan 1.0 3.0 2001.0 28.0 18.0 8.0
3 nan 1.0 4.0 2001.0 30.0 24.0 19.0
4 nan 1.0 5.0 2001.0 36.0 30.0 21.0
5 nan 1.0 6.0 2001.0 33.0 26.0 19.0
6 nan 1.0 7.0 2001.0 34.0 28.0 21.0
7 nan 1.0 8.0 2001.0 26.0 20.0 14.0
8 nan 1.0 9.0 2001.0 23.0 16.0 10.0
9 nan 1.0 10.0 2001.0 34.0 26.0 19.0
Import and Parse census data
Parse Progress: [##################################################] 100%
Rows:79 Cols:9
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C1
1-Byte Integers
3
33.333336
441 B
22.546013
C1S
1-Byte Fractions
1
11.111112
163 B
8.333334
C2S
2-Byte Fractions
4
44.444447
968 B
49.488754
C4
4-Byte Integers
1
11.111112
384 B
19.6319
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.43:54321
1.9 KB
79.0
1.0
9.0
mean
1.9 KB
79.0
1.0
9.0
min
1.9 KB
79.0
1.0
9.0
max
1.9 KB
79.0
1.0
9.0
stddev
0 B
0.0
0.0
0.0
total
1.9 KB
79.0
1.0
9.0
Community Area Number COMMUNITY AREA NAME PERCENT OF HOUSING CROWDED PERCENT HOUSEHOLDS BELOW POVERTY PERCENT AGED 16 UNEMPLOYED PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA PERCENT AGED UNDER 18 OR OVER 64 PER CAPITA INCOME HARDSHIP INDEX
type int enum real real real real real int int
mins 1.0 0.0 0.30000000000000004 3.3000000000000003 4.7 2.5 13.5 8201.0 1.0
mean 39.0 NaN 4.920512820512822 21.73974358974359 15.341025641025642 20.33076923076924 35.71794871794871 25597.000000000004 49.506493506493506
maxs 77.0 78.0 15.8 56.5 35.9 54.800000000000004 51.5 88669.0 98.0
sigma 22.371857321197094 NaN 3.6589814413502006 11.457230912971083 7.49949670860991 11.746514351100048 7.284421084944952 15196.405541331917 28.69055565156158
zeros 0 1 0 0 0 0 0 0 0
missing 2 0 1 1 1 1 1 1 2
0 nan COMMUNITY AREA NAME nan nan nan nan nan nan nan
1 1.0 Rogers Park 7.7 23.6 8.700000000000001 18.2 27.5 23939.0 39.0
2 2.0 West Ridge 7.800000000000001 17.2 8.8 20.8 38.5 23040.0 46.0
3 3.0 Uptown 3.8000000000000003 24.0 8.9 11.8 22.200000000000003 35787.0 20.0
4 4.0 Lincoln Square 3.4000000000000004 10.9 8.200000000000001 13.4 25.5 37524.0 17.0
5 5.0 North Center 0.30000000000000004 7.5 5.2 4.5 26.200000000000003 57123.0 6.0
6 6.0 Lake View 1.1 11.4 4.7 2.6 17.0 60058.0 5.0
7 7.0 Lincoln Park 0.8 12.3 5.1000000000000005 3.6 21.5 71551.0 2.0
8 8.0 Near North Side 1.9000000000000001 12.9 7.0 2.5 22.6 88669.0 1.0
9 9.0 Edison Park 1.1 3.3000000000000003 6.5 7.4 35.300000000000004 40959.0 8.0
Import and Parse crimes data
Parse Progress: [##################################################] 100%
Rows:9,999 Cols:22
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
1
4.5454545
80 B
0.0092869
C1
1-Byte Integers
8
36.363636
78.6 KB
9.349084
C1N
1-Byte Integers (w/o NAs)
2
9.090909
19.7 KB
2.337271
C2
2-Byte Integers
4
18.181818
78.4 KB
9.317509
C4
4-Byte Integers
3
13.636364
117.4 KB
13.952581
CStr
String
2
9.090909
390.7 KB
46.446617
C8D
64-bit Reals
2
9.090909
156.4 KB
18.587654
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.43:54321
841.2 KB
9999.0
1.0
22.0
mean
841.2 KB
9999.0
1.0
22.0
min
841.2 KB
9999.0
1.0
22.0
max
841.2 KB
9999.0
1.0
22.0
stddev
0 B
0.0
0.0
0.0
total
841.2 KB
9999.0
1.0
22.0
ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location
type int string string enum int enum enum enum enum enum int int int int int int int int enum real real enum
mins 21735.0 NaN NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 2015.0 0.0 41.64507243 -87.906463888 0.0
mean 9931318.737373699 NaN NaN NaN 1189.676513569939 NaN NaN NaN 0.29282928292829274 0.15231523152315235 1159.6180618061765 11.348988512757918 22.954095409541008 37.447644764476536 12.740123622682114 1163880.5981498407 1885916.1498424308 2015.0 NaN 41.842565224673535 -87.67414052209607 NaN
maxs 9962898.0 NaN NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 2015.0 32.0 42.022646183 -87.524773286 8603.0
sigma 396787.5642214295 NaN NaN NaN 927.7514355826443 NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.649566114361296 21.274876222320856 7.574238579108433 16496.449368147238 31274.01631985589 0.0 NaN 0.08601865793584824 0.06003579706529789 NaN
zeros 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1
missing 0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162
0 9955810.0 HY144797 02/08/2015 11:43:40 PM 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 2015.0 02/15/2015 12:43:39 PM 41.747693646 -87.54903538900001 (41.747693646, -87.549035389)
1 9955861.0 HY144838 02/08/2015 11:41:42 PM 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 2015.0 02/15/2015 12:43:39 PM 41.679442289 -87.622850758 (41.679442289, -87.622850758)
2 9955801.0 HY144779 02/08/2015 11:30:22 PM 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 2015.0 02/15/2015 12:43:39 PM 41.877773330000004 -87.755117993 (41.87777333, -87.755117993)
3 9956197.0 HY144787 02/08/2015 11:30:23 PM 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 321.0 nan 6.0 42.0 18.0 nan nan 2015.0 02/15/2015 12:43:39 PM nan nan
4 9955846.0 HY144829 02/08/2015 11:30:58 PM 0000X S MAYFIELD AVE 610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 2015.0 02/15/2015 12:43:39 PM 41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324)
5 9955835.0 HY144778 02/08/2015 11:30:21 PM 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 2015.0 02/15/2015 12:43:39 PM 41.807059405000004 -87.65206589 (41.807059405, -87.65206589)
6 9955872.0 HY144822 02/08/2015 11:27:24 PM 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGE TO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 2015.0 02/15/2015 12:43:39 PM 41.999814056000005 -87.669342967 (41.999814056, -87.669342967)
7 21752.0 HY144738 02/08/2015 11:26:12 PM 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 2015.0 02/15/2015 12:43:39 PM 41.920755683 -87.776067514 (41.920755683, -87.776067514)
8 9955808.0 HY144775 02/08/2015 11:20:33 PM 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.0 02/15/2015 12:43:39 PM 41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)
9 9958275.0 HY146732 02/08/2015 11:15:36 PM 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.0 02/15/2015 12:43:39 PM 41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356)
In [4]:
def refine_date_col(data, col, pattern):
data[col] = data[col].as_date(pattern)
data["Day"] = data[col].day()
data["Month"] = data[col].month() + 1 # Since H2O indexes from 0
data["Year"] = data[col].year() + 1900 # Start of epoch is 1900
data["WeekNum"] = data[col].week()
data["WeekDay"] = data[col].dayOfWeek()
data["HourOfDay"] = data[col].hour()
data.describe() # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
# Create weekend and season cols
# Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
# data["Weekend"] = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
data["Weekend"] = ((data["WeekDay"] == "Sun") | (data["WeekDay"] == "Sat"))
data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
crimes = crimes.drop("Date")
crimes.describe()
Rows:9,999 Cols:27
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
1
3.7037036
80 B
0.0110837
C1
1-Byte Integers
8
29.62963
78.6 KB
11.157955
C1N
1-Byte Integers (w/o NAs)
7
25.925926
68.8 KB
9.763211
C2
2-Byte Integers
4
14.814815
78.4 KB
11.12027
C4
4-Byte Integers
3
11.111112
117.4 KB
16.652143
C8
64-bit Integers
1
3.7037036
78.2 KB
11.092008
CStr
String
1
3.7037036
127.0 KB
18.019316
C8D
64-bit Reals
2
7.4074073
156.4 KB
22.184013
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.43:54321
704.9 KB
9999.0
1.0
27.0
mean
704.9 KB
9999.0
1.0
27.0
min
704.9 KB
9999.0
1.0
27.0
max
704.9 KB
9999.0
1.0
27.0
stddev
0 B
0.0
0.0
0.0
total
704.9 KB
9999.0
1.0
27.0
ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay
type int string int enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int
mins 21735.0 NaN 1422030630000.0 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0
mean 9931318.737373699 NaN 1422714450809.2847 NaN 1189.676513569939 NaN NaN NaN 0.29282928292829274 0.15231523152315235 1159.6180618061765 11.348988512757918 22.954095409541008 37.447644764476536 12.740123622682114 1163880.5981498407 1885916.1498424308 3915.0 NaN 41.842565224673535 -87.67414052209607 NaN 17.683968396839663 2.419441944194423 5.1808180818082 NaN 13.631963196319662
maxs 9962898.0 NaN 1423467820000.0 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0
sigma 396787.5642214295 NaN 433879245.1905283 NaN 927.7514355826443 NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.649566114361296 21.274876222320856 7.574238579108433 16496.449368147238 31274.01631985589 0.0 NaN 0.08601865793584824 0.06003579706529789 NaN 11.180104335827702 0.4934924067865386 0.7389298304087689 NaN 6.4732173580715475
zeros 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374
missing 0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0
0 9955810.0 HY144797 1423467820000.0 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.0 02/15/2015 12:43:39 PM 41.747693646 -87.54903538900001 (41.747693646, -87.549035389) 8.0 3.0 6.0 Sun 23.0
1 9955861.0 HY144838 1423467702000.0 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.0 02/15/2015 12:43:39 PM 41.679442289 -87.622850758 (41.679442289, -87.622850758) 8.0 3.0 6.0 Sun 23.0
2 9955801.0 HY144779 1423467022000.0 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.0 02/15/2015 12:43:39 PM 41.877773330000004 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0
3 9956197.0 HY144787 1423467023000.0 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.0 02/15/2015 12:43:39 PM nan nan 8.0 3.0 6.0 Sun 23.0
4 9955846.0 HY144829 1423467058000.0 0000X S MAYFIELD AVE 610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.0 02/15/2015 12:43:39 PM 41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324) 8.0 3.0 6.0 Sun 23.0
5 9955835.0 HY144778 1423467021000.0 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.0 02/15/2015 12:43:39 PM 41.807059405000004 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0
6 9955872.0 HY144822 1423466844000.0 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGE TO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.0 02/15/2015 12:43:39 PM 41.999814056000005 -87.669342967 (41.999814056, -87.669342967) 8.0 3.0 6.0 Sun 23.0
7 21752.0 HY144738 1423466772000.0 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.0 02/15/2015 12:43:39 PM 41.920755683 -87.776067514 (41.920755683, -87.776067514) 8.0 3.0 6.0 Sun 23.0
8 9955808.0 HY144775 1423466433000.0 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0
9 9958275.0 HY146732 1423466136000.0 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0
Rows:9,999 Cols:28
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
1
3.5714288
80 B
0.0124154
CBS
Bits
2
7.1428576
2.6 KB
0.4097082
C1
1-Byte Integers
8
28.57143
78.6 KB
12.498584
C1N
1-Byte Integers (w/o NAs)
7
25.0
68.8 KB
10.936261
C2
2-Byte Integers
4
14.285715
78.4 KB
12.456371
C4
4-Byte Integers
3
10.714286
117.4 KB
18.652899
CStr
String
1
3.5714288
127.0 KB
20.184338
C8D
64-bit Reals
2
7.1428576
156.4 KB
24.849424
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.43:54321
629.3 KB
9999.0
1.0
28.0
mean
629.3 KB
9999.0
1.0
28.0
min
629.3 KB
9999.0
1.0
28.0
max
629.3 KB
9999.0
1.0
28.0
stddev
0 B
0.0
0.0
0.0
total
629.3 KB
9999.0
1.0
28.0
ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum
mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0
mean 9931318.737373699 NaN NaN 1189.676513569939 NaN NaN NaN 0.29282928292829274 0.15231523152315235 1159.6180618061765 11.348988512757918 22.954095409541008 37.447644764476536 12.740123622682114 1163880.5981498407 1885916.1498424308 3915.0 NaN 41.842565224673535 -87.67414052209607 NaN 17.683968396839663 2.419441944194423 5.1808180818082 NaN 13.631963196319662 0.35753575357535755 NaN
maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0
sigma 396787.5642214295 NaN NaN 927.7514355826443 NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.649566114361296 21.274876222320856 7.574238579108433 16496.449368147238 31274.01631985589 0.0 NaN 0.08601865793584824 0.06003579706529789 NaN 11.180104335827702 0.4934924067865386 0.7389298304087689 NaN 6.4732173580715475 0.47929835538994453 NaN
zeros 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 6424 5805
missing 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0
0 9955810.0 HY144797 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.0 02/15/2015 12:43:39 PM 41.747693646 -87.54903538900001 (41.747693646, -87.549035389) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
1 9955861.0 HY144838 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.0 02/15/2015 12:43:39 PM 41.679442289 -87.622850758 (41.679442289, -87.622850758) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
2 9955801.0 HY144779 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.0 02/15/2015 12:43:39 PM 41.877773330000004 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
3 9956197.0 HY144787 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.0 02/15/2015 12:43:39 PM nan nan 8.0 3.0 6.0 Sun 23.0 1.0 Spring
4 9955846.0 HY144829 0000X S MAYFIELD AVE 610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.0 02/15/2015 12:43:39 PM 41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
5 9955835.0 HY144778 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.0 02/15/2015 12:43:39 PM 41.807059405000004 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
6 9955872.0 HY144822 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGE TO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.0 02/15/2015 12:43:39 PM 41.999814056000005 -87.669342967 (41.999814056, -87.669342967) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
7 21752.0 HY144738 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.0 02/15/2015 12:43:39 PM 41.920755683 -87.776067514 (41.920755683, -87.776067514) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
8 9955808.0 HY144775 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
9 9958275.0 HY146732 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
In [5]:
# Merge crimes data with weather and census
census.set_name(0,"Community Area")
weather.set_name(1,"Month")
weather.set_name(2,"Day")
weather.set_name(3,"Year")
crimes.merge(census, all_x=True, all_y=False)
crimes.merge(weather, all_x=True, all_y=False)
crimes.describe()
Rows:9,999 Cols:28
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
1
3.5714288
80 B
0.0124154
CBS
Bits
2
7.1428576
2.6 KB
0.4097082
C1
1-Byte Integers
8
28.57143
78.6 KB
12.498584
C1N
1-Byte Integers (w/o NAs)
7
25.0
68.8 KB
10.936261
C2
2-Byte Integers
4
14.285715
78.4 KB
12.456371
C4
4-Byte Integers
3
10.714286
117.4 KB
18.652899
CStr
String
1
3.5714288
127.0 KB
20.184338
C8D
64-bit Reals
2
7.1428576
156.4 KB
24.849424
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.43:54321
629.3 KB
9999.0
1.0
28.0
mean
629.3 KB
9999.0
1.0
28.0
min
629.3 KB
9999.0
1.0
28.0
max
629.3 KB
9999.0
1.0
28.0
stddev
0 B
0.0
0.0
0.0
total
629.3 KB
9999.0
1.0
28.0
ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum
mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0
mean 9931318.737373699 NaN NaN 1189.676513569939 NaN NaN NaN 0.29282928292829274 0.15231523152315235 1159.6180618061765 11.348988512757918 22.954095409541008 37.447644764476536 12.740123622682114 1163880.5981498407 1885916.1498424308 3915.0 NaN 41.842565224673535 -87.67414052209607 NaN 17.683968396839663 2.419441944194423 5.1808180818082 NaN 13.631963196319662 0.35753575357535755 NaN
maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0
sigma 396787.5642214295 NaN NaN 927.7514355826443 NaN NaN NaN 0.4550835155878833 0.3593441468595258 695.7602987498396 6.945474933012859 13.649566114361296 21.274876222320856 7.574238579108433 16496.449368147238 31274.01631985589 0.0 NaN 0.08601865793584824 0.06003579706529789 NaN 11.180104335827702 0.4934924067865386 0.7389298304087689 NaN 6.4732173580715475 0.47929835538994453 NaN
zeros 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 6424 5805
missing 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0
0 9955810.0 HY144797 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.0 02/15/2015 12:43:39 PM 41.747693646 -87.54903538900001 (41.747693646, -87.549035389) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
1 9955861.0 HY144838 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.0 02/15/2015 12:43:39 PM 41.679442289 -87.622850758 (41.679442289, -87.622850758) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
2 9955801.0 HY144779 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.0 02/15/2015 12:43:39 PM 41.877773330000004 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
3 9956197.0 HY144787 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.0 02/15/2015 12:43:39 PM nan nan 8.0 3.0 6.0 Sun 23.0 1.0 Spring
4 9955846.0 HY144829 0000X S MAYFIELD AVE 610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.0 02/15/2015 12:43:39 PM 41.880025548000006 -87.77154132400001 (41.880025548, -87.771541324) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
5 9955835.0 HY144778 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.0 02/15/2015 12:43:39 PM 41.807059405000004 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
6 9955872.0 HY144822 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGE TO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.0 02/15/2015 12:43:39 PM 41.999814056000005 -87.669342967 (41.999814056, -87.669342967) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
7 21752.0 HY144738 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.0 02/15/2015 12:43:39 PM 41.920755683 -87.776067514 (41.920755683, -87.776067514) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
8 9955808.0 HY144775 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
9 9958275.0 HY146732 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818000005 -87.63139635600001 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
In [6]:
# Create test/train split
r = crimes["Arrest"].runif(1234)
train = crimes[r < 0.8]
test = crimes[r >= 0.8]
# Simple GBM - Predict Arrest
crimes_names_x = crimes.names[:]
crimes_names_x.remove("Arrest")
data_gbm = H2OGradientBoostingEstimator(ntrees =10,
max_depth =6,
distribution ="bernoulli")
data_gbm.train(x =crimes_names_x,
y ="Arrest",
training_frame =train,
validation_frame=test)
# Simple Deep Learning - Predict Arrest
data_dl = H2ODeepLearningEstimator(variable_importances=True,
loss ="Automatic")
data_dl.train(x =crimes_names_x,
y ="Arrest",
training_frame =train,
validation_frame=test)
gbm Model Build Progress: [##################################################] 100%
deeplearning Model Build Progress: [##################################################] 100%
In [7]:
# GBM performance on train/test data
train_auc_gbm = data_gbm.model_performance(train).auc()
test_auc_gbm = data_gbm.model_performance(test) .auc()
# Deep Learning performance on train/test data
train_auc_dl = data_dl.model_performance(train).auc()
test_auc_dl = data_dl.model_performance(test) .auc()
# Make a pretty HTML table printout of the results
header = ["Model", "AUC Train", "AUC Test"]
table = [
["GBM", train_auc_gbm, test_auc_gbm],
["DL ", train_auc_dl, test_auc_dl]
]
h2o.display.H2ODisplay(table, header)
Model
AUC Train
AUC Test
GBM
0.9568221
0.9307979
DL
0.8956055
0.8841564
Out[7]:
Model
AUC Train
AUC Test
GBM
0.9568221
0.9307979
DL
0.8956055
0.8841564
In [8]:
# Create new H2OFrame of crime observations
examples = {
"Date": ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"],
"IUCR": [1811, 1150],
"Primary.Type": ["NARCOTICS", "DECEPTIVE PRACTICE"],
"Location.Description": ["STREET", "RESIDENCE"],
"Domestic": ["false", "false"],
"Beat": [422, 923],
"District": [4, 9],
"Ward": [7, 14],
"Community.Area": [46, 63],
"FBI.Code": [18, 11]
}
crime_examples = h2o.H2OFrame(examples)
# Refine date column and merge with census data
refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
crime_examples.drop("Date")
census.set_name(0,"Community.Area")
crime_examples.merge(census, all_x=True, all_y=False)
crime_examples.describe()
Parse Progress: [##################################################] 100%
Rows:2 Cols:16
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
7
43.75
560 B
43.75
C1N
1-Byte Integers (w/o NAs)
4
25.0
280 B
21.875
C2
2-Byte Integers
2
12.5
144 B
11.25
C2S
2-Byte Fractions
1
6.25
88 B
6.875
CStr
String
2
12.5
208 B
16.25
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.43:54321
1.3 KB
2.0
1.0
16.0
mean
1.3 KB
2.0
1.0
16.0
min
1.3 KB
2.0
1.0
16.0
max
1.3 KB
2.0
1.0
16.0
stddev
0 B
0.0
0.0
0.0
total
1.3 KB
2.0
1.0
16.0
Primary.Type Domestic FBI.Code Ward District Community.Area Location.Description Date IUCR Beat Day Month Year WeekNum WeekDay HourOfDay
type string enum int int int int string int int int int int int int enum int
mins NaN 0.0 11.0 7.0 4.0 46.0 NaN 1423465239000.0 1150.0 422.0 8.0 3.0 3915.0 6.0 6.0 23.0
mean NaN 0.0 14.5 10.5 6.5 54.5 NaN 1423466538500.0 1480.5 672.5 8.0 3.0 3915.0 6.0 NaN 23.0
maxs NaN 0.0 18.0 14.0 9.0 63.0 NaN 1423467838000.0 1811.0 923.0 8.0 3.0 3915.0 6.0 6.0 23.0
sigma NaN 0.0 4.949747468305833 4.949747468305833 3.5355339059327378 12.020815280171307 NaN 1837770.524303837 467.39758236430794 354.26049737446033 0.0 0.0 0.0 0.0 NaN 0.0
zeros 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0
missing 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 NARCOTICS false 18.0 7.0 4.0 46.0 STREET 1423467838000.0 1811.0 422.0 8.0 3.0 3915.0 6.0 Sun 23.0
1 DECEPTIVE PRACTICE false 11.0 14.0 9.0 63.0 RESIDENCE 1423465239000.0 1150.0 923.0 8.0 3.0 3915.0 6.0 Sun 23.0
Rows:2 Cols:18
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
9
50.0
720 B
50.0
C1N
1-Byte Integers (w/o NAs)
4
22.222223
280 B
19.444445
C2
2-Byte Integers
2
11.111112
144 B
10.0
C2S
2-Byte Fractions
1
5.555556
88 B
6.111111
CStr
String
2
11.111112
208 B
14.444445
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.43:54321
1.4 KB
2.0
1.0
18.0
mean
1.4 KB
2.0
1.0
18.0
min
1.4 KB
2.0
1.0
18.0
max
1.4 KB
2.0
1.0
18.0
stddev
0 B
0.0
0.0
0.0
total
1.4 KB
2.0
1.0
18.0
Primary.Type Domestic FBI.Code Ward District Community.Area Location.Description Date IUCR Beat Day Month Year WeekNum WeekDay HourOfDay Weekend Season
type string enum int int int int string int int int int int int int enum int int enum
mins NaN 0.0 11.0 7.0 4.0 46.0 NaN 1423465239000.0 1150.0 422.0 8.0 3.0 3915.0 6.0 6.0 23.0 1.0 1.0
mean NaN 0.0 14.5 10.5 6.5 54.5 NaN 1423466538500.0 1480.5 672.5 8.0 3.0 3915.0 6.0 NaN 23.0 1.0 NaN
maxs NaN 0.0 18.0 14.0 9.0 63.0 NaN 1423467838000.0 1811.0 923.0 8.0 3.0 3915.0 6.0 6.0 23.0 1.0 1.0
sigma NaN 0.0 4.949747468305833 4.949747468305833 3.5355339059327378 12.020815280171307 NaN 1837770.524303837 467.39758236430794 354.26049737446033 0.0 0.0 0.0 0.0 NaN 0.0 0.0 NaN
zeros 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
missing 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 NARCOTICS false 18.0 7.0 4.0 46.0 STREET 1423467838000.0 1811.0 422.0 8.0 3.0 3915.0 6.0 Sun 23.0 1.0 Spring
1 DECEPTIVE PRACTICE false 11.0 14.0 9.0 63.0 RESIDENCE 1423465239000.0 1150.0 923.0 8.0 3.0 3915.0 6.0 Sun 23.0 1.0 Spring
In [9]:
# Predict probability of arrest from new observations
gbm_pred = data_gbm.predict(crime_examples)
dl_pred = data_dl .predict(crime_examples)
# Make a pretty HTML table printout of the results
header = ["FBI Code", "GBM Arrest Prob", "DL Arrest Prob"]
table = [
[examples["FBI.Code"][0], gbm_pred[0,"true"], dl_pred[0,"true"]],
[examples["FBI.Code"][1], gbm_pred[1,"true"], dl_pred[1,"true"]]
]
h2o.display.H2ODisplay(table, header)
FBI Code
GBM Arrest Prob
DL Arrest Prob
18
0.1199714
0.3047381
11
0.1199714
0.2496035
Out[9]:
FBI Code
GBM Arrest Prob
DL Arrest Prob
18
0.1199714
0.3047381
11
0.1199714
0.2496035
Content source: nilbody/h2o-3
Similar notebooks: