In [9]:
import h2o
In [10]:
# Connect to a cluster
h2o.init()
H2O cluster uptime:
8 minutes 14 seconds 218 milliseconds
H2O cluster version:
3.7.0.99999
H2O cluster name:
ece
H2O cluster total nodes:
1
H2O cluster total memory:
7.11 GB
H2O cluster total cores:
8
H2O cluster allowed cores:
8
H2O cluster healthy:
True
H2O Connection ip:
127.0.0.1
H2O Connection port:
54321
H2O Connection proxy:
None
In [11]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
weather_path = _locate("smalldata/chicago/chicagoAllWeather.csv")
census_path = _locate("smalldata/chicago/chicagoCensus.csv")
crimes_path = _locate("smalldata/chicago/chicagoCrimes10k.csv.zip")
print("Import and Parse weather data")
weather = h2o.import_file(path=weather_path, col_types = ["time"] + ["numeric"]*6)
weather.drop("date")
weather.describe()
print("Import and Parse census data")
census = h2o.import_file(path=census_path, col_types = ["numeric", "enum"] + ["numeric"]*7)
census.describe()
print("Import and Parse crimes data")
crimes = h2o.import_file(path=crimes_path)
crimes.describe()
Import and Parse weather data
Parse Progress: [##################################################] 100%
Rows:5,162 Cols:7
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0D
Constant Reals
1
14.285715
80 B
0.2537749
C1N
1-Byte Integers (w/o NAs)
2
28.57143
10.2 KB
33.18107
C1S
1-Byte Fractions
4
57.14286
20.5 KB
66.565155
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.61:54321
30.8 KB
5162.0
1.0
7.0
mean
30.8 KB
5162.0
1.0
7.0
min
30.8 KB
5162.0
1.0
7.0
max
30.8 KB
5162.0
1.0
7.0
stddev
0 B
0.0
0.0
0.0
total
30.8 KB
5162.0
1.0
7.0
date month day year maxTemp meanTemp minTemp
type time int int int int int int
mins NaN 1.0 1.0 2001.0 -2.0 -9.0 -18.0
mean 0.0 6.47442851608 15.7082526153 2007.57148392 58.871042921 50.3103515246 41.4812584968
maxs NaN 12.0 31.0 2015.0 103.0 93.0 82.0
sigma -0.0 3.46905171694 8.79895173997 4.0773409057 21.4829777237 19.9302399266 19.0207297123
zeros -5162 0 0 0 0 2 16
missing 5162 0 0 0 13 13 13
0 nan 1.0 1.0 2001.0 23.0 14.0 6.0
1 nan 1.0 2.0 2001.0 18.0 12.0 6.0
2 nan 1.0 3.0 2001.0 28.0 18.0 8.0
3 nan 1.0 4.0 2001.0 30.0 24.0 19.0
4 nan 1.0 5.0 2001.0 36.0 30.0 21.0
5 nan 1.0 6.0 2001.0 33.0 26.0 19.0
6 nan 1.0 7.0 2001.0 34.0 28.0 21.0
7 nan 1.0 8.0 2001.0 26.0 20.0 14.0
8 nan 1.0 9.0 2001.0 23.0 16.0 10.0
9 nan 1.0 10.0 2001.0 34.0 26.0 19.0
Import and Parse census data
Parse Progress: [##################################################] 100%
Rows:79 Cols:9
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C1
1-Byte Integers
3
33.333336
441 B
22.546013
C1S
1-Byte Fractions
1
11.111112
163 B
8.333334
C2S
2-Byte Fractions
4
44.444447
968 B
49.488754
C4
4-Byte Integers
1
11.111112
384 B
19.6319
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.61:54321
1.9 KB
79.0
1.0
9.0
mean
1.9 KB
79.0
1.0
9.0
min
1.9 KB
79.0
1.0
9.0
max
1.9 KB
79.0
1.0
9.0
stddev
0 B
0.0
0.0
0.0
total
1.9 KB
79.0
1.0
9.0
Community Area Number COMMUNITY AREA NAME PERCENT OF HOUSING CROWDED PERCENT HOUSEHOLDS BELOW POVERTY PERCENT AGED 16 UNEMPLOYED PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA PERCENT AGED UNDER 18 OR OVER 64 PER CAPITA INCOME HARDSHIP INDEX
type int enum real real real real real int int
mins 1.0 0.0 0.3 3.3 4.7 2.5 13.5 8201.0 1.0
mean 39.0 NaN 4.92051282051 21.7397435897 15.341025641 20.3307692308 35.7179487179 25597.0 49.5064935065
maxs 77.0 78.0 15.8 56.5 35.9 54.8 51.5 88669.0 98.0
sigma 22.3718573212 NaN 3.65898144135 11.457230913 7.49949670861 11.7465143511 7.28442108494 15196.4055413 28.6905556516
zeros 0 1 0 0 0 0 0 0 0
missing 2 0 1 1 1 1 1 1 2
0 nan COMMUNITY AREA NAME nan nan nan nan nan nan nan
1 1.0 Rogers Park 7.7 23.6 8.7 18.2 27.5 23939.0 39.0
2 2.0 West Ridge 7.8 17.2 8.8 20.8 38.5 23040.0 46.0
3 3.0 Uptown 3.8 24.0 8.9 11.8 22.2 35787.0 20.0
4 4.0 Lincoln Square 3.4 10.9 8.2 13.4 25.5 37524.0 17.0
5 5.0 North Center 0.3 7.5 5.2 4.5 26.2 57123.0 6.0
6 6.0 Lake View 1.1 11.4 4.7 2.6 17.0 60058.0 5.0
7 7.0 Lincoln Park 0.8 12.3 5.1 3.6 21.5 71551.0 2.0
8 8.0 Near North Side 1.9 12.9 7.0 2.5 22.6 88669.0 1.0
9 9.0 Edison Park 1.1 3.3 6.5 7.4 35.3 40959.0 8.0
Import and Parse crimes data
Parse Progress: [##################################################] 100%
Rows:9,999 Cols:22
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
4
4.5454545
320 B
0.0369521
C1
1-Byte Integers
32
36.363636
80.2 KB
9.488375
C1N
1-Byte Integers (w/o NAs)
8
9.090909
20.1 KB
2.3720937
C2
2-Byte Integers
16
18.181818
79.2 KB
9.362738
C4
4-Byte Integers
12
13.636364
118.0 KB
13.949879
CStr
String
8
9.090909
391.2 KB
46.25294
C8D
64-bit Reals
8
9.090909
156.8 KB
18.53702
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.61:54321
845.7 KB
9999.0
4.0
88.0
mean
845.7 KB
9999.0
4.0
88.0
min
845.7 KB
9999.0
4.0
88.0
max
845.7 KB
9999.0
4.0
88.0
stddev
0 B
0.0
0.0
0.0
total
845.7 KB
9999.0
4.0
88.0
ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location
type int string string enum int enum enum enum enum enum int int int int int int int int enum real real enum
mins 21735.0 NaN NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 2015.0 0.0 41.64507243 -87.906463888 0.0
mean 9931318.73737 NaN NaN NaN 1189.67651357 NaN NaN NaN 0.292829282928 0.152315231523 1159.61806181 11.3489885128 22.9540954095 37.4476447645 12.7401236227 1163880.59815 1885916.14984 2015.0 NaN 41.8425652247 -87.6741405221 NaN
maxs 9962898.0 NaN NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 2015.0 32.0 42.022646183 -87.524773286 8603.0
sigma 396787.564221 NaN NaN NaN 927.751435583 NaN NaN NaN 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 NaN 0.0860186579358 0.0600357970653 NaN
zeros 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1
missing 0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162
0 9955810.0 HY144797 02/08/2015 11:43:40 PM 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 2015.0 02/15/2015 12:43:39 PM 41.747693646 -87.549035389 (41.747693646, -87.549035389)
1 9955861.0 HY144838 02/08/2015 11:41:42 PM 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 2015.0 02/15/2015 12:43:39 PM 41.679442289 -87.622850758 (41.679442289, -87.622850758)
2 9955801.0 HY144779 02/08/2015 11:30:22 PM 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 2015.0 02/15/2015 12:43:39 PM 41.87777333 -87.755117993 (41.87777333, -87.755117993)
3 9956197.0 HY144787 02/08/2015 11:30:23 PM 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 321.0 nan 6.0 42.0 18.0 nan nan 2015.0 02/15/2015 12:43:39 PM nan nan
4 9955846.0 HY144829 02/08/2015 11:30:58 PM 0000X S MAYFIELD AVE 610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 2015.0 02/15/2015 12:43:39 PM 41.880025548 -87.771541324 (41.880025548, -87.771541324)
5 9955835.0 HY144778 02/08/2015 11:30:21 PM 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 2015.0 02/15/2015 12:43:39 PM 41.807059405 -87.65206589 (41.807059405, -87.65206589)
6 9955872.0 HY144822 02/08/2015 11:27:24 PM 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGE TO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 2015.0 02/15/2015 12:43:39 PM 41.999814056 -87.669342967 (41.999814056, -87.669342967)
7 21752.0 HY144738 02/08/2015 11:26:12 PM 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 2015.0 02/15/2015 12:43:39 PM 41.920755683 -87.776067514 (41.920755683, -87.776067514)
8 9955808.0 HY144775 02/08/2015 11:20:33 PM 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356)
9 9958275.0 HY146732 02/08/2015 11:15:36 PM 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356)
In [12]:
def refine_date_col(data, col, pattern):
data[col] = data[col].as_date(pattern)
data["Day"] = data[col].day()
data["Month"] = data[col].month() + 1 # Since H2O indexes from 0
data["Year"] = data[col].year() + 1900 # Start of epoch is 1900
data["WeekNum"] = data[col].week()
data["WeekDay"] = data[col].dayOfWeek()
data["HourOfDay"] = data[col].hour()
data.describe() # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
# Create weekend and season cols
# Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
# data["Weekend"] = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
data["Weekend"] = ((data["WeekDay"] == "Sun") | (data["WeekDay"] == "Sat"))
data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
crimes = crimes.drop("Date")
crimes.describe()
Rows:9,999 Cols:27
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
9
8.333334
720 B
0.1006741
C1
1-Byte Integers
32
29.62963
80.2 KB
11.489151
C1N
1-Byte Integers (w/o NAs)
23
21.296297
57.9 KB
8.296664
C2
2-Byte Integers
16
14.814815
79.2 KB
11.337022
C4
4-Byte Integers
12
11.111112
118.0 KB
16.891436
C8
64-bit Integers
4
3.7037036
78.4 KB
11.222924
CStr
String
4
3.7037036
127.2 KB
18.21628
C8D
64-bit Reals
8
7.4074073
156.8 KB
22.445848
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.61:54321
698.4 KB
9999.0
4.0
108.0
mean
698.4 KB
9999.0
4.0
108.0
min
698.4 KB
9999.0
4.0
108.0
max
698.4 KB
9999.0
4.0
108.0
stddev
0 B
0.0
0.0
0.0
total
698.4 KB
9999.0
4.0
108.0
ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay
type int string int enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int
mins 21735.0 NaN 1.42203063e+12 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0
mean 9931318.73737 NaN 1.42271445081e+12 NaN 1189.67651357 NaN NaN NaN 0.292829282928 0.152315231523 1159.61806181 11.3489885128 22.9540954095 37.4476447645 12.7401236227 1163880.59815 1885916.14984 3915.0 NaN 41.8425652247 -87.6741405221 NaN 17.6839683968 2.41944194419 5.18081808181 NaN 13.6319631963
maxs 9962898.0 NaN 1.42346782e+12 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0
sigma 396787.564221 NaN 433879245.187 NaN 927.751435583 NaN NaN NaN 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 NaN 0.0860186579358 0.0600357970653 NaN 11.1801043358 0.493492406787 0.738929830409 NaN 6.47321735807
zeros 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374
missing 0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0
0 9955810.0 HY144797 1.42346782e+12 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.0 02/15/2015 12:43:39 PM 41.747693646 -87.549035389 (41.747693646, -87.549035389) 8.0 3.0 6.0 Sun 23.0
1 9955861.0 HY144838 1.423467702e+12 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.0 02/15/2015 12:43:39 PM 41.679442289 -87.622850758 (41.679442289, -87.622850758) 8.0 3.0 6.0 Sun 23.0
2 9955801.0 HY144779 1.423467022e+12 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.0 02/15/2015 12:43:39 PM 41.87777333 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0
3 9956197.0 HY144787 1.423467023e+12 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.0 02/15/2015 12:43:39 PM nan nan 8.0 3.0 6.0 Sun 23.0
4 9955846.0 HY144829 1.423467058e+12 0000X S MAYFIELD AVE 610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.0 02/15/2015 12:43:39 PM 41.880025548 -87.771541324 (41.880025548, -87.771541324) 8.0 3.0 6.0 Sun 23.0
5 9955835.0 HY144778 1.423467021e+12 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.0 02/15/2015 12:43:39 PM 41.807059405 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0
6 9955872.0 HY144822 1.423466844e+12 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGE TO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.0 02/15/2015 12:43:39 PM 41.999814056 -87.669342967 (41.999814056, -87.669342967) 8.0 3.0 6.0 Sun 23.0
7 21752.0 HY144738 1.423466772e+12 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.0 02/15/2015 12:43:39 PM 41.920755683 -87.776067514 (41.920755683, -87.776067514) 8.0 3.0 6.0 Sun 23.0
8 9955808.0 HY144775 1.423466433e+12 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0
9 9958275.0 HY146732 1.423466136e+12 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0
Rows:9,999 Cols:28
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
12
10.714286
960 B
0.1506986
CBS
Bits
5
4.464286
1.8 KB
0.2948042
C1
1-Byte Integers
32
28.57143
80.2 KB
12.898546
C1N
1-Byte Integers (w/o NAs)
23
20.535715
57.9 KB
9.314431
C2
2-Byte Integers
16
14.285715
79.2 KB
12.727756
C4
4-Byte Integers
12
10.714286
118.0 KB
18.963537
CStr
String
4
3.5714288
127.2 KB
20.450903
C8D
64-bit Reals
8
7.1428576
156.8 KB
25.199324
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.61:54321
622.1 KB
9999.0
4.0
112.0
mean
622.1 KB
9999.0
4.0
112.0
min
622.1 KB
9999.0
4.0
112.0
max
622.1 KB
9999.0
4.0
112.0
stddev
0 B
0.0
0.0
0.0
total
622.1 KB
9999.0
4.0
112.0
ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum
mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0
mean 9931318.73737 NaN NaN 1189.67651357 NaN NaN NaN 0.292829282928 0.152315231523 1159.61806181 11.3489885128 22.9540954095 37.4476447645 12.7401236227 1163880.59815 1885916.14984 3915.0 NaN 41.8425652247 -87.6741405221 NaN 17.6839683968 2.41944194419 5.18081808181 NaN 13.6319631963 0.357535753575 NaN
maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0
sigma 396787.564221 NaN NaN 927.751435583 NaN NaN NaN 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 NaN 0.0860186579358 0.0600357970653 NaN 11.1801043358 0.493492406787 0.738929830409 NaN 6.47321735807 0.47929835539 NaN
zeros 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 6424 5805
missing 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0
0 9955810.0 HY144797 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.0 02/15/2015 12:43:39 PM 41.747693646 -87.549035389 (41.747693646, -87.549035389) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
1 9955861.0 HY144838 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.0 02/15/2015 12:43:39 PM 41.679442289 -87.622850758 (41.679442289, -87.622850758) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
2 9955801.0 HY144779 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.0 02/15/2015 12:43:39 PM 41.87777333 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
3 9956197.0 HY144787 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.0 02/15/2015 12:43:39 PM nan nan 8.0 3.0 6.0 Sun 23.0 1.0 Spring
4 9955846.0 HY144829 0000X S MAYFIELD AVE 610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.0 02/15/2015 12:43:39 PM 41.880025548 -87.771541324 (41.880025548, -87.771541324) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
5 9955835.0 HY144778 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.0 02/15/2015 12:43:39 PM 41.807059405 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
6 9955872.0 HY144822 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGE TO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.0 02/15/2015 12:43:39 PM 41.999814056 -87.669342967 (41.999814056, -87.669342967) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
7 21752.0 HY144738 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.0 02/15/2015 12:43:39 PM 41.920755683 -87.776067514 (41.920755683, -87.776067514) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
8 9955808.0 HY144775 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
9 9958275.0 HY146732 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
In [13]:
# Merge crimes data with weather and census
census.set_name(0,"Community Area")
weather.set_name(1,"Month")
weather.set_name(2,"Day")
weather.set_name(3,"Year")
crimes.merge(census, all_x=True, all_y=False)
crimes.merge(weather, all_x=True, all_y=False)
crimes.describe()
Rows:9,999 Cols:28
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
12
10.714286
960 B
0.1506986
CBS
Bits
5
4.464286
1.8 KB
0.2948042
C1
1-Byte Integers
32
28.57143
80.2 KB
12.898546
C1N
1-Byte Integers (w/o NAs)
23
20.535715
57.9 KB
9.314431
C2
2-Byte Integers
16
14.285715
79.2 KB
12.727756
C4
4-Byte Integers
12
10.714286
118.0 KB
18.963537
CStr
String
4
3.5714288
127.2 KB
20.450903
C8D
64-bit Reals
8
7.1428576
156.8 KB
25.199324
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.61:54321
622.1 KB
9999.0
4.0
112.0
mean
622.1 KB
9999.0
4.0
112.0
min
622.1 KB
9999.0
4.0
112.0
max
622.1 KB
9999.0
4.0
112.0
stddev
0 B
0.0
0.0
0.0
total
622.1 KB
9999.0
4.0
112.0
ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum
mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.0 0.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0
mean 9931318.73737 NaN NaN 1189.67651357 NaN NaN NaN 0.292829282928 0.152315231523 1159.61806181 11.3489885128 22.9540954095 37.4476447645 12.7401236227 1163880.59815 1885916.14984 3915.0 NaN 41.8425652247 -87.6741405221 NaN 17.6839683968 2.41944194419 5.18081808181 NaN 13.6319631963 0.357535753575 NaN
maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.0 32.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0
sigma 396787.564221 NaN NaN 927.751435583 NaN NaN NaN 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 NaN 0.0860186579358 0.0600357970653 NaN 11.1801043358 0.493492406787 0.738929830409 NaN 6.47321735807 0.47929835539 NaN
zeros 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 6424 5805
missing 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0
0 9955810.0 HY144797 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.0 02/15/2015 12:43:39 PM 41.747693646 -87.549035389 (41.747693646, -87.549035389) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
1 9955861.0 HY144838 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.0 02/15/2015 12:43:39 PM 41.679442289 -87.622850758 (41.679442289, -87.622850758) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
2 9955801.0 HY144779 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.0 02/15/2015 12:43:39 PM 41.87777333 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
3 9956197.0 HY144787 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.0 02/15/2015 12:43:39 PM nan nan 8.0 3.0 6.0 Sun 23.0 1.0 Spring
4 9955846.0 HY144829 0000X S MAYFIELD AVE 610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.0 02/15/2015 12:43:39 PM 41.880025548 -87.771541324 (41.880025548, -87.771541324) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
5 9955835.0 HY144778 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.0 02/15/2015 12:43:39 PM 41.807059405 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
6 9955872.0 HY144822 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGE TO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.0 02/15/2015 12:43:39 PM 41.999814056 -87.669342967 (41.999814056, -87.669342967) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
7 21752.0 HY144738 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.0 02/15/2015 12:43:39 PM 41.920755683 -87.776067514 (41.920755683, -87.776067514) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
8 9955808.0 HY144775 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
9 9958275.0 HY146732 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
In [14]:
# Create test/train split
r = crimes["Arrest"].runif(1234)
train = crimes[r < 0.8]
test = crimes[r >= 0.8]
# Simple GBM - Predict Arrest
data_gbm = h2o.gbm(x =train.drop("Arrest"),
y =train ["Arrest"],
validation_x =test .drop("Arrest"),
validation_y =test ["Arrest"],
ntrees =10,
max_depth =6,
distribution ="bernoulli")
# Simple Deep Learning - Predict Arrest
data_dl = h2o.deeplearning(x =train.drop("Arrest"),
y =train ["Arrest"],
validation_x =test .drop("Arrest"),
validation_y =test ["Arrest"],
variable_importances=True,
loss ="Automatic")
gbm Model Build Progress: [##################################################] 100%
-c:13: DeprecationWarning: `h2o.gbm` is deprecated. Use the estimators sub module to build an H2OGradientBoostedEstimator.
-c:21: DeprecationWarning: `h2o.deeplearning` is deprecated. Use the estimators sub module to build an H2ODeepLearningEstimator.
deeplearning Model Build Progress: [##################################################] 100%
In [15]:
# GBM performance on train/test data
train_auc_gbm = data_gbm.model_performance(train).auc()
test_auc_gbm = data_gbm.model_performance(test) .auc()
# Deep Learning performance on train/test data
train_auc_dl = data_dl.model_performance(train).auc()
test_auc_dl = data_dl.model_performance(test) .auc()
# Make a pretty HTML table printout of the results
header = ["Model", "AUC Train", "AUC Test"]
table = [
["GBM", train_auc_gbm, test_auc_gbm],
["DL ", train_auc_dl, test_auc_dl]
]
h2o.display.H2ODisplay(table, header)
Model
AUC Train
AUC Test
GBM
0.9562092
0.9325577
DL
0.9844351
0.9229118
Out[15]:
In [16]:
# Create new H2OFrame of crime observations
examples = {
"Date": ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"],
"IUCR": [1811, 1150],
"Primary.Type": ["NARCOTICS", "DECEPTIVE PRACTICE"],
"Location.Description": ["STREET", "RESIDENCE"],
"Domestic": ["false", "false"],
"Beat": [422, 923],
"District": [4, 9],
"Ward": [7, 14],
"Community.Area": [46, 63],
"FBI.Code": [18, 11]
}
crime_examples = h2o.H2OFrame(examples)
# Refine date column and merge with census data
refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
crime_examples.drop("Date")
census.set_name(0,"Community.Area")
crime_examples.merge(census, all_x=True, all_y=False)
crime_examples.describe()
Parse Progress: [##################################################] 100%
Rows:2 Cols:16
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
7
43.75
560 B
43.75
C1N
1-Byte Integers (w/o NAs)
4
25.0
280 B
21.875
C2
2-Byte Integers
2
12.5
144 B
11.25
C2S
2-Byte Fractions
1
6.25
88 B
6.875
CStr
String
2
12.5
208 B
16.25
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.61:54321
1.3 KB
2.0
1.0
16.0
mean
1.3 KB
2.0
1.0
16.0
min
1.3 KB
2.0
1.0
16.0
max
1.3 KB
2.0
1.0
16.0
stddev
0 B
0.0
0.0
0.0
total
1.3 KB
2.0
1.0
16.0
Location.Description FBI.Code Beat Primary.Type Community.Area District Date Ward Domestic IUCR Day Month Year WeekNum WeekDay HourOfDay
type string int int string int int int int enum int int int int int enum int
mins NaN 11.0 422.0 NaN 46.0 4.0 1.423465239e+12 7.0 0.0 1150.0 8.0 3.0 3915.0 6.0 6.0 23.0
mean NaN 14.5 672.5 NaN 54.5 6.5 1.4234665385e+12 10.5 0.0 1480.5 8.0 3.0 3915.0 6.0 NaN 23.0
maxs NaN 18.0 923.0 NaN 63.0 9.0 1.423467838e+12 14.0 0.0 1811.0 8.0 3.0 3915.0 6.0 6.0 23.0
sigma NaN 4.94974746831 354.260497374 NaN 12.0208152802 3.53553390593 1837770.5243 4.94974746831 0.0 467.397582364 0.0 0.0 0.0 0.0 NaN 0.0
zeros 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0
missing 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 STREET 18.0 422.0 NARCOTICS 46.0 4.0 1.423467838e+12 7.0 false 1811.0 8.0 3.0 3915.0 6.0 Sun 23.0
1 RESIDENCE 11.0 923.0 DECEPTIVE PRACTICE 63.0 9.0 1.423465239e+12 14.0 false 1150.0 8.0 3.0 3915.0 6.0 Sun 23.0
Rows:2 Cols:18
Chunk compression summary:
chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
9
50.0
720 B
50.0
C1N
1-Byte Integers (w/o NAs)
4
22.222223
280 B
19.444445
C2
2-Byte Integers
2
11.111112
144 B
10.0
C2S
2-Byte Fractions
1
5.555556
88 B
6.111111
CStr
String
2
11.111112
208 B
14.444445
Frame distribution summary:
size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.61:54321
1.4 KB
2.0
1.0
18.0
mean
1.4 KB
2.0
1.0
18.0
min
1.4 KB
2.0
1.0
18.0
max
1.4 KB
2.0
1.0
18.0
stddev
0 B
0.0
0.0
0.0
total
1.4 KB
2.0
1.0
18.0
Location.Description FBI.Code Beat Primary.Type Community.Area District Date Ward Domestic IUCR Day Month Year WeekNum WeekDay HourOfDay Weekend Season
type string int int string int int int int enum int int int int int enum int int enum
mins NaN 11.0 422.0 NaN 46.0 4.0 1.423465239e+12 7.0 0.0 1150.0 8.0 3.0 3915.0 6.0 6.0 23.0 1.0 1.0
mean NaN 14.5 672.5 NaN 54.5 6.5 1.4234665385e+12 10.5 0.0 1480.5 8.0 3.0 3915.0 6.0 NaN 23.0 1.0 NaN
maxs NaN 18.0 923.0 NaN 63.0 9.0 1.423467838e+12 14.0 0.0 1811.0 8.0 3.0 3915.0 6.0 6.0 23.0 1.0 1.0
sigma NaN 4.94974746831 354.260497374 NaN 12.0208152802 3.53553390593 1837770.5243 4.94974746831 0.0 467.397582364 0.0 0.0 0.0 0.0 NaN 0.0 0.0 NaN
zeros 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
missing 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 STREET 18.0 422.0 NARCOTICS 46.0 4.0 1.423467838e+12 7.0 false 1811.0 8.0 3.0 3915.0 6.0 Sun 23.0 1.0 Spring
1 RESIDENCE 11.0 923.0 DECEPTIVE PRACTICE 63.0 9.0 1.423465239e+12 14.0 false 1150.0 8.0 3.0 3915.0 6.0 Sun 23.0 1.0 Spring
In [17]:
# Predict probability of arrest from new observations
gbm_pred = data_gbm.predict(crime_examples)
dl_pred = data_dl .predict(crime_examples)
# Make a pretty HTML table printout of the results
header = ["FBI Code", "GBM Arrest Prob", "DL Arrest Prob"]
table = [
[examples["FBI.Code"][0], gbm_pred[0,"true"], dl_pred[0,"true"]],
[examples["FBI.Code"][1], gbm_pred[1,"true"], dl_pred[1,"true"]]
]
h2o.display.H2ODisplay(table, header)
FBI Code
GBM Arrest Prob
DL Arrest Prob
18
0.1136120
0.0044016
11
0.1136120
0.0113582
Out[17]:
Content source: nilbody/h2o-3
Similar notebooks: