In [9]:
import h2o

In [10]:
# Connect to a cluster
h2o.init()


H2O cluster uptime: 8 minutes 14 seconds 218 milliseconds
H2O cluster version: 3.7.0.99999
H2O cluster name: ece
H2O cluster total nodes: 1
H2O cluster total memory: 7.11 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321
H2O Connection proxy: None

In [11]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
weather_path = _locate("smalldata/chicago/chicagoAllWeather.csv")
census_path = _locate("smalldata/chicago/chicagoCensus.csv")
crimes_path = _locate("smalldata/chicago/chicagoCrimes10k.csv.zip")

print("Import and Parse weather data")
weather = h2o.import_file(path=weather_path, col_types = ["time"] + ["numeric"]*6)
weather.drop("date")
weather.describe()

print("Import and Parse census data")
census = h2o.import_file(path=census_path, col_types = ["numeric", "enum"] + ["numeric"]*7)
census.describe()

print("Import and Parse crimes data")
crimes = h2o.import_file(path=crimes_path)
crimes.describe()


Import and Parse weather data

Parse Progress: [##################################################] 100%
Rows:5,162 Cols:7

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0D Constant Reals 1 14.285715 80 B 0.2537749
C1N 1-Byte Integers (w/o NAs) 2 28.57143 10.2 KB 33.18107
C1S 1-Byte Fractions 4 57.14286 20.5 KB 66.565155
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 30.8 KB 5162.0 1.0 7.0
mean 30.8 KB 5162.0 1.0 7.0
min 30.8 KB 5162.0 1.0 7.0
max 30.8 KB 5162.0 1.0 7.0
stddev 0 B 0.0 0.0 0.0
total 30.8 KB 5162.0 1.0 7.0

date month day year maxTemp meanTemp minTemp
type time int int int int int int
mins NaN 1.0 1.0 2001.0 -2.0 -9.0 -18.0
mean 0.0 6.4744285160815.70825261532007.5714839258.871042921 50.310351524641.4812584968
maxs NaN 12.0 31.0 2015.0 103.0 93.0 82.0
sigma -0.0 3.469051716948.798951739974.0773409057 21.482977723719.930239926619.0207297123
zeros -5162 0 0 0 0 2 16
missing5162 0 0 0 13 13 13
0 nan 1.0 1.0 2001.0 23.0 14.0 6.0
1 nan 1.0 2.0 2001.0 18.0 12.0 6.0
2 nan 1.0 3.0 2001.0 28.0 18.0 8.0
3 nan 1.0 4.0 2001.0 30.0 24.0 19.0
4 nan 1.0 5.0 2001.0 36.0 30.0 21.0
5 nan 1.0 6.0 2001.0 33.0 26.0 19.0
6 nan 1.0 7.0 2001.0 34.0 28.0 21.0
7 nan 1.0 8.0 2001.0 26.0 20.0 14.0
8 nan 1.0 9.0 2001.0 23.0 16.0 10.0
9 nan 1.0 10.0 2001.0 34.0 26.0 19.0
Import and Parse census data

Parse Progress: [##################################################] 100%
Rows:79 Cols:9

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C1 1-Byte Integers 3 33.333336 441 B 22.546013
C1S 1-Byte Fractions 1 11.111112 163 B 8.333334
C2S 2-Byte Fractions 4 44.444447 968 B 49.488754
C4 4-Byte Integers 1 11.111112 384 B 19.6319
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 1.9 KB 79.0 1.0 9.0
mean 1.9 KB 79.0 1.0 9.0
min 1.9 KB 79.0 1.0 9.0
max 1.9 KB 79.0 1.0 9.0
stddev 0 B 0.0 0.0 0.0
total 1.9 KB 79.0 1.0 9.0

Community Area Number COMMUNITY AREA NAME PERCENT OF HOUSING CROWDED PERCENT HOUSEHOLDS BELOW POVERTY PERCENT AGED 16 UNEMPLOYED PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA PERCENT AGED UNDER 18 OR OVER 64 PER CAPITA INCOME HARDSHIP INDEX
type int enum real real real real real int int
mins 1.0 0.0 0.3 3.3 4.7 2.5 13.5 8201.0 1.0
mean 39.0 NaN 4.92051282051 21.7397435897 15.341025641 20.3307692308 35.7179487179 25597.0 49.5064935065
maxs 77.0 78.0 15.8 56.5 35.9 54.8 51.5 88669.0 98.0
sigma 22.3718573212 NaN 3.65898144135 11.457230913 7.49949670861 11.7465143511 7.28442108494 15196.4055413 28.6905556516
zeros 0 1 0 0 0 0 0 0 0
missing2 0 1 1 1 1 1 1 2
0 nan COMMUNITY AREA NAME nan nan nan nan nan nan nan
1 1.0 Rogers Park 7.7 23.6 8.7 18.2 27.5 23939.0 39.0
2 2.0 West Ridge 7.8 17.2 8.8 20.8 38.5 23040.0 46.0
3 3.0 Uptown 3.8 24.0 8.9 11.8 22.2 35787.0 20.0
4 4.0 Lincoln Square 3.4 10.9 8.2 13.4 25.5 37524.0 17.0
5 5.0 North Center 0.3 7.5 5.2 4.5 26.2 57123.0 6.0
6 6.0 Lake View 1.1 11.4 4.7 2.6 17.0 60058.0 5.0
7 7.0 Lincoln Park 0.8 12.3 5.1 3.6 21.5 71551.0 2.0
8 8.0 Near North Side 1.9 12.9 7.0 2.5 22.6 88669.0 1.0
9 9.0 Edison Park 1.1 3.3 6.5 7.4 35.3 40959.0 8.0
Import and Parse crimes data

Parse Progress: [##################################################] 100%
Rows:9,999 Cols:22

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 4 4.5454545 320 B 0.0369521
C1 1-Byte Integers 32 36.363636 80.2 KB 9.488375
C1N 1-Byte Integers (w/o NAs) 8 9.090909 20.1 KB 2.3720937
C2 2-Byte Integers 16 18.181818 79.2 KB 9.362738
C4 4-Byte Integers 12 13.636364 118.0 KB 13.949879
CStr String 8 9.090909 391.2 KB 46.25294
C8D 64-bit Reals 8 9.090909 156.8 KB 18.53702
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 845.7 KB 9999.0 4.0 88.0
mean 845.7 KB 9999.0 4.0 88.0
min 845.7 KB 9999.0 4.0 88.0
max 845.7 KB 9999.0 4.0 88.0
stddev 0 B 0.0 0.0 0.0
total 845.7 KB 9999.0 4.0 88.0

ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location
type int string string enum int enum enum enum enum enum int int int int int int int int enum real real enum
mins 21735.0 NaN NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 2015.00.0 41.64507243 -87.906463888 0.0
mean 9931318.73737NaN NaN NaN 1189.67651357NaN NaN NaN 0.2928292829280.1523152315231159.6180618111.348988512822.954095409537.4476447645 12.74012362271163880.59815 1885916.14984 2015.0NaN 41.8425652247 -87.6741405221 NaN
maxs 9962898.0 NaN NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 2015.032.0 42.022646183 -87.524773286 8603.0
sigma 396787.564221NaN NaN NaN 927.751435583NaN NaN NaN 0.4550835155880.35934414686 695.76029875 6.9454749330113.649566114421.2748762223 7.5742385791116496.4493681 31274.0163199 0.0 NaN 0.08601865793580.0600357970653NaN
zeros 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1
missing0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162
0 9955810.0 HY144797 02/08/2015 11:43:40 PM081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 2015.002/15/2015 12:43:39 PM41.747693646 -87.549035389 (41.747693646, -87.549035389)
1 9955861.0 HY144838 02/08/2015 11:41:42 PM118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 2015.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)
2 9955801.0 HY144779 02/08/2015 11:30:22 PM002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 2015.002/15/2015 12:43:39 PM41.87777333 -87.755117993 (41.87777333, -87.755117993)
3 9956197.0 HY144787 02/08/2015 11:30:23 PM006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 2015.002/15/2015 12:43:39 PMnan nan
4 9955846.0 HY144829 02/08/2015 11:30:58 PM0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 2015.002/15/2015 12:43:39 PM41.880025548 -87.771541324 (41.880025548, -87.771541324)
5 9955835.0 HY144778 02/08/2015 11:30:21 PM010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 2015.002/15/2015 12:43:39 PM41.807059405 -87.65206589 (41.807059405, -87.65206589)
6 9955872.0 HY144822 02/08/2015 11:27:24 PM015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 2015.002/15/2015 12:43:39 PM41.999814056 -87.669342967 (41.999814056, -87.669342967)
7 21752.0 HY144738 02/08/2015 11:26:12 PM060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 2015.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)
8 9955808.0 HY144775 02/08/2015 11:20:33 PM001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.002/15/2015 12:43:39 PM41.886707818 -87.631396356 (41.886707818, -87.631396356)
9 9958275.0 HY146732 02/08/2015 11:15:36 PM001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.002/15/2015 12:43:39 PM41.886707818 -87.631396356 (41.886707818, -87.631396356)

In [12]:
def refine_date_col(data, col, pattern):
    data[col]         = data[col].as_date(pattern)
    data["Day"]       = data[col].day()
    data["Month"]     = data[col].month() + 1     # Since H2O indexes from 0
    data["Year"]      = data[col].year() + 1900   # Start of epoch is 1900
    data["WeekNum"]   = data[col].week()
    data["WeekDay"]   = data[col].dayOfWeek()
    data["HourOfDay"] = data[col].hour()
    
    data.describe()  # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
    
    # Create weekend and season cols
    # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
    # data["Weekend"]   = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
    data["Weekend"] = ((data["WeekDay"] == "Sun") | (data["WeekDay"] == "Sat"))
    data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
    
refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
crimes = crimes.drop("Date")
crimes.describe()


Rows:9,999 Cols:27

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 9 8.333334 720 B 0.1006741
C1 1-Byte Integers 32 29.62963 80.2 KB 11.489151
C1N 1-Byte Integers (w/o NAs) 23 21.296297 57.9 KB 8.296664
C2 2-Byte Integers 16 14.814815 79.2 KB 11.337022
C4 4-Byte Integers 12 11.111112 118.0 KB 16.891436
C8 64-bit Integers 4 3.7037036 78.4 KB 11.222924
CStr String 4 3.7037036 127.2 KB 18.21628
C8D 64-bit Reals 8 7.4074073 156.8 KB 22.445848
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 698.4 KB 9999.0 4.0 108.0
mean 698.4 KB 9999.0 4.0 108.0
min 698.4 KB 9999.0 4.0 108.0
max 698.4 KB 9999.0 4.0 108.0
stddev 0 B 0.0 0.0 0.0
total 698.4 KB 9999.0 4.0 108.0

ID Case Number Date Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay
type int string int enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int
mins 21735.0 NaN 1.42203063e+12 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.00.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0
mean 9931318.73737NaN 1.42271445081e+12NaN 1189.67651357NaN NaN NaN 0.2928292829280.1523152315231159.6180618111.348988512822.954095409537.4476447645 12.74012362271163880.59815 1885916.14984 3915.0NaN 41.8425652247 -87.6741405221 NaN 17.68396839682.41944194419 5.18081808181 NaN 13.6319631963
maxs 9962898.0 NaN 1.42346782e+12 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.032.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0
sigma 396787.564221NaN 433879245.187 NaN 927.751435583NaN NaN NaN 0.4550835155880.35934414686 695.76029875 6.9454749330113.649566114421.2748762223 7.5742385791116496.4493681 31274.0163199 0.0 NaN 0.08601865793580.0600357970653NaN 11.18010433580.4934924067870.738929830409NaN 6.47321735807
zeros 0 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374
missing0 0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0
0 9955810.0 HY144797 1.42346782e+12 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.002/15/2015 12:43:39 PM41.747693646 -87.549035389 (41.747693646, -87.549035389)8.0 3.0 6.0 Sun 23.0
1 9955861.0 HY144838 1.423467702e+12 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)8.0 3.0 6.0 Sun 23.0
2 9955801.0 HY144779 1.423467022e+12 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.002/15/2015 12:43:39 PM41.87777333 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0
3 9956197.0 HY144787 1.423467023e+12 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.002/15/2015 12:43:39 PMnan nan 8.0 3.0 6.0 Sun 23.0
4 9955846.0 HY144829 1.423467058e+12 0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.002/15/2015 12:43:39 PM41.880025548 -87.771541324 (41.880025548, -87.771541324)8.0 3.0 6.0 Sun 23.0
5 9955835.0 HY144778 1.423467021e+12 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.002/15/2015 12:43:39 PM41.807059405 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0
6 9955872.0 HY144822 1.423466844e+12 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.002/15/2015 12:43:39 PM41.999814056 -87.669342967 (41.999814056, -87.669342967)8.0 3.0 6.0 Sun 23.0
7 21752.0 HY144738 1.423466772e+12 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)8.0 3.0 6.0 Sun 23.0
8 9955808.0 HY144775 1.423466433e+12 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818 -87.631396356 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0
9 9958275.0 HY146732 1.423466136e+12 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818 -87.631396356 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0
Rows:9,999 Cols:28

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 12 10.714286 960 B 0.1506986
CBS Bits 5 4.464286 1.8 KB 0.2948042
C1 1-Byte Integers 32 28.57143 80.2 KB 12.898546
C1N 1-Byte Integers (w/o NAs) 23 20.535715 57.9 KB 9.314431
C2 2-Byte Integers 16 14.285715 79.2 KB 12.727756
C4 4-Byte Integers 12 10.714286 118.0 KB 18.963537
CStr String 4 3.5714288 127.2 KB 20.450903
C8D 64-bit Reals 8 7.1428576 156.8 KB 25.199324
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 622.1 KB 9999.0 4.0 112.0
mean 622.1 KB 9999.0 4.0 112.0
min 622.1 KB 9999.0 4.0 112.0
max 622.1 KB 9999.0 4.0 112.0
stddev 0 B 0.0 0.0 0.0
total 622.1 KB 9999.0 4.0 112.0

ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum
mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.00.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0
mean 9931318.73737NaN NaN 1189.67651357NaN NaN NaN 0.2928292829280.1523152315231159.6180618111.348988512822.954095409537.4476447645 12.74012362271163880.59815 1885916.14984 3915.0NaN 41.8425652247 -87.6741405221 NaN 17.68396839682.41944194419 5.18081808181 NaN 13.63196319630.357535753575NaN
maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.032.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0
sigma 396787.564221NaN NaN 927.751435583NaN NaN NaN 0.4550835155880.35934414686 695.76029875 6.9454749330113.649566114421.2748762223 7.5742385791116496.4493681 31274.0163199 0.0 NaN 0.08601865793580.0600357970653NaN 11.18010433580.4934924067870.738929830409NaN 6.473217358070.47929835539 NaN
zeros 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 6424 5805
missing0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0
0 9955810.0 HY144797 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.002/15/2015 12:43:39 PM41.747693646 -87.549035389 (41.747693646, -87.549035389)8.0 3.0 6.0 Sun 23.0 1.0 Spring
1 9955861.0 HY144838 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)8.0 3.0 6.0 Sun 23.0 1.0 Spring
2 9955801.0 HY144779 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.002/15/2015 12:43:39 PM41.87777333 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
3 9956197.0 HY144787 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.002/15/2015 12:43:39 PMnan nan 8.0 3.0 6.0 Sun 23.0 1.0 Spring
4 9955846.0 HY144829 0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.002/15/2015 12:43:39 PM41.880025548 -87.771541324 (41.880025548, -87.771541324)8.0 3.0 6.0 Sun 23.0 1.0 Spring
5 9955835.0 HY144778 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.002/15/2015 12:43:39 PM41.807059405 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
6 9955872.0 HY144822 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.002/15/2015 12:43:39 PM41.999814056 -87.669342967 (41.999814056, -87.669342967)8.0 3.0 6.0 Sun 23.0 1.0 Spring
7 21752.0 HY144738 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)8.0 3.0 6.0 Sun 23.0 1.0 Spring
8 9955808.0 HY144775 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818 -87.631396356 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring
9 9958275.0 HY146732 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818 -87.631396356 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring

In [13]:
# Merge crimes data with weather and census
census.set_name(0,"Community Area")
weather.set_name(1,"Month")
weather.set_name(2,"Day")
weather.set_name(3,"Year")
crimes.merge(census, all_x=True, all_y=False)
crimes.merge(weather, all_x=True, all_y=False)
crimes.describe()


Rows:9,999 Cols:28

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 12 10.714286 960 B 0.1506986
CBS Bits 5 4.464286 1.8 KB 0.2948042
C1 1-Byte Integers 32 28.57143 80.2 KB 12.898546
C1N 1-Byte Integers (w/o NAs) 23 20.535715 57.9 KB 9.314431
C2 2-Byte Integers 16 14.285715 79.2 KB 12.727756
C4 4-Byte Integers 12 10.714286 118.0 KB 18.963537
CStr String 4 3.5714288 127.2 KB 20.450903
C8D 64-bit Reals 8 7.1428576 156.8 KB 25.199324
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 622.1 KB 9999.0 4.0 112.0
mean 622.1 KB 9999.0 4.0 112.0
min 622.1 KB 9999.0 4.0 112.0
max 622.1 KB 9999.0 4.0 112.0
stddev 0 B 0.0 0.0 0.0
total 622.1 KB 9999.0 4.0 112.0

ID Case Number Block IUCR Primary Type Description Location Description Arrest Domestic Beat District Ward Community Area FBI Code X Coordinate Y Coordinate Year Updated On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season
type int string enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum
mins 21735.0 NaN 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 3915.00.0 41.64507243 -87.906463888 0.0 1.0 2.0 4.0 0.0 0.0 0.0 0.0
mean 9931318.73737NaN NaN 1189.67651357NaN NaN NaN 0.2928292829280.1523152315231159.6180618111.348988512822.954095409537.4476447645 12.74012362271163880.59815 1885916.14984 3915.0NaN 41.8425652247 -87.6741405221 NaN 17.68396839682.41944194419 5.18081808181 NaN 13.63196319630.357535753575NaN
maxs 9962898.0 NaN 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 3915.032.0 42.022646183 -87.524773286 8603.0 31.0 3.0 6.0 6.0 23.0 1.0 1.0
sigma 396787.564221NaN NaN 927.751435583NaN NaN NaN 0.4550835155880.35934414686 695.76029875 6.9454749330113.649566114421.2748762223 7.5742385791116496.4493681 31274.0163199 0.0 NaN 0.08601865793580.0600357970653NaN 11.18010433580.4934924067870.738929830409NaN 6.473217358070.47929835539 NaN
zeros 0 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 6424 5805
missing0 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0
0 9955810.0 HY144797 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 3915.002/15/2015 12:43:39 PM41.747693646 -87.549035389 (41.747693646, -87.549035389)8.0 3.0 6.0 Sun 23.0 1.0 Spring
1 9955861.0 HY144838 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 3915.002/15/2015 12:43:39 PM41.679442289 -87.622850758 (41.679442289, -87.622850758)8.0 3.0 6.0 Sun 23.0 1.0 Spring
2 9955801.0 HY144779 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 3915.002/15/2015 12:43:39 PM41.87777333 -87.755117993 (41.87777333, -87.755117993) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
3 9956197.0 HY144787 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESSSTREET true false 321.0 nan 6.0 42.0 18.0 nan nan 3915.002/15/2015 12:43:39 PMnan nan 8.0 3.0 6.0 Sun 23.0 1.0 Spring
4 9955846.0 HY144829 0000X S MAYFIELD AVE610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 3915.002/15/2015 12:43:39 PM41.880025548 -87.771541324 (41.880025548, -87.771541324)8.0 3.0 6.0 Sun 23.0 1.0 Spring
5 9955835.0 HY144778 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 3915.002/15/2015 12:43:39 PM41.807059405 -87.65206589 (41.807059405, -87.65206589) 8.0 3.0 6.0 Sun 23.0 1.0 Spring
6 9955872.0 HY144822 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGETO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 3915.002/15/2015 12:43:39 PM41.999814056 -87.669342967 (41.999814056, -87.669342967)8.0 3.0 6.0 Sun 23.0 1.0 Spring
7 21752.0 HY144738 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 3915.002/15/2015 12:43:39 PM41.920755683 -87.776067514 (41.920755683, -87.776067514)8.0 3.0 6.0 Sun 23.0 1.0 Spring
8 9955808.0 HY144775 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818 -87.631396356 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring
9 9958275.0 HY146732 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 3915.002/15/2015 12:43:39 PM41.886707818 -87.631396356 (41.886707818, -87.631396356)8.0 3.0 6.0 Sun 23.0 1.0 Spring

In [14]:
# Create test/train split
r = crimes["Arrest"].runif(1234)
train = crimes[r < 0.8]
test = crimes[r >= 0.8]

# Simple GBM - Predict Arrest
data_gbm = h2o.gbm(x              =train.drop("Arrest"),
                   y              =train     ["Arrest"],
                   validation_x   =test .drop("Arrest"),
                   validation_y   =test      ["Arrest"],
                   ntrees         =10,
                   max_depth      =6,
                   distribution   ="bernoulli")

# Simple Deep Learning - Predict Arrest
data_dl = h2o.deeplearning(x                   =train.drop("Arrest"),
                           y                   =train     ["Arrest"],
                           validation_x        =test .drop("Arrest"),
                           validation_y        =test      ["Arrest"],
                           variable_importances=True,
                           loss                ="Automatic")


gbm Model Build Progress: [##################################################] 100%
-c:13: DeprecationWarning: `h2o.gbm` is deprecated. Use the estimators sub module to build an H2OGradientBoostedEstimator.
-c:21: DeprecationWarning: `h2o.deeplearning` is deprecated. Use the estimators sub module to build an H2ODeepLearningEstimator.

deeplearning Model Build Progress: [##################################################] 100%

In [15]:
# GBM performance on train/test data
train_auc_gbm = data_gbm.model_performance(train).auc()
test_auc_gbm  = data_gbm.model_performance(test) .auc()

# Deep Learning performance on train/test data
train_auc_dl = data_dl.model_performance(train).auc()
test_auc_dl  = data_dl.model_performance(test) .auc()

# Make a pretty HTML table printout of the results
header = ["Model", "AUC Train", "AUC Test"]
table  = [
           ["GBM", train_auc_gbm, test_auc_gbm],
           ["DL ", train_auc_dl,  test_auc_dl]
         ]
h2o.display.H2ODisplay(table, header)


Model AUC Train AUC Test
GBM 0.9562092 0.9325577
DL 0.9844351 0.9229118
Out[15]:


In [16]:
# Create new H2OFrame of crime observations
examples = {
            "Date":                 ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"],
            "IUCR":                 [1811, 1150],
            "Primary.Type":         ["NARCOTICS", "DECEPTIVE PRACTICE"],
            "Location.Description": ["STREET", "RESIDENCE"],
            "Domestic":             ["false", "false"],
            "Beat":                 [422, 923],
            "District":             [4, 9],
            "Ward":                 [7, 14],
            "Community.Area":       [46, 63],
            "FBI.Code":             [18, 11]
            }

crime_examples = h2o.H2OFrame(examples)

# Refine date column and merge with census data
refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
crime_examples.drop("Date")
census.set_name(0,"Community.Area")
crime_examples.merge(census, all_x=True, all_y=False)
crime_examples.describe()


Parse Progress: [##################################################] 100%
Rows:2 Cols:16

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 7 43.75 560 B 43.75
C1N 1-Byte Integers (w/o NAs) 4 25.0 280 B 21.875
C2 2-Byte Integers 2 12.5 144 B 11.25
C2S 2-Byte Fractions 1 6.25 88 B 6.875
CStr String 2 12.5 208 B 16.25
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 1.3 KB 2.0 1.0 16.0
mean 1.3 KB 2.0 1.0 16.0
min 1.3 KB 2.0 1.0 16.0
max 1.3 KB 2.0 1.0 16.0
stddev 0 B 0.0 0.0 0.0
total 1.3 KB 2.0 1.0 16.0

Location.Description FBI.Code Beat Primary.Type Community.Area District Date Ward Domestic IUCR Day Month Year WeekNum WeekDay HourOfDay
type string int int string int int int int enum int int int int int enum int
mins NaN 11.0 422.0 NaN 46.0 4.0 1.423465239e+12 7.0 0.0 1150.0 8.0 3.0 3915.06.0 6.0 23.0
mean NaN 14.5 672.5 NaN 54.5 6.5 1.4234665385e+1210.5 0.0 1480.5 8.0 3.0 3915.06.0 NaN 23.0
maxs NaN 18.0 923.0 NaN 63.0 9.0 1.423467838e+12 14.0 0.0 1811.0 8.0 3.0 3915.06.0 6.0 23.0
sigma NaN 4.94974746831354.260497374NaN 12.0208152802 3.535533905931837770.5243 4.949747468310.0 467.3975823640.0 0.0 0.0 0.0 NaN 0.0
zeros 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 STREET 18.0 422.0 NARCOTICS 46.0 4.0 1.423467838e+12 7.0 false 1811.0 8.0 3.0 3915.06.0 Sun 23.0
1 RESIDENCE 11.0 923.0 DECEPTIVE PRACTICE63.0 9.0 1.423465239e+12 14.0 false 1150.0 8.0 3.0 3915.06.0 Sun 23.0
Rows:2 Cols:18

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 9 50.0 720 B 50.0
C1N 1-Byte Integers (w/o NAs) 4 22.222223 280 B 19.444445
C2 2-Byte Integers 2 11.111112 144 B 10.0
C2S 2-Byte Fractions 1 5.555556 88 B 6.111111
CStr String 2 11.111112 208 B 14.444445
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 1.4 KB 2.0 1.0 18.0
mean 1.4 KB 2.0 1.0 18.0
min 1.4 KB 2.0 1.0 18.0
max 1.4 KB 2.0 1.0 18.0
stddev 0 B 0.0 0.0 0.0
total 1.4 KB 2.0 1.0 18.0

Location.Description FBI.Code Beat Primary.Type Community.Area District Date Ward Domestic IUCR Day Month Year WeekNum WeekDay HourOfDay Weekend Season
type string int int string int int int int enum int int int int int enum int int enum
mins NaN 11.0 422.0 NaN 46.0 4.0 1.423465239e+12 7.0 0.0 1150.0 8.0 3.0 3915.06.0 6.0 23.0 1.0 1.0
mean NaN 14.5 672.5 NaN 54.5 6.5 1.4234665385e+1210.5 0.0 1480.5 8.0 3.0 3915.06.0 NaN 23.0 1.0 NaN
maxs NaN 18.0 923.0 NaN 63.0 9.0 1.423467838e+12 14.0 0.0 1811.0 8.0 3.0 3915.06.0 6.0 23.0 1.0 1.0
sigma NaN 4.94974746831354.260497374NaN 12.0208152802 3.535533905931837770.5243 4.949747468310.0 467.3975823640.0 0.0 0.0 0.0 NaN 0.0 0.0 NaN
zeros 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 STREET 18.0 422.0 NARCOTICS 46.0 4.0 1.423467838e+12 7.0 false 1811.0 8.0 3.0 3915.06.0 Sun 23.0 1.0 Spring
1 RESIDENCE 11.0 923.0 DECEPTIVE PRACTICE63.0 9.0 1.423465239e+12 14.0 false 1150.0 8.0 3.0 3915.06.0 Sun 23.0 1.0 Spring

In [17]:
# Predict probability of arrest from new observations
gbm_pred = data_gbm.predict(crime_examples)
dl_pred  = data_dl .predict(crime_examples)

# Make a pretty HTML table printout of the results
header = ["FBI Code", "GBM Arrest Prob", "DL Arrest Prob"]
table  = [
           [examples["FBI.Code"][0], gbm_pred[0,"true"], dl_pred[0,"true"]],
           [examples["FBI.Code"][1], gbm_pred[1,"true"], dl_pred[1,"true"]]
         ]
h2o.display.H2ODisplay(table, header)


FBI Code GBM Arrest Prob DL Arrest Prob
18 0.1136120 0.0044016
11 0.1136120 0.0113582
Out[17]: