In [1]:
from pysparkling import *
hc = H2OContext(sc).start()


Warning: Version mismatch. H2O is version 3.7.0.3247, but the python package is version UNKNOWN.
H2O cluster uptime: 996 milliseconds
H2O cluster version: 3.7.0.3247
H2O cluster name: sparkling-water-michal
H2O cluster total nodes: 1
H2O cluster total memory: 982.0 MB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 172.16.2.223
H2O Connection port: 54321

In [2]:
# Import cluster local file - we need to add files via spark import files  -FIXME
import h2o
#from h2o.h2o import _locate # private function. used to find files within h2o project directory.
def _locate(s): 
    return "../../../" + s 
   

f_weather = h2o.import_file(_locate("examples/smalldata/chicagoAllWeather.csv"))
f_census = h2o.import_file(_locate("examples/smalldata/chicagoCensus.csv"))
f_crimes = h2o.import_file(_locate("examples/smalldata/chicagoCrimes10k.csv"))


Parse Progress: [##################################################] 100%

Parse Progress: [##################################################] 100%

Parse Progress: [##################################################] 100%

In [3]:
f_weather.show()


date month day year maxTemp meanTemp minTemp ------- ------- ----- ------ --------- ---------- --------- 1/1/01 1 1 2001 23 14 6 1/2/01 1 2 2001 18 12 6 1/3/01 1 3 2001 28 18 8 1/4/01 1 4 2001 30 24 19 1/5/01 1 5 2001 36 30 21 1/6/01 1 6 2001 33 26 19 1/7/01 1 7 2001 34 28 21 1/8/01 1 8 2001 26 20 14 1/9/01 1 9 2001 23 16 10 1/10/01 1 10 2001 34 26 19

In [4]:
# Transform weather table
## Remove 1st column (date)
f_weather = f_weather[1:]

In [5]:
# Transform census table
## Remove all spaces from column names (causing problems in Spark SQL)
col_names = map(lambda s: s.strip().replace(' ', '_'), f_census.col_names)

## Update column names in the table
#f_weather.names = col_names
f_census.names = col_names

In [6]:
# Transform crimes table

## Drop useless columns
f_crimes = f_crimes[2:]

## Replace ' ' by '_' in column names
col_names = map(lambda s: s.replace(' ', '_'), f_crimes.col_names)
f_crimes.names = col_names

## Refine date column
def refine_date_col(data, col, pattern):
    data[col]         = data[col].as_date(pattern)
    data["Day"]       = data[col].day()
    data["Month"]     = data[col].month()
    data["Year"]      = data[col].year()
    data["WeekNum"]   = data[col].week()
    data["WeekDay"]   = data[col].dayOfWeek()
    data["HourOfDay"] = data[col].hour()
    
    data.describe()  # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
    
    # Create weekend and season cols
    # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
    # data["Weekend"]   = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
    data["Weekend"] = (data["WeekDay"] == "Sun" or data["WeekDay"] == "Sat").ifelse(1, 0)[0]
    data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
    
refine_date_col(f_crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
f_crimes = f_crimes.drop("Date")
f_crimes.describe()


Rows:9,999 Cols:25

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 147 16.333332 11.5 KB 2.2497318
CBS Bits 2 0.2 210 B 0.0
C1 1-Byte Integers 283 31.444445 95.6 KB 18.719261
C1N 1-Byte Integers (w/o NAs) 144 16.0 48.6 KB 9.521951
C2 2-Byte Integers 144 16.0 87.7 KB 17.176012
C2S 2-Byte Fractions 35 3.888889 21.9 KB 4.2817597
C4 4-Byte Integers 72 8.0 82.9 KB 16.23939
C4S 4-Byte Fractions 1 0.1 1.2 KB 0.2
C8 64-bit Integers 1 0.1 2.2 KB 0.4384681
C8D 64-bit Reals 71 7.8888884 158.8 KB 31.10522
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.223:54321 510.5 KB 9999.0 36.0 900.0
mean 510.5 KB 9999.0 36.0 900.0
min 510.5 KB 9999.0 36.0 900.0
max 510.5 KB 9999.0 36.0 900.0
stddev 0 B 0.0 0.0 0.0
total 510.5 KB 9999.0 36.0 900.0

Date Block IUCR Primary_Type Description Location_Description Arrest Domestic Beat District Ward Community_Area FBI_Code X_Coordinate Y_Coordinate Year Updated_On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay ------- ----------------- -------------------- ------------- --------------- ---------------------------- ---------------------- -------------- -------------- ------------- ------------- ------------- ---------------- ------------- -------------- -------------- ------ ---------------------- --------------- --------------- ----------------------------- ------------- -------------- -------------- --------- ------------- type int enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int mins 1.42203063e+12 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 2015.0 0.0 41.64507243 -87.906463888 0.0 1.0 1.0 4.0 0.0 0.0 mean 1.42271445081e+12 NaN 1189.67651357 NaN NaN NaN 0.292829282928 0.152315231523 1159.61806181 11.3489885128 22.9540954095 37.4476447645 12.7401236227 1163880.59815 1885916.14984 2015.0 NaN 41.8425652247 -87.6741405221 NaN 17.6839683968 1.41944194419 5.18081808181 NaN 13.6319631963 maxs 1.42346782e+12 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 2015.0 32.0 42.022646183 -87.524773286 8603.0 31.0 2.0 6.0 6.0 23.0 sigma 433879245.188 NaN 927.751435583 NaN NaN NaN 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 NaN 0.0860186579359 0.0600357970653 NaN 11.1801043358 0.493492406787 0.738929830409 NaN 6.47321735807 zeros 0 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 missing 0 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 1.42346782e+12 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 2015.0 02/15/2015 12:43:39 PM 41.747693646 -87.549035389 (41.747693646, -87.549035389) 8.0 2.0 6.0 Sun 23.0 1 1.423467702e+12 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 2015.0 02/15/2015 12:43:39 PM 41.679442289 -87.622850758 (41.679442289, -87.622850758) 8.0 2.0 6.0 Sun 23.0 2 1.423467022e+12 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 2015.0 02/15/2015 12:43:39 PM 41.87777333 -87.755117993 (41.87777333, -87.755117993) 8.0 2.0 6.0 Sun 23.0 3 1.423467023e+12 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 321.0 nan 6.0 42.0 18.0 nan nan 2015.0 02/15/2015 12:43:39 PM nan nan 8.0 2.0 6.0 Sun 23.0 4 1.423467058e+12 0000X S MAYFIELD AVE 610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 2015.0 02/15/2015 12:43:39 PM 41.880025548 -87.771541324 (41.880025548, -87.771541324) 8.0 2.0 6.0 Sun 23.0 5 1.423467021e+12 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 2015.0 02/15/2015 12:43:39 PM 41.807059405 -87.65206589 (41.807059405, -87.65206589) 8.0 2.0 6.0 Sun 23.0 6 1.423466844e+12 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGE TO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 2015.0 02/15/2015 12:43:39 PM 41.999814056 -87.669342967 (41.999814056, -87.669342967) 8.0 2.0 6.0 Sun 23.0 7 1.423466772e+12 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 2015.0 02/15/2015 12:43:39 PM 41.920755683 -87.776067514 (41.920755683, -87.776067514) 8.0 2.0 6.0 Sun 23.0 8 1.423466433e+12 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356) 8.0 2.0 6.0 Sun 23.0 9 1.423466136e+12 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356) 8.0 2.0 6.0 Sun 23.0
Rows:9,999 Cols:26

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 214 22.863249 16.7 KB 3.3972297
CBS Bits 7 0.7 735 B 0.1
C1 1-Byte Integers 283 30.235043 95.6 KB 19.417192
C1N 1-Byte Integers (w/o NAs) 144 15.384616 48.6 KB 9.876969
C2 2-Byte Integers 144 15.384616 87.7 KB 17.816406
C4 4-Byte Integers 72 7.692308 82.9 KB 16.844862
C4S 4-Byte Fractions 1 0.1 1.2 KB 0.2365361
C8D 64-bit Reals 71 7.58547 158.8 KB 32.26495
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.223:54321 492.1 KB 9999.0 36.0 936.0
mean 492.1 KB 9999.0 36.0 936.0
min 492.1 KB 9999.0 36.0 936.0
max 492.1 KB 9999.0 36.0 936.0
stddev 0 B 0.0 0.0 0.0
total 492.1 KB 9999.0 36.0 936.0

Block IUCR Primary_Type Description Location_Description Arrest Domestic Beat District Ward Community_Area FBI_Code X_Coordinate Y_Coordinate Year Updated_On Latitude Longitude Location Day Month WeekNum WeekDay HourOfDay Weekend Season ------- -------------------- ------------- --------------- ---------------------------- ---------------------- -------------- -------------- ------------- ------------- ------------- ---------------- ------------- -------------- -------------- ------ ---------------------- --------------- --------------- ----------------------------- ------------- -------------- -------------- --------- ------------- -------------- -------- type enum int enum enum enum enum enum int int int int int int int int enum real real enum int int int enum int int enum mins 0.0 110.0 0.0 0.0 0.0 0.0 0.0 111.0 1.0 1.0 1.0 2.0 1100317.0 1814255.0 2015.0 0.0 41.64507243 -87.906463888 0.0 1.0 1.0 4.0 0.0 0.0 0.0 0.0 mean NaN 1189.67651357 NaN NaN NaN 0.292829282928 0.152315231523 1159.61806181 11.3489885128 22.9540954095 37.4476447645 12.7401236227 1163880.59815 1885916.14984 2015.0 NaN 41.8425652247 -87.6741405221 NaN 17.6839683968 1.41944194419 5.18081808181 NaN 13.6319631963 0.159115911591 NaN maxs 6517.0 5131.0 26.0 198.0 90.0 1.0 1.0 2535.0 25.0 50.0 77.0 26.0 1205069.0 1951533.0 2015.0 32.0 42.022646183 -87.524773286 8603.0 31.0 2.0 6.0 6.0 23.0 1.0 0.0 sigma NaN 927.751435583 NaN NaN NaN 0.455083515588 0.35934414686 695.76029875 6.94547493301 13.6495661144 21.2748762223 7.57423857911 16496.4493681 31274.0163199 0.0 NaN 0.0860186579359 0.0600357970653 NaN 11.1801043358 0.493492406787 0.738929830409 NaN 6.47321735807 0.365802434041 NaN zeros 3 0 11 933 19 7071 8476 0 0 0 0 0 0 0 0 603 0 0 1 0 0 0 1038 374 8408 9999 missing 0 419 0 0 6 0 0 0 162 0 0 2557 162 162 0 0 162 162 162 0 0 0 0 0 0 0 0 081XX S COLES AVE 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 422.0 4.0 7.0 46.0 18.0 1198273.0 1851626.0 2015.0 02/15/2015 12:43:39 PM 41.747693646 -87.549035389 (41.747693646, -87.549035389) 8.0 2.0 6.0 Sun 23.0 1.0 Winter 1 118XX S STATE ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT true true 522.0 5.0 34.0 53.0 nan 1178335.0 1826581.0 2015.0 02/15/2015 12:43:39 PM 41.679442289 -87.622850758 (41.679442289, -87.622850758) 8.0 2.0 6.0 Sun 23.0 1.0 Winter 2 002XX S LARAMIE AVE 2026.0 NARCOTICS POSS: PCP SIDEWALK true false 1522.0 15.0 29.0 25.0 18.0 1141717.0 1898581.0 2015.0 02/15/2015 12:43:39 PM 41.87777333 -87.755117993 (41.87777333, -87.755117993) 8.0 2.0 6.0 Sun 23.0 1.0 Winter 3 006XX E 67TH ST 1811.0 NARCOTICS POSS: CANNABIS 30GMS OR LESS STREET true false 321.0 nan 6.0 42.0 18.0 nan nan 2015.0 02/15/2015 12:43:39 PM nan nan 8.0 2.0 6.0 Sun 23.0 1.0 Winter 4 0000X S MAYFIELD AVE 610.0 BURGLARY FORCIBLE ENTRY APARTMENT false false 1513.0 15.0 29.0 25.0 5.0 1137239.0 1899372.0 2015.0 02/15/2015 12:43:39 PM 41.880025548 -87.771541324 (41.880025548, -87.771541324) 8.0 2.0 6.0 Sun 23.0 1.0 Winter 5 010XX W 48TH ST 486.0 BATTERY DOMESTIC BATTERY SIMPLE APARTMENT false true 933.0 9.0 3.0 61.0 nan 1169986.0 1873019.0 2015.0 02/15/2015 12:43:39 PM 41.807059405 -87.65206589 (41.807059405, -87.65206589) 8.0 2.0 6.0 Sun 23.0 1.0 Winter 6 015XX W ARTHUR AVE 1320.0 CRIMINAL DAMAGE TO VEHICLE STREET false false 2432.0 24.0 40.0 1.0 14.0 1164732.0 1943222.0 2015.0 02/15/2015 12:43:39 PM 41.999814056 -87.669342967 (41.999814056, -87.669342967) 8.0 2.0 6.0 Sun 23.0 1.0 Winter 7 060XX W GRAND AVE 110.0 HOMICIDE FIRST DEGREE MURDER STREET true false 2512.0 25.0 37.0 19.0 nan 1135910.0 1914206.0 2015.0 02/15/2015 12:43:39 PM 41.920755683 -87.776067514 (41.920755683, -87.776067514) 8.0 2.0 6.0 Sun 23.0 1.0 Winter 8 001XX W WACKER DR 460.0 BATTERY SIMPLE OTHER false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356) 8.0 2.0 6.0 Sun 23.0 1.0 Winter 9 001XX W WACKER DR 460.0 BATTERY SIMPLE HOTEL/MOTEL false false 122.0 1.0 42.0 32.0 nan 1175384.0 1902088.0 2015.0 02/15/2015 12:43:39 PM 41.886707818 -87.631396356 (41.886707818, -87.631396356) 8.0 2.0 6.0 Sun 23.0 1.0 Winter

In [7]:
# Expose H2O frames as Spark DataFrame

df_weather = hc.as_spark_frame(f_weather)
df_census = hc.as_spark_frame(f_census)
df_crimes = hc.as_spark_frame(f_crimes)

In [8]:
df_weather.show()


+-----+---+----+-------+--------+-------+
|month|day|year|maxTemp|meanTemp|minTemp|
+-----+---+----+-------+--------+-------+
|    1|  1|2001|     23|      14|      6|
|    1|  2|2001|     18|      12|      6|
|    1|  3|2001|     28|      18|      8|
|    1|  4|2001|     30|      24|     19|
|    1|  5|2001|     36|      30|     21|
|    1|  6|2001|     33|      26|     19|
|    1|  7|2001|     34|      28|     21|
|    1|  8|2001|     26|      20|     14|
|    1|  9|2001|     23|      16|     10|
|    1| 10|2001|     34|      26|     19|
|    1| 11|2001|     39|      28|     18|
|    1| 12|2001|     37|      31|     25|
|    1| 13|2001|     35|      34|     33|
|    1| 14|2001|     36|      34|     32|
|    1| 15|2001|     35|      32|     30|
|    1| 16|2001|     30|      28|     26|
|    1| 17|2001|     26|      22|     19|
|    1| 18|2001|     30|      24|     19|
|    1| 19|2001|     27|      22|     17|
|    1| 20|2001|     24|      18|     10|
+-----+---+----+-------+--------+-------+
only showing top 20 rows


In [9]:
# Use Spark SQL to join datasets

## Register DataFrames as tables in SQL context
sqlContext.registerDataFrameAsTable(df_weather, "chicagoWeather")
sqlContext.registerDataFrameAsTable(df_census, "chicagoCensus")
sqlContext.registerDataFrameAsTable(df_crimes, "chicagoCrime")


crimeWithWeather = sqlContext.sql("""SELECT
a.Year, a.Month, a.Day, a.WeekNum, a.HourOfDay, a.Weekend, a.Season, a.WeekDay,
a.IUCR, a.Primary_Type, a.Location_Description, a.Community_Area, a.District,
a.Arrest, a.Domestic, a.Beat, a.Ward, a.FBI_Code,
b.minTemp, b.maxTemp, b.meanTemp,
c.PERCENT_AGED_UNDER_18_OR_OVER_64, c.PER_CAPITA_INCOME, c.HARDSHIP_INDEX,
c.PERCENT_OF_HOUSING_CROWDED, c.PERCENT_HOUSEHOLDS_BELOW_POVERTY,
c.PERCENT_AGED_16__UNEMPLOYED, c.PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA
FROM chicagoCrime a
JOIN chicagoWeather b
ON a.Year = b.year AND a.Month = b.month AND a.Day = b.day
JOIN chicagoCensus c
ON a.Community_Area = c.Community_Area_Number""")

In [10]:
crimeWithWeather.show()


+----+-----+---+-------+---------+-------+------+-------+----+--------------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+
|Year|Month|Day|WeekNum|HourOfDay|Weekend|Season|WeekDay|IUCR|        Primary_Type|Location_Description|Community_Area|District|Arrest|Domestic|Beat|Ward|FBI_Code|minTemp|maxTemp|meanTemp|PERCENT_AGED_UNDER_18_OR_OVER_64|PER_CAPITA_INCOME|HARDSHIP_INDEX|PERCENT_OF_HOUSING_CROWDED|PERCENT_HOUSEHOLDS_BELOW_POVERTY|PERCENT_AGED_16__UNEMPLOYED|PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA|
+----+-----+---+-------+---------+-------+------+-------+----+--------------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+
|2015|    1| 23|      4|       22|      0|Winter|    Fri|null|   WEAPONS VIOLATION|               ALLEY|            31|      12|  true|   false|1234|  25|      15|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       19|      0|Winter|    Fri|4625|       OTHER OFFENSE|            SIDEWALK|            31|      10|  true|   false|1034|  25|      26|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       19|      0|Winter|    Fri| 320|             ROBBERY|  SMALL RETAIL STORE|            31|      10| false|   false|1034|  25|       3|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       18|      0|Winter|    Fri|1310|     CRIMINAL DAMAGE|          RESTAURANT|            31|      12| false|   false|1235|  25|      14|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       18|      0|Winter|    Fri| 610|            BURGLARY|           RESIDENCE|            31|      12| false|   false|1234|  25|       5|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       16|      0|Winter|    Fri|2210|LIQUOR LAW VIOLATION| TAVERN/LIQUOR STORE|            31|      12|  true|   false|1233|  25|      22|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       16|      0|Winter|    Fri| 470|PUBLIC PEACE VIOL...|             CTA BUS|            31|      10|  true|   false|1034|  25|      24|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       15|      0|Winter|    Fri|1305|     CRIMINAL DAMAGE|           RESIDENCE|            31|      12| false|   false|1235|  25|      14|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       13|      0|Winter|    Fri| 486|             BATTERY|           RESIDENCE|            31|      10| false|    true|1034|  25|    null|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       13|      0|Winter|    Fri| 820|               THEFT|              STREET|            31|      12| false|   false|1234|  25|       6|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       12|      0|Winter|    Fri| 915| MOTOR VEHICLE THEFT|  VEHICLE-COMMERCIAL|            31|       9| false|   false| 912|  11|       7|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       12|      0|Winter|    Fri| 870|               THEFT|           CTA TRAIN|            31|      12| false|   false|1234|  25|       6|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       10|      0|Winter|    Fri| 486|             BATTERY|           APARTMENT|            31|      12|  true|   false|1234|  25|    null|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|       22|      0|Winter|    Sat|2820|       OTHER OFFENSE|           APARTMENT|            31|      10| false|   false|1034|  25|      26|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|       21|      0|Winter|    Sat|1310|     CRIMINAL DAMAGE|          RESTAURANT|            31|      12|  true|   false|1233|  25|      14|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|       18|      0|Winter|    Sat|1750|OFFENSE INVOLVING...|           RESIDENCE|            31|      12| false|    true|1235|  25|      20|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|       18|      0|Winter|    Sat| 460|             BATTERY|               OTHER|            31|      10| false|   false|1023|  25|    null|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|       13|      0|Winter|    Sat| 890|               THEFT|   CURRENCY EXCHANGE|            31|      10| false|   false|1023|  25|       6|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|        9|      0|Winter|    Sat| 560|             ASSAULT|               OTHER|            31|      12| false|   false|1234|  25|    null|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|        8|      0|Winter|    Sat| 486|             BATTERY|           RESIDENCE|            31|      12|  true|    true|1235|  25|    null|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
+----+-----+---+-------+---------+-------+------+-------+----+--------------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+
only showing top 20 rows


In [11]:
# Publish Spark DataFrame as H2OFrame with given name
crimeWithWeatherHF = hc.as_h2o_frame(crimeWithWeather, "crimeWithWeatherTable")

In [12]:
# Transform selected String columns to categoricals
crimeWithWeatherHF["Arrest"] = crimeWithWeatherHF["Arrest"].asfactor()
crimeWithWeatherHF["Season"] = crimeWithWeatherHF["Season"].asfactor()
crimeWithWeatherHF["WeekDay"] = crimeWithWeatherHF["WeekDay"].asfactor()
crimeWithWeatherHF["Primary_Type"] = crimeWithWeatherHF["Primary_Type"].asfactor()
crimeWithWeatherHF["Location_Description"] = crimeWithWeatherHF["Location_Description"].asfactor()
crimeWithWeatherHF["Domestic"] = crimeWithWeatherHF["Domestic"].asfactor()

In [13]:
# Split final data table
ratios = [0.8]
frs = crimeWithWeatherHF.split_frame(ratios)
train = frs[0]
test = frs[1]

In [14]:
h2o.gbm?

In [15]:
gbm_model = h2o.gbm(x      = train.drop("Arrest"),
              y            = train["Arrest"],
              validation_x = test.drop("Arrest"),
              validation_y = test["Arrest"],
              ntrees       = 50,
              max_depth    = 3,
              learn_rate   = 0.1,
              distribution = "bernoulli")


gbm Model Build Progress: [##################################################] 100%
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:8: DeprecationWarning: `h2o.gbm` is deprecated. Use the estimators sub module to build an H2OGradientBoostedEstimator.

In [16]:
gbm_model.model_performance(test)


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.0886840850472
R^2: 0.580772634503
LogLoss: 0.297152918244
AUC: 0.915804744837
Gini: 0.831609489674

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.373316650621:
false true Error Rate
false 1347.0 48.0 0.0344 (48.0/1395.0)
true 177.0 432.0 0.2906 (177.0/609.0)
Total 1524.0 480.0 0.1123 (225.0/2004.0)
Maximum Metrics: Maximum metrics at their respective thresholds

metric threshold value idx
max f1 0.4 0.8 145.0
max f2 0.2 0.8 265.0
max f0point5 0.7 0.9 99.0
max accuracy 0.6 0.9 110.0
max precision 1.0 1.0 0.0
max absolute_MCC 0.6 0.7 110.0
max min_per_class_accuracy 0.2 0.8 221.0
Out[16]:


In [17]:
dl_model = h2o.deeplearning(
              x            = train.drop("Arrest"),
              y            = train["Arrest"],
              validation_x = test.drop("Arrest"),
              validation_y = test["Arrest"])


deeplearning Model Build Progress: [##################################################] 100%
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:5: DeprecationWarning: `h2o.deeplearning` is deprecated. Use the estimators sub module to build an H2ODeepLearningEstimator.

In [18]:
dl_model.model_performance(test)


ModelMetricsBinomial: deeplearning
** Reported on test data. **

MSE: 0.104848808803
R^2: 0.504358759898
LogLoss: 0.345785338505
AUC: 0.903655443144
Gini: 0.807310886288

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.664046009238:
false true Error Rate
false 1329.0 66.0 0.0473 (66.0/1395.0)
true 179.0 430.0 0.2939 (179.0/609.0)
Total 1508.0 496.0 0.1223 (245.0/2004.0)
Maximum Metrics: Maximum metrics at their respective thresholds

metric threshold value idx
max f1 0.7 0.8 94.0
max f2 0.2 0.8 251.0
max f0point5 0.9 0.9 35.0
max accuracy 0.7 0.9 82.0
max precision 1.0 1.0 0.0
max absolute_MCC 0.8 0.7 56.0
max min_per_class_accuracy 0.3 0.8 195.0
Out[18]:


In [19]:
# create Crime class
from datetime import datetime
from pytz import timezone

def get_season(dt):
    if (dt >= 3 and dt <= 5):
        return "Spring"
    elif (dt >= 6 and dt <= 8):
        return "Summer"
    elif (dt >= 9 and dt <= 10):
        return "Autumn"
    else:       
        return "Winter"

def crime(date,
        iucr,
        primaryType,
        locationDescr,
        domestic,
        beat,
        district,
        ward,
        communityArea,
        fbiCode,
        minTemp = 77777,
        maxTemp = 77777,
        meanTemp = 77777,
        datePattern = "%d/%m/%Y %I:%M:%S %p",
        dateTimeZone = "Etc/UTC"):

    dt = datetime.strptime("02/08/2015 11:43:58 PM",'%d/%m/%Y %I:%M:%S %p')
    dt.replace(tzinfo=timezone("Etc/UTC"))

    crime = {}
    crime["Year"] = dt.year
    crime["Month"] = dt.month
    crime["Day"] = dt.day
    crime["WeekNum"] = dt.isocalendar()[1]
    crime["HourOfDay"] = dt.hour
    crime["Weekend"] = 1 if dt.weekday() == 5 or dt.weekday() == 6 else 0
    crime["Season"] = get_season(dt.month)
    crime["WeekDay"] = dt.strftime('%a')  #gets the day of week in short format - Mon, Tue ...
    crime["IUCR"] = iucr
    crime["Primary_Type"] = primaryType
    crime["Location_Description"] = locationDescr
    crime["Domestic"] = True if domestic else False
    crime["Beat"] = beat
    crime["District"] = district
    crime["Ward"] = ward
    crime["Community_Area"] = communityArea
    crime["FBI_Code"] = fbiCode
    crime["minTemp"] = minTemp
    crime["maxTemp"] = maxTemp
    crime["meanTemp"] = meanTemp
    return crime

In [20]:
# Create crimes examples
crime_examples = [
  crime("02/08/2015 11:43:58 PM", 1811, "NARCOTICS", "STREET",False, 422, 4, 7, 46, 18),
  crime("02/08/2015 11:00:39 PM", 1150, "DECEPTIVE PRACTICE", "RESIDENCE",False, 923, 9, 14, 63, 11)]

In [22]:
#For given crime and model returns probability of crime.
from h2o.connection import H2OConnection
def score_event(crime, model, censusTable):
    srdd = sqlContext.createDataFrame([crime])
    # Join table with census data
    df_row = censusTable.join(srdd).where("Community_Area = Community_Area_Number")  
    row = hc.as_h2o_frame(df_row)
    row["Season"] = row["Season"].asfactor()
    row["WeekDay"] = row["WeekDay"].asfactor()
    row["Primary_Type"] = row["Primary_Type"].asfactor()
    row["Location_Description"] = row["Location_Description"].asfactor()
    row["Domestic"] = row["Domestic"].asfactor()

    predictTable = model.predict(row)
    #FIXME: for glmModel table predictTable does not contain column "true"
    probOfArrest = predictTable["true"][0,0]
    return probOfArrest

for crime in crime_examples:
    arrestProbGLM = 100*score_event(crime, gbm_model, df_census)
    arrestProbGBM = 100*score_event(crime, dl_model, df_census)

    print("""
       |Crime: """+str(crime)+"""
       |  Probability of arrest best on DeepLearning: """+str(arrestProbGLM)+"""
       |  Probability of arrest best on GBM: """+str(arrestProbGBM)+"""
        """)


       |Crime: {'meanTemp': 77777, 'maxTemp': 77777, 'Primary_Type': 'NARCOTICS', 'District': 4, 'minTemp': 77777, 'Beat': 422, 'Season': 'Summer', 'Community_Area': 46, 'Domestic': False, 'IUCR': 1811, 'Month': 8, 'Location_Description': 'STREET', 'WeekNum': 31, 'WeekDay': 'Sun', 'Year': 2015, 'HourOfDay': 23, 'Ward': 7, 'FBI_Code': 18, 'Weekend': 1, 'Day': 2}
       |  Probability of arrest best on DeepLearning: 96.2809743835
       |  Probability of arrest best on GBM: 4.59228476779e-169
        

       |Crime: {'meanTemp': 77777, 'maxTemp': 77777, 'Primary_Type': 'DECEPTIVE PRACTICE', 'District': 9, 'minTemp': 77777, 'Beat': 923, 'Season': 'Summer', 'Community_Area': 63, 'Domestic': False, 'IUCR': 1150, 'Month': 8, 'Location_Description': 'RESIDENCE', 'WeekNum': 31, 'WeekDay': 'Sun', 'Year': 2015, 'HourOfDay': 23, 'Ward': 14, 'FBI_Code': 11, 'Weekend': 1, 'Day': 2}
       |  Probability of arrest best on DeepLearning: 6.33938940776
       |  Probability of arrest best on GBM: 1.45404780481e-169
        

In [ ]: