notebook.community

Edit and run



In [1]:

    
from pysparkling import *
hc = H2OContext(sc).start()









    



Warning: Version mismatch. H2O is version 3.7.0.3247, but the python package is version UNKNOWN.






    




H2O cluster uptime: 
996 milliseconds 
H2O cluster version: 
3.7.0.3247
H2O cluster name: 
sparkling-water-michal
H2O cluster total nodes: 
1
H2O cluster total memory: 
982.0 MB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
172.16.2.223
H2O Connection port: 
54321



In [2]:

    
# Import cluster local file - we need to add files via spark import files  -FIXME
import h2o
#from h2o.h2o import _locate # private function. used to find files within h2o project directory.
def _locate(s): 
    return "../../../" + s 
   

f_weather = h2o.import_file(_locate("examples/smalldata/chicagoAllWeather.csv"))
f_census = h2o.import_file(_locate("examples/smalldata/chicagoCensus.csv"))
f_crimes = h2o.import_file(_locate("examples/smalldata/chicagoCrimes10k.csv"))









    



Parse Progress: [##################################################] 100%

Parse Progress: [##################################################] 100%

Parse Progress: [##################################################] 100%



In [3]:

    
f_weather.show()









    




date       month    day    year    maxTemp    meanTemp    minTemp
-------  -------  -----  ------  ---------  ----------  ---------
1/1/01         1      1    2001         23          14          6
1/2/01         1      2    2001         18          12          6
1/3/01         1      3    2001         28          18          8
1/4/01         1      4    2001         30          24         19
1/5/01         1      5    2001         36          30         21
1/6/01         1      6    2001         33          26         19
1/7/01         1      7    2001         34          28         21
1/8/01         1      8    2001         26          20         14
1/9/01         1      9    2001         23          16         10
1/10/01        1     10    2001         34          26         19



In [4]:

    
# Transform weather table
## Remove 1st column (date)
f_weather = f_weather[1:]



In [5]:

    
# Transform census table
## Remove all spaces from column names (causing problems in Spark SQL)
col_names = map(lambda s: s.strip().replace(' ', '_'), f_census.col_names)

## Update column names in the table
#f_weather.names = col_names
f_census.names = col_names



In [6]:

    
# Transform crimes table

## Drop useless columns
f_crimes = f_crimes[2:]

## Replace ' ' by '_' in column names
col_names = map(lambda s: s.replace(' ', '_'), f_crimes.col_names)
f_crimes.names = col_names

## Refine date column
def refine_date_col(data, col, pattern):
    data[col]         = data[col].as_date(pattern)
    data["Day"]       = data[col].day()
    data["Month"]     = data[col].month()
    data["Year"]      = data[col].year()
    data["WeekNum"]   = data[col].week()
    data["WeekDay"]   = data[col].dayOfWeek()
    data["HourOfDay"] = data[col].hour()
    
    data.describe()  # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
    
    # Create weekend and season cols
    # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
    # data["Weekend"]   = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
    data["Weekend"] = (data["WeekDay"] == "Sun" or data["WeekDay"] == "Sat").ifelse(1, 0)[0]
    data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
    
refine_date_col(f_crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
f_crimes = f_crimes.drop("Date")
f_crimes.describe()









    



Rows:9,999 Cols:25

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
147
16.333332
   11.5 KB
2.2497318
CBS
Bits
2
0.2
    210  B
0.0
C1
1-Byte Integers
283
31.444445
   95.6 KB
18.719261
C1N
1-Byte Integers (w/o NAs)
144
16.0
   48.6 KB
9.521951
C2
2-Byte Integers
144
16.0
   87.7 KB
17.176012
C2S
2-Byte Fractions
35
3.888889
   21.9 KB
4.2817597
C4
4-Byte Integers
72
8.0
   82.9 KB
16.23939
C4S
4-Byte Fractions
1
0.1
    1.2 KB
0.2
C8
64-bit Integers
1
0.1
    2.2 KB
0.4384681
C8D
64-bit Reals
71
7.8888884
  158.8 KB
31.10522






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.223:54321
  510.5 KB
9999.0
36.0
900.0
mean
  510.5 KB
9999.0
36.0
900.0
min
  510.5 KB
9999.0
36.0
900.0
max
  510.5 KB
9999.0
36.0
900.0
stddev
      0  B
0.0
0.0
0.0
total
  510.5 KB
9999.0
36.0
900.0






    










    




         Date               Block                 IUCR           Primary_Type     Description                   Location_Description    Arrest          Domestic        Beat           District       Ward           Community_Area    FBI_Code       X_Coordinate    Y_Coordinate    Year    Updated_On              Latitude         Longitude        Location                       Day            Month           WeekNum         WeekDay    HourOfDay
-------  -----------------  --------------------  -------------  ---------------  ----------------------------  ----------------------  --------------  --------------  -------------  -------------  -------------  ----------------  -------------  --------------  --------------  ------  ----------------------  ---------------  ---------------  -----------------------------  -------------  --------------  --------------  ---------  -------------
type     int                enum                  int            enum             enum                          enum                    enum            enum            int            int            int            int               int            int             int             int     enum                    real             real             enum                           int            int             int             enum       int
mins     1.42203063e+12     0.0                   110.0          0.0              0.0                           0.0                     0.0             0.0             111.0          1.0            1.0            1.0               2.0            1100317.0       1814255.0       2015.0  0.0                     41.64507243      -87.906463888    0.0                            1.0            1.0             4.0             0.0        0.0
mean     1.42271445081e+12  NaN                   1189.67651357  NaN              NaN                           NaN                     0.292829282928  0.152315231523  1159.61806181  11.3489885128  22.9540954095  37.4476447645     12.7401236227  1163880.59815   1885916.14984   2015.0  NaN                     41.8425652247    -87.6741405221   NaN                            17.6839683968  1.41944194419   5.18081808181   NaN        13.6319631963
maxs     1.42346782e+12     6517.0                5131.0         26.0             198.0                         90.0                    1.0             1.0             2535.0         25.0           50.0           77.0              26.0           1205069.0       1951533.0       2015.0  32.0                    42.022646183     -87.524773286    8603.0                         31.0           2.0             6.0             6.0        23.0
sigma    433879245.188      NaN                   927.751435583  NaN              NaN                           NaN                     0.455083515588  0.35934414686   695.76029875   6.94547493301  13.6495661144  21.2748762223     7.57423857911  16496.4493681   31274.0163199   0.0     NaN                     0.0860186579359  0.0600357970653  NaN                            11.1801043358  0.493492406787  0.738929830409  NaN        6.47321735807
zeros    0                  3                     0              11               933                           19                      7071            8476            0              0              0              0                 0              0               0               0       603                     0                0                1                              0              0               0               1038       374
missing  0                  0                     419            0                0                             6                       0               0               0              162            0              0                 2557           162             162             0       0                       162              162              162                            0              0               0               0          0
0        1.42346782e+12     081XX S COLES AVE     1811.0         NARCOTICS        POSS: CANNABIS 30GMS OR LESS  STREET                  true            false           422.0          4.0            7.0            46.0              18.0           1198273.0       1851626.0       2015.0  02/15/2015 12:43:39 PM  41.747693646     -87.549035389    (41.747693646, -87.549035389)  8.0            2.0             6.0             Sun        23.0
1        1.423467702e+12    118XX S STATE ST      486.0          BATTERY          DOMESTIC BATTERY SIMPLE       APARTMENT               true            true            522.0          5.0            34.0           53.0              nan            1178335.0       1826581.0       2015.0  02/15/2015 12:43:39 PM  41.679442289     -87.622850758    (41.679442289, -87.622850758)  8.0            2.0             6.0             Sun        23.0
2        1.423467022e+12    002XX S LARAMIE AVE   2026.0         NARCOTICS        POSS: PCP                     SIDEWALK                true            false           1522.0         15.0           29.0           25.0              18.0           1141717.0       1898581.0       2015.0  02/15/2015 12:43:39 PM  41.87777333      -87.755117993    (41.87777333, -87.755117993)   8.0            2.0             6.0             Sun        23.0
3        1.423467023e+12    006XX E 67TH ST       1811.0         NARCOTICS        POSS: CANNABIS 30GMS OR LESS  STREET                  true            false           321.0          nan            6.0            42.0              18.0           nan             nan             2015.0  02/15/2015 12:43:39 PM  nan              nan                                             8.0            2.0             6.0             Sun        23.0
4        1.423467058e+12    0000X S MAYFIELD AVE  610.0          BURGLARY         FORCIBLE ENTRY                APARTMENT               false           false           1513.0         15.0           29.0           25.0              5.0            1137239.0       1899372.0       2015.0  02/15/2015 12:43:39 PM  41.880025548     -87.771541324    (41.880025548, -87.771541324)  8.0            2.0             6.0             Sun        23.0
5        1.423467021e+12    010XX W 48TH ST       486.0          BATTERY          DOMESTIC BATTERY SIMPLE       APARTMENT               false           true            933.0          9.0            3.0            61.0              nan            1169986.0       1873019.0       2015.0  02/15/2015 12:43:39 PM  41.807059405     -87.65206589     (41.807059405, -87.65206589)   8.0            2.0             6.0             Sun        23.0
6        1.423466844e+12    015XX W ARTHUR AVE    1320.0         CRIMINAL DAMAGE  TO VEHICLE                    STREET                  false           false           2432.0         24.0           40.0           1.0               14.0           1164732.0       1943222.0       2015.0  02/15/2015 12:43:39 PM  41.999814056     -87.669342967    (41.999814056, -87.669342967)  8.0            2.0             6.0             Sun        23.0
7        1.423466772e+12    060XX W GRAND AVE     110.0          HOMICIDE         FIRST DEGREE MURDER           STREET                  true            false           2512.0         25.0           37.0           19.0              nan            1135910.0       1914206.0       2015.0  02/15/2015 12:43:39 PM  41.920755683     -87.776067514    (41.920755683, -87.776067514)  8.0            2.0             6.0             Sun        23.0
8        1.423466433e+12    001XX W WACKER DR     460.0          BATTERY          SIMPLE                        OTHER                   false           false           122.0          1.0            42.0           32.0              nan            1175384.0       1902088.0       2015.0  02/15/2015 12:43:39 PM  41.886707818     -87.631396356    (41.886707818, -87.631396356)  8.0            2.0             6.0             Sun        23.0
9        1.423466136e+12    001XX W WACKER DR     460.0          BATTERY          SIMPLE                        HOTEL/MOTEL             false           false           122.0          1.0            42.0           32.0              nan            1175384.0       1902088.0       2015.0  02/15/2015 12:43:39 PM  41.886707818     -87.631396356    (41.886707818, -87.631396356)  8.0            2.0             6.0             Sun        23.0






    



Rows:9,999 Cols:26

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
214
22.863249
   16.7 KB
3.3972297
CBS
Bits
7
0.7
    735  B
0.1
C1
1-Byte Integers
283
30.235043
   95.6 KB
19.417192
C1N
1-Byte Integers (w/o NAs)
144
15.384616
   48.6 KB
9.876969
C2
2-Byte Integers
144
15.384616
   87.7 KB
17.816406
C4
4-Byte Integers
72
7.692308
   82.9 KB
16.844862
C4S
4-Byte Fractions
1
0.1
    1.2 KB
0.2365361
C8D
64-bit Reals
71
7.58547
  158.8 KB
32.26495






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.223:54321
  492.1 KB
9999.0
36.0
936.0
mean
  492.1 KB
9999.0
36.0
936.0
min
  492.1 KB
9999.0
36.0
936.0
max
  492.1 KB
9999.0
36.0
936.0
stddev
      0  B
0.0
0.0
0.0
total
  492.1 KB
9999.0
36.0
936.0






    










    




         Block                 IUCR           Primary_Type     Description                   Location_Description    Arrest          Domestic        Beat           District       Ward           Community_Area    FBI_Code       X_Coordinate    Y_Coordinate    Year    Updated_On              Latitude         Longitude        Location                       Day            Month           WeekNum         WeekDay    HourOfDay      Weekend         Season
-------  --------------------  -------------  ---------------  ----------------------------  ----------------------  --------------  --------------  -------------  -------------  -------------  ----------------  -------------  --------------  --------------  ------  ----------------------  ---------------  ---------------  -----------------------------  -------------  --------------  --------------  ---------  -------------  --------------  --------
type     enum                  int            enum             enum                          enum                    enum            enum            int            int            int            int               int            int             int             int     enum                    real             real             enum                           int            int             int             enum       int            int             enum
mins     0.0                   110.0          0.0              0.0                           0.0                     0.0             0.0             111.0          1.0            1.0            1.0               2.0            1100317.0       1814255.0       2015.0  0.0                     41.64507243      -87.906463888    0.0                            1.0            1.0             4.0             0.0        0.0            0.0             0.0
mean     NaN                   1189.67651357  NaN              NaN                           NaN                     0.292829282928  0.152315231523  1159.61806181  11.3489885128  22.9540954095  37.4476447645     12.7401236227  1163880.59815   1885916.14984   2015.0  NaN                     41.8425652247    -87.6741405221   NaN                            17.6839683968  1.41944194419   5.18081808181   NaN        13.6319631963  0.159115911591  NaN
maxs     6517.0                5131.0         26.0             198.0                         90.0                    1.0             1.0             2535.0         25.0           50.0           77.0              26.0           1205069.0       1951533.0       2015.0  32.0                    42.022646183     -87.524773286    8603.0                         31.0           2.0             6.0             6.0        23.0           1.0             0.0
sigma    NaN                   927.751435583  NaN              NaN                           NaN                     0.455083515588  0.35934414686   695.76029875   6.94547493301  13.6495661144  21.2748762223     7.57423857911  16496.4493681   31274.0163199   0.0     NaN                     0.0860186579359  0.0600357970653  NaN                            11.1801043358  0.493492406787  0.738929830409  NaN        6.47321735807  0.365802434041  NaN
zeros    3                     0              11               933                           19                      7071            8476            0              0              0              0                 0              0               0               0       603                     0                0                1                              0              0               0               1038       374            8408            9999
missing  0                     419            0                0                             6                       0               0               0              162            0              0                 2557           162             162             0       0                       162              162              162                            0              0               0               0          0              0               0
0        081XX S COLES AVE     1811.0         NARCOTICS        POSS: CANNABIS 30GMS OR LESS  STREET                  true            false           422.0          4.0            7.0            46.0              18.0           1198273.0       1851626.0       2015.0  02/15/2015 12:43:39 PM  41.747693646     -87.549035389    (41.747693646, -87.549035389)  8.0            2.0             6.0             Sun        23.0           1.0             Winter
1        118XX S STATE ST      486.0          BATTERY          DOMESTIC BATTERY SIMPLE       APARTMENT               true            true            522.0          5.0            34.0           53.0              nan            1178335.0       1826581.0       2015.0  02/15/2015 12:43:39 PM  41.679442289     -87.622850758    (41.679442289, -87.622850758)  8.0            2.0             6.0             Sun        23.0           1.0             Winter
2        002XX S LARAMIE AVE   2026.0         NARCOTICS        POSS: PCP                     SIDEWALK                true            false           1522.0         15.0           29.0           25.0              18.0           1141717.0       1898581.0       2015.0  02/15/2015 12:43:39 PM  41.87777333      -87.755117993    (41.87777333, -87.755117993)   8.0            2.0             6.0             Sun        23.0           1.0             Winter
3        006XX E 67TH ST       1811.0         NARCOTICS        POSS: CANNABIS 30GMS OR LESS  STREET                  true            false           321.0          nan            6.0            42.0              18.0           nan             nan             2015.0  02/15/2015 12:43:39 PM  nan              nan                                             8.0            2.0             6.0             Sun        23.0           1.0             Winter
4        0000X S MAYFIELD AVE  610.0          BURGLARY         FORCIBLE ENTRY                APARTMENT               false           false           1513.0         15.0           29.0           25.0              5.0            1137239.0       1899372.0       2015.0  02/15/2015 12:43:39 PM  41.880025548     -87.771541324    (41.880025548, -87.771541324)  8.0            2.0             6.0             Sun        23.0           1.0             Winter
5        010XX W 48TH ST       486.0          BATTERY          DOMESTIC BATTERY SIMPLE       APARTMENT               false           true            933.0          9.0            3.0            61.0              nan            1169986.0       1873019.0       2015.0  02/15/2015 12:43:39 PM  41.807059405     -87.65206589     (41.807059405, -87.65206589)   8.0            2.0             6.0             Sun        23.0           1.0             Winter
6        015XX W ARTHUR AVE    1320.0         CRIMINAL DAMAGE  TO VEHICLE                    STREET                  false           false           2432.0         24.0           40.0           1.0               14.0           1164732.0       1943222.0       2015.0  02/15/2015 12:43:39 PM  41.999814056     -87.669342967    (41.999814056, -87.669342967)  8.0            2.0             6.0             Sun        23.0           1.0             Winter
7        060XX W GRAND AVE     110.0          HOMICIDE         FIRST DEGREE MURDER           STREET                  true            false           2512.0         25.0           37.0           19.0              nan            1135910.0       1914206.0       2015.0  02/15/2015 12:43:39 PM  41.920755683     -87.776067514    (41.920755683, -87.776067514)  8.0            2.0             6.0             Sun        23.0           1.0             Winter
8        001XX W WACKER DR     460.0          BATTERY          SIMPLE                        OTHER                   false           false           122.0          1.0            42.0           32.0              nan            1175384.0       1902088.0       2015.0  02/15/2015 12:43:39 PM  41.886707818     -87.631396356    (41.886707818, -87.631396356)  8.0            2.0             6.0             Sun        23.0           1.0             Winter
9        001XX W WACKER DR     460.0          BATTERY          SIMPLE                        HOTEL/MOTEL             false           false           122.0          1.0            42.0           32.0              nan            1175384.0       1902088.0       2015.0  02/15/2015 12:43:39 PM  41.886707818     -87.631396356    (41.886707818, -87.631396356)  8.0            2.0             6.0             Sun        23.0           1.0             Winter



In [7]:

    
# Expose H2O frames as Spark DataFrame

df_weather = hc.as_spark_frame(f_weather)
df_census = hc.as_spark_frame(f_census)
df_crimes = hc.as_spark_frame(f_crimes)



In [8]:

    
df_weather.show()









    



+-----+---+----+-------+--------+-------+
|month|day|year|maxTemp|meanTemp|minTemp|
+-----+---+----+-------+--------+-------+
|    1|  1|2001|     23|      14|      6|
|    1|  2|2001|     18|      12|      6|
|    1|  3|2001|     28|      18|      8|
|    1|  4|2001|     30|      24|     19|
|    1|  5|2001|     36|      30|     21|
|    1|  6|2001|     33|      26|     19|
|    1|  7|2001|     34|      28|     21|
|    1|  8|2001|     26|      20|     14|
|    1|  9|2001|     23|      16|     10|
|    1| 10|2001|     34|      26|     19|
|    1| 11|2001|     39|      28|     18|
|    1| 12|2001|     37|      31|     25|
|    1| 13|2001|     35|      34|     33|
|    1| 14|2001|     36|      34|     32|
|    1| 15|2001|     35|      32|     30|
|    1| 16|2001|     30|      28|     26|
|    1| 17|2001|     26|      22|     19|
|    1| 18|2001|     30|      24|     19|
|    1| 19|2001|     27|      22|     17|
|    1| 20|2001|     24|      18|     10|
+-----+---+----+-------+--------+-------+
only showing top 20 rows



In [9]:

    
# Use Spark SQL to join datasets

## Register DataFrames as tables in SQL context
sqlContext.registerDataFrameAsTable(df_weather, "chicagoWeather")
sqlContext.registerDataFrameAsTable(df_census, "chicagoCensus")
sqlContext.registerDataFrameAsTable(df_crimes, "chicagoCrime")


crimeWithWeather = sqlContext.sql("""SELECT
a.Year, a.Month, a.Day, a.WeekNum, a.HourOfDay, a.Weekend, a.Season, a.WeekDay,
a.IUCR, a.Primary_Type, a.Location_Description, a.Community_Area, a.District,
a.Arrest, a.Domestic, a.Beat, a.Ward, a.FBI_Code,
b.minTemp, b.maxTemp, b.meanTemp,
c.PERCENT_AGED_UNDER_18_OR_OVER_64, c.PER_CAPITA_INCOME, c.HARDSHIP_INDEX,
c.PERCENT_OF_HOUSING_CROWDED, c.PERCENT_HOUSEHOLDS_BELOW_POVERTY,
c.PERCENT_AGED_16__UNEMPLOYED, c.PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA
FROM chicagoCrime a
JOIN chicagoWeather b
ON a.Year = b.year AND a.Month = b.month AND a.Day = b.day
JOIN chicagoCensus c
ON a.Community_Area = c.Community_Area_Number""")



In [10]:

    
crimeWithWeather.show()









    



+----+-----+---+-------+---------+-------+------+-------+----+--------------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+
|Year|Month|Day|WeekNum|HourOfDay|Weekend|Season|WeekDay|IUCR|        Primary_Type|Location_Description|Community_Area|District|Arrest|Domestic|Beat|Ward|FBI_Code|minTemp|maxTemp|meanTemp|PERCENT_AGED_UNDER_18_OR_OVER_64|PER_CAPITA_INCOME|HARDSHIP_INDEX|PERCENT_OF_HOUSING_CROWDED|PERCENT_HOUSEHOLDS_BELOW_POVERTY|PERCENT_AGED_16__UNEMPLOYED|PERCENT_AGED_25__WITHOUT_HIGH_SCHOOL_DIPLOMA|
+----+-----+---+-------+---------+-------+------+-------+----+--------------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+
|2015|    1| 23|      4|       22|      0|Winter|    Fri|null|   WEAPONS VIOLATION|               ALLEY|            31|      12|  true|   false|1234|  25|      15|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       19|      0|Winter|    Fri|4625|       OTHER OFFENSE|            SIDEWALK|            31|      10|  true|   false|1034|  25|      26|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       19|      0|Winter|    Fri| 320|             ROBBERY|  SMALL RETAIL STORE|            31|      10| false|   false|1034|  25|       3|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       18|      0|Winter|    Fri|1310|     CRIMINAL DAMAGE|          RESTAURANT|            31|      12| false|   false|1235|  25|      14|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       18|      0|Winter|    Fri| 610|            BURGLARY|           RESIDENCE|            31|      12| false|   false|1234|  25|       5|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       16|      0|Winter|    Fri|2210|LIQUOR LAW VIOLATION| TAVERN/LIQUOR STORE|            31|      12|  true|   false|1233|  25|      22|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       16|      0|Winter|    Fri| 470|PUBLIC PEACE VIOL...|             CTA BUS|            31|      10|  true|   false|1034|  25|      24|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       15|      0|Winter|    Fri|1305|     CRIMINAL DAMAGE|           RESIDENCE|            31|      12| false|   false|1235|  25|      14|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       13|      0|Winter|    Fri| 486|             BATTERY|           RESIDENCE|            31|      10| false|    true|1034|  25|    null|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       13|      0|Winter|    Fri| 820|               THEFT|              STREET|            31|      12| false|   false|1234|  25|       6|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       12|      0|Winter|    Fri| 915| MOTOR VEHICLE THEFT|  VEHICLE-COMMERCIAL|            31|       9| false|   false| 912|  11|       7|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       12|      0|Winter|    Fri| 870|               THEFT|           CTA TRAIN|            31|      12| false|   false|1234|  25|       6|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 23|      4|       10|      0|Winter|    Fri| 486|             BATTERY|           APARTMENT|            31|      12|  true|   false|1234|  25|    null|     29|     31|      30|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|       22|      0|Winter|    Sat|2820|       OTHER OFFENSE|           APARTMENT|            31|      10| false|   false|1034|  25|      26|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|       21|      0|Winter|    Sat|1310|     CRIMINAL DAMAGE|          RESTAURANT|            31|      12|  true|   false|1233|  25|      14|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|       18|      0|Winter|    Sat|1750|OFFENSE INVOLVING...|           RESIDENCE|            31|      12| false|    true|1235|  25|      20|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|       18|      0|Winter|    Sat| 460|             BATTERY|               OTHER|            31|      10| false|   false|1023|  25|    null|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|       13|      0|Winter|    Sat| 890|               THEFT|   CURRENCY EXCHANGE|            31|      10| false|   false|1023|  25|       6|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|        9|      0|Winter|    Sat| 560|             ASSAULT|               OTHER|            31|      12| false|   false|1234|  25|    null|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
|2015|    1| 24|      4|        8|      0|Winter|    Sat| 486|             BATTERY|           RESIDENCE|            31|      12|  true|    true|1235|  25|    null|     29|     43|      36|                            32.6|            16444|            76|         9.600000000000001|                            25.8|                       15.8|                                        40.7|
+----+-----+---+-------+---------+-------+------+-------+----+--------------------+--------------------+--------------+--------+------+--------+----+----+--------+-------+-------+--------+--------------------------------+-----------------+--------------+--------------------------+--------------------------------+---------------------------+--------------------------------------------+
only showing top 20 rows



In [11]:

    
# Publish Spark DataFrame as H2OFrame with given name
crimeWithWeatherHF = hc.as_h2o_frame(crimeWithWeather, "crimeWithWeatherTable")



In [12]:

    
# Transform selected String columns to categoricals
crimeWithWeatherHF["Arrest"] = crimeWithWeatherHF["Arrest"].asfactor()
crimeWithWeatherHF["Season"] = crimeWithWeatherHF["Season"].asfactor()
crimeWithWeatherHF["WeekDay"] = crimeWithWeatherHF["WeekDay"].asfactor()
crimeWithWeatherHF["Primary_Type"] = crimeWithWeatherHF["Primary_Type"].asfactor()
crimeWithWeatherHF["Location_Description"] = crimeWithWeatherHF["Location_Description"].asfactor()
crimeWithWeatherHF["Domestic"] = crimeWithWeatherHF["Domestic"].asfactor()



In [13]:

    
# Split final data table
ratios = [0.8]
frs = crimeWithWeatherHF.split_frame(ratios)
train = frs[0]
test = frs[1]



In [14]:

    
h2o.gbm?



In [15]:

    
gbm_model = h2o.gbm(x      = train.drop("Arrest"),
              y            = train["Arrest"],
              validation_x = test.drop("Arrest"),
              validation_y = test["Arrest"],
              ntrees       = 50,
              max_depth    = 3,
              learn_rate   = 0.1,
              distribution = "bernoulli")









    



gbm Model Build Progress: [##################################################] 100%






    



/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:8: DeprecationWarning: `h2o.gbm` is deprecated. Use the estimators sub module to build an H2OGradientBoostedEstimator.



In [16]:

    
gbm_model.model_performance(test)









    



ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.0886840850472
R^2: 0.580772634503
LogLoss: 0.297152918244
AUC: 0.915804744837
Gini: 0.831609489674

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.373316650621:






    





false
true
Error
Rate
false
1347.0
48.0
0.0344
 (48.0/1395.0)
true
177.0
432.0
0.2906
 (177.0/609.0)
Total
1524.0
480.0
0.1123
 (225.0/2004.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.4
0.8
145.0
max f2
0.2
0.8
265.0
max f0point5
0.7
0.9
99.0
max accuracy
0.6
0.9
110.0
max precision
1.0
1.0
0.0
max absolute_MCC
0.6
0.7
110.0
max min_per_class_accuracy
0.2
0.8
221.0






    Out[16]:



In [17]:

    
dl_model = h2o.deeplearning(
              x            = train.drop("Arrest"),
              y            = train["Arrest"],
              validation_x = test.drop("Arrest"),
              validation_y = test["Arrest"])









    



deeplearning Model Build Progress: [##################################################] 100%






    



/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:5: DeprecationWarning: `h2o.deeplearning` is deprecated. Use the estimators sub module to build an H2ODeepLearningEstimator.



In [18]:

    
dl_model.model_performance(test)









    



ModelMetricsBinomial: deeplearning
** Reported on test data. **

MSE: 0.104848808803
R^2: 0.504358759898
LogLoss: 0.345785338505
AUC: 0.903655443144
Gini: 0.807310886288

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.664046009238:






    





false
true
Error
Rate
false
1329.0
66.0
0.0473
 (66.0/1395.0)
true
179.0
430.0
0.2939
 (179.0/609.0)
Total
1508.0
496.0
0.1223
 (245.0/2004.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.7
0.8
94.0
max f2
0.2
0.8
251.0
max f0point5
0.9
0.9
35.0
max accuracy
0.7
0.9
82.0
max precision
1.0
1.0
0.0
max absolute_MCC
0.8
0.7
56.0
max min_per_class_accuracy
0.3
0.8
195.0






    Out[18]:



In [19]:

    
# create Crime class
from datetime import datetime
from pytz import timezone

def get_season(dt):
    if (dt >= 3 and dt <= 5):
        return "Spring"
    elif (dt >= 6 and dt <= 8):
        return "Summer"
    elif (dt >= 9 and dt <= 10):
        return "Autumn"
    else:       
        return "Winter"

def crime(date,
        iucr,
        primaryType,
        locationDescr,
        domestic,
        beat,
        district,
        ward,
        communityArea,
        fbiCode,
        minTemp = 77777,
        maxTemp = 77777,
        meanTemp = 77777,
        datePattern = "%d/%m/%Y %I:%M:%S %p",
        dateTimeZone = "Etc/UTC"):

    dt = datetime.strptime("02/08/2015 11:43:58 PM",'%d/%m/%Y %I:%M:%S %p')
    dt.replace(tzinfo=timezone("Etc/UTC"))

    crime = {}
    crime["Year"] = dt.year
    crime["Month"] = dt.month
    crime["Day"] = dt.day
    crime["WeekNum"] = dt.isocalendar()[1]
    crime["HourOfDay"] = dt.hour
    crime["Weekend"] = 1 if dt.weekday() == 5 or dt.weekday() == 6 else 0
    crime["Season"] = get_season(dt.month)
    crime["WeekDay"] = dt.strftime('%a')  #gets the day of week in short format - Mon, Tue ...
    crime["IUCR"] = iucr
    crime["Primary_Type"] = primaryType
    crime["Location_Description"] = locationDescr
    crime["Domestic"] = True if domestic else False
    crime["Beat"] = beat
    crime["District"] = district
    crime["Ward"] = ward
    crime["Community_Area"] = communityArea
    crime["FBI_Code"] = fbiCode
    crime["minTemp"] = minTemp
    crime["maxTemp"] = maxTemp
    crime["meanTemp"] = meanTemp
    return crime



In [20]:

    
# Create crimes examples
crime_examples = [
  crime("02/08/2015 11:43:58 PM", 1811, "NARCOTICS", "STREET",False, 422, 4, 7, 46, 18),
  crime("02/08/2015 11:00:39 PM", 1150, "DECEPTIVE PRACTICE", "RESIDENCE",False, 923, 9, 14, 63, 11)]



In [22]:

    
#For given crime and model returns probability of crime.
from h2o.connection import H2OConnection
def score_event(crime, model, censusTable):
    srdd = sqlContext.createDataFrame([crime])
    # Join table with census data
    df_row = censusTable.join(srdd).where("Community_Area = Community_Area_Number")  
    row = hc.as_h2o_frame(df_row)
    row["Season"] = row["Season"].asfactor()
    row["WeekDay"] = row["WeekDay"].asfactor()
    row["Primary_Type"] = row["Primary_Type"].asfactor()
    row["Location_Description"] = row["Location_Description"].asfactor()
    row["Domestic"] = row["Domestic"].asfactor()

    predictTable = model.predict(row)
    #FIXME: for glmModel table predictTable does not contain column "true"
    probOfArrest = predictTable["true"][0,0]
    return probOfArrest

for crime in crime_examples:
    arrestProbGLM = 100*score_event(crime, gbm_model, df_census)
    arrestProbGBM = 100*score_event(crime, dl_model, df_census)

    print("""
       |Crime: """+str(crime)+"""
       |  Probability of arrest best on DeepLearning: """+str(arrestProbGLM)+"""
       |  Probability of arrest best on GBM: """+str(arrestProbGBM)+"""
        """)









    



       |Crime: {'meanTemp': 77777, 'maxTemp': 77777, 'Primary_Type': 'NARCOTICS', 'District': 4, 'minTemp': 77777, 'Beat': 422, 'Season': 'Summer', 'Community_Area': 46, 'Domestic': False, 'IUCR': 1811, 'Month': 8, 'Location_Description': 'STREET', 'WeekNum': 31, 'WeekDay': 'Sun', 'Year': 2015, 'HourOfDay': 23, 'Ward': 7, 'FBI_Code': 18, 'Weekend': 1, 'Day': 2}
       |  Probability of arrest best on DeepLearning: 96.2809743835
       |  Probability of arrest best on GBM: 4.59228476779e-169
        

       |Crime: {'meanTemp': 77777, 'maxTemp': 77777, 'Primary_Type': 'DECEPTIVE PRACTICE', 'District': 9, 'minTemp': 77777, 'Beat': 923, 'Season': 'Summer', 'Community_Area': 63, 'Domestic': False, 'IUCR': 1150, 'Month': 8, 'Location_Description': 'RESIDENCE', 'WeekNum': 31, 'WeekDay': 'Sun', 'Year': 2015, 'HourOfDay': 23, 'Ward': 14, 'FBI_Code': 11, 'Weekend': 1, 'Day': 2}
       |  Probability of arrest best on DeepLearning: 6.33938940776
       |  Probability of arrest best on GBM: 1.45404780481e-169



In [ ]:

H2O cluster uptime:	996 milliseconds
H2O cluster version:	3.7.0.3247
H2O cluster name:	sparkling-water-michal
H2O cluster total nodes:	1
H2O cluster total memory:	982.0 MB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	172.16.2.223
H2O Connection port:	54321

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	147	16.333332	11.5 KB	2.2497318
CBS	Bits	2	0.2	210 B	0.0
C1	1-Byte Integers	283	31.444445	95.6 KB	18.719261
C1N	1-Byte Integers (w/o NAs)	144	16.0	48.6 KB	9.521951
C2	2-Byte Integers	144	16.0	87.7 KB	17.176012
C2S	2-Byte Fractions	35	3.888889	21.9 KB	4.2817597
C4	4-Byte Integers	72	8.0	82.9 KB	16.23939
C4S	4-Byte Fractions	1	0.1	1.2 KB	0.2
C8	64-bit Integers	1	0.1	2.2 KB	0.4384681
C8D	64-bit Reals	71	7.8888884	158.8 KB	31.10522

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.223:54321	510.5 KB	9999.0	36.0	900.0
mean	510.5 KB	9999.0	36.0	900.0
min	510.5 KB	9999.0	36.0	900.0
max	510.5 KB	9999.0	36.0	900.0
stddev	0 B	0.0	0.0	0.0
total	510.5 KB	9999.0	36.0	900.0

	false	true	Error	Rate
false	1347.0	48.0	0.0344	(48.0/1395.0)
true	177.0	432.0	0.2906	(177.0/609.0)
Total	1524.0	480.0	0.1123	(225.0/2004.0)

metric	threshold	value	idx
max f1	0.4	0.8	145.0
max f2	0.2	0.8	265.0
max f0point5	0.7	0.9	99.0
max accuracy	0.6	0.9	110.0
max precision	1.0	1.0	0.0
max absolute_MCC	0.6	0.7	110.0
max min_per_class_accuracy	0.2	0.8	221.0