notebook.community

Edit and run



In [1]:

    
import h2o



In [2]:

    
# Connect to a cluster
h2o.init()









    




H2O cluster uptime: 
17 seconds 548 milliseconds 
H2O cluster version: 
3.1.0.99999
H2O cluster name: 
anqi_fu
H2O cluster total nodes: 
1
H2O cluster total memory: 
1.78 GB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
127.0.0.1
H2O Connection port: 
54321



In [3]:

    
weather_path = h2o.locate("smalldata/chicago/chicagoAllWeather.csv")
census_path = h2o.locate("smalldata/chicago/chicagoCensus.csv")
crimes_path = h2o.locate("smalldata/chicago/chicagoCrimes10k.csv.zip")

print "Import and Parse weather data"
weather = h2o.import_frame(path=weather_path)
weather.drop("date")
weather.describe()

print "Import and Parse census data"
census = h2o.import_frame(path=census_path)
census.describe()

print "Import and Parse crimes data"
crimes = h2o.import_frame(path=crimes_path)
crimes.describe()









    



Import and Parse weather data

Parse Progress: [##################################################] 100%
Imported  /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoAllWeather.csv . Parsed 5,162 rows and 7 cols
Rows: 5,162 Cols: 7

Chunk compression summary:







    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C1N
1-Byte Integers (w/o NAs)
2
28.57143
   10.2 KB
11.221008
C1S
1-Byte Fractions
4
57.14286
   20.5 KB
22.510675
CStr
String
1
14.285715
   60.3 KB
66.26832






    



Frame distribution summary:







    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.17:54321
   91.0 KB
5162.0
1.0
7.0
mean
   91.0 KB
5162.0
1.0
7.0
min
   91.0 KB
5162.0
1.0
7.0
max
   91.0 KB
5162.0
1.0
7.0
stddev
      0  B
0.0
0.0
0.0
total
   91.0 KB
5162.0
1.0
7.0






    



Column-by-Column Summary:







    





date
month
day
year
maxTemp
meanTemp
minTemp
type
string
int
int
int
int
int
int
mins
NaN
1.0
1.0
2001.0
-2.0
-9.0
-18.0
maxs
NaN
12.0
31.0
2015.0
103.0
93.0
82.0
sigma
NaN
3.46905171694
8.79895173997
4.0773409057
21.4829777237
19.9302399266
19.0207297123
zero_count
0
0
0
0
0
2
16
missing_count
0
0
0
0
13
13
13






    



Import and Parse census data

Parse Progress: [##################################################] 100%
Imported  /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoCensus.csv . Parsed 79 rows and 9 cols
Rows: 79 Cols: 9

Chunk compression summary:







    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C1
1-Byte Integers
2
22.222223
    294  B
9.312638
C1S
1-Byte Fractions
1
11.111112
    163  B
5.1631293
C2S
2-Byte Fractions
4
44.444447
    968  B
30.662022
C4
4-Byte Integers
1
11.111112
    384  B
12.163446
CStr
String
1
11.111112
    1.3 KB
42.698765






    



Frame distribution summary:







    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.17:54321
    3.1 KB
79.0
1.0
9.0
mean
    3.1 KB
79.0
1.0
9.0
min
    3.1 KB
79.0
1.0
9.0
max
    3.1 KB
79.0
1.0
9.0
stddev
      0  B
0.0
0.0
0.0
total
    3.1 KB
79.0
1.0
9.0






    



Column-by-Column Summary:







    





Community Area Number
COMMUNITY AREA NAME
PERCENT OF HOUSING CROWDED
PERCENT HOUSEHOLDS BELOW POVERTY
PERCENT AGED 16  UNEMPLOYED
PERCENT AGED 25  WITHOUT HIGH SCHOOL DIPLOMA
PERCENT AGED UNDER 18 OR OVER 64
PER CAPITA INCOME 
HARDSHIP INDEX
type
int
string
real
real
real
real
real
int
int
mins
1.0
NaN
0.3
3.3
4.7
2.5
13.5
8201.0
1.0
maxs
77.0
NaN
15.8
56.5
35.9
54.8
51.5
88669.0
98.0
sigma
22.3718573212
NaN
3.65898144135
11.457230913
7.49949670861
11.7465143511
7.28442108494
15196.4055413
28.6905556516
zero_count
0
0
0
0
0
0
0
0
0
missing_count
2
0
1
1
1
1
1
1
2






    



Import and Parse crimes data

Parse Progress: [##################################################] 100%
Imported  /Users/anqi_fu/Documents/workspace/h2o-3/smalldata/chicago/chicagoCrimes10k.csv.zip . Parsed 9,999 rows and 22 cols
Rows: 9,999 Cols: 22

Chunk compression summary:







    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
4
4.5454545
    320  B
0.03695244
C1
1-Byte Integers
32
36.363636
   80.2 KB
9.488462
C1N
1-Byte Integers (w/o NAs)
8
9.090909
   20.1 KB
2.3721156
C2
2-Byte Integers
16
18.181818
   79.2 KB
9.362824
C4
4-Byte Integers
12
13.636364
  118.0 KB
13.950008
CStr
String
8
9.090909
  391.1 KB
46.252445
C8D
64-bit Reals
8
9.090909
  156.8 KB
18.537191






    



Frame distribution summary:







    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.17:54321
  845.7 KB
9999.0
4.0
88.0
mean
  845.7 KB
9999.0
4.0
88.0
min
  845.7 KB
9999.0
4.0
88.0
max
  845.7 KB
9999.0
4.0
88.0
stddev
      0  B
0.0
0.0
0.0
total
  845.7 KB
9999.0
4.0
88.0






    



Column-by-Column Summary:







    





ID
Case Number
Date
Block
IUCR
Primary Type
Description
Location Description
Arrest
Domestic
Beat
District
Ward
Community Area
FBI Code
X Coordinate
Y Coordinate
Year
Updated On
Latitude
Longitude
Location
type
int
string
string
enum
int
enum
enum
enum
enum
enum
int
int
int
int
int
int
int
int
enum
real
real
enum
mins
21735.0
NaN
NaN
0.0
110.0
0.0
0.0
0.0
0.0
0.0
111.0
1.0
1.0
1.0
2.0
1100317.0
1814255.0
2015.0
0.0
41.64507243
-87.906463888
0.0
maxs
9962898.0
NaN
NaN
6517.0
5131.0
26.0
198.0
90.0
1.0
1.0
2535.0
25.0
50.0
77.0
26.0
1205069.0
1951533.0
2015.0
32.0
42.022646183
-87.524773286
8603.0
sigma
396787.564221
NaN
NaN
1915.88517194
927.751435583
9.16241735944
60.1059382029
25.5963972463
0.455083515588
0.35934414686
695.76029875
6.94547493301
13.6495661144
21.2748762223
7.57423857911
16496.4493681
31274.0163199
0.0
10.0824464345
0.0860186579359
0.0600357970653
2469.64729385
zero_count
0
0
0
3
0
11
933
19
7071
8476
0
0
0
0
0
0
0
0
603
0
0
1
missing_count
0
0
0
0
419
0
0
6
0
0
0
162
0
0
2557
162
162
0
0
162
162
162



In [4]:

    
def refine_date_col(data, col, pattern):
    data[col]         = data[col].as_date(pattern)
    data["Day"]       = data[col].day()
    data["Month"]     = data[col].month() + 1     # Since H2O indexes from 0
    data["Year"]      = data[col].year() + 1900   # Start of epoch is 1900
    data["WeekNum"]   = data[col].week()
    data["WeekDay"]   = data[col].dayOfWeek()
    data["HourOfDay"] = data[col].hour()
    
    data.describe()  # HACK: Force evaluation before ifelse and cut. See PUBDEV-1425.
    
    # Create weekend and season cols
    # Spring = Mar, Apr, May. Summer = Jun, Jul, Aug. Autumn = Sep, Oct. Winter = Nov, Dec, Jan, Feb.
    # data["Weekend"]   = [1 if x in ("Sun", "Sat") else 0 for x in data["WeekDay"]]
    data["Weekend"] = h2o.ifelse(data["WeekDay"] == "Sun" or data["WeekDay"] == "Sat", 1, 0)[0]
    data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
    
refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")
crimes = crimes.drop("Date")
crimes.describe()









    



Rows: 9,999 Cols: 27

Chunk compression summary:







    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
9
8.333334
    720  B
0.10067465
C1
1-Byte Integers
32
29.62963
   80.2 KB
11.489216
C1N
1-Byte Integers (w/o NAs)
23
21.296297
   57.9 KB
8.29671
C2
2-Byte Integers
16
14.814815
   79.2 KB
11.337085
C4
4-Byte Integers
12
11.111112
  118.0 KB
16.891531
C8
64-bit Integers
4
3.7037036
   78.4 KB
11.222987
CStr
String
4
3.7037036
  127.2 KB
18.215822
C8D
64-bit Reals
8
7.4074073
  156.8 KB
22.445974






    



Frame distribution summary:







    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.17:54321
  698.4 KB
9999.0
4.0
108.0
mean
  698.4 KB
9999.0
4.0
108.0
min
  698.4 KB
9999.0
4.0
108.0
max
  698.4 KB
9999.0
4.0
108.0
stddev
      0  B
0.0
0.0
0.0
total
  698.4 KB
9999.0
4.0
108.0






    



Column-by-Column Summary:







    





ID
Case Number
Date
Block
IUCR
Primary Type
Description
Location Description
Arrest
Domestic
Beat
District
Ward
Community Area
FBI Code
X Coordinate
Y Coordinate
Year
Updated On
Latitude
Longitude
Location
Day
Month
WeekNum
WeekDay
HourOfDay
type
int
string
int
enum
int
enum
enum
enum
enum
enum
int
int
int
int
int
int
int
int
enum
real
real
enum
int
int
int
enum
int
mins
21735.0
NaN
1.42203063e+12
0.0
110.0
0.0
0.0
0.0
0.0
0.0
111.0
1.0
1.0
1.0
2.0
1100317.0
1814255.0
3915.0
0.0
41.64507243
-87.906463888
0.0
1.0
2.0
4.0
0.0
0.0
maxs
9962898.0
NaN
1.42346782e+12
6517.0
5131.0
26.0
198.0
90.0
1.0
1.0
2535.0
25.0
50.0
77.0
26.0
1205069.0
1951533.0
3915.0
32.0
42.022646183
-87.524773286
8603.0
31.0
3.0
6.0
6.0
23.0
sigma
396787.564221
NaN
433879245.188
1915.88517194
927.751435583
9.16241735944
60.1059382029
25.5963972463
0.455083515588
0.35934414686
695.76029875
6.94547493301
13.6495661144
21.2748762223
7.57423857911
16496.4493681
31274.0163199
0.0
10.0824464345
0.0860186579359
0.0600357970653
2469.64729385
11.1801043358
0.493492406787
0.738929830409
1.93284056432
6.47321735807
zero_count
0
0
0
3
0
11
933
19
7071
8476
0
0
0
0
0
0
0
0
603
0
0
1
0
0
0
1038
374
missing_count
0
0
0
0
419
0
0
6
0
0
0
162
0
0
2557
162
162
0
0
162
162
162
0
0
0
0
0






    



Rows: 9,999 Cols: 28

Chunk compression summary:







    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
13
11.607142
    1.0 KB
0.16332634
CBS
Bits
4
3.5714288
    1.5 KB
0.2404352
C1
1-Byte Integers
32
28.57143
   80.2 KB
12.9040365
C1N
1-Byte Integers (w/o NAs)
23
20.535715
   57.9 KB
9.318395
C2
2-Byte Integers
16
14.285715
   79.2 KB
12.733171
C4
4-Byte Integers
12
10.714286
  118.0 KB
18.97161
CStr
String
4
3.5714288
  127.2 KB
20.458979
C8D
64-bit Reals
8
7.1428576
  156.8 KB
25.210047






    



Frame distribution summary:







    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.17:54321
  621.8 KB
9999.0
4.0
112.0
mean
  621.8 KB
9999.0
4.0
112.0
min
  621.8 KB
9999.0
4.0
112.0
max
  621.8 KB
9999.0
4.0
112.0
stddev
      0  B
0.0
0.0
0.0
total
  621.8 KB
9999.0
4.0
112.0






    



Column-by-Column Summary:







    





ID
Case Number
Block
IUCR
Primary Type
Description
Location Description
Arrest
Domestic
Beat
District
Ward
Community Area
FBI Code
X Coordinate
Y Coordinate
Year
Updated On
Latitude
Longitude
Location
Day
Month
WeekNum
WeekDay
HourOfDay
Weekend
Season
type
int
string
enum
int
enum
enum
enum
enum
enum
int
int
int
int
int
int
int
int
enum
real
real
enum
int
int
int
enum
int
int
enum
mins
21735.0
NaN
0.0
110.0
0.0
0.0
0.0
0.0
0.0
111.0
1.0
1.0
1.0
2.0
1100317.0
1814255.0
3915.0
0.0
41.64507243
-87.906463888
0.0
1.0
2.0
4.0
0.0
0.0
0.0
0.0
maxs
9962898.0
NaN
6517.0
5131.0
26.0
198.0
90.0
1.0
1.0
2535.0
25.0
50.0
77.0
26.0
1205069.0
1951533.0
3915.0
32.0
42.022646183
-87.524773286
8603.0
31.0
3.0
6.0
6.0
23.0
1.0
1.0
sigma
396787.564221
NaN
1915.88517194
927.751435583
9.16241735944
60.1059382029
25.5963972463
0.455083515588
0.35934414686
695.76029875
6.94547493301
13.6495661144
21.2748762223
7.57423857911
16496.4493681
31274.0163199
0.0
10.0824464345
0.0860186579359
0.0600357970653
2469.64729385
11.1801043358
0.493492406787
0.738929830409
1.93284056432
6.47321735807
0.365802434041
0.493492406787
zero_count
0
0
3
0
11
933
19
7071
8476
0
0
0
0
0
0
0
0
603
0
0
1
0
0
0
1038
374
8408
5805
missing_count
0
0
0
419
0
0
6
0
0
0
162
0
0
2557
162
162
0
0
162
162
162
0
0
0
0
0
0
0



In [5]:

    
# Merge crimes data with weather and census
census["Community Area Number"]._name = "Community Area"
weather["month"]._name = "Month"
weather["day"]  ._name = "Day"
weather["year"] ._name = "Year"
crimes.merge(census, allLeft=True, allRite=False)
crimes.merge(weather, allLeft=True, allRite=False)









    



---------------------------------------------------------------------------
EnvironmentError                          Traceback (most recent call last)
<ipython-input-5-e946a6af6204> in <module>()
      4 weather["day"]  ._name = "Day"
      5 weather["year"] ._name = "Year"
----> 6 crimes.merge(census, allLeft=True, allRite=False)
      7 crimes.merge(weather, allLeft=True, allRite=False)

/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/frame.pyc in merge(self, other, allLeft, allRite)
   1022     expr2 = "(, "+expr+" (del %"+lkey+" #0) (del %"+rkey+" #0) )"
   1023 
-> 1024     h2o.rapids(expr2)       # merge in h2o
   1025     # Make backing H2OVecs for the remote h2o vecs
   1026     j = h2o.frame(tmp_key)  # Fetch the frame as JSON

/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/h2o.pyc in rapids(expr)
    487   :return: The JSON response of the Rapids execution
    488   """
--> 489   result = H2OConnection.post_json("Rapids", ast=urllib.quote(expr), _rest_version=99)
    490   if result['error'] is not None:
    491     raise EnvironmentError("rapids expression not evaluated: {0}".format(str(result['error'])))

/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in post_json(url_suffix, file_upload_info, **kwargs)
    360     if __H2OCONN__ is None:
    361       raise ValueError("No h2o connection. Did you run `h2o.init()` ?")
--> 362     return __H2OCONN__._rest_json(url_suffix, "POST", file_upload_info, **kwargs)
    363 
    364   def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):

/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in _rest_json(self, url_suffix, method, file_upload_info, **kwargs)
    363 
    364   def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):
--> 365     raw_txt = self._do_raw_rest(url_suffix, method, file_upload_info, **kwargs)
    366     return self._process_tables(raw_txt.json())
    367 

/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in _do_raw_rest(self, url_suffix, method, file_upload_info, **kwargs)
    429       raise EnvironmentError(("h2o-py got an unexpected HTTP status code:\n {} {} (method = {}; url = {}). \n"+ \
    430                               "detailed error messages: {}")
--> 431                              .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs))
    432 
    433     # TODO: is.logging? -> write to logs

EnvironmentError: h2o-py got an unexpected HTTP status code:
 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). 
detailed error messages: water.DException$DistributedException: from /172.16.2.17:54321; by class water.rapids.ASTMerge$MergeSet$MakeHash; class water.exceptions.H2OIllegalArgumentException: unimplemented



In [12]:

    
# Create test/train split
data_split = h2o.split_frame(data, ratios = [0.8,0.2])
train = data_split[1]
test  = data_split[2]

# Simple GBM - Predict Arrest
data_gbm = h2o.gbm(x              =train.drop("Arrest"),
                   y              =train     ["Arrest"],
                   validation_x   =test .drop("Arrest"),
                   validation_y   =test      ["Arrest"],
                   ntrees         =10,
                   max_depth      =6,
                   distribution   ="bernoulli")

# Simple Deep Learning
data_dl = h2o.deeplearning(x                   =train.drop("Arrest"),
                           y                   =train     ["Arrest"],
                           validation_x        =test .drop("Arrest"),
                           validation_y        =test      ["Arrest"],
                           variable_importances=True,
                           loss                ="Automatic")









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-347776b381b3> in <module>()
      1 # Create test/train split
----> 2 data_split = h2o.split_frame(data, ratios = [0.8,0.2])
      3 train = data_split[1]
      4 test  = data_split[2]
      5 

NameError: name 'data' is not defined



In [2]:

    
# GBM performance on train/test data
train_auc_gbm = data_gbm.model_performance(train).auc()
test_auc_gbm  = data_gbm.model_performance(test) .auc()

# Deep Learning performance on train/test data
train_auc_dl = data_dl.model_performance(train).auc()
test_auc_dl  = data_dl.model_performance(test) .auc()

# Make a pretty HTML table printout of the results
header = ["Model", "AUC Train", "AUC Test"]
table  = [
           ["GBM", train_auc_gbm, test_auc_gbm],
           ["DL ", train_auc_dl,  test_auc_dl]
         ]
h2o.H2ODisplay(table, header)









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-f7c2ab3a3e26> in <module>()
      1 # GBM performance on train/test data
----> 2 train_auc_gbm = data_gbm.model_performance(train).auc()
      3 test_auc_gbm  = data_gbm.model_performance(test) .auc()
      4 
      5 # Deep Learning performance on train/test data

NameError: name 'data_gbm' is not defined



In [6]:

    
# Create new H2OFrame of crime observations
examples = {
            "Date":                 ["02/08/2015 11:43:58 PM", "02/08/2015 11:00:39 PM"],
            "IUCR":                 [1811, 1150],
            "Primary.Type":         ["NARCOTICS", "DECEPTIVE PRACTICE"],
            "Location.Description": ["STREET", "RESIDENCE"],
            "Domestic":             ["false", "false"],
            "Beat":                 [422, 923],
            "District":             [4, 9],
            "Ward":                 [7, 14],
            "Community.Area":       [46, 63],
            "FBI.Code":             [18, 11]
            }

crime_examples = h2o.H2OFrame(python_obj = examples)

# Refine date column and merge with census data
refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
crime_examples.drop("Date")
crime_examples.merge(census, allLeft=True, allRite=False)









    



Parse Progress: [##################################################] 100%
Uploaded py634b18a9-7e84-40ca-b265-b2fe43e064aa into cluster with 2 rows and 10 cols
Rows: 2 Cols: 16

Chunk compression summary:







    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
7
43.75
    560  B
43.818466
C1N
1-Byte Integers (w/o NAs)
4
25.0
    280  B
21.909233
C2
2-Byte Integers
2
12.5
    144  B
11.267606
C2S
2-Byte Fractions
1
6.25
     88  B
6.885759
CStr
String
2
12.5
    206  B
16.118937






    



Frame distribution summary:







    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.17:54321
    1.2 KB
2.0
1.0
16.0
mean
    1.2 KB
2.0
1.0
16.0
min
    1.2 KB
2.0
1.0
16.0
max
    1.2 KB
2.0
1.0
16.0
stddev
      0  B
0.0
0.0
0.0
total
    1.2 KB
2.0
1.0
16.0






    



Column-by-Column Summary:







    





Location.Description
FBI.Code
Primary.Type
Community.Area
District
Beat
Domestic
IUCR
Date
Ward
Day
Month
Year
WeekNum
WeekDay
HourOfDay
type
string
int
string
int
int
int
enum
int
int
int
int
int
int
int
enum
int
mins
NaN
11.0
NaN
46.0
4.0
422.0
0.0
1150.0
1.423465239e+12
7.0
8.0
3.0
3915.0
6.0
6.0
23.0
maxs
NaN
18.0
NaN
63.0
9.0
923.0
0.0
1811.0
1.423467838e+12
14.0
8.0
3.0
3915.0
6.0
6.0
23.0
sigma
NaN
4.94974746831
NaN
12.0208152802
3.53553390593
354.260497374
0.0
467.397582364
1837770.5243
4.94974746831
0.0
0.0
0.0
0.0
0.0
0.0
zero_count
0
0
0
0
0
0
2
0
0
0
0
0
0
0
0
0
missing_count
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0






    



---------------------------------------------------------------------------
EnvironmentError                          Traceback (most recent call last)
<ipython-input-6-85bb7c75c897> in <module>()
     16 
     17 # Refine date column and merge with census data
---> 18 refine_date_col(crime_examples, "Date", "%m/%d/%Y %I:%M:%S %p")
     19 crime_examples.drop("Date")
     20 crime_examples.merge(census, allLeft=True, allRite=False)

<ipython-input-4-c2702228f9f1> in refine_date_col(data, col, pattern)
     15     # data["Weekend"] = h2o.ifelse(data["WeekDay"] in ("Sun", "Sat"), 1, 0)[0]
     16     data["Weekend"] = h2o.ifelse(data["WeekDay"] == "Sun" or data["WeekDay"] == "Sat", 1, 0)[0]
---> 17     data["Season"] = data["Month"].cut([0, 2, 5, 7, 10, 12], ["Winter", "Spring", "Summer", "Autumn", "Winter"])
     18 
     19 refine_date_col(crimes, "Date", "%m/%d/%Y %I:%M:%S %p")

/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/frame.pyc in cut(self, breaks, labels, include_lowest, right, dig_lab)
   1256 
   1257     expr = "(cut '{}' {} {} {} {} #{}".format(self.key(), breaks_list, labels_list, "%TRUE" if include_lowest else "%FALSE", "%TRUE" if right else "%FALSE", dig_lab)
-> 1258     res = h2o.rapids(expr)
   1259     return H2OVec(self._name, Expr(op=res["vec_ids"][0]["name"], length=res["num_rows"]))
   1260 

/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/h2o.pyc in rapids(expr)
    487   :return: The JSON response of the Rapids execution
    488   """
--> 489   result = H2OConnection.post_json("Rapids", ast=urllib.quote(expr), _rest_version=99)
    490   if result['error'] is not None:
    491     raise EnvironmentError("rapids expression not evaluated: {0}".format(str(result['error'])))

/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in post_json(url_suffix, file_upload_info, **kwargs)
    360     if __H2OCONN__ is None:
    361       raise ValueError("No h2o connection. Did you run `h2o.init()` ?")
--> 362     return __H2OCONN__._rest_json(url_suffix, "POST", file_upload_info, **kwargs)
    363 
    364   def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):

/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in _rest_json(self, url_suffix, method, file_upload_info, **kwargs)
    363 
    364   def _rest_json(self, url_suffix, method, file_upload_info, **kwargs):
--> 365     raw_txt = self._do_raw_rest(url_suffix, method, file_upload_info, **kwargs)
    366     return self._process_tables(raw_txt.json())
    367 

/Users/anqi_fu/Documents/workspace/h2o-3/h2o-py/h2o/connection.pyc in _do_raw_rest(self, url_suffix, method, file_upload_info, **kwargs)
    429       raise EnvironmentError(("h2o-py got an unexpected HTTP status code:\n {} {} (method = {}; url = {}). \n"+ \
    430                               "detailed error messages: {}")
--> 431                              .format(http_result.status_code,http_result.reason,method,url,detailed_error_msgs))
    432 
    433     # TODO: is.logging? -> write to logs

EnvironmentError: h2o-py got an unexpected HTTP status code:
 412 Precondition Failed (method = POST; url = http://localhost:54321/99/Rapids). 
detailed error messages: Data vector is constant!



In [ ]:

    
# Predict probability of arrest from new observations
gbm_pred = data_gbm.predict(crime_examples)
dl_pred  = data_dl .predict(crime_examples)

# TODO: Replace with a pretty HTML table
gbm_pred.describe()
dl_pred.describe()

H2O cluster uptime:	17 seconds 548 milliseconds
H2O cluster version:	3.1.0.99999
H2O cluster name:	anqi_fu
H2O cluster total nodes:	1
H2O cluster total memory:	1.78 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C1N	1-Byte Integers (w/o NAs)	2	28.57143	10.2 KB	11.221008
C1S	1-Byte Fractions	4	57.14286	20.5 KB	22.510675
CStr	String	1	14.285715	60.3 KB	66.26832

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.17:54321	91.0 KB	5162.0	1.0	7.0
mean	91.0 KB	5162.0	1.0	7.0
min	91.0 KB	5162.0	1.0	7.0
max	91.0 KB	5162.0	1.0	7.0
stddev	0 B	0.0	0.0	0.0
total	91.0 KB	5162.0	1.0	7.0

	date	month	day	year	maxTemp	meanTemp	minTemp
type	string	int	int	int	int	int	int
mins	NaN	1.0	1.0	2001.0	-2.0	-9.0	-18.0
maxs	NaN	12.0	31.0	2015.0	103.0	93.0	82.0
sigma	NaN	3.46905171694	8.79895173997	4.0773409057	21.4829777237	19.9302399266	19.0207297123
zero_count	0	0	0	0	0	2	16
missing_count	0	0	0	0	13	13	13

	Community Area Number	COMMUNITY AREA NAME	PERCENT OF HOUSING CROWDED	PERCENT HOUSEHOLDS BELOW POVERTY	PERCENT AGED 16 UNEMPLOYED	PERCENT AGED 25 WITHOUT HIGH SCHOOL DIPLOMA	PERCENT AGED UNDER 18 OR OVER 64	PER CAPITA INCOME	HARDSHIP INDEX
type	int	string	real	real	real	real	real	int	int
mins	1.0	NaN	0.3	3.3	4.7	2.5	13.5	8201.0	1.0
maxs	77.0	NaN	15.8	56.5	35.9	54.8	51.5	88669.0	98.0
sigma	22.3718573212	NaN	3.65898144135	11.457230913	7.49949670861	11.7465143511	7.28442108494	15196.4055413	28.6905556516
zero_count	0	0	0	0	0	0	0	0	0
missing_count	2	0	1	1	1	1	1	1	2

	ID	Case Number	Date	Block	IUCR	Primary Type	Description	Location Description	Arrest	Domestic	Beat	District	Ward	Community Area	FBI Code	X Coordinate	Y Coordinate	Year	Updated On	Latitude	Longitude	Location
type	int	string	string	enum	int	enum	enum	enum	enum	enum	int	int	int	int	int	int	int	int	enum	real	real	enum
mins	21735.0	NaN	NaN	0.0	110.0	0.0	0.0	0.0	0.0	0.0	111.0	1.0	1.0	1.0	2.0	1100317.0	1814255.0	2015.0	0.0	41.64507243	-87.906463888	0.0
maxs	9962898.0	NaN	NaN	6517.0	5131.0	26.0	198.0	90.0	1.0	1.0	2535.0	25.0	50.0	77.0	26.0	1205069.0	1951533.0	2015.0	32.0	42.022646183	-87.524773286	8603.0
sigma	396787.564221	NaN	NaN	1915.88517194	927.751435583	9.16241735944	60.1059382029	25.5963972463	0.455083515588	0.35934414686	695.76029875	6.94547493301	13.6495661144	21.2748762223	7.57423857911	16496.4493681	31274.0163199	0.0	10.0824464345	0.0860186579359	0.0600357970653	2469.64729385
zero_count	0	0	0	3	0	11	933	19	7071	8476	0	0	0	0	0	0	0	0	603	0	0	1
missing_count	0	0	0	0	419	0	0	6	0	0	0	162	0	0	2557	162	162	0	0	162	162	162

	Location.Description	FBI.Code	Primary.Type	Community.Area	District	Beat	Domestic	IUCR	Date	Ward	Day	Month	Year	WeekNum	WeekDay	HourOfDay
type	string	int	string	int	int	int	enum	int	int	int	int	int	int	int	enum	int
mins	NaN	11.0	NaN	46.0	4.0	422.0	0.0	1150.0	1.423465239e+12	7.0	8.0	3.0	3915.0	6.0	6.0	23.0
maxs	NaN	18.0	NaN	63.0	9.0	923.0	0.0	1811.0	1.423467838e+12	14.0	8.0	3.0	3915.0	6.0	6.0	23.0
sigma	NaN	4.94974746831	NaN	12.0208152802	3.53553390593	354.260497374	0.0	467.397582364	1837770.5243	4.94974746831	0.0	0.0	0.0	0.0	0.0	0.0
zero_count	0	0	0	0	0	0	2	0	0	0	0	0	0	0	0	0
missing_count	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0