notebook.community

Edit and run



In [40]:

    
import h2o
import time
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator



In [41]:

    
# Explore a typical Data Science workflow with H2O and Python
#
# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles
# across the CitiBike network of stations, by predicting the number of bike
# trips taken from the station every day.  Use 10 million rows of historical
# data, and eventually add weather data.


# Connect to a cluster
h2o.init()









    



Warning: Version mismatch. H2O is version 3.5.0.99999, but the python package is version UNKNOWN.






    




H2O cluster uptime: 
9 minutes 59 seconds 442 milliseconds 
H2O cluster version: 
3.5.0.99999
H2O cluster name: 
ludirehak
H2O cluster total nodes: 
1
H2O cluster total memory: 
4.44 GB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
127.0.0.1
H2O Connection port: 
54321



In [42]:

    
from h2o.h2o import _locate # private function. used to find files within h2o git project directory.

# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = False

def mylocate(s):
    if data_source_is_s3:
        return "s3n://h2o-public-test-data/" + s
    else:
        return _locate(s)



In [43]:

    
# Pick either the big or the small demo.
# Big data is 10M rows
small_test = [mylocate("bigdata/laptop/citibike-nyc/2013-10.csv")]
big_test =   [mylocate("bigdata/laptop/citibike-nyc/2013-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-08.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-09.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-10.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-11.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-12.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-01.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-02.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-03.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-04.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-05.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-06.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-08.csv")]

# ----------

# 1- Load data - 1 row per bicycle trip.  Has columns showing the start and end
# station, trip duration and trip start time and day.  The larger dataset
# totals about 10 million rows
print "Import and Parse bike data"
data = h2o.import_file(path=big_test)









    



Import and Parse bike data

Parse Progress: [##################################################] 100%

Parsed 10,407,546 rows and 15 cols:







    




File1
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-07.csv
File2
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-08.csv
File3
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-09.csv
File4
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-10.csv
File5
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-11.csv
File6
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-12.csv
File7
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-01.csv
File8
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-02.csv
File9
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-03.csv
File10
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-04.csv
File11
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-05.csv
File12
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-06.csv
File13
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-07.csv
File14
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-08.csv



In [44]:

    
# ----------

# 2- light data munging: group the bike starts per-day, converting the 10M rows
# of trips to about 140,000 station&day combos - predicting the number of trip
# starts per-station-per-day.

# Convert start time to: Day since the Epoch
startime = data["starttime"]
secsPerDay=1000*60*60*24
data["Days"] = (startime/secsPerDay).floor()
data.describe()









    



Rows: 10,407,546 Cols: 16

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
117
1.5298117
    9.1 KB
0.0
C1
1-Byte Integers
478
6.25
   10.0 MB
1.7289143
C1N
1-Byte Integers (w/o NAs)
478
6.25
   10.0 MB
1.7289143
C1S
1-Byte Fractions
839
10.970188
   17.5 MB
3.042758
C2
2-Byte Integers
2616
34.20502
  108.8 MB
18.8909
C2S
2-Byte Fractions
314
4.1056485
   12.9 MB
2.2460942
C4
4-Byte Integers
214
2.7981172
   17.9 MB
3.1005228
C4S
4-Byte Fractions
389
5.086297
   32.4 MB
5.625424
C8
64-bit Integers
680
8.891213
  113.5 MB
19.704786
C8D
64-bit Reals
1523
19.913704
  253.0 MB
43.930134






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.37:54321
  575.9 MB
10407546.0
478.0
7648.0
mean
  575.9 MB
10407546.0
478.0
7648.0
min
  575.9 MB
10407546.0
478.0
7648.0
max
  575.9 MB
10407546.0
478.0
7648.0
stddev
      0  B
0.0
0.0
0.0
total
  575.9 MB
10407546.0
478.0
7648.0






    



Column-by-Column Summary:







    





tripduration
starttime
stoptime
start station id
start station name
start station latitude
start station longitude
end station id
end station name
end station latitude
end station longitude
bikeid
usertype
birth year
gender
Days
type
int
time
time
int
enum
real
real
int
enum
real
real
int
enum
int
int
int
mins
60.0
1372662000000.0
1372662242000.0
72.0
0.0
40.7
-74.0
72.0
0.0
40.7
-74.0
14529.0
0.0
1899.0
0.0
15887.0
maxs
6250750.0
1409554787000.0
1409563605000.0
3002.0
339.0
40.771522
-74.0
3002.0
339.0
40.771522
-74.0
21689.0
1.0
1998.0
2.0
16314.0
mean
869.0
1390999858230.0
1391000727180.0
444.9
NaN
40.7
-74.0
445.3
NaN
40.7
-74.0
17895.7
0.9
1975.8
1.1
16099.0
sigma
2985.1
11806578171.7
11806555707.8
355.8
NaN
0.0
0.0
360.1
NaN
0.0
0.0
1938.8
0.3
11.1
0.6
136.6
zero_count
0
0
0
0
56836
0
0
0
55167
0
0
0
1247534
0
1248517
0
missing_count
0
0
0
0
0
0
0
0
0
0
0
0
0
1247644
0
0



In [45]:

    
# Now do a monster Group-By.  Count bike starts per-station per-day.  Ends up
# with about 340 stations times 400 days (140,000 rows).  This is what we want
# to predict.
grouped = data.group_by(["Days","start station name"])
bpd = grouped.count().get_frame() # Compute bikes-per-day
bpd.set_name(2,"bikes")
bpd.show()
bpd.describe()
bpd.dim









    



H2OFrame with 139261 rows and 3 columns: 






    






  
    
      
      Days
      start station name
      bikes
    
  
  
    
      0
      16313
      Greenwich St & N Moore St
      74
    
    
      1
      15993
      Henry St & Atlantic Ave
      56
    
    
      2
      16057
      Harrison St & Hudson St
      13
    
    
      3
      16249
      Greenwich St & Warren St
      197
    
    
      4
      16121
      Hanover Pl & Livingston St
      2
    
    
      5
      16185
      Hancock St & Bedford Ave
      14
    
    
      6
      15966
      Perry St & Bleecker St
      101
    
    
      7
      16222
      Park Pl & Church St
      53
    
    
      8
      16158
      Pearl St & Anchorage Pl
      15
    
    
      9
      16286
      Park Ave & St Edwards St
      5
    
  








    



Rows: 139,261 Cols: 3

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C2
2-Byte Integers
96
100.0
  822.4 KB
100.0






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.37:54321
  822.4 KB
139261.0
32.0
96.0
mean
  822.4 KB
139261.0
32.0
96.0
min
  822.4 KB
139261.0
32.0
96.0
max
  822.4 KB
139261.0
32.0
96.0
stddev
      0  B
0.0
0.0
0.0
total
  822.4 KB
139261.0
32.0
96.0






    



Column-by-Column Summary:







    





Days
start station name
bikes
type
int
enum
int
mins
15887.0
0.0
1.0
maxs
16314.0
339.0
680.0
mean
16100.0
NaN
74.7
sigma
123.6
NaN
64.1
zero_count
0
428
0
missing_count
0
0
0






    Out[45]:





[139261, 3]



In [46]:

    
# Quantiles: the data is fairly unbalanced; some station/day combos are wildly
# more popular than others.
print "Quantiles of bikes-per-day"
bpd["bikes"].quantile().show()









    



Quantiles of bikes-per-day
H2OFrame with 9 rows and 2 columns: 






    






  
    
      
      Probs
      bikesQuantiles
    
  
  
    
      0
      0.010
      2
    
    
      1
      0.100
      11
    
    
      2
      0.250
      26
    
    
      3
      0.333
      35
    
    
      4
      0.500
      58
    
    
      5
      0.667
      89
    
    
      6
      0.750
      107
    
    
      7
      0.900
      157
    
    
      8
      0.990
      291



In [47]:

    
# A little feature engineering
# Add in month-of-year (seasonality; fewer bike rides in winter than summer)
secs = bpd["Days"]*secsPerDay
bpd["Month"]     = secs.month().asfactor()
# Add in day-of-week (work-week; more bike rides on Sunday than Monday)
bpd["DayOfWeek"] = secs.dayOfWeek()
print "Bikes-Per-Day"
bpd.describe()









    



Bikes-Per-Day
Rows: 139,261 Cols: 5

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C1N
1-Byte Integers (w/o NAs)
64
40.0
  276.2 KB
25.145071
C2
2-Byte Integers
96
60.000004
  822.4 KB
74.85493






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.37:54321
    1.1 MB
139261.0
32.0
160.0
mean
    1.1 MB
139261.0
32.0
160.0
min
    1.1 MB
139261.0
32.0
160.0
max
    1.1 MB
139261.0
32.0
160.0
stddev
      0  B
0.0
0.0
0.0
total
    1.1 MB
139261.0
32.0
160.0






    



Column-by-Column Summary:







    





Days
start station name
bikes
Month
DayOfWeek
type
int
enum
int
enum
enum
mins
15887.0
0.0
1.0
0.0
0.0
maxs
16314.0
339.0
680.0
11.0
6.0
mean
16100.0
NaN
74.7
NaN
NaN
sigma
123.6
NaN
64.1
NaN
NaN
zero_count
0
428
0
9949
19880
missing_count
0
0
0
0
0



In [48]:

    
# ----------
# 3- Fit a model on train; using test as validation

# Function for doing class test/train/holdout split
def split_fit_predict(data):
  global gbm0,drf0,glm0,dl0
  # Classic Test/Train split
  r = data['Days'].runif()   # Random UNIForm numbers, one per row
  train = data[  r  < 0.6]
  test  = data[(0.6 <= r) & (r < 0.9)]
  hold  = data[ 0.9 <= r ]
  print "Training data has",train.ncol,"columns and",train.nrow,"rows, test has",test.nrow,"rows, holdout has",hold.nrow
  bike_names_x = data.names
  bike_names_x.remove("bikes")
  
  # Run GBM
  s = time.time()
  
  gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well
                                      max_depth=6,
                                      learn_rate=0.1)
    

  gbm0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)

  gbm_elapsed = time.time() - s

  # Run DRF
  s = time.time()
    
  drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)

  drf0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)
    
  drf_elapsed = time.time() - s 
    
    
  # Run GLM
  if "WC1" in bike_names_x: bike_names_x.remove("WC1")
  s = time.time()

  glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family="poisson")
    
  glm0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)

  glm_elapsed = time.time() - s
  
  # Run DL
  s = time.time()

  dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)
    
  dl0.train(x               =bike_names_x,
            y               ="bikes",
            training_frame  =train,
            validation_frame=test)
    
  dl_elapsed = time.time() - s
  
  # ----------
  # 4- Score on holdout set & report
  train_r2_gbm = gbm0.model_performance(train).r2()
  test_r2_gbm  = gbm0.model_performance(test ).r2()
  hold_r2_gbm  = gbm0.model_performance(hold ).r2()
#   print "GBM R2 TRAIN=",train_r2_gbm,", R2 TEST=",test_r2_gbm,", R2 HOLDOUT=",hold_r2_gbm
  
  train_r2_drf = drf0.model_performance(train).r2()
  test_r2_drf  = drf0.model_performance(test ).r2()
  hold_r2_drf  = drf0.model_performance(hold ).r2()
#   print "DRF R2 TRAIN=",train_r2_drf,", R2 TEST=",test_r2_drf,", R2 HOLDOUT=",hold_r2_drf
  
  train_r2_glm = glm0.model_performance(train).r2()
  test_r2_glm  = glm0.model_performance(test ).r2()
  hold_r2_glm  = glm0.model_performance(hold ).r2()
#   print "GLM R2 TRAIN=",train_r2_glm,", R2 TEST=",test_r2_glm,", R2 HOLDOUT=",hold_r2_glm
    
  train_r2_dl = dl0.model_performance(train).r2()
  test_r2_dl  = dl0.model_performance(test ).r2()
  hold_r2_dl  = dl0.model_performance(hold ).r2()
#   print " DL R2 TRAIN=",train_r2_dl,", R2 TEST=",test_r2_dl,", R2 HOLDOUT=",hold_r2_dl
    
  # make a pretty HTML table printout of the results

  header = ["Model", "R2 TRAIN", "R2 TEST", "R2 HOLDOUT", "Model Training Time (s)"]
  table  = [
            ["GBM", train_r2_gbm, test_r2_gbm, hold_r2_gbm, round(gbm_elapsed,3)],
            ["DRF", train_r2_drf, test_r2_drf, hold_r2_drf, round(drf_elapsed,3)],
            ["GLM", train_r2_glm, test_r2_glm, hold_r2_glm, round(glm_elapsed,3)],
            ["DL ", train_r2_dl,  test_r2_dl,  hold_r2_dl , round(dl_elapsed,3) ],
           ]
  h2o.H2ODisplay(table,header)
  # --------------



In [49]:

    
# Split the data (into test & train), fit some models and predict on the holdout data
split_fit_predict(bpd)
# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM.  This means given just
# the station, the month, and the day-of-week we can predict 90% of the
# variance of the bike-trip-starts.









    



Training data has 5 columns and 83800 rows, test has 41722 rows, holdout has 13739

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%






    




Model
R2 TRAIN
R2 TEST
R2 HOLDOUT
Model Training Time (s)
GBM
1.0
0.9
0.9
19.065
DRF
0.9
0.8
0.8
23.345
GLM
0.8
0.8
0.8
0.36
DL 
0.9
0.9
0.9
67.712



In [50]:

    
# ----------
# 5- Now lets add some weather
# Load weather data
wthr1 = h2o.import_file(path=[mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"),
                               mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
# Peek at the data
wthr1.describe()









    



Parse Progress: [##################################################] 100%

Parsed 17,520 rows and 50 cols:







    




File1
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv
File2
/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv






    



Rows: 17,520 Cols: 50

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
107
6.294118
    8.4 KB
0.7889721
C0D
Constant Reals
436
25.647058
   34.1 KB
3.2148771
CXI
Sparse Integers
17
1.0
    1.5 KB
0.1
C1
1-Byte Integers
346
20.352942
  197.4 KB
18.634672
C1N
1-Byte Integers (w/o NAs)
214
12.588236
  122.3 KB
11.544063
C1S
1-Byte Fractions
214
12.588236
  125.3 KB
11.822968
C2S
2-Byte Fractions
196
11.529412
  214.5 KB
20.242111
C4S
4-Byte Fractions
170
10.0
  356.1 KB
33.612423






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.37:54321
    1.0 MB
17520.0
34.0
1700.0
mean
    1.0 MB
17520.0
34.0
1700.0
min
    1.0 MB
17520.0
34.0
1700.0
max
    1.0 MB
17520.0
34.0
1700.0
stddev
      0  B
0.0
0.0
0.0
total
    1.0 MB
17520.0
34.0
1700.0






    



Column-by-Column Summary:







    





Year Local
Month Local
Day Local
Hour Local
Year UTC
Month UTC
Day UTC
Hour UTC
Cavok Reported
Cloud Ceiling (m)
Cloud Cover Fraction
Cloud Cover Fraction 1
Cloud Cover Fraction 2
Cloud Cover Fraction 3
Cloud Cover Fraction 4
Cloud Cover Fraction 5
Cloud Cover Fraction 6
Cloud Height (m) 1
Cloud Height (m) 2
Cloud Height (m) 3
Cloud Height (m) 4
Cloud Height (m) 5
Cloud Height (m) 6
Dew Point (C)
Humidity Fraction
Precipitation One Hour (mm)
Pressure Altimeter (mbar)
Pressure Sea Level (mbar)
Pressure Station (mbar)
Snow Depth (cm)
Temperature (C)
Visibility (km)
Weather Code 1
Weather Code 1/ Description
Weather Code 2
Weather Code 2/ Description
Weather Code 3
Weather Code 3/ Description
Weather Code 4
Weather Code 4/ Description
Weather Code 5
Weather Code 5/ Description
Weather Code 6
Weather Code 6/ Description
Weather Code Most Severe / Icon Code
Weather Code Most Severe
Weather Code Most Severe / Description
Wind Direction (degrees)
Wind Gust (m/s)
Wind Speed (m/s)
type
int
int
int
int
int
int
int
int
int
real
real
real
real
real
int
int
int
real
real
real
int
int
int
real
real
real
real
int
int
int
real
real
int
enum
int
enum
int
enum
int
enum
int
enum
int
enum
int
int
enum
int
real
real
mins
2013.0
1.0
1.0
0.0
2013.0
1.0
1.0
0.0
0.0
61.0
0.0
0.0
0.25
0.5
NaN
NaN
NaN
60.96
213.36
365.76
NaN
NaN
NaN
-26.7
0.1251
0.0
983.2949
NaN
NaN
NaN
-15.6
0.001
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
3.0
0.0
0.0
1.0
0.0
10.0
7.2
0.0
maxs
2014.0
12.0
31.0
23.0
2015.0
12.0
31.0
23.0
0.0
3657.6
1.0
1.0
1.0
1.0
NaN
NaN
NaN
3657.5999
3657.5999
3657.5999
NaN
NaN
NaN
24.4
1.0
26.924
1042.2113
NaN
NaN
NaN
36.1
16.0934
60.0
11.0
60.0
10.0
36.0
7.0
27.0
4.0
27.0
2.0
3.0
0.0
16.0
60.0
11.0
360.0
20.58
10.8
mean
2013.5
6.5
15.7
11.5
2013.5
6.5
15.7
11.5
0.0
1306.3
0.4
0.4
0.9
1.0
0.0
0.0
0.0
1294.0
1643.7
2084.9
0.0
0.0
0.0
4.3
0.6
1.4
1017.8
0.0
0.0
0.0
12.6
14.4
4.8
NaN
3.7
NaN
2.8
NaN
2.0
NaN
4.125
NaN
3.0
0.0
1.4
4.8
NaN
194.7
9.4
2.4
sigma
0.5
3.4
8.8
6.9
0.5
3.4
8.8
6.9
0.0
995.3
0.5
0.4
0.2
0.1
-0.0
-0.0
-0.0
962.7
916.7
887.2
-0.0
-0.0
-0.0
11.0
0.2
2.6
7.5
-0.0
-0.0
-0.0
10.0
3.7
5.7
NaN
6.1
NaN
5.8
NaN
3.1
NaN
6.2
NaN
0.0
0.0
4.1
5.7
NaN
106.4
1.8
1.6
zero_count
0
0
0
730
0
0
0
730
17455
0
8758
8758
0
0
-17520
-17520
-17520
0
0
0
-17520
-17520
-17520
268
0
501
0
-17520
-17520
-17520
269
0
0
17
0
30
0
13
-5044
-5024
-11241
-11229
-17030
-17028
14980
0
17
0
0
2768
missing_count
0
0
0
0
0
0
0
0
65
10780
375
375
14682
16535
17520
17520
17520
9103
14683
16535
17520
17520
17520
67
67
15660
360
17520
17520
17520
67
412
14980
14980
16477
16477
17181
17181
17433
17433
17504
17504
17518
17518
0
14980
14980
9382
14381
1283



In [51]:

    
# Lots of columns in there!  Lets plan on converting to time-since-epoch to do
# a 'join' with the bike data, plus gather weather info that might affect
# cyclists - rain, snow, temperature.  Alas, drop the "snow" column since it's
# all NA's.  Also add in dew point and humidity just in case.  Slice out just
# the columns of interest and drop the rest.
wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]

wthr2.set_name(wthr2.names.index("Precipitation One Hour (mm)"), "Rain (mm)")
wthr2.set_name(wthr2.names.index("Weather Code 1/ Description"), "WC1")
wthr2.describe()
# Much better!









    



Rows: 17,520 Cols: 9

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
46
15.0
    3.6 KB
1.780005
C1
1-Byte Integers
34
11.111112
   19.4 KB
9.592678
C1N
1-Byte Integers (w/o NAs)
90
29.411766
   51.5 KB
25.494701
C1S
1-Byte Fractions
42
13.725491
   24.0 KB
11.894592
C2S
2-Byte Fractions
94
30.718956
  103.4 KB
51.238026






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.37:54321
  201.9 KB
17520.0
34.0
306.0
mean
  201.9 KB
17520.0
34.0
306.0
min
  201.9 KB
17520.0
34.0
306.0
max
  201.9 KB
17520.0
34.0
306.0
stddev
      0  B
0.0
0.0
0.0
total
  201.9 KB
17520.0
34.0
306.0






    



Column-by-Column Summary:







    





Year Local
Month Local
Day Local
Hour Local
Dew Point (C)
Humidity Fraction
Rain (mm)
Temperature (C)
WC1
type
int
int
int
int
real
real
real
real
enum
mins
2013.0
1.0
1.0
0.0
-26.7
0.1251
0.0
-15.6
0.0
maxs
2014.0
12.0
31.0
23.0
24.4
1.0
26.924
36.1
11.0
mean
2013.5
6.5
15.7
11.5
4.3
0.6
1.4
12.6
NaN
sigma
0.5
3.4
8.8
6.9
11.0
0.2
2.6
10.0
NaN
zero_count
0
0
0
730
268
0
501
269
17
missing_count
0
0
0
0
67
67
15660
67
14980



In [52]:

    
# Filter down to the weather at Noon
wthr3 = wthr2[ wthr2["Hour Local"]==12 ]



In [53]:

    
# Lets now get Days since the epoch... we'll convert year/month/day into Epoch
# time, and then back to Epoch days.  Need zero-based month and days, but have
# 1-based.
wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
secsPerDay=1000*60*60*24
wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
wthr3.describe()
# msec looks sane (numbers like 1.3e12 are in the correct range for msec since
# 1970).  Epoch Days matches closely with the epoch day numbers from the
# CitiBike dataset.









    



Rows: 730 Cols: 11

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
80
21.390373
    6.3 KB
12.498779
C0D
Constant Reals
13
3.4759357
    1.0 KB
2.0310516
C1
1-Byte Integers
30
8.021391
    2.6 KB
5.2455816
C1N
1-Byte Integers (w/o NAs)
56
14.973262
    4.9 KB
9.801778
C1S
1-Byte Fractions
34
9.090909
    3.5 KB
7.0032225
C2S
2-Byte Fractions
34
9.090909
    4.2 KB
8.4288645
CUD
Unique Reals
25
6.6844916
    3.6 KB
7.2297626
C8D
64-bit Reals
102
27.272728
   23.9 KB
47.76096






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.37:54321
   50.0 KB
730.0
34.0
374.0
mean
   50.0 KB
730.0
34.0
374.0
min
   50.0 KB
730.0
34.0
374.0
max
   50.0 KB
730.0
34.0
374.0
stddev
      0  B
0.0
0.0
0.0
total
   50.0 KB
730.0
34.0
374.0






    



Column-by-Column Summary:







    





Year Local
Month Local
Day Local
Hour Local
Dew Point (C)
Humidity Fraction
Rain (mm)
Temperature (C)
WC1
msec
Days
type
int
int
int
int
real
real
real
real
enum
int
int
mins
2013.0
1.0
1.0
12.0
-26.7
0.1723
0.0
-13.9
0.0
1357070400000.0
15706.0
maxs
2014.0
12.0
31.0
12.0
23.3
1.0
12.446
34.4
10.0
1420056000000.0
16435.0
mean
2013.5
6.5
15.7
12.0
4.2
0.5
1.5
14.1
NaN
1388560852600.0
16070.5
sigma
0.5
3.5
8.8
0.0
11.1
0.2
2.4
10.4
NaN
18219740080.4
210.9
zero_count
0
0
0
0
14
0
-174
7
-83
0
0
missing_count
0
0
0
0
3
3
660
3
620
0
0



In [54]:

    
# Lets drop off the extra time columns to make a easy-to-handle dataset.
wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")



In [55]:

    
# Also, most rain numbers are missing - lets assume those are zero rain days
rain = wthr4["Rain (mm)"]
rain[ rain.isna() ] = 0
wthr4["Rain (mm)"] = rain



In [56]:

    
# ----------
# 6 - Join the weather data-per-day to the bike-starts-per-day
print "Merge Daily Weather with Bikes-Per-Day"
bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False)
bpd_with_weather.describe()
bpd_with_weather.show()









    



Merge Daily Weather with Bikes-Per-Day
Rows: 139,261 Cols: 10

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C1
1-Byte Integers
32
10.0
  138.1 KB
4.3211317
C1N
1-Byte Integers (w/o NAs)
64
20.0
  276.2 KB
8.642263
C2
2-Byte Integers
96
30.000002
  822.4 KB
25.72735
CUD
Unique Reals
96
30.000002
  869.6 KB
27.205559
C8D
64-bit Reals
32
10.0
    1.1 MB
34.10369






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.37:54321
    3.1 MB
139261.0
32.0
320.0
mean
    3.1 MB
139261.0
32.0
320.0
min
    3.1 MB
139261.0
32.0
320.0
max
    3.1 MB
139261.0
32.0
320.0
stddev
      0  B
0.0
0.0
0.0
total
    3.1 MB
139261.0
32.0
320.0






    



Column-by-Column Summary:







    





Days
start station name
bikes
Month
DayOfWeek
Humidity Fraction
Rain (mm)
Temperature (C)
WC1
Dew Point (C)
type
int
enum
int
enum
enum
real
real
real
enum
real
mins
15887.0
0.0
1.0
0.0
0.0
0.1723
0.0
-13.9
0.0
-26.7
maxs
16314.0
339.0
680.0
11.0
6.0
1.0
8.382
34.4
10.0
23.3
mean
16100.0
NaN
74.7
NaN
NaN
0.5
0.1
15.6
NaN
5.5
sigma
123.6
NaN
64.1
NaN
NaN
0.2
0.6
10.9
NaN
11.7
zero_count
0
428
0
9949
19880
0
131155
1598
324
1954
missing_count
0
0
0
0
0
981
0
981
119130
981






    



H2OFrame with 139261 rows and 10 columns: 






    






  
    
      
      Days
      start station name
      bikes
      Month
      DayOfWeek
      Humidity Fraction
      Rain (mm)
      Temperature (C)
      WC1
      Dew Point (C)
    
  
  
    
      0
      16313
      Greenwich St & N Moore St
      74
      8
      Sat
      0.6287
      0.000
      28.9
      NaN
      21.1
    
    
      1
      15993
      Henry St & Atlantic Ave
      56
      10
      Mon
      0.6082
      0.000
      18.3
      NaN
      10.6
    
    
      2
      16057
      Harrison St & Hudson St
      13
      12
      Tue
      0.5596
      0.000
      1.1
      NaN
      -6.7
    
    
      3
      16249
      Greenwich St & Warren St
      197
      6
      Fri
      0.3848
      0.000
      28.3
      NaN
      12.8
    
    
      4
      16121
      Hanover Pl & Livingston St
      2
      2
      Wed
      0.4331
      0.000
      7.8
      NaN
      -3.9
    
    
      5
      16185
      Hancock St & Bedford Ave
      14
      4
      Thu
      0.2092
      0.000
      15.0
      NaN
      -7.2
    
    
      6
      15966
      Perry St & Bleecker St
      101
      9
      Tue
      0.3836
      0.000
      18.9
      NaN
      4.4
    
    
      7
      16222
      Park Pl & Church St
      53
      5
      Sat
      0.2586
      0.000
      22.2
      NaN
      1.7
    
    
      8
      16158
      Pearl St & Anchorage Pl
      15
      3
      Fri
      0.8309
      0.254
      8.3
      light rain
      5.6
    
    
      9
      16286
      Park Ave & St Edwards St
      5
      8
      Sun
      0.5444
      0.000
      27.2
      NaN
      17.2



In [57]:

    
# 7 - Test/Train split again, model build again, this time with weather
split_fit_predict(bpd_with_weather)









    



Training data has 10 columns and 83867 rows, test has 41559 rows, holdout has 13835

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%






    




Model
R2 TRAIN
R2 TEST
R2 HOLDOUT
Model Training Time (s)
GBM
1.0
0.9
0.9
25.557
DRF
0.9
0.9
0.8
136.197
GLM
0.8
0.8
0.8
0.367
DL 
0.9
0.9
0.9
74.342



In [ ]:

H2O cluster uptime:	9 minutes 59 seconds 442 milliseconds
H2O cluster version:	3.5.0.99999
H2O cluster name:	ludirehak
H2O cluster total nodes:	1
H2O cluster total memory:	4.44 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

File1	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-07.csv
File2	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-08.csv
File3	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-09.csv
File4	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-10.csv
File5	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-11.csv
File6	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2013-12.csv
File7	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-01.csv
File8	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-02.csv
File9	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-03.csv
File10	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-04.csv
File11	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-05.csv
File12	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-06.csv
File13	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-07.csv
File14	/Users/ludirehak/h2o-3/bigdata/laptop/citibike-nyc/2014-08.csv

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	117	1.5298117	9.1 KB	0.0
C1	1-Byte Integers	478	6.25	10.0 MB	1.7289143
C1N	1-Byte Integers (w/o NAs)	478	6.25	10.0 MB	1.7289143
C1S	1-Byte Fractions	839	10.970188	17.5 MB	3.042758
C2	2-Byte Integers	2616	34.20502	108.8 MB	18.8909
C2S	2-Byte Fractions	314	4.1056485	12.9 MB	2.2460942
C4	4-Byte Integers	214	2.7981172	17.9 MB	3.1005228
C4S	4-Byte Fractions	389	5.086297	32.4 MB	5.625424
C8	64-bit Integers	680	8.891213	113.5 MB	19.704786
C8D	64-bit Reals	1523	19.913704	253.0 MB	43.930134

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.37:54321	575.9 MB	10407546.0	478.0	7648.0
mean	575.9 MB	10407546.0	478.0	7648.0
min	575.9 MB	10407546.0	478.0	7648.0
max	575.9 MB	10407546.0	478.0	7648.0
stddev	0 B	0.0	0.0	0.0
total	575.9 MB	10407546.0	478.0	7648.0

	tripduration	starttime	stoptime	start station id	start station name	start station latitude	start station longitude	end station id	end station name	end station latitude	end station longitude	bikeid	usertype	birth year	gender	Days
type	int	time	time	int	enum	real	real	int	enum	real	real	int	enum	int	int	int
mins	60.0	1372662000000.0	1372662242000.0	72.0	0.0	40.7	-74.0	72.0	0.0	40.7	-74.0	14529.0	0.0	1899.0	0.0	15887.0
maxs	6250750.0	1409554787000.0	1409563605000.0	3002.0	339.0	40.771522	-74.0	3002.0	339.0	40.771522	-74.0	21689.0	1.0	1998.0	2.0	16314.0
mean	869.0	1390999858230.0	1391000727180.0	444.9	NaN	40.7	-74.0	445.3	NaN	40.7	-74.0	17895.7	0.9	1975.8	1.1	16099.0
sigma	2985.1	11806578171.7	11806555707.8	355.8	NaN	0.0	0.0	360.1	NaN	0.0	0.0	1938.8	0.3	11.1	0.6	136.6
zero_count	0	0	0	0	56836	0	0	0	55167	0	0	0	1247534	0	1248517	0
missing_count	0	0	0	0	0	0	0	0	0	0	0	0	0	1247644	0	0

	Days	start station name	bikes
0	16313	Greenwich St & N Moore St	74
1	15993	Henry St & Atlantic Ave	56
2	16057	Harrison St & Hudson St	13
3	16249	Greenwich St & Warren St	197
4	16121	Hanover Pl & Livingston St	2
5	16185	Hancock St & Bedford Ave	14
6	15966	Perry St & Bleecker St	101
7	16222	Park Pl & Church St	53
8	16158	Pearl St & Anchorage Pl	15
9	16286	Park Ave & St Edwards St	5

	Days	start station name	bikes
type	int	enum	int
mins	15887.0	0.0	1.0
maxs	16314.0	339.0	680.0
mean	16100.0	NaN	74.7
sigma	123.6	NaN	64.1
zero_count	0	428	0
missing_count	0	0	0

	Probs	bikesQuantiles
0	0.010	2
1	0.100	11
2	0.250	26
3	0.333	35
4	0.500	58
5	0.667	89
6	0.750	107
7	0.900	157
8	0.990	291

Model	R2 TRAIN	R2 TEST	R2 HOLDOUT	Model Training Time (s)
GBM	1.0	0.9	0.9	19.065
DRF	0.9	0.8	0.8	23.345
GLM	0.8	0.8	0.8	0.36
DL	0.9	0.9	0.9	67.712

	Year Local	Month Local	Day Local	Hour Local	Year UTC	Month UTC	Day UTC	Hour UTC	Cavok Reported	Cloud Ceiling (m)	Cloud Cover Fraction	Cloud Cover Fraction 1	Cloud Cover Fraction 2	Cloud Cover Fraction 3	Cloud Cover Fraction 4	Cloud Cover Fraction 5	Cloud Cover Fraction 6	Cloud Height (m) 1	Cloud Height (m) 2	Cloud Height (m) 3	Cloud Height (m) 4	Cloud Height (m) 5	Cloud Height (m) 6	Dew Point (C)	Humidity Fraction	Precipitation One Hour (mm)	Pressure Altimeter (mbar)	Pressure Sea Level (mbar)	Pressure Station (mbar)	Snow Depth (cm)	Temperature (C)	Visibility (km)	Weather Code 1	Weather Code 1/ Description	Weather Code 2	Weather Code 2/ Description	Weather Code 3	Weather Code 3/ Description	Weather Code 4	Weather Code 4/ Description	Weather Code 5	Weather Code 5/ Description	Weather Code 6	Weather Code 6/ Description	Weather Code Most Severe / Icon Code	Weather Code Most Severe	Weather Code Most Severe / Description	Wind Direction (degrees)	Wind Gust (m/s)	Wind Speed (m/s)
type	int	int	int	int	int	int	int	int	int	real	real	real	real	real	int	int	int	real	real	real	int	int	int	real	real	real	real	int	int	int	real	real	int	enum	int	enum	int	enum	int	enum	int	enum	int	enum	int	int	enum	int	real	real
mins	2013.0	1.0	1.0	0.0	2013.0	1.0	1.0	0.0	0.0	61.0	0.0	0.0	0.25	0.5	NaN	NaN	NaN	60.96	213.36	365.76	NaN	NaN	NaN	-26.7	0.1251	0.0	983.2949	NaN	NaN	NaN	-15.6	0.001	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	3.0	0.0	0.0	1.0	0.0	10.0	7.2	0.0
maxs	2014.0	12.0	31.0	23.0	2015.0	12.0	31.0	23.0	0.0	3657.6	1.0	1.0	1.0	1.0	NaN	NaN	NaN	3657.5999	3657.5999	3657.5999	NaN	NaN	NaN	24.4	1.0	26.924	1042.2113	NaN	NaN	NaN	36.1	16.0934	60.0	11.0	60.0	10.0	36.0	7.0	27.0	4.0	27.0	2.0	3.0	0.0	16.0	60.0	11.0	360.0	20.58	10.8
mean	2013.5	6.5	15.7	11.5	2013.5	6.5	15.7	11.5	0.0	1306.3	0.4	0.4	0.9	1.0	0.0	0.0	0.0	1294.0	1643.7	2084.9	0.0	0.0	0.0	4.3	0.6	1.4	1017.8	0.0	0.0	0.0	12.6	14.4	4.8	NaN	3.7	NaN	2.8	NaN	2.0	NaN	4.125	NaN	3.0	0.0	1.4	4.8	NaN	194.7	9.4	2.4
sigma	0.5	3.4	8.8	6.9	0.5	3.4	8.8	6.9	0.0	995.3	0.5	0.4	0.2	0.1	-0.0	-0.0	-0.0	962.7	916.7	887.2	-0.0	-0.0	-0.0	11.0	0.2	2.6	7.5	-0.0	-0.0	-0.0	10.0	3.7	5.7	NaN	6.1	NaN	5.8	NaN	3.1	NaN	6.2	NaN	0.0	0.0	4.1	5.7	NaN	106.4	1.8	1.6
zero_count	0	0	0	730	0	0	0	730	17455	0	8758	8758	0	0	-17520	-17520	-17520	0	0	0	-17520	-17520	-17520	268	0	501	0	-17520	-17520	-17520	269	0	0	17	0	30	0	13	-5044	-5024	-11241	-11229	-17030	-17028	14980	0	17	0	0	2768
missing_count	0	0	0	0	0	0	0	0	65	10780	375	375	14682	16535	17520	17520	17520	9103	14683	16535	17520	17520	17520	67	67	15660	360	17520	17520	17520	67	412	14980	14980	16477	16477	17181	17181	17433	17433	17504	17504	17518	17518	0	14980	14980	9382	14381	1283

	Days	start station name	bikes	Month	DayOfWeek	Humidity Fraction	Rain (mm)	Temperature (C)	WC1	Dew Point (C)
0	16313	Greenwich St & N Moore St	74	8	Sat	0.6287	0.000	28.9	NaN	21.1
1	15993	Henry St & Atlantic Ave	56	10	Mon	0.6082	0.000	18.3	NaN	10.6
2	16057	Harrison St & Hudson St	13	12	Tue	0.5596	0.000	1.1	NaN	-6.7
3	16249	Greenwich St & Warren St	197	6	Fri	0.3848	0.000	28.3	NaN	12.8
4	16121	Hanover Pl & Livingston St	2	2	Wed	0.4331	0.000	7.8	NaN	-3.9
5	16185	Hancock St & Bedford Ave	14	4	Thu	0.2092	0.000	15.0	NaN	-7.2
6	15966	Perry St & Bleecker St	101	9	Tue	0.3836	0.000	18.9	NaN	4.4
7	16222	Park Pl & Church St	53	5	Sat	0.2586	0.000	22.2	NaN	1.7
8	16158	Pearl St & Anchorage Pl	15	3	Fri	0.8309	0.254	8.3	light rain	5.6
9	16286	Park Ave & St Edwards St	5	8	Sun	0.5444	0.000	27.2	NaN	17.2