In [1]:
import h2o
import time
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [2]:
# Explore a typical Data Science workflow with H2O and Python
#
# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles
# across the CitiBike network of stations, by predicting the number of bike
# trips taken from the station every day.  Use 10 million rows of historical
# data, and eventually add weather data.


# Connect to a cluster
h2o.init()


H2O cluster uptime: 1 minutes 6 seconds 272 milliseconds
H2O cluster version: 3.7.0.99999
H2O cluster name: ece
H2O cluster total nodes: 1
H2O cluster total memory: 8.89 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321
H2O Connection proxy: None

In [3]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.

# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = False

def mylocate(s):
    if data_source_is_s3:
        return "s3n://h2o-public-test-data/" + s
    else:
        return _locate(s)

In [4]:
# Pick either the big or the small demo.
# Big data is 10M rows
small_test = [mylocate("bigdata/laptop/citibike-nyc/2013-10.csv")]
big_test =   [mylocate("bigdata/laptop/citibike-nyc/2013-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-08.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-09.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-10.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-11.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-12.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-01.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-02.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-03.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-04.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-05.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-06.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-08.csv")]

# ----------

# 1- Load data - 1 row per bicycle trip.  Has columns showing the start and end
# station, trip duration and trip start time and day.  The larger dataset
# totals about 10 million rows
print("Import and Parse bike data")
data = h2o.import_file(path=big_test)


Import and Parse bike data

Parse Progress: [##################################################] 100%

In [5]:
# ----------

# 2- light data munging: group the bike starts per-day, converting the 10M rows
# of trips to about 140,000 station&day combos - predicting the number of trip
# starts per-station-per-day.

# Convert start time to: Day since the Epoch
startime = data["starttime"]
secsPerDay=1000*60*60*24
data["Days"] = (startime/secsPerDay).floor()
data.describe()


Rows:10,407,546 Cols:16

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 117 1.5298117 9.1 KB 0.0015501
C1 1-Byte Integers 478 6.25 10.0 MB 1.7289143
C1N 1-Byte Integers (w/o NAs) 478 6.25 10.0 MB 1.7289143
C1S 1-Byte Fractions 839 10.970188 17.5 MB 3.042758
C2 2-Byte Integers 2616 34.20502 108.8 MB 18.8909
C2S 2-Byte Fractions 314 4.1056485 12.9 MB 2.2460942
C4 4-Byte Integers 214 2.7981172 17.9 MB 3.1005228
C4S 4-Byte Fractions 389 5.086297 32.4 MB 5.625424
C8 64-bit Integers 680 8.891213 113.5 MB 19.704786
C8D 64-bit Reals 1523 19.913704 253.0 MB 43.930134
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 575.9 MB 10407546.0000000 478.0 7648.0
mean 575.9 MB 10407546.0000000 478.0 7648.0
min 575.9 MB 10407546.0000000 478.0 7648.0
max 575.9 MB 10407546.0000000 478.0 7648.0
stddev 0 B 0.0 0.0 0.0
total 575.9 MB 10407546.0000000 478.0 7648.0

tripduration starttime stoptime start station id start station name start station latitude start station longitude end station id end station name end station latitude end station longitude bikeid usertype birth year gender Days
type int time time int enum real real int enum real real int enum int int int
mins 60.0 1.372662e+12 1.372662242e+12 72.0 0.0 40.680342423 -74.01713445 72.0 0.0 40.680342423 -74.01713445 14529.0 0.0 1899.0 0.0 15887.0
mean 868.968726057 1.39099985823e+121.39100072718e+12444.858835407 NaN 40.7343819823 -73.9910570182 445.259785544 NaN 40.7340868895 -73.9911707799 17895.66183580.8801317813061975.798923941.08414654137 16099.0245177
maxs 6250750.0 1.409554787e+12 1.409563605e+12 3002.0 339.0 40.771522 -73.9500479759 3002.0 339.0 40.771522 -73.9500479759 21689.0 1.0 1998.0 2.0 16314.0
sigma 2985.10540532 11806578171.7 11806555707.8 355.755989765 NaN 0.0197100508736 0.0123453320185 360.070380844 NaN 0.0197309578633 0.0124311861598 1938.805178840.32480738750611.132784905 0.563019777794136.647269305
zeros 0 0 0 0 56836 0 0 0 55167 0 0 0 1247534 0 1248517 0
missing0 0 0 0 0 0 0 0 0 0 0 0 0 1247644 0 0
0 634.0 1.372662e+12 1.372662634e+12 164.0 E 47 St & 2 Ave 40.75323098 -73.97032517 504.0 1 Ave & E 15 St 40.73221853 -73.98165557 16950.0 Customer nan 0.0 15887.0
1 1547.0 1.372662002e+12 1.372663549e+12 388.0 W 26 St & 10 Ave 40.749717753 -74.002950346 459.0 W 20 St & 11 Ave 40.746745 -74.007756 19816.0 Customer nan 0.0 15887.0
2 178.0 1.372662064e+12 1.372662242e+12 293.0 Lafayette St & E 8 St 40.73028666 -73.9907647 237.0 E 11 St & 2 Ave 40.73047309 -73.98672378 14548.0 Subscriber 1980.0 2.0 15887.0
3 1580.0 1.372662066e+12 1.372663646e+12 531.0 Forsyth St & Broome St 40.71893904 -73.99266288 499.0 Broadway & W 60 St 40.76915505 -73.98191841 16063.0 Customer nan 0.0 15887.0
4 757.0 1.37266207e+12 1.372662827e+12 382.0 University Pl & E 14 St40.73492695 -73.99200509 410.0 Suffolk St & Stanton St 40.72066442 -73.98517977 19213.0 Subscriber 1986.0 1.0 15887.0
5 861.0 1.372662083e+12 1.372662944e+12 511.0 E 14 St & Avenue B 40.72938685 -73.97772429 454.0 E 51 St & 1 Ave 40.75455731 -73.96592976 16223.0 Subscriber 1988.0 1.0 15887.0
6 550.0 1.372662119e+12 1.372662669e+12 293.0 Lafayette St & E 8 St 40.73028666 -73.9907647 394.0 E 9 St & Avenue C 40.72521311 -73.97768752 16746.0 Customer nan 0.0 15887.0
7 288.0 1.372662136e+12 1.372662424e+12 224.0 Spruce St & Nassau St 40.71146364 -74.00552427 376.0 John St & William St 40.70862144 -74.00722156 16062.0 Subscriber 1985.0 2.0 15887.0
8 766.0 1.372662136e+12 1.372662902e+12 432.0 E 7 St & Avenue A 40.72621788 -73.98379855 336.0 Sullivan St & Washington Sq40.73047747 -73.99906065 17963.0 Subscriber 1980.0 2.0 15887.0
9 773.0 1.372662143e+12 1.372662916e+12 173.0 Broadway & W 49 St 40.76064679 -73.98442659 479.0 9 Ave & W 45 St 40.76019252 -73.9912551 19365.0 Subscriber 1989.0 1.0 15887.0

In [6]:
# Now do a monster Group-By.  Count bike starts per-station per-day.  Ends up
# with about 340 stations times 400 days (140,000 rows).  This is what we want
# to predict.
grouped = data.group_by(["Days","start station name"])
bpd = grouped.count().get_frame() # Compute bikes-per-day
bpd.set_name(2,"bikes")
bpd.show()
bpd.describe()
bpd.dim


Daysstart station name bikes
158871 Ave & E 15 St 47
158871 Ave & E 18 St 40
158871 Ave & E 30 St 42
158871 Ave & E 44 St 12
1588710 Ave & W 28 St 28
1588711 Ave & W 27 St 30
1588711 Ave & W 41 St 20
1588712 Ave & W 40 St 13
158872 Ave & E 31 St 33
158872 Ave & E 58 St 54
Rows:139,261 Cols:3

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C1N 1-Byte Integers (w/o NAs) 2 2.0833335 8.6 KB 1.2723082
C1S 1-Byte Fractions 32 33.333336 138.6 KB 20.434813
C2 2-Byte Integers 62 64.58333 531.1 KB 78.29288
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 678.4 KB 139261.0 32.0 96.0
mean 678.4 KB 139261.0 32.0 96.0
min 678.4 KB 139261.0 32.0 96.0
max 678.4 KB 139261.0 32.0 96.0
stddev 0 B 0.0 0.0 0.0
total 678.4 KB 139261.0 32.0 96.0

Days start station name bikes
type int enum int
mins 15887.0 0.0 1.0
mean 16099.9758008NaN 74.7329833909
maxs 16314.0 339.0 680.0
sigma 123.635133897NaN 64.1241591283
zeros 0 428 0
missing0 0 0
0 15887.0 1 Ave & E 15 St 47.0
1 15887.0 1 Ave & E 18 St 40.0
2 15887.0 1 Ave & E 30 St 42.0
3 15887.0 1 Ave & E 44 St 12.0
4 15887.0 10 Ave & W 28 St 28.0
5 15887.0 11 Ave & W 27 St 30.0
6 15887.0 11 Ave & W 41 St 20.0
7 15887.0 12 Ave & W 40 St 13.0
8 15887.0 2 Ave & E 31 St 33.0
9 15887.0 2 Ave & E 58 St 54.0
Out[6]:
[139261, 3]

In [7]:
# Quantiles: the data is fairly unbalanced; some station/day combos are wildly
# more popular than others.
print("Quantiles of bikes-per-day")
bpd["bikes"].quantile().show()


Quantiles of bikes-per-day
Probs bikesQuantiles
0.01 2
0.1 11
0.25 26
0.333 35
0.5 58
0.667 89
0.75 107
0.9 157
0.99 291

In [8]:
# A little feature engineering
# Add in month-of-year (seasonality; fewer bike rides in winter than summer)
secs = bpd["Days"]*secsPerDay
bpd["Month"]     = secs.month().asfactor()
# Add in day-of-week (work-week; more bike rides on Sunday than Monday)
bpd["DayOfWeek"] = secs.dayOfWeek()
print("Bikes-Per-Day")
bpd.describe()


Bikes-Per-Day
Rows:139,261 Cols:5

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 18 11.25 1.4 KB 0.1607895
CBS Bits 1 0.625 614 B 0.0685588
C1N 1-Byte Integers (w/o NAs) 47 29.374998 202.9 KB 23.194105
C1S 1-Byte Fractions 32 20.0 138.6 KB 15.849935
C2 2-Byte Integers 62 38.75 531.1 KB 60.726612
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 874.6 KB 139261.0 32.0 160.0
mean 874.6 KB 139261.0 32.0 160.0
min 874.6 KB 139261.0 32.0 160.0
max 874.6 KB 139261.0 32.0 160.0
stddev 0 B 0.0 0.0 0.0
total 874.6 KB 139261.0 32.0 160.0

Days start station name bikes Month DayOfWeek
type int enum int enum enum
mins 15887.0 0.0 1.0 0.0 0.0
mean 16099.9758008NaN 74.7329833909NaN NaN
maxs 16314.0 339.0 680.0 11.0 6.0
sigma 123.635133897NaN 64.1241591283NaN NaN
zeros 0 428 0 9949 19880
missing0 0 0 0 0
0 15887.0 1 Ave & E 15 St 47.0 6 Sun
1 15887.0 1 Ave & E 18 St 40.0 6 Sun
2 15887.0 1 Ave & E 30 St 42.0 6 Sun
3 15887.0 1 Ave & E 44 St 12.0 6 Sun
4 15887.0 10 Ave & W 28 St 28.0 6 Sun
5 15887.0 11 Ave & W 27 St 30.0 6 Sun
6 15887.0 11 Ave & W 41 St 20.0 6 Sun
7 15887.0 12 Ave & W 40 St 13.0 6 Sun
8 15887.0 2 Ave & E 31 St 33.0 6 Sun
9 15887.0 2 Ave & E 58 St 54.0 6 Sun

In [9]:
# ----------
# 3- Fit a model on train; using test as validation

# Function for doing class test/train/holdout split
def split_fit_predict(data):
  global gbm0,drf0,glm0,dl0
  # Classic Test/Train split
  r = data['Days'].runif()   # Random UNIForm numbers, one per row
  train = data[  r  < 0.6]
  test  = data[(0.6 <= r) & (r < 0.9)]
  hold  = data[ 0.9 <= r ]
  print("Training data has",train.ncol,"columns and",train.nrow,"rows, test has",test.nrow,"rows, holdout has",hold.nrow)
  bike_names_x = data.names
  bike_names_x.remove("bikes")
  
  # Run GBM
  s = time.time()
  
  gbm0 = H2OGradientBoostingEstimator(ntrees=500, # 500 works well
                                      max_depth=6,
                                      learn_rate=0.1)
    

  gbm0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)

  gbm_elapsed = time.time() - s

  # Run DRF
  s = time.time()
    
  drf0 = H2ORandomForestEstimator(ntrees=250, max_depth=30)

  drf0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)
    
  drf_elapsed = time.time() - s 
    
    
  # Run GLM
  if "WC1" in bike_names_x: bike_names_x.remove("WC1")
  s = time.time()

  glm0 = H2OGeneralizedLinearEstimator(Lambda=[1e-5], family="poisson")
    
  glm0.train(x               =bike_names_x,
             y               ="bikes",
             training_frame  =train,
             validation_frame=test)

  glm_elapsed = time.time() - s
  
  # Run DL
  s = time.time()

  dl0 = H2ODeepLearningEstimator(hidden=[50,50,50,50], epochs=50)
    
  dl0.train(x               =bike_names_x,
            y               ="bikes",
            training_frame  =train,
            validation_frame=test)
    
  dl_elapsed = time.time() - s
  
  # ----------
  # 4- Score on holdout set & report
  train_r2_gbm = gbm0.model_performance(train).r2()
  test_r2_gbm  = gbm0.model_performance(test ).r2()
  hold_r2_gbm  = gbm0.model_performance(hold ).r2()
#   print "GBM R2 TRAIN=",train_r2_gbm,", R2 TEST=",test_r2_gbm,", R2 HOLDOUT=",hold_r2_gbm
  
  train_r2_drf = drf0.model_performance(train).r2()
  test_r2_drf  = drf0.model_performance(test ).r2()
  hold_r2_drf  = drf0.model_performance(hold ).r2()
#   print "DRF R2 TRAIN=",train_r2_drf,", R2 TEST=",test_r2_drf,", R2 HOLDOUT=",hold_r2_drf
  
  train_r2_glm = glm0.model_performance(train).r2()
  test_r2_glm  = glm0.model_performance(test ).r2()
  hold_r2_glm  = glm0.model_performance(hold ).r2()
#   print "GLM R2 TRAIN=",train_r2_glm,", R2 TEST=",test_r2_glm,", R2 HOLDOUT=",hold_r2_glm
    
  train_r2_dl = dl0.model_performance(train).r2()
  test_r2_dl  = dl0.model_performance(test ).r2()
  hold_r2_dl  = dl0.model_performance(hold ).r2()
#   print " DL R2 TRAIN=",train_r2_dl,", R2 TEST=",test_r2_dl,", R2 HOLDOUT=",hold_r2_dl
    
  # make a pretty HTML table printout of the results

  header = ["Model", "R2 TRAIN", "R2 TEST", "R2 HOLDOUT", "Model Training Time (s)"]
  table  = [
            ["GBM", train_r2_gbm, test_r2_gbm, hold_r2_gbm, round(gbm_elapsed,3)],
            ["DRF", train_r2_drf, test_r2_drf, hold_r2_drf, round(drf_elapsed,3)],
            ["GLM", train_r2_glm, test_r2_glm, hold_r2_glm, round(glm_elapsed,3)],
            ["DL ", train_r2_dl,  test_r2_dl,  hold_r2_dl , round(dl_elapsed,3) ],
           ]
  h2o.display.H2ODisplay(table,header)
  # --------------

In [10]:
# Split the data (into test & train), fit some models and predict on the holdout data
split_fit_predict(bpd)
# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM.  This means given just
# the station, the month, and the day-of-week we can predict 90% of the
# variance of the bike-trip-starts.


Training data has 5 columns and 83820 rows, test has 41603 rows, holdout has 13838

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%
Model R2 TRAIN R2 TEST R2 HOLDOUT Model Training Time (s)
GBM 0.9687211 0.9200211 0.9226079 19.092
DRF 0.8570970 0.8269861 0.8256906 20.128
GLM 0.7869223 0.7810472 0.7824196 0.699
DL 0.8783443 0.8570735 0.8638957 65.437

In [11]:
# ----------
# 5- Now lets add some weather
# Load weather data
wthr1 = h2o.import_file(path=[mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"),
                               mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
# Peek at the data
wthr1.describe()


Parse Progress: [##################################################] 100%
Rows:17,520 Cols:50

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 107 6.294118 8.4 KB 0.7889721
C0D Constant Reals 436 25.647058 34.1 KB 3.2148771
CXI Sparse Integers 17 1.0 1.5 KB 0.1399135
C1 1-Byte Integers 346 20.352942 197.4 KB 18.634672
C1N 1-Byte Integers (w/o NAs) 214 12.588236 122.3 KB 11.544063
C1S 1-Byte Fractions 214 12.588236 125.3 KB 11.822968
C2S 2-Byte Fractions 196 11.529412 214.5 KB 20.242111
C4S 4-Byte Fractions 170 10.0 356.1 KB 33.612423
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 1.0 MB 17520.0 34.0 1700.0
mean 1.0 MB 17520.0 34.0 1700.0
min 1.0 MB 17520.0 34.0 1700.0
max 1.0 MB 17520.0 34.0 1700.0
stddev 0 B 0.0 0.0 0.0
total 1.0 MB 17520.0 34.0 1700.0

Year Local Month Local Day Local Hour Local Year UTC Month UTC Day UTC Hour UTC Cavok Reported Cloud Ceiling (m) Cloud Cover Fraction Cloud Cover Fraction 1 Cloud Cover Fraction 2 Cloud Cover Fraction 3 Cloud Cover Fraction 4 Cloud Cover Fraction 5 Cloud Cover Fraction 6 Cloud Height (m) 1 Cloud Height (m) 2 Cloud Height (m) 3 Cloud Height (m) 4 Cloud Height (m) 5 Cloud Height (m) 6 Dew Point (C) Humidity Fraction Precipitation One Hour (mm) Pressure Altimeter (mbar) Pressure Sea Level (mbar) Pressure Station (mbar) Snow Depth (cm) Temperature (C) Visibility (km) Weather Code 1 Weather Code 1/ Description Weather Code 2 Weather Code 2/ Description Weather Code 3 Weather Code 3/ Description Weather Code 4 Weather Code 4/ Description Weather Code 5 Weather Code 5/ Description Weather Code 6 Weather Code 6/ Description Weather Code Most Severe / Icon Code Weather Code Most Severe Weather Code Most Severe / Description Wind Direction (degrees) Wind Gust (m/s) Wind Speed (m/s)
type int int int int int int int int int real real real real real int int int real real real int int int real real real real int int int real real int enum int enum int enum int enum int enum int enum int int enum int real real
mins 2013.0 1.0 1.0 0.0 2013.0 1.0 1.0 0.0 0.0 61.0 0.0 0.0 0.25 0.5 NaN NaN NaN 60.96 213.36 365.76 NaN NaN NaN -26.7 0.1251 0.0 983.2949 NaN NaN NaN -15.6 0.001 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 1.0 0.0 3.0 0.0 0.0 1.0 0.0 10.0 7.2 0.0
mean 2013.5 6.5260273972615.720547945211.5 2013.50057078 6.5251141552515.721347032 11.50011415530.0 1306.31195846 0.416742490522 0.361207349081 0.872445384073 0.963045685279 0.0 0.0 0.0 1293.9822682 1643.73900166 2084.89386376 0.0 0.0 0.0 4.31304646766 0.596736389159 1.37993010753 1017.82581441 0.0 0.0 0.0 12.5789090701 14.3914429682 4.84251968504 NaN 3.65867689358 NaN 2.84660766962 NaN 2.01149425287 NaN 4.125 NaN 3.0 0.0 1.37848173516 4.84251968504 NaN 194.69525682 9.42216948073 2.41032887849
maxs 2014.0 12.0 31.0 23.0 2015.0 12.0 31.0 23.0 0.0 3657.6 1.0 1.0 1.0 1.0 NaN NaN NaN 3657.5999 3657.5999 3657.5999 NaN NaN NaN 24.4 1.0 26.924 1042.2113 NaN NaN NaN 36.1 16.0934 60.0 11.0 60.0 10.0 36.0 7.0 27.0 4.0 27.0 2.0 3.0 0.0 16.0 60.0 11.0 360.0 20.58 10.8
sigma 0.5000142700173.447949723858.796498048526.922384111880.5005844117163.447824054588.795614888686.922301652030.0 995.339856966 0.462720830993 0.42770569708 0.197155690367 0.0861015598104 -0.0 -0.0 -0.0 962.743095854 916.73861349 887.215847511 -0.0 -0.0 -0.0 10.9731282097 0.185792011866 2.56215129179 7.46451697179 -0.0 -0.0 -0.0 10.0396739531 3.69893623033 5.70486576983 NaN 6.13386253912 NaN 5.80553286364 NaN 3.12340844261 NaN 6.15223536611 NaN 0.0 0.0 4.07386062702 5.70486576983 NaN 106.350000031 1.81511871115 1.61469790524
zeros 0 0 0 730 0 0 0 730 17455 0 8758 8758 0 0 -17520 -17520 -17520 0 0 0 -17520 -17520 -17520 268 0 501 0 -17520 -17520 -17520 269 0 0 17 0 30 0 13 -5044 -5024 -11241 -11229 -17030 -17028 14980 0 17 0 0 2768
missing0 0 0 0 0 0 0 0 65 10780 375 375 14682 16535 17520 17520 17520 9103 14683 16535 17520 17520 17520 67 67 15660 360 17520 17520 17520 67 412 14980 14980 16477 16477 17181 17181 17433 17433 17504 17504 17518 17518 0 14980 14980 9382 14381 1283
0 2013.0 1.0 1.0 0.0 2013.0 1.0 1.0 5.0 0.0 2895.6 1.0 0.9 1.0 nan nan nan nan 2895.5999 3352.8 nan nan nan nan -5.0 0.5447 nan 1013.0917 nan nan nan 3.3 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 2.57
1 2013.0 1.0 1.0 1.0 2013.0 1.0 1.0 6.0 0.0 3048.0 1.0 1.0 nan nan nan nan nan 3048.0 nan nan nan nan nan -4.4 0.5463 nan 1012.0759 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 9.77 4.63
2 2013.0 1.0 1.0 2.0 2013.0 1.0 1.0 7.0 0.0 1828.8 1.0 1.0 nan nan nan nan nan 1828.7999 nan nan nan nan nan -3.3 0.619 nan 1012.4145 nan nan nan 3.3 16.0934 nan nan nan nan nan nan 0.0 nan nan 7.72 1.54
3 2013.0 1.0 1.0 3.0 2013.0 1.0 1.0 8.0 0.0 1463.0 1.0 1.0 nan nan nan nan nan 1463.04 nan nan nan nan nan -2.8 0.6159 nan 1012.4145 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 3.09
4 2013.0 1.0 1.0 4.0 2013.0 1.0 1.0 9.0 0.0 1402.1 1.0 1.0 nan nan nan nan nan 1402.08 nan nan nan nan nan -2.8 0.6159 nan 1012.7531 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 nan 4.12
5 2013.0 1.0 1.0 5.0 2013.0 1.0 1.0 10.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -2.8 0.6159 nan 1012.4145 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan nan nan 3.09
6 2013.0 1.0 1.0 6.0 2013.0 1.0 1.0 11.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -3.3 0.5934 nan 1012.0759 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan nan 9.26 3.09
7 2013.0 1.0 1.0 7.0 2013.0 1.0 1.0 12.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -3.3 0.5934 nan 1012.4145 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 9.26 4.63
8 2013.0 1.0 1.0 8.0 2013.0 1.0 1.0 13.0 0.0 1524.0 1.0 1.0 nan nan nan nan nan 1524.0 nan nan nan nan nan -2.8 0.6425 nan 1012.4145 nan nan nan 3.3 16.0934 nan nan nan nan nan nan 0.0 nan 260.0 nan 3.09
9 2013.0 1.0 1.0 9.0 2013.0 1.0 1.0 14.0 0.0 1524.0 1.0 0.9 1.0 nan nan nan nan 1524.0 3657.5999 nan nan nan nan -2.8 0.6159 nan 1012.4145 nan nan nan 3.9 16.0934 nan nan nan nan nan nan 0.0 nan nan 9.26 3.09

In [12]:
# Lots of columns in there!  Lets plan on converting to time-since-epoch to do
# a 'join' with the bike data, plus gather weather info that might affect
# cyclists - rain, snow, temperature.  Alas, drop the "snow" column since it's
# all NA's.  Also add in dew point and humidity just in case.  Slice out just
# the columns of interest and drop the rest.
wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]

wthr2.set_name(wthr2.names.index("Precipitation One Hour (mm)"), "Rain (mm)")
wthr2.set_name(wthr2.names.index("Weather Code 1/ Description"), "WC1")
wthr2.describe()
# Much better!


Rows:17,520 Cols:9

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 46 15.0326805 3.6 KB 1.780005
C1 1-Byte Integers 34 11.111112 19.4 KB 9.592678
C1N 1-Byte Integers (w/o NAs) 90 29.411766 51.5 KB 25.494701
C1S 1-Byte Fractions 42 13.725491 24.0 KB 11.894592
C2S 2-Byte Fractions 94 30.718956 103.4 KB 51.238026
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 201.9 KB 17520.0 34.0 306.0
mean 201.9 KB 17520.0 34.0 306.0
min 201.9 KB 17520.0 34.0 306.0
max 201.9 KB 17520.0 34.0 306.0
stddev 0 B 0.0 0.0 0.0
total 201.9 KB 17520.0 34.0 306.0

Year Local Month Local Day Local Hour Local Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1
type int int int int real real real real enum
mins 2013.0 1.0 1.0 0.0 -26.7 0.1251 0.0 -15.6 0.0
mean 2013.5 6.5260273972615.720547945211.5 4.31304646766 0.596736389159 1.3799301075312.5789090701 NaN
maxs 2014.0 12.0 31.0 23.0 24.4 1.0 26.924 36.1 11.0
sigma 0.5000142700173.447949723858.796498048526.9223841118810.9731282097 0.185792011866 2.5621512917910.0396739531 NaN
zeros 0 0 0 730 268 0 501 269 17
missing0 0 0 0 67 67 15660 67 14980
0 2013.0 1.0 1.0 0.0 -5.0 0.5447 nan 3.3
1 2013.0 1.0 1.0 1.0 -4.4 0.5463 nan 3.9
2 2013.0 1.0 1.0 2.0 -3.3 0.619 nan 3.3
3 2013.0 1.0 1.0 3.0 -2.8 0.6159 nan 3.9
4 2013.0 1.0 1.0 4.0 -2.8 0.6159 nan 3.9
5 2013.0 1.0 1.0 5.0 -2.8 0.6159 nan 3.9
6 2013.0 1.0 1.0 6.0 -3.3 0.5934 nan 3.9
7 2013.0 1.0 1.0 7.0 -3.3 0.5934 nan 3.9
8 2013.0 1.0 1.0 8.0 -2.8 0.6425 nan 3.3
9 2013.0 1.0 1.0 9.0 -2.8 0.6159 nan 3.9

In [13]:
# Filter down to the weather at Noon
wthr3 = wthr2[ wthr2["Hour Local"]==12 ]

In [14]:
# Lets now get Days since the epoch... we'll convert year/month/day into Epoch
# time, and then back to Epoch days.  Need zero-based month and days, but have
# 1-based.
wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
secsPerDay=1000*60*60*24
wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
wthr3.describe()
# msec looks sane (numbers like 1.3e12 are in the correct range for msec since
# 1970).  Epoch Days matches closely with the epoch day numbers from the
# CitiBike dataset.


Rows:730 Cols:11

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 80 21.390373 6.3 KB 12.498779
C0D Constant Reals 13 3.4759357 1.0 KB 2.0310516
C1 1-Byte Integers 30 8.021391 2.6 KB 5.2455816
C1N 1-Byte Integers (w/o NAs) 56 14.973262 4.9 KB 9.801778
C1S 1-Byte Fractions 34 9.090909 3.5 KB 7.0032225
C2S 2-Byte Fractions 34 9.090909 4.2 KB 8.4288645
CUD Unique Reals 25 6.6844916 3.6 KB 7.2297626
C8D 64-bit Reals 102 27.272728 23.9 KB 47.76096
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 50.0 KB 730.0 34.0 374.0
mean 50.0 KB 730.0 34.0 374.0
min 50.0 KB 730.0 34.0 374.0
max 50.0 KB 730.0 34.0 374.0
stddev 0 B 0.0 0.0 0.0
total 50.0 KB 730.0 34.0 374.0

Year Local Month Local Day Local Hour Local Dew Point (C) Humidity Fraction Rain (mm) Temperature (C) WC1 msec Days
type int int int int real real real real enum int int
mins 2013.0 1.0 1.0 12.0 -26.7 0.1723 0.0 -13.9 0.0 1.3570704e+12 15706.0
mean 2013.5 6.5260273972615.720547945212.0 4.23012379642 0.539728198074 1.5312571428614.0687757909 NaN 1.3885608526e+1216070.5
maxs 2014.0 12.0 31.0 12.0 23.3 1.0 12.446 34.4 10.0 1.420056e+12 16435.0
sigma 0.5003428180043.450215293078.802278027010.0 11.1062964725 0.179945027923 2.3606424861510.3989855149 NaN 18219740080.4 210.877136425
zeros 0 0 0 0 14 0 -174 7 -83 0 0
missing0 0 0 0 3 3 660 3 620 0 0
0 2013.0 1.0 1.0 12.0 -3.3 0.5934 nan 3.9 1.3570704e+12 15706.0
1 2013.0 1.0 2.0 12.0 -11.7 0.4806 nan -2.2 1.3571568e+12 15707.0
2 2013.0 1.0 3.0 12.0 -10.6 0.5248 nan -2.2 1.3572432e+12 15708.0
3 2013.0 1.0 4.0 12.0 -7.2 0.4976 nan 2.2 1.3573296e+12 15709.0
4 2013.0 1.0 5.0 12.0 -7.2 0.426 nan 4.4 1.357416e+12 15710.0
5 2013.0 1.0 6.0 12.0 -1.7 0.6451 nan 4.4 haze 1.3575024e+12 15711.0
6 2013.0 1.0 7.0 12.0 -6.1 0.4119 nan 6.1 1.3575888e+12 15712.0
7 2013.0 1.0 8.0 12.0 -1.7 0.5314 nan 7.2 1.3576752e+12 15713.0
8 2013.0 1.0 9.0 12.0 0.6 0.56 nan 8.9 haze 1.3577616e+12 15714.0
9 2013.0 1.0 10.0 12.0 -6.1 0.3952 nan 6.7 1.357848e+12 15715.0

In [15]:
# Lets drop off the extra time columns to make a easy-to-handle dataset.
wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")

In [16]:
# Also, most rain numbers are missing - lets assume those are zero rain days
rain = wthr4["Rain (mm)"]
rain[ rain.isna() ] = 0
wthr4["Rain (mm)"] = rain

In [17]:
# ----------
# 6 - Join the weather data-per-day to the bike-starts-per-day
print("Merge Daily Weather with Bikes-Per-Day")
bpd_with_weather = bpd.merge(wthr4,all_x=True,all_y=False)
bpd_with_weather.describe()
bpd_with_weather.show()


Merge Daily Weather with Bikes-Per-Day
Rows:139,261 Cols:10

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 50 15.625 3.9 KB 0.2143667
C0D Constant Reals 6 1.8750001 480 B 0.0257240
CBS Bits 1 0.3125 614 B 0.0329053
C1 1-Byte Integers 26 8.125 112.2 KB 6.1573634
C1N 1-Byte Integers (w/o NAs) 47 14.687499 202.9 KB 11.132173
C1S 1-Byte Fractions 32 10.0 138.6 KB 7.6072865
C2 2-Byte Integers 62 19.375 531.1 KB 29.146162
CUD Unique Reals 96 30.000002 832.5 KB 45.684017
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.61:54321 1.8 MB 139261.0 32.0 320.0
mean 1.8 MB 139261.0 32.0 320.0
min 1.8 MB 139261.0 32.0 320.0
max 1.8 MB 139261.0 32.0 320.0
stddev 0 B 0.0 0.0 0.0
total 1.8 MB 139261.0 32.0 320.0

Days start station name bikes Month DayOfWeek Humidity Fraction Rain (mm) Temperature (C) WC1 Dew Point (C)
type int enum int enum enum real int real enum real
mins 15887.0 0.0 1.0 0.0 0.0 0.1723 0.0 -13.9 0.0 -26.7
mean 16099.9758008NaN 74.7329833909NaN NaN 0.532494425803 0.0 15.6334205959 NaN 5.47825137402
maxs 16314.0 339.0 680.0 11.0 6.0 1.0 0.0 34.4 10.0 23.3
sigma 123.635133897NaN 64.1241591283NaN NaN 0.178408938664 0.0 10.9454511961 NaN 11.7308194576
zeros 0 428 0 9949 19880 0 139261 1598 -258111954
missing0 0 0 0 0 981 0 981 119130981
0 15887.0 1 Ave & E 15 St 47.0 6 Sun 0.9354 0.0 22.8 rain 21.7
1 15887.0 1 Ave & E 18 St 40.0 6 Sun 0.9354 0.0 22.8 rain 21.7
2 15887.0 1 Ave & E 30 St 42.0 6 Sun 0.9354 0.0 22.8 rain 21.7
3 15887.0 1 Ave & E 44 St 12.0 6 Sun 0.9354 0.0 22.8 rain 21.7
4 15887.0 10 Ave & W 28 St 28.0 6 Sun 0.9354 0.0 22.8 rain 21.7
5 15887.0 11 Ave & W 27 St 30.0 6 Sun 0.9354 0.0 22.8 rain 21.7
6 15887.0 11 Ave & W 41 St 20.0 6 Sun 0.9354 0.0 22.8 rain 21.7
7 15887.0 12 Ave & W 40 St 13.0 6 Sun 0.9354 0.0 22.8 rain 21.7
8 15887.0 2 Ave & E 31 St 33.0 6 Sun 0.9354 0.0 22.8 rain 21.7
9 15887.0 2 Ave & E 58 St 54.0 6 Sun 0.9354 0.0 22.8 rain 21.7
Daysstart station name bikes MonthDayOfWeek Humidity Fraction Rain (mm) Temperature (C)WC1 Dew Point (C)
158871 Ave & E 15 St 47 6Sun 0.9354 0 22.8rain 21.7
158871 Ave & E 18 St 40 6Sun 0.9354 0 22.8rain 21.7
158871 Ave & E 30 St 42 6Sun 0.9354 0 22.8rain 21.7
158871 Ave & E 44 St 12 6Sun 0.9354 0 22.8rain 21.7
1588710 Ave & W 28 St 28 6Sun 0.9354 0 22.8rain 21.7
1588711 Ave & W 27 St 30 6Sun 0.9354 0 22.8rain 21.7
1588711 Ave & W 41 St 20 6Sun 0.9354 0 22.8rain 21.7
1588712 Ave & W 40 St 13 6Sun 0.9354 0 22.8rain 21.7
158872 Ave & E 31 St 33 6Sun 0.9354 0 22.8rain 21.7
158872 Ave & E 58 St 54 6Sun 0.9354 0 22.8rain 21.7

In [18]:
# 7 - Test/Train split again, model build again, this time with weather
split_fit_predict(bpd_with_weather)


Training data has 10 columns and 83723 rows, test has 41517 rows, holdout has 14021

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%
Model R2 TRAIN R2 TEST R2 HOLDOUT Model Training Time (s)
GBM 0.9673728 0.9259863 0.9275258 22.512
DRF 0.9004752 0.8439568 0.8420079 46.674
GLM 0.8013288 0.7958744 0.8013114 0.343
DL 0.9301786 0.9067311 0.9075087 71.67