In [1]:
import h2o

In [2]:
# Explore a typical Data Science workflow with H2O and Python
#
# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles
# across the CitiBike network of stations, by predicting the number of bike
# trips taken from the station every day.  Use 10 million rows of historical
# data, and eventually add weather data.


# Connect to a cluster
h2o.init()


--------------------------  ---------------------------
H2O cluster uptime:         25 seconds 458 milliseconds
H2O cluster version:        0.3.0.99999
H2O cluster name:           cliffc
H2O cluster total nodes:    1
H2O cluster total memory:   7.67 GB
H2O cluster total cores:    16
H2O cluster allowed cores:  16
H2O cluster healthy:        True
--------------------------  ---------------------------


In [3]:
# Pick either the big or the small demo.
# Big data is 10M rows
small_test = ["bigdata/laptop/citibike-nyc/2013-10.csv"]
big_test =   ["bigdata/laptop/citibike-nyc/2013-07.csv",
              "bigdata/laptop/citibike-nyc/2013-08.csv",
              "bigdata/laptop/citibike-nyc/2013-09.csv",
              "bigdata/laptop/citibike-nyc/2013-10.csv",
              "bigdata/laptop/citibike-nyc/2013-11.csv",
              "bigdata/laptop/citibike-nyc/2013-12.csv",
              "bigdata/laptop/citibike-nyc/2014-01.csv",
              "bigdata/laptop/citibike-nyc/2014-02.csv",
              "bigdata/laptop/citibike-nyc/2014-03.csv",
              "bigdata/laptop/citibike-nyc/2014-04.csv",
              "bigdata/laptop/citibike-nyc/2014-05.csv",
              "bigdata/laptop/citibike-nyc/2014-06.csv",
              "bigdata/laptop/citibike-nyc/2014-07.csv",
              "bigdata/laptop/citibike-nyc/2014-08.csv"]

# ----------

# 1- Load data - 1 row per bicycle trip.  Has columns showing the start and end
# station, trip duration and trip start time and day.  The larger dataset
# totals about 10 million rows
print "Import and Parse bike data"
data = h2o.import_frame(path=big_test)


Import and Parse bike data

Parse Progress: [##################################################] 100%
Veckeys [{u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff01000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff02000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff03000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff04000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff05000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff06000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff07000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff08000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff09000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0a000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0b000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0c000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0d000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0e000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0f000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\2013-07.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}]
Imported ['bigdata/laptop/citibike-nyc/2013-07.csv', 'bigdata/laptop/citibike-nyc/2013-08.csv', 'bigdata/laptop/citibike-nyc/2013-09.csv', 'bigdata/laptop/citibike-nyc/2013-10.csv', 'bigdata/laptop/citibike-nyc/2013-11.csv', 'bigdata/laptop/citibike-nyc/2013-12.csv', 'bigdata/laptop/citibike-nyc/2014-01.csv', 'bigdata/laptop/citibike-nyc/2014-02.csv', 'bigdata/laptop/citibike-nyc/2014-03.csv', 'bigdata/laptop/citibike-nyc/2014-04.csv', 'bigdata/laptop/citibike-nyc/2014-05.csv', 'bigdata/laptop/citibike-nyc/2014-06.csv', 'bigdata/laptop/citibike-nyc/2014-07.csv', 'bigdata/laptop/citibike-nyc/2014-08.csv'] into cluster with 10407546 rows and 15 cols

In [4]:
# ----------

# 2- light data munging: group the bike starts per-day, converting the 10M rows
# of trips to about 140,000 station&day combos - predicting the number of trip
# starts per-station-per-day.

# Convert start time to: Day since the Epoch
startime = data["starttime"]
secsPerDay=1000*60*60*24
data["Days"] = (startime/secsPerDay).floor()
data.describe()


Rows: 10407546 Cols: 16
               tripduration    starttime          stoptime           start station id    start station name    start station latitude    start station longitude    end station id    end station name    end station latitude    end station longitude    bikeid         usertype        birth year     gender          Days
-------------  --------------  -----------------  -----------------  ------------------  --------------------  ------------------------  -------------------------  ----------------  ------------------  ----------------------  -----------------------  -------------  --------------  -------------  --------------  -------------
type           int             time               time               int                 enum                  real                      real                       int               enum                real                    real                     int            enum            int            int             int
mins           60.0            1.372662e+12       1.372662242e+12    72.0                0.0                   40.680342423              -74.01713445               72.0              0.0                 40.680342423            -74.01713445             14529.0        0.0             1899.0         0.0             15887.0
mean           868.968726057   1.39099985823e+12  1.39100072718e+12  444.858835407       168.221531954         40.7343819823             -73.9910570182             445.259785544     169.27576741        40.7340868895           -73.9911707799           17895.6618358  0.880131781306  1975.79892394  1.08414654137   16099.0245177
maxs           6250750.0       1.409554787e+12    1.409563605e+12    3002.0              339.0                 40.771522                 -73.9500479759             3002.0            339.0               40.771522               -73.9500479759           21689.0        1.0             1998.0         2.0             16314.0
sigma          2985.10540532   11806578171.7      11806555707.8      355.755989765       103.210304227         0.0197100508736           0.0123453320185            360.070380844     103.205091206       0.0197309578633         0.0124311861598          1938.80517884  0.324807387506  11.132784905   0.563019777794  136.647269305
zero_count     0               0                  0                  0                   56836                 0                         0                          0                 55167               0                       0                        0              1247534         0              1248517         0
missing_count  0               0                  0                  0                   0                     0                         0                          0                 0                   0                       0                        0              0               1247644        0               0


Internal FluidVec compression/distribution summary:

    Chunk Type    Count    Count Percentage    Size      Size Percentage
--  ------------  -------  ------------------  --------  -----------------
    C0L           117      1.52981             9.1 KB    0.00148886
    CBS           478      6.25                1.3 MB    0.212291
    C1N           478      6.25                10.0 MB   1.66066
    C1S           839      10.9702             17.5 MB   2.92209
    C2            2616     34.205              108.8 MB  18.1451
    C2S           314      4.10565             12.9 MB   2.15722
    C4            214      2.79812             17.9 MB   2.97811
    C8            680      8.89121             113.5 MB  18.9268
    C8D           1912     25                  317.7 MB  52.9962




In [5]:
# Now do a monster Group-By.  Count bike starts per-station per-day.  Ends up
# with about 340 stations times 400 days (140,000 rows).  This is what we want
# to predict.
ddplycols=["Days","start station name"]
bpd = h2o.ddply(data[ddplycols],ddplycols,"(%nrow)")  # Compute bikes-per-day
bpd["C1"]._name = "bikes" # Rename column from generic name

In [6]:
# Quantiles: the data is fairly unbalanced; some station/day combos are wildly
# more popular than others.
print "Quantiles of bikes-per-day"
bpd["bikes"].quantile().show()


Quantiles of bikes-per-day
  Row ID    bikes (first 9 row(s))
--------  ------------------------
       1                         2
       2                        11
       3                        26
       4                        35
       5                        58
       6                        89
       7                       107
       8                       157
       9                       291


In [7]:
# A little feature engineering
# Add in month-of-year (seasonality; fewer bike rides in winter than summer)
secs = bpd["Days"]*secsPerDay
bpd["Month"]     = secs.month()
# Add in day-of-week (work-week; more bike rides on Sunday than Monday)
bpd["DayOfWeek"] = secs.dayOfWeek()
print "Bikes-Per-Day"
bpd.describe()


Bikes-Per-Day
Rows: 139261 Cols: 5
               Days           start station name    bikes          Month          DayOfWeek
-------------  -------------  --------------------  -------------  -------------  -------------
type           int            enum                  int            enum           enum
mins           15887.0        0.0                   1.0            0.0            0.0
mean           16099.9758008  169.640760873         74.7341035897  5.68177738204  3.0059384896
maxs           16314.0        339.0                 680.0          11.0           6.0
sigma          123.635133897  98.50295732           64.1243887565  3.20373100216  2.00302100015
zero_count     0              428                   0              9949           19880
missing_count  0              0                     0              0              0


Internal FluidVec compression/distribution summary:

    Chunk Type    Count    Count Percentage    Size      Size Percentage
--  ------------  -------  ------------------  --------  -----------------
    C1N           128      40                  280.5 KB  25.2874
    C2            192      60                  828.7 KB  74.7126




In [8]:
# ----------
# 3- Fit a model on train; using test as validation

# Function for doing class test/train/holdout split
def split_fit_predict(data):
  # Classic Test/Train split
  r = data['Days'].runif()   # Random UNIForm numbers, one per row
  train = data[  r  < 0.6]
  test  = data[(0.6 <= r) & (r < 0.9)]
  hold  = data[ 0.9 <= r ]
  print "Training data has",train.ncol(),"columns and",train.nrow(),"rows, test has",test.nrow(),"rows, holdout has",hold.nrow()
  
  # Run GBM
  gbm = h2o.gbm(x           =train.drop("bikes"),
                y           =train     ["bikes"],
                validation_x=test .drop("bikes"),
                validation_y=test      ["bikes"],
                ntrees=500, # 500 works well
                max_depth=6,
                learn_rate=0.1)

  # Run DRF
  drf = h2o.random_forest(x =train.drop("bikes"),
                y           =train     ["bikes"],
                validation_x=test .drop("bikes"),
                validation_y=test      ["bikes"],
                ntrees=500, # 500 works well
                max_depth=50)

  # Run GLM
  glm = h2o.glm(x           =train.drop("bikes"),
                y           =train     ["bikes"],
                validation_x=test .drop("bikes"),
                validation_y=test      ["bikes"],
                dropNA20Cols=True)
  #glm.show()
  
  
  # ----------
  # 4- Score on holdout set & report
  train_r2_gbm = gbm.model_performance(train).r2()
  test_r2_gbm  = gbm.model_performance(test ).r2()
  hold_r2_gbm  = gbm.model_performance(hold ).r2()
  print "GBM R2 TRAIN=",train_r2_gbm,", R2 TEST=",test_r2_gbm,", R2 HOLDOUT=",hold_r2_gbm
  
  train_r2_drf = drf.model_performance(train).r2()
  test_r2_drf  = drf.model_performance(test ).r2()
  hold_r2_drf  = drf.model_performance(hold ).r2()
  print "DRF R2 TRAIN=",train_r2_drf,", R2 TEST=",test_r2_drf,", R2 HOLDOUT=",hold_r2_drf
  
  train_r2_glm = glm.model_performance(train).r2()
  test_r2_glm  = glm.model_performance(test ).r2()
  hold_r2_glm  = glm.model_performance(hold ).r2()
  print "GLM R2 TRAIN=",train_r2_glm,", R2 TEST=",test_r2_glm,", R2 HOLDOUT=",hold_r2_glm
  # --------------

In [9]:
# Split the data (into test & train), fit some models and predict on the holdout data
split_fit_predict(bpd)
# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM.  This means given just
# the station, the month, and the day-of-week we can predict 90% of the
# variance of the bike-trip-starts.


Training data has 5 columns and 83795 rows, test has 41643 rows, holdout has 13823

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%
GBM R2 TRAIN= 0.96601042827 , R2 TEST= 0.922711534678 , R2 HOLDOUT= 0.918842405638
DRF R2 TRAIN= 0.801796515639 , R2 TEST= 0.7919404403 , R2 HOLDOUT= 0.788198629144
GLM R2 TRAIN= 0.711006232492 , R2 TEST= 0.706796249967 , R2 HOLDOUT= 0.700993160367

In [10]:
# ----------
# 5- Now lets add some weather
# Load weather data
wthr1 = h2o.import_frame(path=["bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv",
                               "bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv"])
# Peek at the data
wthr1.describe()


Parse Progress: [##################################################] 100%
Veckeys [{u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff01000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff02000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff03000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff04000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff05000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff06000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff07000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff08000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff09000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0a000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0b000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0c000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0d000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0e000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff0f000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff10000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff11000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff12000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff13000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff14000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff15000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff16000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff17000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff18000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff19000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff1a000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff1b000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff1c000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff1d000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff1e000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff1f000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff20000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff21000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff22000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff23000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff24000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff25000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff26000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff27000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff28000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff29000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff2a000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff2b000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff2c000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff2d000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff2e000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff2f000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff30000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff31000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}, {u'URL': None, u'type': u'Key<Vec>', u'name': u'$04ff32000000ffffffff$nfs:\\C:\\Users\\cliffc\\Desktop\\h2o-dev\\bigdata\\laptop\\citibike-nyc\\31081_New_York_City__Hourly_2013.csv', u'__meta': {u'schema_name': u'VecKeyV1', u'schema_version': 1, u'schema_type': u'Key<Vec>'}}]
Imported ['bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv', 'bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv'] into cluster with 17520 rows and 50 cols
Rows: 17520 Cols: 50
               Year Local      Month Local    Day Local      Hour Local     Year UTC        Month UTC      Day UTC        Hour UTC       Cavok Reported    Cloud Ceiling (m)    Cloud Cover Fraction    Cloud Cover Fraction 1    Cloud Cover Fraction 2    Cloud Cover Fraction 3    Cloud Cover Fraction 4    Cloud Cover Fraction 5    Cloud Cover Fraction 6    Cloud Height (m) 1    Cloud Height (m) 2    Cloud Height (m) 3    Cloud Height (m) 4    Cloud Height (m) 5    Cloud Height (m) 6    Dew Point (C)    Humidity Fraction    Precipitation One Hour (mm)    Pressure Altimeter (mbar)    Pressure Sea Level (mbar)    Pressure Station (mbar)    Snow Depth (cm)      Temperature (C)    Visibility (km)    Weather Code 1    Weather Code 1/ Description    Weather Code 2    Weather Code 2/ Description    Weather Code 3    Weather Code 3/ Description    Weather Code 4    Weather Code 4/ Description    Weather Code 5    Weather Code 5/ Description    Weather Code 6    Weather Code 6/ Description    Weather Code Most Severe / Icon Code    Weather Code Most Severe    Weather Code Most Severe / Description    Wind Direction (degrees)    Wind Gust (m/s)    Wind Speed (m/s)
-------------  --------------  -------------  -------------  -------------  --------------  -------------  -------------  -------------  ----------------  -------------------  ----------------------  ------------------------  ------------------------  ------------------------  ------------------------  ------------------------  ------------------------  --------------------  --------------------  --------------------  --------------------  --------------------  --------------------  ---------------  -------------------  -----------------------------  ---------------------------  ---------------------------  -------------------------  -------------------  -----------------  -----------------  ----------------  -----------------------------  ----------------  -----------------------------  ----------------  -----------------------------  ----------------  -----------------------------  ----------------  -----------------------------  ----------------  -----------------------------  --------------------------------------  --------------------------  ----------------------------------------  --------------------------  -----------------  ------------------
type           int             int            int            int            int             int            int            int            int               real                 real                    real                      real                      real                      int                       int                       int                       real                  real                  real                  int                   int                   int                   real             real                 real                           real                         int                          int                        int                  real               real               int               enum                           int               enum                           int               enum                           int               enum                           int               enum                           int               int                            int                                     int                         enum                                      int                         real               real
mins           2013.0          1.0            1.0            0.0            2013.0          1.0            1.0            0.0            0.0               61.0                 0.0                     0.0                       0.25                      0.5                       1.79769313486e+308        1.79769313486e+308        1.79769313486e+308        60.96                 213.36                365.76                1.79769313486e+308    1.79769313486e+308    1.79769313486e+308    -26.7            0.1251               0.0                            983.2949                     1.79769313486e+308           1.79769313486e+308         1.79769313486e+308   -15.6              0.001              1.0               0.0                            1.0               0.0                            1.0               0.0                            1.0               0.0                            1.0               0.0                            3.0               1.79769313486e+308             0.0                                     1.0                         0.0                                       10.0                        7.2                0.0
mean           2013.5          6.52602739726  15.7205479452  11.5           2013.50057078   6.52511415525  15.721347032   11.5001141553  0.0               1306.31195846        0.416742490522          0.361207349081            0.872445384073            0.963045685279            0.0                       0.0                       0.0                       1293.9822682          1643.73900166         2084.89386376         0.0                   0.0                   0.0                   4.31304646766    0.596736389159       1.37993010753                  1017.82581441                0.0                          0.0                        0.0                  12.5789090701      14.3914429682      4.84251968504     6.16417322835                  3.65867689358     6.43911792905                  2.84660766962     4.58702064897                  2.01149425287     2.27586206897                  4.125             0.3125                         3.0               0.0                            1.37848173516                           4.84251968504               6.16417322835                             194.69525682                9.42216948073      2.41032887849
maxs           2014.0          12.0           31.0           23.0           2015.0          12.0           31.0           23.0           0.0               3657.6               1.0                     1.0                       1.0                       1.0                       -1.79769313486e+308       -1.79769313486e+308       -1.79769313486e+308       3657.5999             3657.5999             3657.5999             -1.79769313486e+308   -1.79769313486e+308   -1.79769313486e+308   24.4             1.0                  26.924                         1042.2113                    -1.79769313486e+308          -1.79769313486e+308        -1.79769313486e+308  36.1               16.0934            60.0              11.0                           60.0              10.0                           36.0              7.0                            27.0              4.0                            27.0              2.0                            3.0               -1.79769313486e+308            16.0                                    60.0                        11.0                                      360.0                       20.58              10.8
sigma          0.500014270017  3.44794972385  8.79649804852  6.92238411188  0.500584411716  3.44782405458  8.79561488868  6.92230165203  0.0               995.339856966        0.462720830993          0.42770569708             0.197155690367            0.0861015598104           -0.0                      -0.0                      -0.0                      962.743095854         916.73861349          887.215847511         -0.0                  -0.0                  -0.0                  10.9731282097    0.185792011866       2.56215129179                  7.46451697179                -0.0                         -0.0                       -0.0                 10.0396739531      3.69893623033      5.70486576983     2.47814708663                  6.13386253912     1.83976235335                  5.80553286364     1.28967553698                  3.12340844261     1.282164369                    6.15223536611     0.60207972894                  0.0               -0.0                           4.07386062702                           5.70486576983               2.47814708663                             106.350000031               1.81511871115      1.61469790524
zero_count     0               0              0              730            0               0              0              730            17455             0                    8758                    8758                      0                         0                         0                         0                         0                         0                     0                     0                     0                     0                     0                     268              0                    501                            0                            0                            0                          0                    269                0                  0                 17                             0                 30                             0                 13                             0                 20                             0                 12                             0                 0                              14980                                   0                           17                                        0                           0                  2768
missing_count  0               0              0              0              0               0              0              0              65                10780                375                     375                       14682                     16535                     17520                     17520                     17520                     9103                  14683                 16535                 17520                 17520                 17520                 67               67                   15660                          360                          17520                        17520                      17520                67                 412                14980             14980                          16477             16477                          17181             17181                          17433             17433                          17504             17504                          17518             17520                          0                                       14980                       14980                                     9382                        14381              1283


Internal FluidVec compression/distribution summary:

    Chunk Type    Count    Count Percentage    Size      Size Percentage
--  ------------  -------  ------------------  --------  -----------------
    C0L           2        2                   160  B    0.00610671
    C0D           21       21                  1.6 KB    0.0641205
    CBS           1        1                   2.2 KB    0.0862573
    CXI           2        2                   408  B    0.0155721
    C1            24       24                  206.9 KB  8.08651
    C1N           14       14                  120.7 KB  4.71713
    C1S           4        4                   34.5 KB   1.34958
    C8D           32       32                  2.1 MB    85.6747




In [11]:
# Lots of columns in there!  Lets plan on converting to time-since-epoch to do
# a 'join' with the bike data, plus gather weather info that might affect
# cyclists - rain, snow, temperature.  Alas, drop the "snow" column since it's
# all NA's.  Also add in dew point and humidity just in case.  Slice out just
# the columns of interest and drop the rest.
wthr2 = wthr1["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]
wthr2["Precipitation One Hour (mm)"]._name = "Rain (mm)" # Shorter column name
wthr2["Weather Code 1/ Description"]._name = "WC1" # Shorter column name
wthr2.describe()
# Much better!


Rows: 17520 Cols: 9
               Year Local      Month Local    Day Local      Hour Local     Dew Point (C)    Humidity Fraction    Rain (mm)      Temperature (C)    WC1
-------------  --------------  -------------  -------------  -------------  ---------------  -------------------  -------------  -----------------  -------------
type           int             int            int            int            real             real                 real           real               enum
mins           2013.0          1.0            1.0            0.0            -26.7            0.1251               0.0            -15.6              0.0
mean           2013.5          6.52602739726  15.7205479452  11.5           4.31304646766    0.596736389159       1.37993010753  12.5789090701      6.16417322835
maxs           2014.0          12.0           31.0           23.0           24.4             1.0                  26.924         36.1               11.0
sigma          0.500014270017  3.44794972385  8.79649804852  6.92238411188  10.9731282097    0.185792011866       2.56215129179  10.0396739531      2.47814708663
zero_count     0               0              0              730            268              0                    501            269                17
missing_count  0               0              0              0              67               67                   15660          67                 14980


Internal FluidVec compression/distribution summary:

    Chunk Type    Count    Count Percentage    Size      Size Percentage
--  ------------  -------  ------------------  --------  -----------------
    C0L           2        11.1111             160  B    0.0253177
    C1            2        11.1111             17.2 KB   2.79381
    C1N           6        33.3333             51.7 KB   8.38144
    C8D           8        44.4444             548.0 KB  88.7994




In [12]:
# Filter down to the weather at Noon
wthr3 = wthr2[ wthr2["Hour Local"]==12 ]

In [13]:
# Lets now get Days since the epoch... we'll convert year/month/day into Epoch
# time, and then back to Epoch days.  Need zero-based month and days, but have
# 1-based.
wthr3["msec"] = h2o.H2OVec.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
secsPerDay=1000*60*60*24
wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
wthr3.describe()
# msec looks sane (numbers like 1.3e12 are in the correct range for msec since
# 1970).  Epoch Days matches closely with the epoch day numbers from the
# CitiBike dataset.


Rows: 730 Cols: 11
               Year Local      Month Local    Day Local      Hour Local    Dew Point (C)    Humidity Fraction    Rain (mm)      Temperature (C)    WC1            msec              Days
-------------  --------------  -------------  -------------  ------------  ---------------  -------------------  -------------  -----------------  -------------  ----------------  -------------
type           int             int            int            int           real             real                 real           real               enum           int               int
mins           2013.0          1.0            1.0            12.0          -26.7            0.1723               0.0            -13.9              0.0            1.3570704e+12     15706.0
mean           2013.5          6.52602739726  15.7205479452  12.0          4.23012379642    0.539728198074       1.53125714286  14.0687757909      5.18181818182  1.3885608526e+12  16070.5
maxs           2014.0          12.0           31.0           12.0          23.3             1.0                  12.446         34.4               10.0           1.420056e+12      16435.0
sigma          0.500342818004  3.45021529307  8.80227802701  0.0           11.1062964725    0.179945027923       2.36064248615  10.3989855149      2.74674726123  18219740080.4     210.877136425
zero_count     0               0              0              0             14               0                    15             7                  1              0                 0
missing_count  0               0              0              0             3                3                    660            3                  620            0                 0


Internal FluidVec compression/distribution summary:

    Chunk Type    Count    Count Percentage    Size     Size Percentage
--  ------------  -------  ------------------  -------  -----------------
    C0L           4        18.1818             320  B   0.930395
    C1            2        9.09091             866  B   2.51788
    C1N           4        18.1818             1.7 KB   5.03576
    C2            2        9.09091             1.6 KB   4.64034
    C8            2        9.09091             5.8 KB   17.3751
    C8D           8        36.3636             23.3 KB  69.5005




In [14]:
# Lets drop off the extra time columns to make a easy-to-handle dataset.
wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")

In [15]:
# Also, most rain numbers are missing - lets assume those are zero rain days
rain = wthr4["Rain (mm)"]
rain[rain == None ] = 0

In [16]:
# ----------
# 6 - Join the weather data-per-day to the bike-starts-per-day
print "Merge Daily Weather with Bikes-Per-Day"
bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False)
bpd_with_weather.describe()
bpd_with_weather.show()


Merge Daily Weather with Bikes-Per-Day
Rows: 139261 Cols: 10
               Days           start station name    bikes          Month          DayOfWeek      Humidity Fraction    Rain (mm)        Temperature (C)    WC1            Dew Point (C)
-------------  -------------  --------------------  -------------  -------------  -------------  -------------------  ---------------  -----------------  -------------  ---------------
type           int            enum                  int            enum           enum           real                 real             real               enum           real
mins           15887.0        0.0                   1.0            0.0            0.0            0.1723               0.0              -13.9              0.0            -26.7
mean           16099.9758008  169.640760873         74.7341035897  5.68177738204  3.0059384896   0.532494425803       0.0860139306769  15.6334205959      5.09840544434  5.47825137402
maxs           16314.0        339.0                 680.0          11.0           6.0            1.0                  8.382            34.4               10.0           23.3
sigma          123.635133897  98.50295732           64.1243887565  3.20373100216  2.00302100015  0.178408938664       0.577304430765   10.9454511961      2.962709609    11.7308194576
zero_count     0              428                   0              9949           19880          0                    131155           1598               324            1954
missing_count  0              0                     0              0              0              981                  0                981                119130         981


Internal FluidVec compression/distribution summary:

    Chunk Type    Count    Count Percentage    Size      Size Percentage
--  ------------  -------  ------------------  --------  -----------------
    C1            64       10                  140.2 KB  2.49622
    C1N           128      20                  280.5 KB  4.99244
    C2            192      30                  828.7 KB  14.7504
    C8D           256      40                  4.3 MB    77.761



Displaying 10 row(s):
  Row ID    Days    start station name    bikes    Month    DayOfWeek    Humidity Fraction    Rain (mm)    Temperature (C)    WC1    Dew Point (C)
--------  ------  --------------------  -------  -------  -----------  -------------------  -----------  -----------------  -----  ---------------
       1   16106                    88        6        1            1               0.9228        0                    0        8             -1.1
       2   16234                    87       15        5            3               0.934         0.508               20        8             18.9
       3   15978                    89       36        8            6               0.4688        0                   20      nan              8.3
       4   16088                    27       55        0            4               1             0                    1      nan              1
       5   15945                    28      140        7            1               0.5681        0                   28.3      6             18.9
       6   16123                    83        6        1            4               0.3141        0                    9.4    nan             -6.7
       7   16251                    82       79        5            6               0.5275        0                   27.2    nan             16.7
       8   15995                    84       22        9            2               0.6765        0                   20.6    nan             14.4
       9   16121                   183        2        1            2               0.4331        0                    7.8    nan             -3.9
      10   16185                   182       14        3            3               0.2092        0                   15      nan             -7.2


In [17]:
# 7 - Test/Train split again, model build again, this time with weather
split_fit_predict(bpd_with_weather)


Training data has 10 columns and 83325 rows, test has 41930 rows, holdout has 14006

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%
GBM R2 TRAIN= 0.965496639937 , R2 TEST= 0.927997436911 , R2 HOLDOUT= 0.929164174219
DRF R2 TRAIN= 0.854580917016 , R2 TEST= 0.844209490168 , R2 HOLDOUT= 0.848629904163
GLM R2 TRAIN= 0.723581688957 , R2 TEST= 0.720152442756 , R2 HOLDOUT= 0.730959520174