notebook.community

Edit and run



In [1]:

    
import h2o
import time



In [2]:

    
# Explore a typical Data Science workflow with H2O and Python
#
# Goal: assist the manager of CitiBike of NYC to load-balance the bicycles
# across the CitiBike network of stations, by predicting the number of bike
# trips taken from the station every day.  Use 10 million rows of historical
# data, and eventually add weather data.


# Connect to a cluster
h2o.init()









    




H2O cluster uptime: 
4 seconds 579 milliseconds 
H2O cluster version: 
3.3.0.99999
H2O cluster name: 
ece
H2O cluster total nodes: 
1
H2O cluster total memory: 
8.89 GB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
127.0.0.1
H2O Connection port: 
54321



In [3]:

    
# Set this to True if you want to fetch the data directly from S3.
# This is useful if your cluster is running in EC2.
data_source_is_s3 = False

def mylocate(s):
    if data_source_is_s3:
        return "s3n://h2o-public-test-data/" + s
    else:
        return h2o.locate(s)



In [4]:

    
# Pick either the big or the small demo.
# Big data is 10M rows
small_test = [mylocate("bigdata/laptop/citibike-nyc/2013-10.csv")]
big_test =   [mylocate("bigdata/laptop/citibike-nyc/2013-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-08.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-09.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-10.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-11.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2013-12.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-01.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-02.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-03.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-04.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-05.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-06.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-07.csv"),
              mylocate("bigdata/laptop/citibike-nyc/2014-08.csv")]

# ----------

# 1- Load data - 1 row per bicycle trip.  Has columns showing the start and end
# station, trip duration and trip start time and day.  The larger dataset
# totals about 10 million rows
print "Import and Parse bike data"
data = h2o.import_file(path=big_test)









    



Import and Parse bike data

Parse Progress: [##################################################] 100%

Parsed 10,407,546 rows and 15 cols:







    




File1
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-07.csv
File2
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-08.csv
File3
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-09.csv
File4
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-10.csv
File5
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-11.csv
File6
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-12.csv
File7
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-01.csv
File8
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-02.csv
File9
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-03.csv
File10
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-04.csv
File11
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-05.csv
File12
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-06.csv
File13
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-07.csv
File14
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-08.csv



In [5]:

    
# ----------

# 2- light data munging: group the bike starts per-day, converting the 10M rows
# of trips to about 140,000 station&day combos - predicting the number of trip
# starts per-station-per-day.

# Convert start time to: Day since the Epoch
startime = data["starttime"]
secsPerDay=1000*60*60*24
data["Days"] = (startime/secsPerDay).floor()
data.describe()









    



Rows: 10,407,546 Cols: 16
  chunk_type                 chunk_name  count  count_percentage        size  \
0        C0L          Constant Integers    117          1.529812      9.1 KB   
1         C1            1-Byte Integers    478          6.250000     10.0 MB   
2        C1N  1-Byte Integers (w/o NAs)    478          6.250000     10.0 MB   
3        C1S           1-Byte Fractions    839         10.970188     17.5 MB   
4         C2            2-Byte Integers   2616         34.205020    108.8 MB   
5        C2S           2-Byte Fractions    314          4.105649     12.9 MB   
6         C4            4-Byte Integers    214          2.798117     17.9 MB   
7        C4S           4-Byte Fractions    389          5.086297     32.4 MB   
8         C8            64-bit Integers    680          8.891213    113.5 MB   
9        C8D               64-bit Reals   1523         19.913704    253.0 MB   

   size_percentage  
0         0.001550  
1         1.728914  
2         1.728914  
3         3.042758  
4        18.890900  
5         2.246094  
6         3.100523  
7         5.625424  
8        19.704786  
9        43.930134  
                            size  number_of_rows  number_of_chunks_per_column  \
0  172.16.2.59:54321    575.9 MB        10407546                          478   
1               mean    575.9 MB        10407546                          478   
2                min    575.9 MB        10407546                          478   
3                max    575.9 MB        10407546                          478   
4             stddev        0  B               0                            0   
5              total    575.9 MB        10407546                          478   

   number_of_chunks  
0              7648  
1              7648  
2              7648  
3              7648  
4                 0  
5              7648  

Column-by-Column Summary:







    





tripduration
starttime
stoptime
start station id
start station name
start station latitude
start station longitude
end station id
end station name
end station latitude
end station longitude
bikeid
usertype
birth year
gender
Days
type
int
time
time
int
enum
real
real
int
enum
real
real
int
enum
int
int
int
mins
60.0
1.372662e+12
1.372662242e+12
72.0
0.0
40.680342423
-74.01713445
72.0
0.0
40.680342423
-74.01713445
14529.0
0.0
1899.0
0.0
15887.0
maxs
6250750.0
1.409554787e+12
1.409563605e+12
3002.0
339.0
40.771522
-73.9500479759
3002.0
339.0
40.771522
-73.9500479759
21689.0
1.0
1998.0
2.0
16314.0
sigma
2985.10540532
11806578171.7
11806555707.8
355.755989765
NaN
0.0197100508736
0.0123453320185
360.070380844
NaN
0.0197309578633
0.0124311861598
1938.80517884
0.324807387506
11.132784905
0.563019777794
136.647269305
zero_count
0
0
0
0
56836
0
0
0
55167
0
0
0
1247534
0
1248517
0
missing_count
0
0
0
0
0
0
0
0
0
0
0
0
0
1247644
0
0



In [6]:

    
# Now do a monster Group-By.  Count bike starts per-station per-day.  Ends up
# with about 340 stations times 400 days (140,000 rows).  This is what we want
# to predict.
grouped = data.group_by(["Days","start station name"])
bpd = grouped.count().get_frame() # Compute bikes-per-day
bpd.set_name(2,"bikes")
bpd.show()
bpd.describe()
bpd.dim









    



H2OFrame with 139261 rows and 3 columns: 
    Days start station name  bikes
0  15887    1 Ave & E 15 St     47
1  15887    1 Ave & E 18 St     40
2  15887    1 Ave & E 30 St     42
3  15887    1 Ave & E 44 St     12
4  15887   10 Ave & W 28 St     28
5  15887   11 Ave & W 27 St     30
6  15887   11 Ave & W 41 St     20
7  15887   12 Ave & W 40 St     13
8  15887    2 Ave & E 31 St     33
9  15887    2 Ave & E 58 St     54
Rows: 139,261 Cols: 3
  chunk_type                 chunk_name  count  count_percentage        size  \
0        C1N  1-Byte Integers (w/o NAs)      2          2.083334      8.6 KB   
1        C1S           1-Byte Fractions     32         33.333336    138.6 KB   
2         C2            2-Byte Integers     62         64.583330    531.1 KB   

   size_percentage  
0         1.272600  
1        20.434874  
2        78.292534  
                            size  number_of_rows  number_of_chunks_per_column  \
0  172.16.2.59:54321    678.4 KB          139261                           32   
1               mean    678.4 KB          139261                           32   
2                min    678.4 KB          139261                           32   
3                max    678.4 KB          139261                           32   
4             stddev        0  B               0                            0   
5              total    678.4 KB          139261                           32   

   number_of_chunks  
0                96  
1                96  
2                96  
3                96  
4                 0  
5                96  

Column-by-Column Summary:







    





Days
start station name
bikes
type
int
enum
int
mins
15887.0
0.0
1.0
maxs
16314.0
339.0
680.0
sigma
123.635133897
NaN
64.1243887565
zero_count
0
428
0
missing_count
0
0
0






    Out[6]:





[139261, 3]



In [7]:

    
# Quantiles: the data is fairly unbalanced; some station/day combos are wildly
# more popular than others.
print "Quantiles of bikes-per-day"
bpd["bikes"].quantile().show()









    



Quantiles of bikes-per-day
H2OFrame with 9 rows and 2 columns: 
   Probs  bikesQuantiles
0  0.010               2
1  0.100              11
2  0.250              26
3  0.333              35
4  0.500              58
5  0.667              89
6  0.750             107
7  0.900             157
8  0.990             291



In [8]:

    
# A little feature engineering
# Add in month-of-year (seasonality; fewer bike rides in winter than summer)
secs = bpd["Days"]*secsPerDay
bpd["Month"]     = secs.month().asfactor()
# Add in day-of-week (work-week; more bike rides on Sunday than Monday)
bpd["DayOfWeek"] = secs.dayOfWeek()
print "Bikes-Per-Day"
bpd.describe()









    



Bikes-Per-Day
Rows: 139,261 Cols: 5
  chunk_type                 chunk_name  count  count_percentage        size  \
0        C0L          Constant Integers     18         11.250000      1.4 KB   
1        CBS                       Bits      1          0.625000      614  B   
2        C1N  1-Byte Integers (w/o NAs)     47         29.374998    202.9 KB   
3        C1S           1-Byte Fractions     32         20.000000    138.6 KB   
4         C2            2-Byte Integers     62         38.750000    531.1 KB   

   size_percentage  
0         0.160788  
1         0.068558  
2        23.195410  
3        15.849757  
4        60.725487  
                            size  number_of_rows  number_of_chunks_per_column  \
0  172.16.2.59:54321    874.6 KB          139261                           32   
1               mean    874.6 KB          139261                           32   
2                min    874.6 KB          139261                           32   
3                max    874.6 KB          139261                           32   
4             stddev        0  B               0                            0   
5              total    874.6 KB          139261                           32   

   number_of_chunks  
0               160  
1               160  
2               160  
3               160  
4                 0  
5               160  

Column-by-Column Summary:







    





Days
start station name
bikes
Month
DayOfWeek
type
int
enum
int
enum
enum
mins
15887.0
0.0
1.0
0.0
0.0
maxs
16314.0
339.0
680.0
11.0
6.0
sigma
123.635133897
NaN
64.1243887565
NaN
NaN
zero_count
0
428
0
9949
19880
missing_count
0
0
0
0
0



In [9]:

    
# ----------
# 3- Fit a model on train; using test as validation

# Function for doing class test/train/holdout split
def split_fit_predict(data):
  global gbm0,drf0,glm0,dl0
  # Classic Test/Train split
  r = data['Days'].runif()   # Random UNIForm numbers, one per row
  train = data[  r  < 0.6]
  test  = data[(0.6 <= r) & (r < 0.9)]
  hold  = data[ 0.9 <= r ]
  print "Training data has",train.ncol,"columns and",train.nrow,"rows, test has",test.nrow,"rows, holdout has",hold.nrow
  
  # Run GBM
  s = time.time()
  gbm0 = h2o.gbm(x           =train.drop("bikes"),
                 y           =train     ["bikes"],
                 validation_x=test .drop("bikes"),
                 validation_y=test      ["bikes"],
                 ntrees=500, # 500 works well
                 max_depth=6,
                 learn_rate=0.1)
  gbm_elapsed = time.time() - s

  # Run DRF
  s = time.time()
  drf0 = h2o.random_forest(x =train.drop("bikes"),
                y           =train     ["bikes"],
                validation_x=test .drop("bikes"),
                validation_y=test      ["bikes"],
                ntrees=250,
                max_depth=30)
  drf_elapsed = time.time() - s 
    
    
  # Run GLM
  s = time.time()
  glm0 = h2o.glm(x           =train.drop("bikes"),
                 y           =train     ["bikes"],
                 validation_x=test .drop("bikes"),
                 validation_y=test      ["bikes"],
                 Lambda=[1e-5],
                 family="poisson")
  glm_elapsed = time.time() - s
  
  # Run DL
  s = time.time()
  dl0 = h2o.deeplearning(x  =train.drop("bikes"),
                y           =train     ["bikes"],
                validation_x=test .drop("bikes"),
                validation_y=test      ["bikes"],
                hidden=[50,50,50,50],
                epochs=50)
  dl_elapsed = time.time() - s
  
  # ----------
  # 4- Score on holdout set & report
  train_r2_gbm = gbm0.model_performance(train).r2()
  test_r2_gbm  = gbm0.model_performance(test ).r2()
  hold_r2_gbm  = gbm0.model_performance(hold ).r2()
#   print "GBM R2 TRAIN=",train_r2_gbm,", R2 TEST=",test_r2_gbm,", R2 HOLDOUT=",hold_r2_gbm
  
  train_r2_drf = drf0.model_performance(train).r2()
  test_r2_drf  = drf0.model_performance(test ).r2()
  hold_r2_drf  = drf0.model_performance(hold ).r2()
#   print "DRF R2 TRAIN=",train_r2_drf,", R2 TEST=",test_r2_drf,", R2 HOLDOUT=",hold_r2_drf
  
  train_r2_glm = glm0.model_performance(train).r2()
  test_r2_glm  = glm0.model_performance(test ).r2()
  hold_r2_glm  = glm0.model_performance(hold ).r2()
#   print "GLM R2 TRAIN=",train_r2_glm,", R2 TEST=",test_r2_glm,", R2 HOLDOUT=",hold_r2_glm
    
  train_r2_dl = dl0.model_performance(train).r2()
  test_r2_dl  = dl0.model_performance(test ).r2()
  hold_r2_dl  = dl0.model_performance(hold ).r2()
#   print " DL R2 TRAIN=",train_r2_dl,", R2 TEST=",test_r2_dl,", R2 HOLDOUT=",hold_r2_dl
    
  # make a pretty HTML table printout of the results

  header = ["Model", "R2 TRAIN", "R2 TEST", "R2 HOLDOUT", "Model Training Time (s)"]
  table  = [
            ["GBM", train_r2_gbm, test_r2_gbm, hold_r2_gbm, round(gbm_elapsed,3)],
            ["DRF", train_r2_drf, test_r2_drf, hold_r2_drf, round(drf_elapsed,3)],
            ["GLM", train_r2_glm, test_r2_glm, hold_r2_glm, round(glm_elapsed,3)],
            ["DL ", train_r2_dl,  test_r2_dl,  hold_r2_dl , round(dl_elapsed,3) ],
           ]
  h2o.H2ODisplay(table,header)
  # --------------



In [10]:

    
# Split the data (into test & train), fit some models and predict on the holdout data
split_fit_predict(bpd)
# Here we see an r^2 of 0.91 for GBM, and 0.71 for GLM.  This means given just
# the station, the month, and the day-of-week we can predict 90% of the
# variance of the bike-trip-starts.









    



Training data has 5 columns and 83581 rows, test has 41837 rows, holdout has 13843

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%






    




Model
R2 TRAIN
R2 TEST
R2 HOLDOUT
Model Training Time (s)
GBM
0.968502742969
0.921316256547
0.918298756395
19.496
DRF
0.85304131064
0.824335646322
0.823687930871
21.247
GLM
0.785458821665
0.782426105079
0.788803070693
0.784
DL 
0.870873002226
0.851042287799
0.84859682418
70.038



In [11]:

    
# ----------
# 5- Now lets add some weather
# Load weather data
wthr1 = h2o.import_file(path=[mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv"),
                               mylocate("bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv")])
# Peek at the data
wthr1.describe()









    



Parse Progress: [##################################################] 100%

Parsed 17,520 rows and 50 cols:







    




File1
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2013.csv
File2
/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/31081_New_York_City__Hourly_2014.csv






    



Rows: 17,520 Cols: 50
  chunk_type                 chunk_name  count  count_percentage        size  \
0        C0L          Constant Integers    107          6.294118      8.4 KB   
1        C0D             Constant Reals    436         25.647058     34.1 KB   
2        CXI            Sparse Integers     17          1.000000      1.5 KB   
3         C1            1-Byte Integers    346         20.352942    197.4 KB   
4        C1N  1-Byte Integers (w/o NAs)    214         12.588236    122.3 KB   
5        C1S           1-Byte Fractions    214         12.588236    125.3 KB   
6        C2S           2-Byte Fractions    196         11.529412    214.5 KB   
7        C4S           4-Byte Fractions    170         10.000000    356.1 KB   

   size_percentage  
0         0.788972  
1         3.214877  
2         0.139914  
3        18.634672  
4        11.544063  
5        11.822968  
6        20.242111  
7        33.612423  
                            size  number_of_rows  number_of_chunks_per_column  \
0  172.16.2.59:54321      1.0 MB           17520                           34   
1               mean      1.0 MB           17520                           34   
2                min      1.0 MB           17520                           34   
3                max      1.0 MB           17520                           34   
4             stddev        0  B               0                            0   
5              total      1.0 MB           17520                           34   

   number_of_chunks  
0              1700  
1              1700  
2              1700  
3              1700  
4                 0  
5              1700  

Column-by-Column Summary:







    





Year Local
Month Local
Day Local
Hour Local
Year UTC
Month UTC
Day UTC
Hour UTC
Cavok Reported
Cloud Ceiling (m)
Cloud Cover Fraction
Cloud Cover Fraction 1
Cloud Cover Fraction 2
Cloud Cover Fraction 3
Cloud Cover Fraction 4
Cloud Cover Fraction 5
Cloud Cover Fraction 6
Cloud Height (m) 1
Cloud Height (m) 2
Cloud Height (m) 3
Cloud Height (m) 4
Cloud Height (m) 5
Cloud Height (m) 6
Dew Point (C)
Humidity Fraction
Precipitation One Hour (mm)
Pressure Altimeter (mbar)
Pressure Sea Level (mbar)
Pressure Station (mbar)
Snow Depth (cm)
Temperature (C)
Visibility (km)
Weather Code 1
Weather Code 1/ Description
Weather Code 2
Weather Code 2/ Description
Weather Code 3
Weather Code 3/ Description
Weather Code 4
Weather Code 4/ Description
Weather Code 5
Weather Code 5/ Description
Weather Code 6
Weather Code 6/ Description
Weather Code Most Severe / Icon Code
Weather Code Most Severe
Weather Code Most Severe / Description
Wind Direction (degrees)
Wind Gust (m/s)
Wind Speed (m/s)
type
int
int
int
int
int
int
int
int
int
real
real
real
real
real
int
int
int
real
real
real
int
int
int
real
real
real
real
int
int
int
real
real
int
enum
int
enum
int
enum
int
enum
int
enum
int
enum
int
int
enum
int
real
real
mins
2013.0
1.0
1.0
0.0
2013.0
1.0
1.0
0.0
0.0
61.0
0.0
0.0
0.25
0.5
NaN
NaN
NaN
60.96
213.36
365.76
NaN
NaN
NaN
-26.7
0.1251
0.0
983.2949
NaN
NaN
NaN
-15.6
0.001
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
1.0
0.0
3.0
0.0
0.0
1.0
0.0
10.0
7.2
0.0
maxs
2014.0
12.0
31.0
23.0
2015.0
12.0
31.0
23.0
0.0
3657.6
1.0
1.0
1.0
1.0
NaN
NaN
NaN
3657.5999
3657.5999
3657.5999
NaN
NaN
NaN
24.4
1.0
26.924
1042.2113
NaN
NaN
NaN
36.1
16.0934
60.0
11.0
60.0
10.0
36.0
7.0
27.0
4.0
27.0
2.0
3.0
0.0
16.0
60.0
11.0
360.0
20.58
10.8
sigma
0.500014270017
3.44794972385
8.79649804852
6.92238411188
0.500584411716
3.44782405458
8.79561488868
6.92230165203
0.0
995.339856966
0.462720830993
0.42770569708
0.197155690367
0.0861015598104
NaN
NaN
NaN
962.743095854
916.73861349
887.215847511
NaN
NaN
NaN
10.9731282097
0.185792011866
2.56215129179
7.46451697179
NaN
NaN
NaN
10.0396739531
3.69893623033
5.70486576983
NaN
6.13386253912
NaN
5.80553286364
NaN
3.12340844261
NaN
6.15223536611
NaN
0.0
0.0
4.07386062702
5.70486576983
NaN
106.350000031
1.81511871115
1.61469790524
zero_count
0
0
0
730
0
0
0
730
17455
0
8758
8758
0
0
0
0
0
0
0
0
0
0
0
268
0
501
0
0
0
0
269
0
0
17
0
30
0
13
0
20
0
12
0
2
14980
0
17
0
0
2768
missing_count
0
0
0
0
0
0
0
0
65
10780
375
375
14682
16535
17520
17520
17520
9103
14683
16535
17520
17520
17520
67
67
15660
360
17520
17520
17520
67
412
14980
14980
16477
16477
17181
17181
17433
17433
17504
17504
17518
17518
0
14980
14980
9382
14381
1283



In [12]:

    
# Lots of columns in there!  Lets plan on converting to time-since-epoch to do
# a 'join' with the bike data, plus gather weather info that might affect
# cyclists - rain, snow, temperature.  Alas, drop the "snow" column since it's
# all NA's.  Also add in dew point and humidity just in case.  Slice out just
# the columns of interest and drop the rest.
wthr2 = wthr1[["Year Local","Month Local","Day Local","Hour Local","Dew Point (C)","Humidity Fraction","Precipitation One Hour (mm)","Temperature (C)","Weather Code 1/ Description"]]

wthr2.set_name(wthr2.index("Precipitation One Hour (mm)"), "Rain (mm)")
wthr2.set_name(wthr2.index("Weather Code 1/ Description"), "WC1")
wthr2.describe()
# Much better!









    



Rows: 17,520 Cols: 9
  chunk_type                 chunk_name  count  count_percentage        size  \
0        C0L          Constant Integers     46         15.032680      3.6 KB   
1         C1            1-Byte Integers     34         11.111112     19.4 KB   
2        C1N  1-Byte Integers (w/o NAs)     90         29.411766     51.5 KB   
3        C1S           1-Byte Fractions     42         13.725491     24.0 KB   
4        C2S           2-Byte Fractions     94         30.718956    103.4 KB   

   size_percentage  
0         1.780005  
1         9.592678  
2        25.494701  
3        11.894592  
4        51.238026  
                            size  number_of_rows  number_of_chunks_per_column  \
0  172.16.2.59:54321    201.9 KB           17520                           34   
1               mean    201.9 KB           17520                           34   
2                min    201.9 KB           17520                           34   
3                max    201.9 KB           17520                           34   
4             stddev        0  B               0                            0   
5              total    201.9 KB           17520                           34   

   number_of_chunks  
0               306  
1               306  
2               306  
3               306  
4                 0  
5               306  

Column-by-Column Summary:







    





Year Local
Month Local
Day Local
Hour Local
Dew Point (C)
Humidity Fraction
Rain (mm)
Temperature (C)
WC1
type
int
int
int
int
real
real
real
real
enum
mins
2013.0
1.0
1.0
0.0
-26.7
0.1251
0.0
-15.6
0.0
maxs
2014.0
12.0
31.0
23.0
24.4
1.0
26.924
36.1
11.0
sigma
0.500014270017
3.44794972385
8.79649804852
6.92238411188
10.9731282097
0.185792011866
2.56215129179
10.0396739531
NaN
zero_count
0
0
0
730
268
0
501
269
17
missing_count
0
0
0
0
67
67
15660
67
14980



In [13]:

    
# Filter down to the weather at Noon
wthr3 = wthr2[ wthr2["Hour Local"]==12 ]



In [14]:

    
# Lets now get Days since the epoch... we'll convert year/month/day into Epoch
# time, and then back to Epoch days.  Need zero-based month and days, but have
# 1-based.
wthr3["msec"] = h2o.H2OFrame.mktime(year=wthr3["Year Local"], month=wthr3["Month Local"]-1, day=wthr3["Day Local"]-1, hour=wthr3["Hour Local"])
secsPerDay=1000*60*60*24
wthr3["Days"] = (wthr3["msec"]/secsPerDay).floor()
wthr3.describe()
# msec looks sane (numbers like 1.3e12 are in the correct range for msec since
# 1970).  Epoch Days matches closely with the epoch day numbers from the
# CitiBike dataset.









    



Rows: 730 Cols: 11
  chunk_type                 chunk_name  count  count_percentage        size  \
0        C0L          Constant Integers     80         21.390373      6.3 KB   
1        C0D             Constant Reals     13          3.475936      1.0 KB   
2         C1            1-Byte Integers     30          8.021391      2.6 KB   
3        C1N  1-Byte Integers (w/o NAs)     56         14.973262      4.9 KB   
4        C1S           1-Byte Fractions     34          9.090909      3.5 KB   
5        C2S           2-Byte Fractions     34          9.090909      4.2 KB   
6        CUD               Unique Reals     25          6.684492      3.6 KB   
7        C8D               64-bit Reals    102         27.272728     23.9 KB   

   size_percentage  
0        12.498779  
1         2.031052  
2         5.245582  
3         9.801778  
4         7.003222  
5         8.428864  
6         7.229763  
7        47.760960  
                            size  number_of_rows  number_of_chunks_per_column  \
0  172.16.2.59:54321     50.0 KB             730                           34   
1               mean     50.0 KB             730                           34   
2                min     50.0 KB             730                           34   
3                max     50.0 KB             730                           34   
4             stddev        0  B               0                            0   
5              total     50.0 KB             730                           34   

   number_of_chunks  
0               374  
1               374  
2               374  
3               374  
4                 0  
5               374  

Column-by-Column Summary:







    





Year Local
Month Local
Day Local
Hour Local
Dew Point (C)
Humidity Fraction
Rain (mm)
Temperature (C)
WC1
msec
Days
type
int
int
int
int
real
real
real
real
enum
int
int
mins
2013.0
1.0
1.0
12.0
-26.7
0.1723
0.0
-13.9
0.0
1.3570704e+12
15706.0
maxs
2014.0
12.0
31.0
12.0
23.3
1.0
12.446
34.4
10.0
1.420056e+12
16435.0
sigma
0.500342818004
3.45021529307
8.80227802701
0.0
11.1062964725
0.179945027923
2.36064248615
10.3989855149
NaN
18219740080.4
210.877136425
zero_count
0
0
0
0
14
0
15
7
1
0
0
missing_count
0
0
0
0
3
3
660
3
620
0
0



In [15]:

    
# Lets drop off the extra time columns to make a easy-to-handle dataset.
wthr4 = wthr3.drop("Year Local").drop("Month Local").drop("Day Local").drop("Hour Local").drop("msec")



In [16]:

    
# Also, most rain numbers are missing - lets assume those are zero rain days
rain = wthr4["Rain (mm)"]
rain[ rain.isna() ] = 0



In [17]:

    
# ----------
# 6 - Join the weather data-per-day to the bike-starts-per-day
print "Merge Daily Weather with Bikes-Per-Day"
bpd_with_weather = bpd.merge(wthr4,allLeft=True,allRite=False)
bpd_with_weather.describe()
bpd_with_weather.show()









    



Merge Daily Weather with Bikes-Per-Day
Rows: 139,261 Cols: 10
  chunk_type                 chunk_name  count  count_percentage        size  \
0        C0L          Constant Integers     18          5.625000      1.4 KB   
1        C0D             Constant Reals     17          5.312500      1.3 KB   
2        CBS                       Bits      1          0.312500      614  B   
3         C1            1-Byte Integers     29          9.062500    125.2 KB   
4        C1N  1-Byte Integers (w/o NAs)     47         14.687499    202.9 KB   
5        C1S           1-Byte Fractions     32         10.000000    138.6 KB   
6         C2            2-Byte Integers     62         19.375000    531.1 KB   
7        CUD               Unique Reals    114         35.625000    987.2 KB   

   size_percentage  
0         0.070727  
1         0.066798  
2         0.030157  
3         6.295629  
4        10.203143  
5         6.971954  
6        26.711788  
7        49.649803  
                            size  number_of_rows  number_of_chunks_per_column  \
0  172.16.2.59:54321      1.9 MB          139261                           32   
1               mean      1.9 MB          139261                           32   
2                min      1.9 MB          139261                           32   
3                max      1.9 MB          139261                           32   
4             stddev        0  B               0                            0   
5              total      1.9 MB          139261                           32   

   number_of_chunks  
0               320  
1               320  
2               320  
3               320  
4                 0  
5               320  

Column-by-Column Summary:







    





Days
start station name
bikes
Month
DayOfWeek
Humidity Fraction
Rain (mm)
Temperature (C)
WC1
Dew Point (C)
type
int
enum
int
enum
enum
real
real
real
enum
real
mins
15887.0
0.0
1.0
0.0
0.0
0.1723
0.0
-13.9
0.0
-26.7
maxs
16314.0
339.0
680.0
11.0
6.0
1.0
8.382
34.4
10.0
23.3
sigma
123.635133897
NaN
64.1243887565
NaN
NaN
0.178408938664
1.76771960813
10.9454511961
NaN
11.7308194576
zero_count
0
428
0
9949
19880
0
2909
1598
324
1954
missing_count
0
0
0
0
0
981
128246
981
119130
981






    



H2OFrame with 139261 rows and 10 columns: 
    Days start station name  bikes  Month DayOfWeek  Humidity Fraction  \
0  15887    1 Ave & E 15 St     47      6       Sun             0.9354   
1  15887    1 Ave & E 18 St     40      6       Sun             0.9354   
2  15887    1 Ave & E 30 St     42      6       Sun             0.9354   
3  15887    1 Ave & E 44 St     12      6       Sun             0.9354   
4  15887   10 Ave & W 28 St     28      6       Sun             0.9354   
5  15887   11 Ave & W 27 St     30      6       Sun             0.9354   
6  15887   11 Ave & W 41 St     20      6       Sun             0.9354   
7  15887   12 Ave & W 40 St     13      6       Sun             0.9354   
8  15887    2 Ave & E 31 St     33      6       Sun             0.9354   
9  15887    2 Ave & E 58 St     54      6       Sun             0.9354   

   Rain (mm)  Temperature (C)   WC1  Dew Point (C)  
0      4.572             22.8  rain           21.7  
1      4.572             22.8  rain           21.7  
2      4.572             22.8  rain           21.7  
3      4.572             22.8  rain           21.7  
4      4.572             22.8  rain           21.7  
5      4.572             22.8  rain           21.7  
6      4.572             22.8  rain           21.7  
7      4.572             22.8  rain           21.7  
8      4.572             22.8  rain           21.7  
9      4.572             22.8  rain           21.7



In [18]:

    
# 7 - Test/Train split again, model build again, this time with weather
split_fit_predict(bpd_with_weather)









    



Training data has 10 columns and 83786 rows, test has 41574 rows, holdout has 13901

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%






    




Model
R2 TRAIN
R2 TEST
R2 HOLDOUT
Model Training Time (s)
GBM
0.966124549992
0.928048318776
0.928039182008
23.424
DRF
0.894341905389
0.846568473857
0.847443079065
104.214
GLM
0.899743454668
0.887344288315
0.875356174098
0.226
DL 
0.934139722026
0.915249165282
0.915457484406
74.397

H2O cluster uptime:	4 seconds 579 milliseconds
H2O cluster version:	3.3.0.99999
H2O cluster name:	ece
H2O cluster total nodes:	1
H2O cluster total memory:	8.89 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

File1	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-07.csv
File2	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-08.csv
File3	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-09.csv
File4	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-10.csv
File5	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-11.csv
File6	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2013-12.csv
File7	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-01.csv
File8	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-02.csv
File9	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-03.csv
File10	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-04.csv
File11	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-05.csv
File12	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-06.csv
File13	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-07.csv
File14	/Users/ece/0xdata/h2o-dev/bigdata/laptop/citibike-nyc/2014-08.csv

	tripduration	starttime	stoptime	start station id	start station name	start station latitude	start station longitude	end station id	end station name	end station latitude	end station longitude	bikeid	usertype	birth year	gender	Days
type	int	time	time	int	enum	real	real	int	enum	real	real	int	enum	int	int	int
mins	60.0	1.372662e+12	1.372662242e+12	72.0	0.0	40.680342423	-74.01713445	72.0	0.0	40.680342423	-74.01713445	14529.0	0.0	1899.0	0.0	15887.0
maxs	6250750.0	1.409554787e+12	1.409563605e+12	3002.0	339.0	40.771522	-73.9500479759	3002.0	339.0	40.771522	-73.9500479759	21689.0	1.0	1998.0	2.0	16314.0
sigma	2985.10540532	11806578171.7	11806555707.8	355.755989765	NaN	0.0197100508736	0.0123453320185	360.070380844	NaN	0.0197309578633	0.0124311861598	1938.80517884	0.324807387506	11.132784905	0.563019777794	136.647269305
zero_count	0	0	0	0	56836	0	0	0	55167	0	0	0	1247534	0	1248517	0
missing_count	0	0	0	0	0	0	0	0	0	0	0	0	0	1247644	0	0

	Days	start station name	bikes
type	int	enum	int
mins	15887.0	0.0	1.0
maxs	16314.0	339.0	680.0
sigma	123.635133897	NaN	64.1243887565
zero_count	0	428	0
missing_count	0	0	0

Model	R2 TRAIN	R2 TEST	R2 HOLDOUT	Model Training Time (s)
GBM	0.968502742969	0.921316256547	0.918298756395	19.496
DRF	0.85304131064	0.824335646322	0.823687930871	21.247
GLM	0.785458821665	0.782426105079	0.788803070693	0.784
DL	0.870873002226	0.851042287799	0.84859682418	70.038

	Year Local	Month Local	Day Local	Hour Local	Year UTC	Month UTC	Day UTC	Hour UTC	Cavok Reported	Cloud Ceiling (m)	Cloud Cover Fraction	Cloud Cover Fraction 1	Cloud Cover Fraction 2	Cloud Cover Fraction 3	Cloud Cover Fraction 4	Cloud Cover Fraction 5	Cloud Cover Fraction 6	Cloud Height (m) 1	Cloud Height (m) 2	Cloud Height (m) 3	Cloud Height (m) 4	Cloud Height (m) 5	Cloud Height (m) 6	Dew Point (C)	Humidity Fraction	Precipitation One Hour (mm)	Pressure Altimeter (mbar)	Pressure Sea Level (mbar)	Pressure Station (mbar)	Snow Depth (cm)	Temperature (C)	Visibility (km)	Weather Code 1	Weather Code 1/ Description	Weather Code 2	Weather Code 2/ Description	Weather Code 3	Weather Code 3/ Description	Weather Code 4	Weather Code 4/ Description	Weather Code 5	Weather Code 5/ Description	Weather Code 6	Weather Code 6/ Description	Weather Code Most Severe / Icon Code	Weather Code Most Severe	Weather Code Most Severe / Description	Wind Direction (degrees)	Wind Gust (m/s)	Wind Speed (m/s)
type	int	int	int	int	int	int	int	int	int	real	real	real	real	real	int	int	int	real	real	real	int	int	int	real	real	real	real	int	int	int	real	real	int	enum	int	enum	int	enum	int	enum	int	enum	int	enum	int	int	enum	int	real	real
mins	2013.0	1.0	1.0	0.0	2013.0	1.0	1.0	0.0	0.0	61.0	0.0	0.0	0.25	0.5	NaN	NaN	NaN	60.96	213.36	365.76	NaN	NaN	NaN	-26.7	0.1251	0.0	983.2949	NaN	NaN	NaN	-15.6	0.001	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	1.0	0.0	3.0	0.0	0.0	1.0	0.0	10.0	7.2	0.0
maxs	2014.0	12.0	31.0	23.0	2015.0	12.0	31.0	23.0	0.0	3657.6	1.0	1.0	1.0	1.0	NaN	NaN	NaN	3657.5999	3657.5999	3657.5999	NaN	NaN	NaN	24.4	1.0	26.924	1042.2113	NaN	NaN	NaN	36.1	16.0934	60.0	11.0	60.0	10.0	36.0	7.0	27.0	4.0	27.0	2.0	3.0	0.0	16.0	60.0	11.0	360.0	20.58	10.8
sigma	0.500014270017	3.44794972385	8.79649804852	6.92238411188	0.500584411716	3.44782405458	8.79561488868	6.92230165203	0.0	995.339856966	0.462720830993	0.42770569708	0.197155690367	0.0861015598104	NaN	NaN	NaN	962.743095854	916.73861349	887.215847511	NaN	NaN	NaN	10.9731282097	0.185792011866	2.56215129179	7.46451697179	NaN	NaN	NaN	10.0396739531	3.69893623033	5.70486576983	NaN	6.13386253912	NaN	5.80553286364	NaN	3.12340844261	NaN	6.15223536611	NaN	0.0	0.0	4.07386062702	5.70486576983	NaN	106.350000031	1.81511871115	1.61469790524
zero_count	0	0	0	730	0	0	0	730	17455	0	8758	8758	0	0	0	0	0	0	0	0	0	0	0	268	0	501	0	0	0	0	269	0	0	17	0	30	0	13	0	20	0	12	0	2	14980	0	17	0	0	2768
missing_count	0	0	0	0	0	0	0	0	65	10780	375	375	14682	16535	17520	17520	17520	9103	14683	16535	17520	17520	17520	67	67	15660	360	17520	17520	17520	67	412	14980	14980	16477	16477	17181	17181	17433	17433	17504	17504	17518	17518	0	14980	14980	9382	14381	1283