notebook.community

Edit and run



In [1]:

    
import h2o









    



[WARNING] H2O requires requests module of version 2.10 or newer. You have version 2.4.3.
You can upgrade to the newest version of the module running from the command line
    $ pip2 install --upgrade requests



In [3]:

    
h2o.connect(ip="35.196.153.55")









    



Connecting to H2O server at http://35.196.153.55:54321... successful.






    




H2O cluster uptime:
7 mins 14 secs
H2O cluster version:
3.14.0.7
H2O cluster version age:
10 days 
H2O cluster name:
sparkling-water-olmsteadbrett_local-1509393572563
H2O cluster total nodes:
1
H2O cluster free memory:
2.322 Gb
H2O cluster total cores:
4
H2O cluster allowed cores:
4
H2O cluster status:
locked, healthy
H2O connection url:
http://35.196.153.55:54321
H2O connection proxy:
None
H2O internal security:
False
Python version:
2.7.9 final






    Out[3]:





<H2OConnection to http://35.196.153.55:54321, session _sid_bae2>



In [5]:

    
from h2o.estimators.gbm import H2OGradientBoostingEstimator



In [7]:

    
air = h2o.import_file("https://raw.github.com/0xdata/h2o/master/smalldata/airlines/allyears2k_headers.zip")









    



Parse progress: |█████████████████████████████████████████████████████████| 100%



In [8]:

    
# set this to True if interactive (matplotlib) plots are desired
interactive = False
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt



In [9]:

    
air_path = "https://raw.github.com/0xdata/h2o/master/smalldata/airlines/allyears2k_headers.zip"

# ----------

# 1- Load data - 1 row per flight.  Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print("Import and Parse airlines data")
data = h2o.import_file(path=air_path)
data.describe()









    



Import and Parse airlines data
Parse progress: |█████████████████████████████████████████████████████████| 100%
Rows:43978
Cols:31








    






       Year         Month        DayofMonth   DayOfWeek    DepTime      CRSDepTime   ArrTime      CRSArrTime   UniqueCarrier  FlightNum    TailNum  ActualElapsedTime  CRSElapsedTime  AirTime      ArrDelay     DepDelay     Origin  Dest  Distance     TaxiIn       TaxiOut      Cancelled      CancellationCode  Diverted        CarrierDelay  WeatherDelay  NASDelay     SecurityDelay  LateAircraftDelay  IsArrDelayed  IsDepDelayed  


type   int          int          int          int          int          int          int          int          enum           int          enum     int                int             int          int          int          enum    enum  int          int          int          int            enum              int             int           int           int          int            int                enum          enum          
mins   1987.0       1.0          1.0          1.0          1.0          0.0          1.0          0.0                         1.0                   16.0               17.0            14.0         -63.0        -16.0                      11.0         0.0          0.0          0.0                              0.0             0.0           0.0           0.0          0.0            0.0                                            
mean   1997.5       1.40909090909 14.6010732639 3.82061485288 1345.84666138 1313.22286143 1504.63413038 1485.28916731                818.842989677          124.814529135      125.021562607   114.316111091 9.31711193698 10.0073906556               730.182190565 5.38136805953 14.1686341847 0.0246941652645                   0.00247851198326 4.04780029106 0.289376469271 4.85503190418 0.0170155602821 7.62006045002                                  
maxs   2008.0       10.0         31.0         7.0          2400.0       2359.0       2400.0       2359.0                      3949.0                475.0              437.0           402.0        475.0        473.0                      3365.0       128.0        254.0        1.0                              1.0             369.0         201.0         323.0        14.0           373.0                                          
sigma  6.34436090171 1.87471137134 9.17579042586 1.90501311913 465.340899124 476.251139993 484.347487904 492.750434123                777.404369164          73.9744416606      73.40159463     69.6363295151 29.8402219624 26.4388090429               578.43800823 4.20197993986 9.9050857472 0.155193141358                   0.0497234872189 16.2057299045 4.41677989873 18.6197762215 0.403940182102 23.4875658741                                  
zeros  0            0            0            0            0            569          0            569                         0                     0                  0               0            1514         6393                       0            623          557          42892                            43869           7344          8840          7388         8914           7140                                           
missing 0            0            0            0            1086         0            1195         0            0              0            32       1195               13              16649        1195         1086         0       0     35           16026        16024        0              9774              0               35045         35045         35045        35045          35045              0             0             
0      1987.0       10.0         14.0         3.0          741.0        730.0        912.0        849.0        PS             1451.0       NA       91.0               79.0            nan          23.0         11.0         SAN     SFO   447.0        nan          nan          0.0            NA                0.0             nan           nan           nan          nan            nan                YES           YES           
1      1987.0       10.0         15.0         4.0          729.0        730.0        903.0        849.0        PS             1451.0       NA       94.0               79.0            nan          14.0         -1.0         SAN     SFO   447.0        nan          nan          0.0            NA                0.0             nan           nan           nan          nan            nan                YES           NO            
2      1987.0       10.0         17.0         6.0          741.0        730.0        918.0        849.0        PS             1451.0       NA       97.0               79.0            nan          29.0         11.0         SAN     SFO   447.0        nan          nan          0.0            NA                0.0             nan           nan           nan          nan            nan                YES           YES           
3      1987.0       10.0         18.0         7.0          729.0        730.0        847.0        849.0        PS             1451.0       NA       78.0               79.0            nan          -2.0         -1.0         SAN     SFO   447.0        nan          nan          0.0            NA                0.0             nan           nan           nan          nan            nan                NO            NO            
4      1987.0       10.0         19.0         1.0          749.0        730.0        922.0        849.0        PS             1451.0       NA       93.0               79.0            nan          33.0         19.0         SAN     SFO   447.0        nan          nan          0.0            NA                0.0             nan           nan           nan          nan            nan                YES           YES           
5      1987.0       10.0         21.0         3.0          728.0        730.0        848.0        849.0        PS             1451.0       NA       80.0               79.0            nan          -1.0         -2.0         SAN     SFO   447.0        nan          nan          0.0            NA                0.0             nan           nan           nan          nan            nan                NO            NO            
6      1987.0       10.0         22.0         4.0          728.0        730.0        852.0        849.0        PS             1451.0       NA       84.0               79.0            nan          3.0          -2.0         SAN     SFO   447.0        nan          nan          0.0            NA                0.0             nan           nan           nan          nan            nan                YES           NO            
7      1987.0       10.0         23.0         5.0          731.0        730.0        902.0        849.0        PS             1451.0       NA       91.0               79.0            nan          13.0         1.0          SAN     SFO   447.0        nan          nan          0.0            NA                0.0             nan           nan           nan          nan            nan                YES           YES           
8      1987.0       10.0         24.0         6.0          744.0        730.0        908.0        849.0        PS             1451.0       NA       84.0               79.0            nan          19.0         14.0         SAN     SFO   447.0        nan          nan          0.0            NA                0.0             nan           nan           nan          nan            nan                YES           YES           
9      1987.0       10.0         25.0         7.0          729.0        730.0        851.0        849.0        PS             1451.0       NA       82.0               79.0            nan          2.0          -1.0         SAN     SFO   447.0        nan          nan          0.0            NA                0.0             nan           nan           nan          nan            nan                YES           NO



In [10]:

    
# ----------

# 2- Data exploration and munging. Generate scatter plots 
# of various columns and plot fitted GLM model.

# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
    if(fit):
        lr = H2OGeneralizedLinearEstimator(family = "gaussian")
        lr.train(x=x, y=y, training_frame=data)
        coeff = lr.coef()
    df = data[[x,y]]
    runif = df[y].runif()
    df_subset = df[runif < float(max_points)/data.nrow]
    df_py = h2o.as_list(df_subset)
    
    if(fit): h2o.remove(lr._id)

    # If x variable is string, generate box-and-whisker plot
    if(df_py[x].dtype == "object"):
        if interactive: df_py.boxplot(column = y, by = x)
    # Otherwise, generate a scatter plot
    else:
        if interactive: df_py.plot(x = x, y = y, kind = "scatter")
    
    if(fit):
        x_min = min(df_py[x])
        x_max = max(df_py[x])
        y_min = coeff["Intercept"] + coeff[x]*x_min
        y_max = coeff["Intercept"] + coeff[x]*x_max
        plt.plot([x_min, x_max], [y_min, y_max], "k-")
    if interactive: plt.show()

scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)









    



glm Model Build progress: |███████████████████████████████████████████████| 100%



In [11]:

    
# Group flights by month
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim

# Convert columns to factors
data["Year"]      = data["Year"]     .asfactor()
data["Month"]     = data["Month"]    .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()









    






  Month   sum_Cancelled   nrow


      1            1067  41979
     10              19   1999








    



Rows:2
Cols:3








    






       Month        sum_Cancelled  nrow         


type   int          int            int          
mins   1.0          19.0           1999.0       
mean   5.5          543.0          21989.0      
maxs   10.0         1067.0         41979.0      
sigma  6.36396103068 741.047906684  28270.1291118
zeros  0            0              0            
missing 0            0              0            
0      1.0          1067.0         41979.0      
1      10.0         19.0           1999.0



In [12]:

    
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1

hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2

# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = (arrTime-depTime > 0).ifelse((arrTime-depTime), h2o.H2OFrame([[None]] * data.nrow))
scatter_plot(data, "Distance", "TravelTime")









    



Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%



In [13]:

    
# Impute missing travel times and re-plot
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")









    



glm Model Build progress: |███████████████████████████████████████████████| 100%



In [14]:

    
# ----------
# 3- Fit a model on train; using test as validation

# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test  = data[s > 0.75]

# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]

# Simple GLM - Predict Delays
data_glm = H2OGeneralizedLinearEstimator(family="binomial", standardize=True)
data_glm.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Simple GBM
data_gbm = H2OGradientBoostingEstimator(balance_classes=True,
                                        ntrees         =3,
                                        max_depth      =1,
                                        distribution   ="bernoulli",
                                        learn_rate     =0.1,
                                        min_rows       =2)

data_gbm.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Complex GBM
data_gbm2 = H2OGradientBoostingEstimator(balance_classes=True,
                                         ntrees         =50,
                                         max_depth      =5,
                                         distribution   ="bernoulli",
                                         learn_rate     =0.1,
                                         min_rows       =2)

data_gbm2.train(x               =myX,
                y               =myY,
                training_frame  =train,
                validation_frame=test)

# Simple Random Forest
data_rf = H2ORandomForestEstimator(ntrees         =5,
                                   max_depth      =2,
                                   balance_classes=True)

data_rf.train(x               =myX,
              y               =myY,
              training_frame  =train,
              validation_frame=test)

# Complex Random Forest
data_rf2 = H2ORandomForestEstimator(ntrees         =10,
                                    max_depth      =5,
                                    balance_classes=True)

data_rf2.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Deep Learning with 5 epochs
data_dl = H2ODeepLearningEstimator(hidden              =[10,10],
                                   epochs              =5,
                                   variable_importances=True,
                                   balance_classes     =True,
                                   loss                ="Automatic")

data_dl.train(x               =myX,
              y               =myY,
              training_frame  =train,
              validation_frame=test)









    



glm Model Build progress: |███████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%



In [15]:

    
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
from six import iteritems
glm_varimp = data_glm.coef_norm()
for k,v in iteritems(glm_varimp):
    glm_varimp[k] = abs(glm_varimp[k])
    
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print("Variable Importances:\n\n" + table)

data_gbm.varimp()
data_rf.varimp()









    



Variable Importances:

| Predictor        |   Normalized Coefficient |
|------------------+--------------------------|
| Year.2008        |               2.12834    |
| Origin.HPN       |               1.78092    |
| Origin.LIH       |               1.72329    |
| Year.2003        |               1.66481    |
| Dest.LYH         |               1.48352    |
| Origin.TLH       |               1.4438     |
| Origin.MDW       |               1.42237    |
| Origin.LEX       |               1.37468    |
| Origin.CHO       |               1.37442    |
| Year.2007        |               1.33457    |
| Origin.TRI       |               1.1948     |
| Origin.LBB       |               1.11611    |
| Dest.PNS         |               1.09135    |
| Dest.HTS         |               1.03871    |
| UniqueCarrier.HP |               1.00414    |
| Origin.ERI       |               1.00131    |
| Year.2002        |               1.00045    |
| Year.2001        |               0.989799   |
| Origin.SRQ       |               0.982801   |
| Year.2004        |               0.962006   |
| Origin.MRY       |               0.959141   |
| Origin.HNL       |               0.944968   |
| Dest.ICT         |               0.935913   |
| Origin.CAE       |               0.932485   |
| Origin.SAV       |               0.923519   |
| Origin.MYR       |               0.888292   |
| Origin.IAH       |               0.834644   |
| Year.2006        |               0.828845   |
| UniqueCarrier.TW |               0.819648   |
| Origin.EYW       |               0.795584   |
| Origin.TUL       |               0.794152   |
| Origin.ATL       |               0.793512   |
| Origin.ACY       |               0.79148    |
| Origin.AUS       |               0.783774   |
| Dest.DAY         |               0.779332   |
| Origin.MLB       |               0.748394   |
| Origin.JAX       |               0.733829   |
| Dest.FLL         |               0.708209   |
| Origin.BOI       |               0.706706   |
| Origin.ORD       |               0.687341   |
| Year.1996        |               0.685849   |
| Origin.RNO       |               0.684905   |
| Year.1994        |               0.683743   |
| Origin.ROA       |               0.678766   |
| Origin.PBI       |               0.672746   |
| Dest.IAH         |               0.668079   |
| Origin.TYS       |               0.666651   |
| Origin.LYH       |               0.634583   |
| Dest.CHO         |               0.634583   |
| Origin.PSP       |               0.631683   |
| Origin.CRP       |               0.629172   |
| Origin.ABE       |               0.615234   |
| Origin.GSO       |               0.612062   |
| Dest.GEG         |               0.590951   |
| Origin.OKC       |               0.590465   |
| Dest.SFO         |               0.586402   |
| Origin.STL       |               0.586156   |
| Year.1997        |               0.584964   |
| Year.2005        |               0.582897   |
| Dest.TPA         |               0.581864   |
| Dest.ISP         |               0.578443   |
| Dest.PSP         |               0.57025    |
| Dest.KOA         |               0.56587    |
| Origin.BTV       |               0.564405   |
| Origin.MSY       |               0.563711   |
| Origin.ALB       |               0.555417   |
| Dest.GSO         |               0.553097   |
| Dest.OAJ         |               0.531787   |
| Dest.PBI         |               0.522437   |
| Dest.SDF         |               0.511014   |
| Origin.PIT       |               0.500839   |
| Dest.CHS         |               0.497163   |
| Dest.UCA         |               0.494345   |
| Dest.SAT         |               0.493991   |
| Origin.MCO       |               0.493147   |
| Dest.FAY         |               0.487934   |
| Year.1990        |               0.487001   |
| Origin.DAY       |               0.479214   |
| Origin.OAK       |               0.478934   |
| Dest.CLE         |               0.478929   |
| Origin.OMA       |               0.474536   |
| Origin.MIA       |               0.47264    |
| UniqueCarrier.WN |               0.459947   |
| Dest.ABQ         |               0.457894   |
| Dest.BDL         |               0.454467   |
| Dest.LBB         |               0.448505   |
| Origin.PWM       |               0.440791   |
| Origin.BUR       |               0.434567   |
| Origin.STX       |               0.430196   |
| Dest.SEA         |               0.429586   |
| Origin.FLL       |               0.412585   |
| Dest.STL         |               0.411415   |
| Origin.SAN       |               0.398132   |
| Dest.CAE         |               0.397591   |
| UniqueCarrier.PI |               0.39487    |
| Origin.SLC       |               0.394488   |
| Origin.SYR       |               0.384714   |
| Origin.IND       |               0.381832   |
| UniqueCarrier.CO |               0.376084   |
| Dest.IND         |               0.373964   |
| Origin.LGA       |               0.364869   |
| Origin.PHL       |               0.361748   |
| Origin.MDT       |               0.358408   |
| Dest.TUL         |               0.356359   |
| Origin.LAX       |               0.351487   |
| Origin.CRW       |               0.342324   |
| Origin.TUS       |               0.340778   |
| Origin.JFK       |               0.336137   |
| Year.1995        |               0.333443   |
| Dest.CMH         |               0.331511   |
| Year.1992        |               0.331424   |
| Year.1991        |               0.320566   |
| Origin.MCI       |               0.320389   |
| Dest.BGM         |               0.319531   |
| Dest.SLC         |               0.319182   |
| Origin.DFW       |               0.312263   |
| UniqueCarrier.US |               0.310796   |
| Origin.BDL       |               0.305355   |
| Dest.JAX         |               0.286775   |
| Origin.CLE       |               0.279886   |
| Origin.IAD       |               0.279167   |
| Dest.MIA         |               0.277144   |
| Origin.PHF       |               0.274691   |
| Origin.SMF       |               0.274546   |
| DayOfWeek.5      |               0.266387   |
| Year.1989        |               0.264867   |
| Dest.PDX         |               0.262286   |
| Dest.BUF         |               0.257864   |
| Dest.MCO         |               0.257      |
| Origin.ROC       |               0.25666    |
| Dest.LAX         |               0.233518   |
| Dest.IAD         |               0.228351   |
| Dest.MKE         |               0.226911   |
| Origin.BUF       |               0.225691   |
| Origin.PHX       |               0.225306   |
| Year.2000        |               0.224469   |
| Origin.BGM       |               0.222624   |
| Dest.CLT         |               0.215434   |
| Year.1987        |               0.213982   |
| Month.10         |               0.213982   |
| Origin.BOS       |               0.212845   |
| Dest.PVD         |               0.206415   |
| Origin.EWR       |               0.19826    |
| Dest.ROC         |               0.195916   |
| Origin.MKE       |               0.192487   |
| Dest.DTW         |               0.191773   |
| UniqueCarrier.UA |               0.18491    |
| Origin.BWI       |               0.184507   |
| Origin.TPA       |               0.184506   |
| Dest.AUS         |               0.181474   |
| Dest.ONT         |               0.18135    |
| Origin.DAL       |               0.176221   |
| DayOfWeek.6      |               0.174946   |
| UniqueCarrier.AA |               0.170341   |
| DayOfWeek.2      |               0.169405   |
| Origin.SJC       |               0.168895   |
| Dest.BUR         |               0.166759   |
| Dest.BTV         |               0.162648   |
| Dest.SMF         |               0.158714   |
| Dest.SJC         |               0.152107   |
| Distance         |               0.152086   |
| Dest.ORD         |               0.150718   |
| Dest.TUS         |               0.14548    |
| Origin.CLT       |               0.143246   |
| Origin.MSP       |               0.139783   |
| Origin.COS       |               0.137617   |
| Dest.BWI         |               0.134686   |
| DayOfWeek.4      |               0.134527   |
| Origin.LAS       |               0.133967   |
| Dest.FAT         |               0.13061    |
| Dest.PHL         |               0.127272   |
| Dest.JFK         |               0.124315   |
| Origin.DCA       |               0.121861   |
| Origin.SFO       |               0.120471   |
| Dest.SNA         |               0.117144   |
| Month.1          |               0.112903   |
| Dest.ALB         |               0.112647   |
| Dest.PHX         |               0.105518   |
| Year.1999        |               0.102472   |
| Year.1993        |               0.100089   |
| Dest.MHT         |               0.0965757  |
| Dest.OAK         |               0.0947442  |
| Origin.MEM       |               0.0924573  |
| Origin.SWF       |               0.0921668  |
| Dest.MSP         |               0.0911779  |
| Dest.LAS         |               0.0899772  |
| Origin.PDX       |               0.0880033  |
| Dest.OMA         |               0.0875835  |
| Dest.MDW         |               0.0836909  |
| Dest.MCI         |               0.0834905  |
| Dest.MYR         |               0.0817729  |
| Dest.RSW         |               0.0806226  |
| Dest.BNA         |               0.0729002  |
| Origin.BNA       |               0.0721146  |
| Dest.ORF         |               0.0693907  |
| Dest.BOI         |               0.0686461  |
| Intercept        |               0.0685957  |
| Origin.DSM       |               0.0673692  |
| Dest.ILM         |               0.0645363  |
| Origin.DEN       |               0.0645087  |
| Origin.EGE       |               0.0604727  |
| Origin.AVP       |               0.057963   |
| Origin.PVD       |               0.0572925  |
| Dest.HOU         |               0.0555928  |
| Origin.ABQ       |               0.0545723  |
| Dest.DCA         |               0.0513871  |
| Dest.ATL         |               0.0474207  |
| Origin.ELP       |               0.0459049  |
| Origin.RDU       |               0.0457665  |
| Origin.GEG       |               0.0439692  |
| DayOfWeek.3      |               0.0425205  |
| Origin.RSW       |               0.0382602  |
| Origin.SNA       |               0.0293943  |
| Origin.HOU       |               0.0278525  |
| Dest.DEN         |               0.02111    |
| Dest.RNO         |               0.0147958  |
| FlightNum        |               0.0142613  |
| Dest.LGA         |               0.0138716  |
| Origin.ONT       |               0.0121315  |
| Dest.SYR         |               0.0103941  |
| DayOfWeek.7      |               0.0102764  |
| Dest.SBN         |               0.00860861 |
| Origin.SAT       |               0.00572206 |
| Origin.DTW       |               0.00555071 |
| Origin.BHM       |               0.00399109 |
| Origin.CHS       |               0.00323463 |
| Dest.ABE         |               0.00184446 |
| Year.1998        |               0.00144204 |
| Dest.LIT         |               0          |
| Dest.OKC         |               0          |
| Dest.HPN         |               0          |
| Dest.ORH         |               0          |
| Origin.ANC       |               0          |
| Dest.LIH         |               0          |
| Dest.SRQ         |               0          |
| Dest.MRY         |               0          |
| Dest.SJU         |               0          |
| UniqueCarrier.DL |               0          |
| Dest.MSY         |               0          |
| Origin.GRR       |               0          |
| Year.1988        |               0          |
| Dest.PIT         |               0          |
| Dest.DSM         |               0          |
| Origin.CVG       |               0          |
| Dest.HNL         |               0          |
| Origin.SCK       |               0          |
| Dest.PWM         |               0          |
| Origin.ISP       |               0          |
| Origin.RIC       |               0          |
| Origin.CMH       |               0          |
| Origin.LAN       |               0          |
| Origin.ICT       |               0          |
| Origin.LIT       |               0          |
| Dest.GRR         |               0          |
| Dest.SWF         |               0          |
| Origin.KOA       |               0          |
| Dest.DAL         |               0          |
| Origin.UCA       |               0          |
| Dest.SAN         |               0          |
| Origin.AMA       |               0          |
| Origin.SEA       |               0          |
| Dest.DFW         |               0          |
| Dest.ERI         |               0          |
| Dest.ACY         |               0          |
| Dest.FNT         |               0          |
| Dest.GSP         |               0          |
| Dest.CRP         |               0          |
| Origin.MFR       |               0          |
| Origin.MHT       |               0          |
| Dest.JAN         |               0          |
| Origin.STT       |               0          |
| Dest.EWR         |               0          |
| Dest.ELP         |               0          |
| Dest.BHM         |               0          |
| Dest.ELM         |               0          |
| Dest.CHA         |               0          |
| Origin.SBN       |               0          |
| Origin.SJU       |               0          |
| Origin.ORF       |               0          |
| Origin.OGG       |               0          |
| Dest.MDT         |               0          |
| Origin.HRL       |               0          |
| Origin.GNV       |               0          |
| Dest.AMA         |               0          |
| Dest.CAK         |               0          |
| Dest.LEX         |               0          |
| Dest.BOS         |               0          |
| Dest.RIC         |               0          |
| Origin.JAN       |               0          |
| Dest.ROA         |               0          |
| Dest.OGG         |               0          |
| UniqueCarrier.PS |               0          |
| Origin.SDF       |               0          |
| Dest.AVL         |               0          |
| Dest.MAF         |               0          |
| Dest.PHF         |               0          |
| Dest.TOL         |               0          |
| Dest.ANC         |               0          |
| Dest.HRL         |               0          |
| Dest.AVP         |               0          |
| Dest.RDU         |               0          |
| DayOfWeek.1      |               0          |
| Dest.STT         |               0          |
| Dest.EYW         |               0          |
| Origin.MAF       |               0          |
| Dest.EUG         |               0          |
| Dest.SCK         |               0          |
| Origin.BIL       |               0          |
| Dest.CVG         |               0          |
| Dest.COS         |               0          |






    Out[15]:





[(u'Year', 1041.415283203125, 1.0, 0.547312376507354),
 (u'Origin', 523.938720703125, 0.5031025846784432, 0.27535427124735096),
 (u'FlightNum', 101.34282684326172, 0.09731259803635424, 0.053260389295381905),
 (u'DayOfWeek', 93.48446655273438, 0.08976675113236311, 0.04913045389359786),
 (u'Distance', 64.25371551513672, 0.06169845646734591, 0.03376832883597861),
 (u'Dest', 55.47443771362305, 0.05326831534774291, 0.029154408265516324),
 (u'UniqueCarrier',
  22.870986938476562,
  0.021961447375855002,
  0.012019771954820393),
 (u'Month', 0.0, 0.0, 0.0)]



In [16]:

    
# Model performance of GBM model on test data
data_gbm2.model_performance(test)









    



ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.204516957162
RMSE: 0.452235510726
LogLoss: 0.595705258221
Mean Per-Class Error: 0.314546559781
AUC: 0.745583363998
Gini: 0.491166727997
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.378769592765: 






    





NO
YES
Error
Rate
NO
2300.0
2967.0
0.5633
 (2967.0/5267.0)
YES
814.0
4942.0
0.1414
 (814.0/5756.0)
Total
3114.0
7909.0
0.343
 (3781.0/11023.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.3787696
0.7233077
284.0
max f2
0.1839260
0.8460443
372.0
max f0point5
0.5536517
0.7028692
193.0
max accuracy
0.5048457
0.6860201
220.0
max precision
0.9688600
1.0
0.0
max recall
0.0689509
1.0
398.0
max specificity
0.9688600
1.0
0.0
max absolute_mcc
0.5048457
0.3708690
220.0
max min_per_class_accuracy
0.5163738
0.6829315
214.0
max mean_per_class_accuracy
0.5048457
0.6854534
220.0






    



Gains/Lift Table: Avg response rate: 52.22 %







    





group
cumulative_data_fraction
lower_threshold
lift
cumulative_lift
response_rate
cumulative_response_rate
capture_rate
cumulative_capture_rate
gain
cumulative_gain

1
0.0100699
0.9336374
1.7597712
1.7597712
0.9189189
0.9189189
0.0177206
0.0177206
75.9771238
75.9771238

2
0.0200490
0.9086334
1.6887217
1.7244072
0.8818182
0.9004525
0.0168520
0.0345726
68.8721650
72.4407190

3
0.0301188
0.8954846
1.7080133
1.7189261
0.8918919
0.8975904
0.0171994
0.0517721
70.8013260
71.8926087

4
0.0400980
0.8821818
1.7757692
1.7330726
0.9272727
0.9049774
0.0177206
0.0694927
77.5769158
73.3072552

5
0.0500771
0.8675461
1.6190836
1.7103574
0.8454545
0.8931159
0.0161571
0.0856498
61.9083644
71.0357371

6
0.1002449
0.8094473
1.6068372
1.6585504
0.8390597
0.8660633
0.0806115
0.1662613
60.6837177
65.8550433

7
0.1505035
0.7670999
1.5209745
1.6126089
0.7942238
0.8420735
0.0764420
0.2427033
52.0974503
61.2608862

8
0.2001270
0.7295836
1.4494126
1.5721427
0.7568556
0.8209429
0.0719249
0.3146282
44.9412615
57.2142703

9
0.3000091
0.6659620
1.3775802
1.5073670
0.7193460
0.7871182
0.1375956
0.4522238
37.7580177
50.7366973

10
0.3999819
0.5947748
1.1625819
1.4211902
0.6070780
0.7421184
0.1162265
0.5684503
16.2581868
42.1190247

11
0.5000454
0.5214679
1.0590912
1.3487310
0.5530372
0.7042816
0.1059764
0.6744267
5.9091164
34.8731014

12
0.6000181
0.4510046
0.9436203
1.2812330
0.4927405
0.6690354
0.0943363
0.7687630
-5.6379739
28.1232972

13
0.7000816
0.3888302
0.7413638
1.2040688
0.3871260
0.6287417
0.0741835
0.8429465
-25.8636185
20.4068831

14
0.7999637
0.3223049
0.6505240
1.1349542
0.3396912
0.5926514
0.0649757
0.9079222
-34.9476028
13.4954191

15
0.9004808
0.2554322
0.5202424
1.0663364
0.2716606
0.5568205
0.0522933
0.9602154
-47.9757585
6.6336355

16
1.0
0.0578914
0.3997679
1.0
0.2087511
0.5221809
0.0397846
1.0
-60.0232139
0.0






    









    Out[16]:

H2O cluster uptime:	7 mins 14 secs
H2O cluster version:	3.14.0.7
H2O cluster version age:	10 days
H2O cluster name:	sparkling-water-olmsteadbrett_local-1509393572563
H2O cluster total nodes:	1
H2O cluster free memory:	2.322 Gb
H2O cluster total cores:	4
H2O cluster allowed cores:	4
H2O cluster status:	locked, healthy
H2O connection url:	http://35.196.153.55:54321
H2O connection proxy:	None
H2O internal security:	False
Python version:	2.7.9 final

	Year	Month	DayofMonth	DayOfWeek	DepTime	CRSDepTime	ArrTime	CRSArrTime	UniqueCarrier	FlightNum	TailNum	ActualElapsedTime	CRSElapsedTime	AirTime	ArrDelay	DepDelay	Origin	Dest	Distance	TaxiIn	TaxiOut	Cancelled	CancellationCode	Diverted	CarrierDelay	WeatherDelay	NASDelay	SecurityDelay	LateAircraftDelay	IsArrDelayed	IsDepDelayed
type	int	int	int	int	int	int	int	int	enum	int	enum	int	int	int	int	int	enum	enum	int	int	int	int	enum	int	int	int	int	int	int	enum	enum
mins	1987.0	1.0	1.0	1.0	1.0	0.0	1.0	0.0		1.0		16.0	17.0	14.0	-63.0	-16.0			11.0	0.0	0.0	0.0		0.0	0.0	0.0	0.0	0.0	0.0
mean	1997.5	1.40909090909	14.6010732639	3.82061485288	1345.84666138	1313.22286143	1504.63413038	1485.28916731		818.842989677		124.814529135	125.021562607	114.316111091	9.31711193698	10.0073906556			730.182190565	5.38136805953	14.1686341847	0.0246941652645		0.00247851198326	4.04780029106	0.289376469271	4.85503190418	0.0170155602821	7.62006045002
maxs	2008.0	10.0	31.0	7.0	2400.0	2359.0	2400.0	2359.0		3949.0		475.0	437.0	402.0	475.0	473.0			3365.0	128.0	254.0	1.0		1.0	369.0	201.0	323.0	14.0	373.0
sigma	6.34436090171	1.87471137134	9.17579042586	1.90501311913	465.340899124	476.251139993	484.347487904	492.750434123		777.404369164		73.9744416606	73.40159463	69.6363295151	29.8402219624	26.4388090429			578.43800823	4.20197993986	9.9050857472	0.155193141358		0.0497234872189	16.2057299045	4.41677989873	18.6197762215	0.403940182102	23.4875658741
zeros	0	0	0	0	0	569	0	569		0		0	0	0	1514	6393			0	623	557	42892		43869	7344	8840	7388	8914	7140
missing	0	0	0	0	1086	0	1195	0	0	0	32	1195	13	16649	1195	1086	0	0	35	16026	16024	0	9774	0	35045	35045	35045	35045	35045	0	0
0	1987.0	10.0	14.0	3.0	741.0	730.0	912.0	849.0	PS	1451.0	NA	91.0	79.0	nan	23.0	11.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	YES
1	1987.0	10.0	15.0	4.0	729.0	730.0	903.0	849.0	PS	1451.0	NA	94.0	79.0	nan	14.0	-1.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	NO
2	1987.0	10.0	17.0	6.0	741.0	730.0	918.0	849.0	PS	1451.0	NA	97.0	79.0	nan	29.0	11.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	YES
3	1987.0	10.0	18.0	7.0	729.0	730.0	847.0	849.0	PS	1451.0	NA	78.0	79.0	nan	-2.0	-1.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	NO	NO
4	1987.0	10.0	19.0	1.0	749.0	730.0	922.0	849.0	PS	1451.0	NA	93.0	79.0	nan	33.0	19.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	YES
5	1987.0	10.0	21.0	3.0	728.0	730.0	848.0	849.0	PS	1451.0	NA	80.0	79.0	nan	-1.0	-2.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	NO	NO
6	1987.0	10.0	22.0	4.0	728.0	730.0	852.0	849.0	PS	1451.0	NA	84.0	79.0	nan	3.0	-2.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	NO
7	1987.0	10.0	23.0	5.0	731.0	730.0	902.0	849.0	PS	1451.0	NA	91.0	79.0	nan	13.0	1.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	YES
8	1987.0	10.0	24.0	6.0	744.0	730.0	908.0	849.0	PS	1451.0	NA	84.0	79.0	nan	19.0	14.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	YES
9	1987.0	10.0	25.0	7.0	729.0	730.0	851.0	849.0	PS	1451.0	NA	82.0	79.0	nan	2.0	-1.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	NO

	NO	YES	Error	Rate
NO	2300.0	2967.0	0.5633	(2967.0/5267.0)
YES	814.0	4942.0	0.1414	(814.0/5756.0)
Total	3114.0	7909.0	0.343	(3781.0/11023.0)

metric	threshold	value	idx
max f1	0.3787696	0.7233077	284.0
max f2	0.1839260	0.8460443	372.0
max f0point5	0.5536517	0.7028692	193.0
max accuracy	0.5048457	0.6860201	220.0
max precision	0.9688600	1.0	0.0
max recall	0.0689509	1.0	398.0
max specificity	0.9688600	1.0	0.0
max absolute_mcc	0.5048457	0.3708690	220.0
max min_per_class_accuracy	0.5163738	0.6829315	214.0
max mean_per_class_accuracy	0.5048457	0.6854534	220.0

group	cumulative_data_fraction	lower_threshold	lift	cumulative_lift	response_rate	cumulative_response_rate	capture_rate	cumulative_capture_rate	gain	cumulative_gain
1	0.0100699	0.9336374	1.7597712	1.7597712	0.9189189	0.9189189	0.0177206	0.0177206	75.9771238	75.9771238
2	0.0200490	0.9086334	1.6887217	1.7244072	0.8818182	0.9004525	0.0168520	0.0345726	68.8721650	72.4407190
3	0.0301188	0.8954846	1.7080133	1.7189261	0.8918919	0.8975904	0.0171994	0.0517721	70.8013260	71.8926087
4	0.0400980	0.8821818	1.7757692	1.7330726	0.9272727	0.9049774	0.0177206	0.0694927	77.5769158	73.3072552
5	0.0500771	0.8675461	1.6190836	1.7103574	0.8454545	0.8931159	0.0161571	0.0856498	61.9083644	71.0357371
6	0.1002449	0.8094473	1.6068372	1.6585504	0.8390597	0.8660633	0.0806115	0.1662613	60.6837177	65.8550433
7	0.1505035	0.7670999	1.5209745	1.6126089	0.7942238	0.8420735	0.0764420	0.2427033	52.0974503	61.2608862
8	0.2001270	0.7295836	1.4494126	1.5721427	0.7568556	0.8209429	0.0719249	0.3146282	44.9412615	57.2142703
9	0.3000091	0.6659620	1.3775802	1.5073670	0.7193460	0.7871182	0.1375956	0.4522238	37.7580177	50.7366973
10	0.3999819	0.5947748	1.1625819	1.4211902	0.6070780	0.7421184	0.1162265	0.5684503	16.2581868	42.1190247
11	0.5000454	0.5214679	1.0590912	1.3487310	0.5530372	0.7042816	0.1059764	0.6744267	5.9091164	34.8731014
12	0.6000181	0.4510046	0.9436203	1.2812330	0.4927405	0.6690354	0.0943363	0.7687630	-5.6379739	28.1232972
13	0.7000816	0.3888302	0.7413638	1.2040688	0.3871260	0.6287417	0.0741835	0.8429465	-25.8636185	20.4068831
14	0.7999637	0.3223049	0.6505240	1.1349542	0.3396912	0.5926514	0.0649757	0.9079222	-34.9476028	13.4954191
15	0.9004808	0.2554322	0.5202424	1.0663364	0.2716606	0.5568205	0.0522933	0.9602154	-47.9757585	6.6336355
16	1.0	0.0578914	0.3997679	1.0	0.2087511	0.5221809	0.0397846	1.0	-60.0232139	0.0