notebook.community

Edit and run



In [1]:

    
import h2o
import pandas
import pprint
import operator
import matplotlib
from tabulate import tabulate









    



/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/__init__.py:7: DeprecationWarning: bad escape \s
  from pandas import hashtable, tslib, lib



In [2]:

    
# Connect to a cluster
h2o.init()









    




H2O cluster uptime: 
5 minutes 25 seconds 443 milliseconds 
H2O cluster version: 
3.7.0.99999
H2O cluster name: 
spIdea
H2O cluster total nodes: 
1
H2O cluster total free memory: 
12.12 GB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
127.0.0.1
H2O Connection port: 
54321
H2O Connection proxy: 
None
Python Version: 
3.5.0



In [3]:

    
# set this to True if interactive (matplotlib) plots are desired
interactive = False
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt



In [4]:

    
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
# air_path = [_locate("bigdata/laptop/airlines_all.05p.csv")]
# air_path = [_locate("bigdata/laptop/flights-nyc/flights14.csv.zip")]
air_path = [_locate("smalldata/airlines/allyears2k_headers.zip")]

# ----------

# 1- Load data - 1 row per flight.  Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print("Import and Parse airlines data")
data = h2o.import_file(path=air_path)
data.describe()









    



Import and Parse airlines data

Parse Progress: [##################################################] 100%
Rows:43,978 Cols:31

Chunk compression summary: 






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
10
5.376344
    800  B
0.0504024
C0D
Constant Reals
23
12.365591
    1.8 KB
0.1159254
CBS
Bits
2
1.0752689
    2.0 KB
0.1272030
CX0
Sparse Bits
10
5.376344
    1.9 KB
0.1247459
C1
1-Byte Integers
40
21.505377
  287.8 KB
18.564957
C1N
1-Byte Integers (w/o NAs)
19
10.215054
  133.1 KB
8.58617
C1S
1-Byte Fractions
6
3.2258065
   43.4 KB
2.8024976
C2
2-Byte Integers
76
40.860214
    1.1 MB
69.628105






    



Frame distribution summary: 






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.84:54321
    1.5 MB
43978.0
6.0
186.0
mean
    1.5 MB
43978.0
6.0
186.0
min
    1.5 MB
43978.0
6.0
186.0
max
    1.5 MB
43978.0
6.0
186.0
stddev
      0  B
0.0
0.0
0.0
total
    1.5 MB
43978.0
6.0
186.0






    










    





       Year             Month            DayofMonth        DayOfWeek         DepTime           CRSDepTime        ArrTime           CRSArrTime        UniqueCarrier  FlightNum        TailNum  ActualElapsedTime  CRSElapsedTime    AirTime           ArrDelay          DepDelay          Origin  Dest  Distance         TaxiIn           TaxiOut           Cancelled           CancellationCode  Diverted             CarrierDelay     WeatherDelay      NASDelay          SecurityDelay       LateAircraftDelay  IsArrDelayed      IsDepDelayed       
type   int              int              int               int               int               int               int               int               enum           int              enum     int                int               int               int               int               enum    enum  int              int              int               int                 enum              int                  int              int               int               int                 int                enum              enum               
mins   1987.0           1.0              1.0               1.0               1.0               0.0               1.0               0.0               0.0            1.0              0.0      16.0               17.0              14.0              -63.0             -16.0             0.0     0.0   11.0             0.0              0.0               0.0                 0.0               0.0                  0.0              0.0               0.0               0.0                 0.0                0.0               0.0                
mean   1997.5           1.409090909090909 14.601073263904679 3.820614852880991 1345.8466613820763 1313.2228614307164 1504.6341303788884 1485.289167310927 NaN            818.8429896766577 NaN      124.8145291354043  125.02156260661899 114.31611109078277 9.317111936984313 10.0073906556001  NaN     NaN   730.1821905650501 5.381368059530628 14.168634184732056 0.024694165264450407 NaN               0.0024785119832643593 4.047800291055627 0.2893764692712417 4.855031904175534 0.017015560282100096 7.620060450016789  0.555755150302424 0.5250579835372226 
maxs   2008.0           10.0             31.0              7.0               2400.0            2359.0            2400.0            2359.0            9.0            3949.0           3500.0   475.0              437.0             402.0             475.0             473.0             131.0   133.0 3365.0           128.0            254.0             1.0                 3.0               1.0                  369.0            201.0             323.0             14.0                373.0              1.0               1.0                
sigma  6.344360901711177 1.874711371343963 9.175790425861443 1.9050131191328936 465.340899124234  476.25113999259946 484.34748790351614 492.75043412270094 NaN            777.4043691636349 NaN      73.97444166059017  73.4015946300093  69.63632951506109 29.840221962414848 26.438809042916454 NaN     NaN   578.438008230424 4.201979939864828 9.905085747204327 0.15519314135784237 NaN               0.049723487218862286 16.20572990448423 4.416779898734124 18.619776221475682 0.40394018210151184 23.487565874106213 0.4968872883428837 0.49937738031758017
zeros  0                0                0                 0                 0                 569               0                 569               724            0                2        0                  0                 -8878             1514              6393              59      172   0                -8255            -8321             42892               81                43869                -23296           -21800            -23252            -21726              -23500             19537             20887              
missing 0                0                0                 0                 1086              0                 1195              0                 0              0                32       1195               13                16649             1195              1086              0       0     35               16026            16024             0                   9774              0                    35045            35045             35045             35045               35045              0                 0                  
0      1987.0           10.0             14.0              3.0               741.0             730.0             912.0             849.0             PS             1451.0           NA       91.0               79.0              nan               23.0              11.0              SAN     SFO   447.0            nan              nan               0.0                 NA                0.0                  nan              nan               nan               nan                 nan                YES               YES                
1      1987.0           10.0             15.0              4.0               729.0             730.0             903.0             849.0             PS             1451.0           NA       94.0               79.0              nan               14.0              -1.0              SAN     SFO   447.0            nan              nan               0.0                 NA                0.0                  nan              nan               nan               nan                 nan                YES               NO                 
2      1987.0           10.0             17.0              6.0               741.0             730.0             918.0             849.0             PS             1451.0           NA       97.0               79.0              nan               29.0              11.0              SAN     SFO   447.0            nan              nan               0.0                 NA                0.0                  nan              nan               nan               nan                 nan                YES               YES                
3      1987.0           10.0             18.0              7.0               729.0             730.0             847.0             849.0             PS             1451.0           NA       78.0               79.0              nan               -2.0              -1.0              SAN     SFO   447.0            nan              nan               0.0                 NA                0.0                  nan              nan               nan               nan                 nan                NO                NO                 
4      1987.0           10.0             19.0              1.0               749.0             730.0             922.0             849.0             PS             1451.0           NA       93.0               79.0              nan               33.0              19.0              SAN     SFO   447.0            nan              nan               0.0                 NA                0.0                  nan              nan               nan               nan                 nan                YES               YES                
5      1987.0           10.0             21.0              3.0               728.0             730.0             848.0             849.0             PS             1451.0           NA       80.0               79.0              nan               -1.0              -2.0              SAN     SFO   447.0            nan              nan               0.0                 NA                0.0                  nan              nan               nan               nan                 nan                NO                NO                 
6      1987.0           10.0             22.0              4.0               728.0             730.0             852.0             849.0             PS             1451.0           NA       84.0               79.0              nan               3.0               -2.0              SAN     SFO   447.0            nan              nan               0.0                 NA                0.0                  nan              nan               nan               nan                 nan                YES               NO                 
7      1987.0           10.0             23.0              5.0               731.0             730.0             902.0             849.0             PS             1451.0           NA       91.0               79.0              nan               13.0              1.0               SAN     SFO   447.0            nan              nan               0.0                 NA                0.0                  nan              nan               nan               nan                 nan                YES               YES                
8      1987.0           10.0             24.0              6.0               744.0             730.0             908.0             849.0             PS             1451.0           NA       84.0               79.0              nan               19.0              14.0              SAN     SFO   447.0            nan              nan               0.0                 NA                0.0                  nan              nan               nan               nan                 nan                YES               YES                
9      1987.0           10.0             25.0              7.0               729.0             730.0             851.0             849.0             PS             1451.0           NA       82.0               79.0              nan               2.0               -1.0              SAN     SFO   447.0            nan              nan               0.0                 NA                0.0                  nan              nan               nan               nan                 nan                YES               NO



In [5]:

    
# ----------

# 2- Data exploration and munging. Generate scatter plots 
# of various columns and plot fitted GLM model.

# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
    if(fit):
        lr = h2o.glm(x = data[[x]], y = data[y], family = "gaussian")
        coeff = lr.coef()
    df = data[[x,y]]
    runif = df[y].runif()
    df_subset = df[runif < float(max_points)/data.nrow]
    df_py = h2o.as_list(df_subset)
    
    if(fit): h2o.remove(lr._id)

    # If x variable is string, generate box-and-whisker plot
    if(df_py[x].dtype == "object"):
        if interactive: df_py.boxplot(column = y, by = x)
    # Otherwise, generate a scatter plot
    else:
        if interactive: df_py.plot(x = x, y = y, kind = "scatter")
    
    if(fit):
        x_min = min(df_py[x])
        x_max = max(df_py[x])
        y_min = coeff["Intercept"] + coeff[x]*x_min
        y_max = coeff["Intercept"] + coeff[x]*x_max
        plt.plot([x_min, x_max], [y_min, y_max], "k-")
    if interactive: plt.show()

scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)









    



glm Model Build Progress: [##################################################] 100%






    



/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:9: DeprecationWarning: `h2o.glm` is deprecated. Use the estimators sub module to build an H2OGeneralizedLinearEstimator.



In [6]:

    
# Group flights by month
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim

# Convert columns to factors
data["Year"]      = data["Year"]     .asfactor()
data["Month"]     = data["Month"]    .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()









    





  Month   nrow_Year   sum_Cancelled
      1       41979            1067
     10        1999              19







    



Rows:2 Cols:3

Chunk compression summary: 






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C1N
1-Byte Integers (w/o NAs)
1
33.333336
     70  B
30.434782
C2
2-Byte Integers
1
33.333336
     72  B
31.304348
C2S
2-Byte Fractions
1
33.333336
     88  B
38.260868






    



Frame distribution summary: 






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
172.16.2.84:54321
    230  B
2.0
1.0
3.0
mean
    230  B
2.0
1.0
3.0
min
    230  B
2.0
1.0
3.0
max
    230  B
2.0
1.0
3.0
stddev
      0  B
0.0
0.0
0.0
total
    230  B
2.0
1.0
3.0






    










    





       Month            nrow_Year        sum_Cancelled    
type   int              int              int              
mins   1.0              1999.0           19.0             
mean   5.5              21989.0          543.0            
maxs   10.0             41979.0          1067.0           
sigma  6.363961030678928 28270.12911183817 741.0479066835018
zeros  0                0                0                
missing 0                0                0                
0      1.0              41979.0          1067.0           
1      10.0             1999.0           19.0



In [7]:

    
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1

hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2

# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = (arrTime-depTime > 0).ifelse((arrTime-depTime), h2o.H2OFrame([[None]]*data.nrow))
scatter_plot(data, "Distance", "TravelTime")









    



Parse Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%






    



/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:9: DeprecationWarning: `h2o.glm` is deprecated. Use the estimators sub module to build an H2OGeneralizedLinearEstimator.



In [8]:

    
# Impute missing travel times and re-plot
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")









    



glm Model Build Progress: [##################################################] 100%






    



/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:9: DeprecationWarning: `h2o.glm` is deprecated. Use the estimators sub module to build an H2OGeneralizedLinearEstimator.



In [9]:

    
# ----------
# 3- Fit a model on train; using test as validation

# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test  = data[s > 0.75]

# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]

# Simple GLM - Predict Delays
data_glm = h2o.glm(x           =train[myX],
                   y           =train[myY],
                   validation_x=test [myX],
                   validation_y=test [myY],
                   family      ="binomial",
                   standardize =True)

# Simple GBM
data_gbm = h2o.gbm(x              =train[myX],
                   y              =train[myY],
                   validation_x   =test [myX],
                   validation_y   =test [myY],
                   balance_classes=True,
                   ntrees         =3,
                   max_depth      =1,
                   distribution   ="bernoulli",
                   learn_rate     =0.1,
                   min_rows       =2)

# Complex GBM
data_gbm2 = h2o.gbm(x              =train[myX],
                    y              =train[myY],
                    validation_x   =test [myX],
                    validation_y   =test [myY],
                    balance_classes=True,
                    ntrees         =50,
                    max_depth      =5,
                    distribution   ="bernoulli",
                    learn_rate     =0.1,
                    min_rows       =2)

# Simple Random Forest
data_rf = h2o.random_forest(x              =train[myX],
                            y              =train[myY],
                            validation_x   =test [myX],
                            validation_y   =test [myY],
                            ntrees         =5,
                            max_depth      =2,
                            balance_classes=True)

# Complex Random Forest
data_rf2 = h2o.random_forest(x              =train[myX], 
                             y              =train[myY],
                             validation_x   =test [myX],
                             validation_y   =test [myY],
                             ntrees         =10,
                             max_depth      =5,
                             balance_classes=True)

# Deep Learning with 5 epochs
data_dl = h2o.deeplearning(x                   =train[myX],
                           y                   =train[myY],
                           validation_x        =test [myX],
                           validation_y        =test [myY],
                           hidden              =[10,10],
                           epochs              =5,
                           variable_importances=True,
                           balance_classes     =True,
                           loss                ="Automatic")









    



glm Model Build Progress: [##################################################] 100%





    



/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:19: DeprecationWarning: `h2o.glm` is deprecated. Use the estimators sub module to build an H2OGeneralizedLinearEstimator.
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:31: DeprecationWarning: `h2o.gbm` is deprecated. Use the estimators sub module to build an H2OGradientBoostedEstimator.






    




gbm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%





    



/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:43: DeprecationWarning: `h2o.gbm` is deprecated. Use the estimators sub module to build an H2OGradientBoostedEstimator.
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:52: DeprecationWarning: `h2o.random_forest` is deprecated. Use the estimators sub module to build an H2ORandomForestEstimator.






    




drf Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%





    



/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:61: DeprecationWarning: `h2o.random_forest` is deprecated. Use the estimators sub module to build an H2ORandomForestEstimator.
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:72: DeprecationWarning: `h2o.deeplearning` is deprecated. Use the estimators sub module to build an H2ODeepLearningEstimator.






    




deeplearning Model Build Progress: [##################################################] 100%



In [11]:

    
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
from six import iteritems
glm_varimp = data_glm.coef_norm()
for k,v in iteritems(glm_varimp):
    glm_varimp[k] = abs(glm_varimp[k])
    
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print("Variable Importances:\n\n" + table)

data_gbm.varimp()
data_rf.varimp()









    



Variable Importances:

| Predictor        |   Normalized Coefficient |
|------------------+--------------------------|
| Year.2008        |               2.1663     |
| Dest.HTS         |               1.59911    |
| Year.2003        |               1.59565    |
| Origin.MDW       |               1.58362    |
| Year.2007        |               1.37479    |
| Origin.HPN       |               1.34354    |
| Origin.LIH       |               1.32598    |
| Dest.LYH         |               1.29275    |
| Origin.LBB       |               1.21984    |
| Origin.LEX       |               1.21291    |
| Origin.ERI       |               1.20959    |
| Origin.TLH       |               1.17343    |
| Origin.CAE       |               1.15044    |
| UniqueCarrier.HP |               1.12944    |
| Origin.PSP       |               1.11685    |
| Origin.HNL       |               1.11194    |
| Origin.TRI       |               1.02187    |
| UniqueCarrier.TW |               1.0169     |
| Year.2001        |               0.979973   |
| Year.2002        |               0.944374   |
| Origin.SDF       |               0.939753   |
| Origin.ATL       |               0.935832   |
| Origin.GRR       |               0.884671   |
| Origin.PBI       |               0.882257   |
| Origin.CHO       |               0.878584   |
| Origin.OGG       |               0.864754   |
| Origin.SRQ       |               0.856535   |
| Year.2004        |               0.846669   |
| Origin.MYR       |               0.835173   |
| Origin.ACY       |               0.804102   |
| Origin.ORD       |               0.787865   |
| Year.1994        |               0.781128   |
| Origin.MAF       |               0.766548   |
| Origin.TUL       |               0.765077   |
| Origin.MRY       |               0.759124   |
| Year.2006        |               0.749834   |
| Origin.STL       |               0.737706   |
| Origin.LYH       |               0.728328   |
| Dest.CHO         |               0.728328   |
| Origin.CMH       |               0.703809   |
| Dest.GSO         |               0.694797   |
| Origin.BTV       |               0.678703   |
| Origin.ROA       |               0.672739   |
| Dest.ISP         |               0.666122   |
| Dest.LIH         |               0.647256   |
| Origin.AUS       |               0.646233   |
| Origin.IAH       |               0.637049   |
| Dest.FLL         |               0.624057   |
| Origin.MLB       |               0.611271   |
| Dest.PBI         |               0.609092   |
| Origin.PIT       |               0.604604   |
| Origin.PWM       |               0.603332   |
| Dest.ICT         |               0.601697   |
| Year.1996        |               0.601507   |
| Origin.TYS       |               0.590041   |
| Origin.MSY       |               0.587653   |
| Year.1990        |               0.564752   |
| Dest.DAY         |               0.564026   |
| Origin.SYR       |               0.560879   |
| Dest.IAH         |               0.553572   |
| Dest.EUG         |               0.54793    |
| Origin.JAX       |               0.542031   |
| Origin.BOI       |               0.541044   |
| Dest.TOL         |               0.528751   |
| Dest.TPA         |               0.51248    |
| Dest.BUF         |               0.512192   |
| Dest.PSP         |               0.508527   |
| Origin.ALB       |               0.506946   |
| Origin.SAV       |               0.50483    |
| Origin.CRW       |               0.504431   |
| Dest.PNS         |               0.503218   |
| UniqueCarrier.CO |               0.499991   |
| Dest.SFO         |               0.499403   |
| Origin.PHL       |               0.498516   |
| Year.1997        |               0.492557   |
| Origin.OKC       |               0.491762   |
| Origin.LGA       |               0.488253   |
| Origin.MIA       |               0.480325   |
| Origin.OMA       |               0.477082   |
| Dest.CHS         |               0.475901   |
| Dest.CAK         |               0.473522   |
| Origin.FLL       |               0.469294   |
| Origin.ICT       |               0.464117   |
| Dest.GEG         |               0.461246   |
| Origin.EGE       |               0.461207   |
| Dest.ABQ         |               0.461191   |
| Dest.EYW         |               0.452089   |
| Year.2005        |               0.45045    |
| Dest.IND         |               0.449927   |
| UniqueCarrier.WN |               0.446792   |
| Origin.IND       |               0.446311   |
| Origin.GSO       |               0.442529   |
| Origin.MCO       |               0.434966   |
| Origin.LAX       |               0.433672   |
| Origin.BDL       |               0.418545   |
| Dest.CAE         |               0.414453   |
| Dest.SMF         |               0.409427   |
| Origin.CRP       |               0.403216   |
| Origin.DFW       |               0.399445   |
| Dest.BDL         |               0.395146   |
| Dest.CVG         |               0.391672   |
| Dest.UCA         |               0.39075    |
| Origin.DSM       |               0.387103   |
| Origin.MEM       |               0.383554   |
| Origin.EYW       |               0.375727   |
| Dest.CLE         |               0.372843   |
| Dest.FAT         |               0.369287   |
| UniqueCarrier.PI |               0.366404   |
| Origin.SLC       |               0.354344   |
| Origin.JFK       |               0.34159    |
| Origin.BWI       |               0.339737   |
| Dest.MIA         |               0.338326   |
| Origin.ROC       |               0.328992   |
| Origin.OAK       |               0.327167   |
| Dest.BGM         |               0.323214   |
| Origin.IAD       |               0.320497   |
| Dest.JAX         |               0.319508   |
| Dest.MKE         |               0.31828    |
| Year.1992        |               0.31714    |
| Dest.MCO         |               0.315641   |
| Dest.FAY         |               0.315447   |
| Dest.COS         |               0.314929   |
| Origin.RNO       |               0.314859   |
| Origin.MCI       |               0.313843   |
| Dest.SAT         |               0.305571   |
| Year.1995        |               0.29602    |
| Origin.SAN       |               0.292782   |
| Dest.OGG         |               0.281564   |
| Year.1991        |               0.274708   |
| Dest.BUR         |               0.270584   |
| Dest.ALB         |               0.268558   |
| Dest.TUL         |               0.26762    |
| Origin.DAY       |               0.264843   |
| Origin.BUR       |               0.264689   |
| Origin.CLT       |               0.256984   |
| Origin.ONT       |               0.256321   |
| Origin.MKE       |               0.254529   |
| Origin.HRL       |               0.253809   |
| DayOfWeek.5      |               0.244342   |
| UniqueCarrier.US |               0.239344   |
| Dest.BTV         |               0.23824    |
| Origin.ABE       |               0.234584   |
| Origin.TPA       |               0.22891    |
| Dest.STT         |               0.225113   |
| Origin.STX       |               0.223986   |
| Dest.GSP         |               0.221914   |
| Origin.BHM       |               0.219408   |
| Dest.IAD         |               0.219399   |
| Origin.BOS       |               0.21936    |
| Origin.MDT       |               0.217089   |
| Dest.PVD         |               0.21636    |
| Dest.RSW         |               0.208373   |
| Origin.ELP       |               0.207048   |
| Origin.DEN       |               0.205402   |
| Dest.LIT         |               0.204071   |
| Month.10         |               0.203185   |
| Year.1987        |               0.203185   |
| Dest.BWI         |               0.202309   |
| Origin.MSP       |               0.201702   |
| Dest.PDX         |               0.201547   |
| Dest.ROC         |               0.199012   |
| Origin.TUS       |               0.197624   |
| Dest.KOA         |               0.197388   |
| Dest.CLT         |               0.191233   |
| Dest.OAJ         |               0.188976   |
| Year.1999        |               0.186221   |
| Origin.SJC       |               0.182876   |
| Dest.DAL         |               0.179589   |
| Origin.BUF       |               0.178246   |
| DayOfWeek.2      |               0.17761    |
| Origin.DAL       |               0.175027   |
| Origin.CLE       |               0.173502   |
| Dest.GRR         |               0.169856   |
| Dest.PWM         |               0.16768    |
| UniqueCarrier.AA |               0.167342   |
| Year.1993        |               0.166087   |
| Dest.RNO         |               0.165744   |
| Distance         |               0.163211   |
| Dest.LBB         |               0.157175   |
| Dest.HRL         |               0.156284   |
| Dest.ABE         |               0.155532   |
| Dest.CMH         |               0.154857   |
| Dest.CRP         |               0.151555   |
| Dest.SNA         |               0.151435   |
| Origin.SFO       |               0.150441   |
| Dest.SEA         |               0.149936   |
| Dest.ROA         |               0.148303   |
| Year.2000        |               0.146046   |
| Dest.ORF         |               0.134053   |
| Dest.SAN         |               0.133593   |
| DayOfWeek.6      |               0.132748   |
| Dest.MSP         |               0.132271   |
| Origin.COS       |               0.128671   |
| Dest.HOU         |               0.127342   |
| Dest.TUS         |               0.120346   |
| DayOfWeek.4      |               0.119748   |
| Dest.DSM         |               0.116603   |
| Dest.LAX         |               0.11609    |
| Dest.SLC         |               0.114966   |
| Dest.AVP         |               0.112227   |
| Dest.STL         |               0.110793   |
| Origin.ORF       |               0.108536   |
| Dest.BHM         |               0.108348   |
| UniqueCarrier.UA |               0.107298   |
| Origin.DTW       |               0.105773   |
| Dest.MDW         |               0.10405    |
| Dest.DFW         |               0.0989164  |
| Origin.CVG       |               0.0967693  |
| Origin.SMF       |               0.0959796  |
| Origin.RSW       |               0.0934595  |
| Origin.SWF       |               0.0927228  |
| Month.1          |               0.092347   |
| Dest.PHL         |               0.0848795  |
| Dest.PHX         |               0.0848389  |
| Origin.RDU       |               0.0839633  |
| Origin.DCA       |               0.0832363  |
| Dest.OAK         |               0.0818515  |
| Dest.MCI         |               0.0815358  |
| Dest.EWR         |               0.0785491  |
| Dest.DEN         |               0.0783454  |
| Dest.DTW         |               0.0774459  |
| Year.1989        |               0.0762646  |
| Dest.LAS         |               0.0743316  |
| Dest.MDT         |               0.0731147  |
| Dest.RIC         |               0.0723303  |
| Dest.OMA         |               0.0661859  |
| UniqueCarrier.PS |               0.0645156  |
| Year.1998        |               0.05845    |
| Dest.MHT         |               0.0576363  |
| Origin.BNA       |               0.0553462  |
| Origin.PHX       |               0.0522407  |
| Origin.GNV       |               0.0504304  |
| Dest.MSY         |               0.0501866  |
| Origin.PVD       |               0.0490418  |
| Origin.MFR       |               0.0437977  |
| Origin.SNA       |               0.0421396  |
| FlightNum        |               0.0376186  |
| Origin.SEA       |               0.0372322  |
| Dest.BNA         |               0.0347007  |
| Origin.PHF       |               0.029703   |
| Dest.LGA         |               0.0291171  |
| Intercept        |               0.026855   |
| Dest.ORD         |               0.0244753  |
| DayOfWeek.7      |               0.0234737  |
| Dest.SJC         |               0.0177833  |
| Dest.AVL         |               0.0172911  |
| Dest.BOS         |               0.0162872  |
| DayOfWeek.1      |               0.0153713  |
| Origin.PDX       |               0.0112833  |
| Origin.RIC       |               0.011192   |
| Origin.SAT       |               0.0110852  |
| Year.1988        |               0.00996483 |
| Origin.BGM       |               0.00952641 |
| Dest.PIT         |               0.00935131 |
| Dest.ATL         |               0.00882664 |
| Origin.CHS       |               0.00818887 |
| Origin.ABQ       |               0.00803383 |
| Dest.ILM         |               0.00255637 |
| UniqueCarrier.DL |               0.00110988 |
| Origin.SBN       |               0          |
| Dest.SRQ         |               0          |
| Origin.EWR       |               0          |
| Dest.LEX         |               0          |
| Dest.HPN         |               0          |
| Dest.MRY         |               0          |
| Origin.SCK       |               0          |
| Dest.ONT         |               0          |
| Origin.AMA       |               0          |
| Dest.JAN         |               0          |
| Dest.CHA         |               0          |
| DayOfWeek.3      |               0          |
| Origin.BIL       |               0          |
| Dest.OKC         |               0          |
| Dest.ORH         |               0          |
| Origin.LAN       |               0          |
| Dest.RDU         |               0          |
| Dest.MAF         |               0          |
| Dest.MYR         |               0          |
| Origin.AVP       |               0          |
| Dest.ANC         |               0          |
| Origin.ISP       |               0          |
| Dest.PHF         |               0          |
| Dest.SBN         |               0          |
| Origin.MHT       |               0          |
| Origin.LIT       |               0          |
| Dest.FNT         |               0          |
| Dest.ACY         |               0          |
| Origin.KOA       |               0          |
| Dest.SYR         |               0          |
| Dest.SJU         |               0          |
| Origin.HOU       |               0          |
| Dest.HNL         |               0          |
| Origin.GEG       |               0          |
| Dest.AMA         |               0          |
| Origin.LAS       |               0          |
| Dest.JFK         |               0          |
| Dest.AUS         |               0          |
| Dest.ELP         |               0          |
| Dest.ERI         |               0          |
| Dest.DCA         |               0          |
| Origin.SJU       |               0          |
| Dest.SWF         |               0          |
| Origin.STT       |               0          |
| Origin.JAN       |               0          |
| Origin.UCA       |               0          |
| Dest.SDF         |               0          |
| Dest.SCK         |               0          |
| Dest.ELM         |               0          |
| Dest.BOI         |               0          |
| Origin.ANC       |               0          |






    Out[11]:





[('Origin', 954.8896484375, 1.0, 0.5008653471985322),
 ('Year', 679.748779296875, 0.7118610830153599, 0.3565465485016113),
 ('UniqueCarrier',
  132.99842834472656,
  0.13928146415908252,
  0.06976125890435877),
 ('FlightNum', 60.533199310302734, 0.06339287414975553, 0.03175129392093015),
 ('Distance', 60.19770812988281, 0.06304153388654407, 0.03157531975801193),
 ('DayOfWeek', 18.11199951171875, 0.01896763625132567, 0.009500231716555696),
 ('Dest', 0.0, 0.0, 0.0),
 ('Month', 0.0, 0.0, 0.0)]



In [12]:

    
# Model performance of GBM model on test data
data_gbm2.model_performance(test)









    



ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.20453710890468096
R^2: 0.17931766817429018
LogLoss: 0.5955368499629994
AUC: 0.7453701269377677
Gini: 0.49074025387553544

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3472357730956941: 






    





NO
YES
Error
Rate
NO
1865.0
3302.0
0.6391
 (3302.0/5167.0)
YES
615.0
5160.0
0.1065
 (615.0/5775.0)
Total
2480.0
8462.0
0.358
 (3917.0/10942.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.3472358
0.7248718
298.0
max f2
0.1737435
0.8500030
377.0
max f0point5
0.5352998
0.7082464
197.0
max accuracy
0.5016028
0.6858892
214.0
max precision
0.9626287
1.0
0.0
max absolute_MCC
0.5352998
0.3733331
197.0
max min_per_class_accuracy
0.5096470
0.6832900
210.0






    



Gains/Lift Table: Avg response rate: 52.78 %







    





group
lower_threshold
cumulative_data_fraction
response_rate
cumulative_response_rate
capture_rate
cumulative_capture_rate
lift
cumulative_lift
gain
cumulative_gain

1
0.8546116
0.0500823
0.8905109
0.8905109
0.0845022
0.0845022
1.6872677
1.6872677
68.7267672
68.7267672

2
0.7960067
0.1000731
0.8555759
0.8730594
0.0810390
0.1655411
1.6210755
1.6542018
62.1075524
65.4201823

3
0.7463254
0.1501554
0.7737226
0.8399270
0.0734199
0.2389610
1.4659867
1.5914253
46.5986665
59.1425252

4
0.7118147
0.2001462
0.7952468
0.8287671
0.0753247
0.3142857
1.5067689
1.5702805
50.6768917
57.0280496

5
0.6824229
0.2503199
0.7158470
0.8061336
0.0680519
0.3823377
1.3563286
1.5273964
35.6328626
52.7396386

6
0.6543955
0.3003107
0.7020110
0.7888010
0.0664935
0.4488312
1.3301133
1.4945559
33.0113251
49.4555888

7
0.6209159
0.3503930
0.6514599
0.7691706
0.0618182
0.5106494
1.2343331
1.4573618
23.4333112
45.7361814

8
0.5858773
0.4006580
0.6109091
0.7493157
0.0581818
0.5688312
1.1575008
1.4197424
15.7500826
41.9742393

9
0.5504401
0.4501005
0.5878004
0.7315736
0.0550649
0.6238961
1.1137163
1.3861261
11.3716302
38.6126126

10
0.5141673
0.5000914
0.5411335
0.7125365
0.0512554
0.6751515
1.0252956
1.3500563
2.5295631
35.0056264

11
0.4810761
0.5499909
0.4890110
0.6922566
0.0462338
0.7213853
0.9265382
1.3116314
-7.3461776
31.1631397

12
0.4443224
0.5999817
0.4570384
0.6726580
0.0432900
0.7646753
0.8659591
1.2744977
-13.4040853
27.4497700

13
0.4130338
0.6507037
0.4162162
0.6526685
0.04
0.8046753
0.7886126
1.2366232
-21.1387387
23.6623231

14
0.3856764
0.6999634
0.3617811
0.6321974
0.0337662
0.8384416
0.6854733
1.1978362
-31.4526661
19.7836210

15
0.3578235
0.7501371
0.3916211
0.6161062
0.0372294
0.8756710
0.7420118
1.1673480
-25.7988156
16.7347957

16
0.3315650
0.7999452
0.3596330
0.6001371
0.0339394
0.9096104
0.6814034
1.1370909
-31.8596608
13.7090927

17
0.3057750
0.8500274
0.3010949
0.5825180
0.0285714
0.9381818
0.5704901
1.1037077
-42.9509906
10.3707715

18
0.2734386
0.9000183
0.2815356
0.5658002
0.0266667
0.9648485
0.5334308
1.0720321
-46.6569165
7.2032100

19
0.2290241
0.9500091
0.2120658
0.5471861
0.0200866
0.9849351
0.4018050
1.0367638
-59.8194956
3.6763779

20
0.0661176
1.0
0.1590494
0.5277829
0.0150649
1.0
0.3013538
1.0
-69.8646217
0.0






    









    Out[12]:

H2O cluster uptime:	5 minutes 25 seconds 443 milliseconds
H2O cluster version:	3.7.0.99999
H2O cluster name:	spIdea
H2O cluster total nodes:	1
H2O cluster total free memory:	12.12 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321
H2O Connection proxy:	None
Python Version:	3.5.0

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	10	5.376344	800 B	0.0504024
C0D	Constant Reals	23	12.365591	1.8 KB	0.1159254
CBS	Bits	2	1.0752689	2.0 KB	0.1272030
CX0	Sparse Bits	10	5.376344	1.9 KB	0.1247459
C1	1-Byte Integers	40	21.505377	287.8 KB	18.564957
C1N	1-Byte Integers (w/o NAs)	19	10.215054	133.1 KB	8.58617
C1S	1-Byte Fractions	6	3.2258065	43.4 KB	2.8024976
C2	2-Byte Integers	76	40.860214	1.1 MB	69.628105

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
172.16.2.84:54321	1.5 MB	43978.0	6.0	186.0
mean	1.5 MB	43978.0	6.0	186.0
min	1.5 MB	43978.0	6.0	186.0
max	1.5 MB	43978.0	6.0	186.0
stddev	0 B	0.0	0.0	0.0
total	1.5 MB	43978.0	6.0	186.0

	Year	Month	DayofMonth	DayOfWeek	DepTime	CRSDepTime	ArrTime	CRSArrTime	UniqueCarrier	FlightNum	TailNum	ActualElapsedTime	CRSElapsedTime	AirTime	ArrDelay	DepDelay	Origin	Dest	Distance	TaxiIn	TaxiOut	Cancelled	CancellationCode	Diverted	CarrierDelay	WeatherDelay	NASDelay	SecurityDelay	LateAircraftDelay	IsArrDelayed	IsDepDelayed
type	int	int	int	int	int	int	int	int	enum	int	enum	int	int	int	int	int	enum	enum	int	int	int	int	enum	int	int	int	int	int	int	enum	enum
mins	1987.0	1.0	1.0	1.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0	16.0	17.0	14.0	-63.0	-16.0	0.0	0.0	11.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
mean	1997.5	1.409090909090909	14.601073263904679	3.820614852880991	1345.8466613820763	1313.2228614307164	1504.6341303788884	1485.289167310927	NaN	818.8429896766577	NaN	124.8145291354043	125.02156260661899	114.31611109078277	9.317111936984313	10.0073906556001	NaN	NaN	730.1821905650501	5.381368059530628	14.168634184732056	0.024694165264450407	NaN	0.0024785119832643593	4.047800291055627	0.2893764692712417	4.855031904175534	0.017015560282100096	7.620060450016789	0.555755150302424	0.5250579835372226
maxs	2008.0	10.0	31.0	7.0	2400.0	2359.0	2400.0	2359.0	9.0	3949.0	3500.0	475.0	437.0	402.0	475.0	473.0	131.0	133.0	3365.0	128.0	254.0	1.0	3.0	1.0	369.0	201.0	323.0	14.0	373.0	1.0	1.0
sigma	6.344360901711177	1.874711371343963	9.175790425861443	1.9050131191328936	465.340899124234	476.25113999259946	484.34748790351614	492.75043412270094	NaN	777.4043691636349	NaN	73.97444166059017	73.4015946300093	69.63632951506109	29.840221962414848	26.438809042916454	NaN	NaN	578.438008230424	4.201979939864828	9.905085747204327	0.15519314135784237	NaN	0.049723487218862286	16.20572990448423	4.416779898734124	18.619776221475682	0.40394018210151184	23.487565874106213	0.4968872883428837	0.49937738031758017
zeros	0	0	0	0	0	569	0	569	724	0	2	0	0	-8878	1514	6393	59	172	0	-8255	-8321	42892	81	43869	-23296	-21800	-23252	-21726	-23500	19537	20887
missing	0	0	0	0	1086	0	1195	0	0	0	32	1195	13	16649	1195	1086	0	0	35	16026	16024	0	9774	0	35045	35045	35045	35045	35045	0	0
0	1987.0	10.0	14.0	3.0	741.0	730.0	912.0	849.0	PS	1451.0	NA	91.0	79.0	nan	23.0	11.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	YES
1	1987.0	10.0	15.0	4.0	729.0	730.0	903.0	849.0	PS	1451.0	NA	94.0	79.0	nan	14.0	-1.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	NO
2	1987.0	10.0	17.0	6.0	741.0	730.0	918.0	849.0	PS	1451.0	NA	97.0	79.0	nan	29.0	11.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	YES
3	1987.0	10.0	18.0	7.0	729.0	730.0	847.0	849.0	PS	1451.0	NA	78.0	79.0	nan	-2.0	-1.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	NO	NO
4	1987.0	10.0	19.0	1.0	749.0	730.0	922.0	849.0	PS	1451.0	NA	93.0	79.0	nan	33.0	19.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	YES
5	1987.0	10.0	21.0	3.0	728.0	730.0	848.0	849.0	PS	1451.0	NA	80.0	79.0	nan	-1.0	-2.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	NO	NO
6	1987.0	10.0	22.0	4.0	728.0	730.0	852.0	849.0	PS	1451.0	NA	84.0	79.0	nan	3.0	-2.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	NO
7	1987.0	10.0	23.0	5.0	731.0	730.0	902.0	849.0	PS	1451.0	NA	91.0	79.0	nan	13.0	1.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	YES
8	1987.0	10.0	24.0	6.0	744.0	730.0	908.0	849.0	PS	1451.0	NA	84.0	79.0	nan	19.0	14.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	YES
9	1987.0	10.0	25.0	7.0	729.0	730.0	851.0	849.0	PS	1451.0	NA	82.0	79.0	nan	2.0	-1.0	SAN	SFO	447.0	nan	nan	0.0	NA	0.0	nan	nan	nan	nan	nan	YES	NO

	Month	nrow_Year	sum_Cancelled
type	int	int	int
mins	1.0	1999.0	19.0
mean	5.5	21989.0	543.0
maxs	10.0	41979.0	1067.0
sigma	6.363961030678928	28270.12911183817	741.0479066835018
zeros	0	0	0
missing	0	0	0
0	1.0	41979.0	1067.0
1	10.0	1999.0	19.0

	NO	YES	Error	Rate
NO	1865.0	3302.0	0.6391	(3302.0/5167.0)
YES	615.0	5160.0	0.1065	(615.0/5775.0)
Total	2480.0	8462.0	0.358	(3917.0/10942.0)

metric	threshold	value	idx
max f1	0.3472358	0.7248718	298.0
max f2	0.1737435	0.8500030	377.0
max f0point5	0.5352998	0.7082464	197.0
max accuracy	0.5016028	0.6858892	214.0
max precision	0.9626287	1.0	0.0
max absolute_MCC	0.5352998	0.3733331	197.0
max min_per_class_accuracy	0.5096470	0.6832900	210.0

group	lower_threshold	cumulative_data_fraction	response_rate	cumulative_response_rate	capture_rate	cumulative_capture_rate	lift	cumulative_lift	gain	cumulative_gain
1	0.8546116	0.0500823	0.8905109	0.8905109	0.0845022	0.0845022	1.6872677	1.6872677	68.7267672	68.7267672
2	0.7960067	0.1000731	0.8555759	0.8730594	0.0810390	0.1655411	1.6210755	1.6542018	62.1075524	65.4201823
3	0.7463254	0.1501554	0.7737226	0.8399270	0.0734199	0.2389610	1.4659867	1.5914253	46.5986665	59.1425252
4	0.7118147	0.2001462	0.7952468	0.8287671	0.0753247	0.3142857	1.5067689	1.5702805	50.6768917	57.0280496
5	0.6824229	0.2503199	0.7158470	0.8061336	0.0680519	0.3823377	1.3563286	1.5273964	35.6328626	52.7396386
6	0.6543955	0.3003107	0.7020110	0.7888010	0.0664935	0.4488312	1.3301133	1.4945559	33.0113251	49.4555888
7	0.6209159	0.3503930	0.6514599	0.7691706	0.0618182	0.5106494	1.2343331	1.4573618	23.4333112	45.7361814
8	0.5858773	0.4006580	0.6109091	0.7493157	0.0581818	0.5688312	1.1575008	1.4197424	15.7500826	41.9742393
9	0.5504401	0.4501005	0.5878004	0.7315736	0.0550649	0.6238961	1.1137163	1.3861261	11.3716302	38.6126126
10	0.5141673	0.5000914	0.5411335	0.7125365	0.0512554	0.6751515	1.0252956	1.3500563	2.5295631	35.0056264
11	0.4810761	0.5499909	0.4890110	0.6922566	0.0462338	0.7213853	0.9265382	1.3116314	-7.3461776	31.1631397
12	0.4443224	0.5999817	0.4570384	0.6726580	0.0432900	0.7646753	0.8659591	1.2744977	-13.4040853	27.4497700
13	0.4130338	0.6507037	0.4162162	0.6526685	0.04	0.8046753	0.7886126	1.2366232	-21.1387387	23.6623231
14	0.3856764	0.6999634	0.3617811	0.6321974	0.0337662	0.8384416	0.6854733	1.1978362	-31.4526661	19.7836210
15	0.3578235	0.7501371	0.3916211	0.6161062	0.0372294	0.8756710	0.7420118	1.1673480	-25.7988156	16.7347957
16	0.3315650	0.7999452	0.3596330	0.6001371	0.0339394	0.9096104	0.6814034	1.1370909	-31.8596608	13.7090927
17	0.3057750	0.8500274	0.3010949	0.5825180	0.0285714	0.9381818	0.5704901	1.1037077	-42.9509906	10.3707715
18	0.2734386	0.9000183	0.2815356	0.5658002	0.0266667	0.9648485	0.5334308	1.0720321	-46.6569165	7.2032100
19	0.2290241	0.9500091	0.2120658	0.5471861	0.0200866	0.9849351	0.4018050	1.0367638	-59.8194956	3.6763779
20	0.0661176	1.0	0.1590494	0.5277829	0.0150649	1.0	0.3013538	1.0	-69.8646217	0.0