notebook.community

Edit and run



In [2]:

    
import h2o
import pandas
import pprint
import operator
import matplotlib
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from tabulate import tabulate



In [3]:

    
# Connect to a cluster
h2o.init()









    




H2O cluster uptime: 
1 minutes 48 seconds 26 milliseconds 
H2O cluster version: 
3.5.0.99999
H2O cluster name: 
ece
H2O cluster total nodes: 
1
H2O cluster total memory: 
3.56 GB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
127.0.0.1
H2O Connection port: 
54321



In [4]:

    
# set this to True if interactive (matplotlib) plots are desired
interactive = False
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt



In [5]:

    
from h2o.h2o import _locate # private function. used to find files within h2o git project directory.
# air_path = [_locate("bigdata/laptop/airlines_all.05p.csv")]
# air_path = [_locate("bigdata/laptop/flights-nyc/flights14.csv.zip")]
air_path = [_locate("smalldata/airlines/allyears2k_headers.zip")]

# ----------

# 1- Load data - 1 row per flight.  Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print "Import and Parse airlines data"
data = h2o.import_file(path=air_path)
data.describe()









    



Import and Parse airlines data

Parse Progress: [##################################################] 100%

Parsed 43,978 rows and 31 cols:







    




File1
/Users/ece/0xdata/h2o-dev/smalldata/airlines/allyears2k_headers.zip






    



Rows: 43,978 Cols: 31

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
10
5.376344
    800  B
0.05040237
C0D
Constant Reals
23
12.365591
    1.8 KB
0.115925446
CBS
Bits
2
1.0752689
    2.0 KB
0.12720299
CX0
Sparse Bits
10
5.376344
    1.9 KB
0.12474586
C1
1-Byte Integers
40
21.505377
  287.8 KB
18.564957
C1N
1-Byte Integers (w/o NAs)
19
10.215054
  133.1 KB
8.58617
C1S
1-Byte Fractions
6
3.2258065
   43.4 KB
2.8024976
C2
2-Byte Integers
76
40.860214
    1.1 MB
69.628105






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
10.0.0.24:54321
    1.5 MB
43978.0
6.0
186.0
mean
    1.5 MB
43978.0
6.0
186.0
min
    1.5 MB
43978.0
6.0
186.0
max
    1.5 MB
43978.0
6.0
186.0
stddev
      0  B
0.0
0.0
0.0
total
    1.5 MB
43978.0
6.0
186.0






    



Column-by-Column Summary:







    





Year
Month
DayofMonth
DayOfWeek
DepTime
CRSDepTime
ArrTime
CRSArrTime
UniqueCarrier
FlightNum
TailNum
ActualElapsedTime
CRSElapsedTime
AirTime
ArrDelay
DepDelay
Origin
Dest
Distance
TaxiIn
TaxiOut
Cancelled
CancellationCode
Diverted
CarrierDelay
WeatherDelay
NASDelay
SecurityDelay
LateAircraftDelay
IsArrDelayed
IsDepDelayed
type
int
int
int
int
int
int
int
int
enum
int
enum
int
int
int
int
int
enum
enum
int
int
int
int
enum
int
int
int
int
int
int
enum
enum
mins
1987.0
1.0
1.0
1.0
1.0
0.0
1.0
0.0
0.0
1.0
0.0
16.0
17.0
14.0
-63.0
-16.0
0.0
0.0
11.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
maxs
2008.0
10.0
31.0
7.0
2400.0
2359.0
2400.0
2359.0
9.0
3949.0
3500.0
475.0
437.0
402.0
475.0
473.0
131.0
133.0
3365.0
128.0
254.0
1.0
3.0
1.0
369.0
201.0
323.0
14.0
373.0
1.0
1.0
mean
1997.5
1.40909090909
14.6010732639
3.82061485288
1345.84666138
1313.22286143
1504.63413038
1485.28916731
NaN
818.842989677
NaN
124.814529135
125.021562607
114.316111091
9.31711193698
10.0073906556
NaN
NaN
730.182190565
5.38136805953
14.1686341847
0.0246941652645
NaN
0.00247851198326
4.04780029106
0.289376469271
4.85503190418
0.0170155602821
7.62006045002
0.555755150302
0.525057983537
sigma
6.34436090171
1.87471137134
9.17579042586
1.90501311913
465.340899124
476.251139993
484.347487904
492.750434123
NaN
777.404369164
NaN
73.9744416606
73.40159463
69.6363295151
29.8402219624
26.4388090429
NaN
NaN
578.43800823
4.20197993986
9.9050857472
0.155193141358
NaN
0.0497234872189
16.2057299045
4.41677989873
18.6197762215
0.403940182102
23.4875658741
0.496887288343
0.499377380318
zero_count
0
0
0
0
0
569
0
569
724
0
2
0
0
-8878
1514
6393
59
172
0
-8255
-8321
42892
81
43869
-23296
-21800
-23252
-21726
-23500
19537
20887
missing_count
0
0
0
0
1086
0
1195
0
0
0
32
1195
13
16649
1195
1086
0
0
35
16026
16024
0
9774
0
35045
35045
35045
35045
35045
0
0



In [6]:

    
# ----------

# 2- Data exploration and munging. Generate scatter plots 
# of various columns and plot fitted GLM model.

# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
    if(fit):
        lr = H2OGeneralizedLinearEstimator(family = "gaussian")
        lr.train(x=x, y=y, training_frame=data)
        coeff = lr.coef()
    df = data[[x,y]]
    runif = df[y].runif()
    df_subset = df[runif < float(max_points)/data.nrow]
    df_py = h2o.as_list(df_subset)
    
    if(fit): h2o.remove(lr._id)

    # If x variable is string, generate box-and-whisker plot
    if(df_py[x].dtype == "object"):
        if interactive: df_py.boxplot(column = y, by = x)
    # Otherwise, generate a scatter plot
    else:
        if interactive: df_py.plot(x = x, y = y, kind = "scatter")
    
    if(fit):
        x_min = min(df_py[x])
        x_max = max(df_py[x])
        y_min = coeff["Intercept"] + coeff[x]*x_min
        y_max = coeff["Intercept"] + coeff[x]*x_max
        plt.plot([x_min, x_max], [y_min, y_max], "k-")
    if interactive: plt.show()

scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)









    



glm Model Build Progress: [##################################################] 100%



In [7]:

    
# Group flights by month
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim

# Convert columns to factors
data["Year"]      = data["Year"]     .asfactor()
data["Month"]     = data["Month"]    .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()









    



H2OFrame with 2 rows and 3 columns: 






    






  
    
      
      Month
      sum_Cancelled
      nrow_Year
    
  
  
    
      0
      10
      19
      1999
    
    
      1
      1
      1067
      41979
    
  








    



Rows: 2 Cols: 3

Chunk compression summary:






    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C1N
1-Byte Integers (w/o NAs)
1
33.333336
     70  B
30.434782
C2
2-Byte Integers
1
33.333336
     72  B
31.304348
C2S
2-Byte Fractions
1
33.333336
     88  B
38.260868






    



Frame distribution summary:






    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
10.0.0.24:54321
    230  B
2.0
1.0
3.0
mean
    230  B
2.0
1.0
3.0
min
    230  B
2.0
1.0
3.0
max
    230  B
2.0
1.0
3.0
stddev
      0  B
0.0
0.0
0.0
total
    230  B
2.0
1.0
3.0






    



Column-by-Column Summary:







    





Month
sum_Cancelled
nrow_Year
type
int
int
int
mins
1.0
19.0
1999.0
maxs
10.0
1067.0
41979.0
mean
5.5
543.0
21989.0
sigma
6.36396103068
741.047906684
28270.1291118
zero_count
0
0
0
missing_count
0
0
0



In [8]:

    
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1

hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2

# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = h2o.ifelse((arrTime-depTime) > 0, (arrTime-depTime), h2o.H2OFrame(python_obj=[[None] * data.nrow]))
scatter_plot(data, "Distance", "TravelTime")









    



Parse Progress: [##################################################] 100%
Uploaded pyf0d32a3c-1375-46bb-bdeb-e30307592eb1 into cluster with 43,978 rows and 1 cols

glm Model Build Progress: [##################################################] 100%



In [9]:

    
# Impute missing travel times and re-plot
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")









    



glm Model Build Progress: [##################################################] 100%



In [10]:

    
# ----------
# 3- Fit a model on train; using test as validation

# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test  = data[s > 0.75]

# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]

# Simple GLM - Predict Delays
data_glm = H2OGeneralizedLinearEstimator(family="binomial", standardize=True)
data_glm.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Simple GBM
data_gbm = H2OGradientBoostingEstimator(balance_classes=True,
                                        ntrees         =3,
                                        max_depth      =1,
                                        distribution   ="bernoulli",
                                        learn_rate     =0.1,
                                        min_rows       =2)

data_gbm.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Complex GBM
data_gbm2 = H2OGradientBoostingEstimator(balance_classes=True,
                                         ntrees         =50,
                                         max_depth      =5,
                                         distribution   ="bernoulli",
                                         learn_rate     =0.1,
                                         min_rows       =2)

data_gbm2.train(x               =myX,
                y               =myY,
                training_frame  =train,
                validation_frame=test)

# Simple Random Forest
data_rf = H2ORandomForestEstimator(ntrees         =5,
                                   max_depth      =2,
                                   balance_classes=True)

data_rf.train(x               =myX,
              y               =myY,
              training_frame  =train,
              validation_frame=test)

# Complex Random Forest
data_rf2 = H2ORandomForestEstimator(ntrees         =10,
                                    max_depth      =5,
                                    balance_classes=True)

data_rf2.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Deep Learning with 5 epochs
data_dl = H2ODeepLearningEstimator(hidden              =[10,10],
                                   epochs              =5,
                                   variable_importances=True,
                                   balance_classes     =True,
                                   loss                ="Automatic")

data_dl.train(x               =myX,
              y               =myY,
              training_frame  =train,
              validation_frame=test)









    



glm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%



In [11]:

    
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
glm_varimp = data_glm.coef_norm()
for k,v in glm_varimp.iteritems():
    glm_varimp[k] = abs(glm_varimp[k])
    
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print "Variable Importances:\n\n" + table

data_gbm.varimp()
data_rf.varimp()









    



Variable Importances:

| Predictor        |   Normalized Coefficient |
|------------------+--------------------------|
| Year.2008        |               2.27303    |
| Origin.HPN       |               1.84215    |
| Origin.MDW       |               1.68805    |
| Year.2003        |               1.54157    |
| Dest.LYH         |               1.48624    |
| Year.2007        |               1.45507    |
| Origin.LIH       |               1.39752    |
| UniqueCarrier.HP |               1.17729    |
| Dest.HTS         |               1.17434    |
| UniqueCarrier.TW |               1.16337    |
| Origin.HNL       |               1.13928    |
| Origin.CHO       |               1.09404    |
| Year.2001        |               1.0751     |
| Origin.CRP       |               1.06815    |
| Origin.LEX       |               1.05819    |
| Origin.TLH       |               1.04609    |
| Year.2002        |               1.03472    |
| Origin.MRY       |               1.03312    |
| Origin.ATL       |               0.964516   |
| Origin.LBB       |               0.924606   |
| Origin.SAV       |               0.924074   |
| Origin.MYR       |               0.917623   |
| Year.2006        |               0.903002   |
| Dest.CHO         |               0.861586   |
| Origin.LYH       |               0.861586   |
| Origin.BTV       |               0.856098   |
| Origin.OGG       |               0.838354   |
| Year.2004        |               0.825728   |
| Origin.ORD       |               0.822116   |
| Origin.PBI       |               0.813204   |
| Origin.ERI       |               0.791175   |
| Origin.STL       |               0.774047   |
| Origin.JAX       |               0.760922   |
| Origin.CAE       |               0.754332   |
| Origin.PSP       |               0.748012   |
| Origin.MLB       |               0.746927   |
| Origin.IAH       |               0.739643   |
| Origin.TUL       |               0.737628   |
| Dest.OAJ         |               0.732495   |
| Origin.OMA       |               0.721397   |
| Year.1994        |               0.716236   |
| Origin.ELP       |               0.711852   |
| Origin.CMH       |               0.706425   |
| Dest.ISP         |               0.697029   |
| Dest.DAY         |               0.695901   |
| Dest.LIH         |               0.658274   |
| Origin.MIA       |               0.650255   |
| Dest.FLL         |               0.647266   |
| Dest.PSP         |               0.644029   |
| Dest.FAY         |               0.638358   |
| Origin.FLL       |               0.634635   |
| Dest.KOA         |               0.621138   |
| Origin.SRQ       |               0.605631   |
| Origin.EGE       |               0.598845   |
| Dest.IAH         |               0.583716   |
| Year.1996        |               0.58132    |
| Origin.PIT       |               0.575036   |
| Origin.MAF       |               0.57235    |
| Origin.MSY       |               0.571793   |
| Origin.MCO       |               0.566613   |
| Origin.TYS       |               0.559455   |
| Dest.ICT         |               0.557224   |
| Origin.SYR       |               0.554063   |
| Dest.PBI         |               0.54852    |
| Origin.LAX       |               0.539237   |
| Dest.CAE         |               0.52657    |
| Origin.AUS       |               0.521412   |
| Origin.RNO       |               0.514039   |
| Origin.GSO       |               0.509524   |
| Origin.ALB       |               0.503548   |
| Origin.BOI       |               0.503534   |
| Year.1997        |               0.502624   |
| Dest.BHM         |               0.498172   |
| UniqueCarrier.WN |               0.497922   |
| Dest.PNS         |               0.488923   |
| Origin.GRR       |               0.487122   |
| Dest.IND         |               0.482582   |
| Dest.TPA         |               0.472877   |
| Origin.DFW       |               0.471554   |
| Year.1990        |               0.468708   |
| Origin.PWM       |               0.459956   |
| UniqueCarrier.PI |               0.457526   |
| Dest.SFO         |               0.456667   |
| Dest.GSO         |               0.453236   |
| Dest.CAK         |               0.452089   |
| Dest.CLE         |               0.448936   |
| Dest.CHS         |               0.448358   |
| Origin.TRI       |               0.446242   |
| UniqueCarrier.CO |               0.44171    |
| Origin.PHL       |               0.440686   |
| Origin.LGA       |               0.434929   |
| Origin.OKC       |               0.432789   |
| Origin.MEM       |               0.428053   |
| Dest.TOL         |               0.425675   |
| Year.2005        |               0.425225   |
| Origin.MDT       |               0.421471   |
| Dest.LBB         |               0.420466   |
| UniqueCarrier.US |               0.408442   |
| Dest.OGG         |               0.390366   |
| Origin.PHF       |               0.389791   |
| Origin.STX       |               0.38837    |
| Origin.IND       |               0.383327   |
| Origin.CRW       |               0.382015   |
| Dest.MCO         |               0.378362   |
| Dest.ALB         |               0.377928   |
| Origin.BDL       |               0.372235   |
| Origin.SDF       |               0.350221   |
| Year.1991        |               0.346886   |
| Origin.ROC       |               0.344576   |
| Origin.MCI       |               0.342834   |
| Year.1992        |               0.338606   |
| Origin.OAK       |               0.337943   |
| Origin.BOS       |               0.336956   |
| Origin.ABE       |               0.327943   |
| Origin.ICT       |               0.323756   |
| Dest.MAF         |               0.323525   |
| Dest.BUF         |               0.321716   |
| Origin.TUS       |               0.320472   |
| Dest.GEG         |               0.313733   |
| Dest.SEA         |               0.308349   |
| Dest.JAX         |               0.305709   |
| Origin.IAD       |               0.291929   |
| Origin.HRL       |               0.281889   |
| Origin.BWI       |               0.281653   |
| Dest.MIA         |               0.276701   |
| Dest.OAK         |               0.275499   |
| Dest.BUR         |               0.272312   |
| Origin.EYW       |               0.272169   |
| Origin.MKE       |               0.270848   |
| Origin.DAY       |               0.270588   |
| Dest.STT         |               0.269064   |
| Dest.ABQ         |               0.268002   |
| Month.10         |               0.267489   |
| Year.1987        |               0.267489   |
| Origin.DEN       |               0.267209   |
| Dest.STL         |               0.266822   |
| Year.1995        |               0.257645   |
| Dest.UCA         |               0.255953   |
| Origin.CLT       |               0.253212   |
| Origin.SAN       |               0.2519     |
| DayOfWeek.5      |               0.248522   |
| Origin.SMF       |               0.247552   |
| Origin.MFR       |               0.243724   |
| Dest.PWM         |               0.238364   |
| Dest.SAT         |               0.237858   |
| Origin.DSM       |               0.237593   |
| Origin.CLE       |               0.237333   |
| Origin.BHM       |               0.236252   |
| Origin.BUF       |               0.234863   |
| Dest.CMH         |               0.231612   |
| Dest.MHT         |               0.231317   |
| Dest.SNA         |               0.230336   |
| Year.1999        |               0.228392   |
| Dest.MSP         |               0.228279   |
| Dest.HPN         |               0.226201   |
| Dest.IAD         |               0.225724   |
| Dest.ROC         |               0.225586   |
| Dest.BDL         |               0.224208   |
| Dest.DAL         |               0.222579   |
| Origin.BUR       |               0.220985   |
| Dest.OKC         |               0.216798   |
| Dest.SDF         |               0.215968   |
| Origin.CHS       |               0.214234   |
| Dest.TUL         |               0.21112    |
| Origin.ROA       |               0.210343   |
| Dest.LAX         |               0.206888   |
| Dest.FNT         |               0.206069   |
| Dest.FAT         |               0.201842   |
| Dest.MKE         |               0.190676   |
| Dest.SMF         |               0.189795   |
| Dest.SLC         |               0.186772   |
| Dest.ORF         |               0.182857   |
| Dest.BTV         |               0.180533   |
| DayOfWeek.6      |               0.177906   |
| Origin.GEG       |               0.177753   |
| Origin.TPA       |               0.173637   |
| Dest.PVD         |               0.172031   |
| Dest.PDX         |               0.171002   |
| DayOfWeek.2      |               0.170626   |
| Dest.LAS         |               0.16865    |
| Dest.GRR         |               0.168363   |
| Dest.CRP         |               0.167742   |
| Dest.SAN         |               0.167685   |
| Origin.JFK       |               0.164529   |
| UniqueCarrier.AA |               0.163194   |
| Dest.SWF         |               0.16196    |
| Dest.CLT         |               0.161741   |
| Distance         |               0.158621   |
| Origin.ACY       |               0.153687   |
| Origin.CVG       |               0.153603   |
| Origin.ONT       |               0.151541   |
| Origin.BGM       |               0.150124   |
| Origin.ISP       |               0.142946   |
| Origin.COS       |               0.140117   |
| Origin.JAN       |               0.139981   |
| Year.2000        |               0.138692   |
| Origin.SLC       |               0.134234   |
| Month.1          |               0.133607   |
| Origin.SJU       |               0.132975   |
| Dest.RSW         |               0.130314   |
| Origin.EWR       |               0.129902   |
| Origin.SFO       |               0.128856   |
| DayOfWeek.4      |               0.125531   |
| Dest.PHX         |               0.123113   |
| Dest.MDW         |               0.121856   |
| Dest.SBN         |               0.117223   |
| Dest.PHL         |               0.115132   |
| Dest.BNA         |               0.114969   |
| Origin.MHT       |               0.114864   |
| Origin.SJC       |               0.112726   |
| UniqueCarrier.UA |               0.108      |
| Origin.PVD       |               0.105506   |
| Dest.SJC         |               0.103874   |
| Origin.DAL       |               0.103711   |
| UniqueCarrier.DL |               0.103033   |
| Dest.DFW         |               0.102785   |
| Dest.HNL         |               0.0999807  |
| Dest.BWI         |               0.0989236  |
| Origin.DCA       |               0.0983886  |
| Dest.MCI         |               0.0982049  |
| Dest.DTW         |               0.0936784  |
| Origin.LAN       |               0.0930779  |
| Origin.RDU       |               0.09238    |
| Dest.BOS         |               0.0890562  |
| Origin.ORF       |               0.0873483  |
| Origin.PHX       |               0.0825334  |
| Dest.AVP         |               0.0821631  |
| Dest.EYW         |               0.0812225  |
| Origin.PDX       |               0.0789342  |
| Dest.OMA         |               0.076895   |
| Dest.CVG         |               0.0767606  |
| Dest.ATL         |               0.0734817  |
| Origin.SBN       |               0.0716625  |
| Origin.SEA       |               0.071505   |
| Dest.SYR         |               0.0658655  |
| Year.1989        |               0.060455   |
| Dest.ORD         |               0.0569392  |
| Origin.BNA       |               0.0561778  |
| Year.1993        |               0.0520074  |
| Dest.EWR         |               0.0519821  |
| Year.1998        |               0.0499919  |
| Dest.GSP         |               0.0463846  |
| DayOfWeek.7      |               0.0462232  |
| Dest.COS         |               0.0451968  |
| Dest.ABE         |               0.0393925  |
| Origin.LAS       |               0.0393662  |
| Intercept        |               0.0363075  |
| Dest.ONT         |               0.0359848  |
| Year.1988        |               0.0310257  |
| Origin.RSW       |               0.0303177  |
| FlightNum        |               0.0269493  |
| Origin.AMA       |               0.0247887  |
| Dest.DCA         |               0.0222587  |
| Dest.MSY         |               0.020414   |
| Origin.KOA       |               0.0203707  |
| Dest.RDU         |               0.0202591  |
| Dest.JFK         |               0.0186385  |
| Dest.AUS         |               0.0185827  |
| Origin.RIC       |               0.0179973  |
| Dest.ILM         |               0.0170692  |
| DayOfWeek.3      |               0.0151937  |
| Dest.AVL         |               0.0150509  |
| Dest.PIT         |               0.0133032  |
| Origin.ABQ       |               0.0124889  |
| Origin.SNA       |               0.00325147 |
| Dest.BOI         |               0.00230797 |
| UniqueCarrier.PS |               0.00186346 |
| Origin.MSP       |               0.00171329 |
| Dest.LGA         |               0.00118201 |
| Dest.LIT         |               0          |
| Dest.ORH         |               0          |
| Dest.RNO         |               0          |
| Origin.ANC       |               0          |
| Dest.SRQ         |               0          |
| Dest.MRY         |               0          |
| Dest.SJU         |               0          |
| Origin.AVP       |               0          |
| Origin.SAT       |               0          |
| Dest.DSM         |               0          |
| Origin.SCK       |               0          |
| Origin.LIT       |               0          |
| Origin.UCA       |               0          |
| Origin.HOU       |               0          |
| Dest.ERI         |               0          |
| Dest.ACY         |               0          |
| Origin.DTW       |               0          |
| Dest.JAN         |               0          |
| Dest.TUS         |               0          |
| Origin.STT       |               0          |
| Dest.ELP         |               0          |
| Dest.ELM         |               0          |
| Dest.CHA         |               0          |
| Dest.MDT         |               0          |
| Origin.GNV       |               0          |
| Dest.AMA         |               0          |
| Dest.HOU         |               0          |
| Dest.BGM         |               0          |
| Dest.LEX         |               0          |
| Dest.RIC         |               0          |
| Dest.ROA         |               0          |
| Dest.PHF         |               0          |
| Dest.ANC         |               0          |
| Dest.HRL         |               0          |
| DayOfWeek.1      |               0          |
| Dest.DEN         |               0          |
| Origin.SWF       |               0          |
| Dest.EUG         |               0          |
| Dest.MYR         |               0          |
| Dest.SCK         |               0          |
| Origin.BIL       |               0          |

Variable Importances:






    




variable
relative_importance
scaled_importance
percentage
Year
1190.35620117
1.0
1.0
Origin
0.0
0.0
0.0
Dest
0.0
0.0
0.0
UniqueCarrier
0.0
0.0
0.0
DayOfWeek
0.0
0.0
0.0
Month
0.0
0.0
0.0
Distance
0.0
0.0
0.0
FlightNum
0.0
0.0
0.0






    



Variable Importances:






    




variable
relative_importance
scaled_importance
percentage
Year
1580.08081055
1.0
0.664161972405
Origin
442.581848145
0.280100767752
0.186032278382
Dest
163.243835449
0.103313599127
0.0686169637725
FlightNum
107.887062073
0.0682794584635
0.0453486198079
DayOfWeek
55.0098800659
0.034814599164
0.0231225328493
UniqueCarrier
20.2114658356
0.012791412756
0.00849556992589
Distance
10.0445384979
0.00635697771334
0.00422206285663
Month
0.0
0.0
0.0



In [12]:

    
# Model performance of GBM model on test data
data_gbm2.model_performance(test)









    



ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.206336404474
R^2: 0.172898682703
LogLoss: 0.598985275271
AUC: 0.739698165955
Gini: 0.479396331911

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.393523825426:






    





NO
YES
Error
Rate
NO
2359.0
2900.0
0.5514
 (2900.0/5259.0)
YES
927.0
4840.0
0.1607
 (927.0/5767.0)
Total
3286.0
7740.0
0.3471
 (3827.0/11026.0)






    



Maximum Metrics: Maximum metrics at their respective thresholds







    




metric
threshold
value
idx
max f1
0.393523825426
0.716665432739
271.0
max f2
0.195633450896
0.847984372225
370.0
max f0point5
0.579458392588
0.702072432121
171.0
max accuracy
0.518166893883
0.680391801197
201.0
max precision
0.968486536437
1.0
0.0
max absolute_MCC
0.579458392588
0.363702269738
171.0
max min_per_class_accuracy
0.522136057492
0.680249696549
199.0






    Out[12]:

H2O cluster uptime:	1 minutes 48 seconds 26 milliseconds
H2O cluster version:	3.5.0.99999
H2O cluster name:	ece
H2O cluster total nodes:	1
H2O cluster total memory:	3.56 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	10	5.376344	800 B	0.05040237
C0D	Constant Reals	23	12.365591	1.8 KB	0.115925446
CBS	Bits	2	1.0752689	2.0 KB	0.12720299
CX0	Sparse Bits	10	5.376344	1.9 KB	0.12474586
C1	1-Byte Integers	40	21.505377	287.8 KB	18.564957
C1N	1-Byte Integers (w/o NAs)	19	10.215054	133.1 KB	8.58617
C1S	1-Byte Fractions	6	3.2258065	43.4 KB	2.8024976
C2	2-Byte Integers	76	40.860214	1.1 MB	69.628105

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
10.0.0.24:54321	1.5 MB	43978.0	6.0	186.0
mean	1.5 MB	43978.0	6.0	186.0
min	1.5 MB	43978.0	6.0	186.0
max	1.5 MB	43978.0	6.0	186.0
stddev	0 B	0.0	0.0	0.0
total	1.5 MB	43978.0	6.0	186.0

	Year	Month	DayofMonth	DayOfWeek	DepTime	CRSDepTime	ArrTime	CRSArrTime	UniqueCarrier	FlightNum	TailNum	ActualElapsedTime	CRSElapsedTime	AirTime	ArrDelay	DepDelay	Origin	Dest	Distance	TaxiIn	TaxiOut	Cancelled	CancellationCode	Diverted	CarrierDelay	WeatherDelay	NASDelay	SecurityDelay	LateAircraftDelay	IsArrDelayed	IsDepDelayed
type	int	int	int	int	int	int	int	int	enum	int	enum	int	int	int	int	int	enum	enum	int	int	int	int	enum	int	int	int	int	int	int	enum	enum
mins	1987.0	1.0	1.0	1.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0	16.0	17.0	14.0	-63.0	-16.0	0.0	0.0	11.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
maxs	2008.0	10.0	31.0	7.0	2400.0	2359.0	2400.0	2359.0	9.0	3949.0	3500.0	475.0	437.0	402.0	475.0	473.0	131.0	133.0	3365.0	128.0	254.0	1.0	3.0	1.0	369.0	201.0	323.0	14.0	373.0	1.0	1.0
mean	1997.5	1.40909090909	14.6010732639	3.82061485288	1345.84666138	1313.22286143	1504.63413038	1485.28916731	NaN	818.842989677	NaN	124.814529135	125.021562607	114.316111091	9.31711193698	10.0073906556	NaN	NaN	730.182190565	5.38136805953	14.1686341847	0.0246941652645	NaN	0.00247851198326	4.04780029106	0.289376469271	4.85503190418	0.0170155602821	7.62006045002	0.555755150302	0.525057983537
sigma	6.34436090171	1.87471137134	9.17579042586	1.90501311913	465.340899124	476.251139993	484.347487904	492.750434123	NaN	777.404369164	NaN	73.9744416606	73.40159463	69.6363295151	29.8402219624	26.4388090429	NaN	NaN	578.43800823	4.20197993986	9.9050857472	0.155193141358	NaN	0.0497234872189	16.2057299045	4.41677989873	18.6197762215	0.403940182102	23.4875658741	0.496887288343	0.499377380318
zero_count	0	0	0	0	0	569	0	569	724	0	2	0	0	-8878	1514	6393	59	172	0	-8255	-8321	42892	81	43869	-23296	-21800	-23252	-21726	-23500	19537	20887
missing_count	0	0	0	0	1086	0	1195	0	0	0	32	1195	13	16649	1195	1086	0	0	35	16026	16024	0	9774	0	35045	35045	35045	35045	35045	0	0

	Month	sum_Cancelled	nrow_Year
type	int	int	int
mins	1.0	19.0	1999.0
maxs	10.0	1067.0	41979.0
mean	5.5	543.0	21989.0
sigma	6.36396103068	741.047906684	28270.1291118
zero_count	0	0	0
missing_count	0	0	0

variable	relative_importance	scaled_importance	percentage
Year	1190.35620117	1.0	1.0
Origin	0.0	0.0	0.0
Dest	0.0	0.0	0.0
UniqueCarrier	0.0	0.0	0.0
DayOfWeek	0.0	0.0	0.0
Month	0.0	0.0	0.0
Distance	0.0	0.0	0.0
FlightNum	0.0	0.0	0.0

	NO	YES	Error	Rate
NO	2359.0	2900.0	0.5514	(2900.0/5259.0)
YES	927.0	4840.0	0.1607	(927.0/5767.0)
Total	3286.0	7740.0	0.3471	(3827.0/11026.0)

metric	threshold	value	idx
max f1	0.393523825426	0.716665432739	271.0
max f2	0.195633450896	0.847984372225	370.0
max f0point5	0.579458392588	0.702072432121	171.0
max accuracy	0.518166893883	0.680391801197	201.0
max precision	0.968486536437	1.0	0.0
max absolute_MCC	0.579458392588	0.363702269738	171.0
max min_per_class_accuracy	0.522136057492	0.680249696549	199.0