notebook.community

Edit and run



In [1]:

    
import h2o
import pandas
import pprint
import operator
import matplotlib.pyplot as plt
from tabulate import tabulate



In [2]:

    
# Connect to a cluster
h2o.init()









    




H2O cluster uptime: 
9 minutes 40 seconds 391 milliseconds 
H2O cluster version: 
3.1.0.99999
H2O cluster name: 
Anqi
H2O cluster total nodes: 
1
H2O cluster total memory: 
1.78 GB
H2O cluster total cores: 
8
H2O cluster allowed cores: 
8
H2O cluster healthy: 
True
H2O Connection ip: 
127.0.0.1
H2O Connection port: 
54321



In [3]:

    
# air_path = [h2o.locate("bigdata/laptop/airlines_all.05p.csv")]
# air_path = [h2o.locate("bigdata/laptop/flights-nyc/flights14.csv.zip")]
air_path = [h2o.locate("smalldata/airlines/allyears2k_headers.zip")]

# ----------

# 1- Load data - 1 row per flight.  Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print "Import and Parse airlines data"
data = h2o.import_frame(path=air_path)
data.describe()









    



Import and Parse airlines data

Parse Progress: [##################################################] 100%

Parsed 43,978 rows and 31 cols:







    




File1
C:\Users\Anqi\Documents\Work\h2o-3\smalldata/airlines/allyears2k_headers.zip






    



Rows: 43,978 Cols: 31

Chunk compression summary:







    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C0L
Constant Integers
10
5.376344
    800  B
0.05040237
C0D
Constant Reals
23
12.365591
    1.8 KB
0.115925446
CBS
Bits
2
1.0752689
    2.0 KB
0.12720299
CX0
Sparse Bits
10
5.376344
    1.9 KB
0.12474586
C1
1-Byte Integers
40
21.505377
  287.8 KB
18.564957
C1N
1-Byte Integers (w/o NAs)
19
10.215054
  133.1 KB
8.58617
C1S
1-Byte Fractions
6
3.2258065
   43.4 KB
2.8024976
C2
2-Byte Integers
76
40.860214
    1.1 MB
69.628105






    



Frame distribution summary:







    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
10.0.0.14:54321
    1.5 MB
43978.0
6.0
186.0
mean
    1.5 MB
43978.0
6.0
186.0
min
    1.5 MB
43978.0
6.0
186.0
max
    1.5 MB
43978.0
6.0
186.0
stddev
      0  B
0.0
0.0
0.0
total
    1.5 MB
43978.0
6.0
186.0






    



Column-by-Column Summary:







    





Year
Month
DayofMonth
DayOfWeek
DepTime
CRSDepTime
ArrTime
CRSArrTime
UniqueCarrier
FlightNum
TailNum
ActualElapsedTime
CRSElapsedTime
AirTime
ArrDelay
DepDelay
Origin
Dest
Distance
TaxiIn
TaxiOut
Cancelled
CancellationCode
Diverted
CarrierDelay
WeatherDelay
NASDelay
SecurityDelay
LateAircraftDelay
IsArrDelayed
IsDepDelayed
type
int
int
int
int
int
int
int
int
enum
int
enum
int
int
int
int
int
enum
enum
int
int
int
int
enum
int
int
int
int
int
int
enum
enum
mins
1987.0
1.0
1.0
1.0
1.0
0.0
1.0
0.0
0.0
1.0
0.0
16.0
17.0
14.0
-63.0
-16.0
0.0
0.0
11.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
maxs
2008.0
10.0
31.0
7.0
2400.0
2359.0
2400.0
2359.0
9.0
3949.0
3500.0
475.0
437.0
402.0
475.0
473.0
131.0
133.0
3365.0
128.0
254.0
1.0
3.0
1.0
369.0
201.0
323.0
14.0
373.0
1.0
1.0
sigma
6.34436090171
1.87471137134
9.17579042586
1.90501311913
465.340899124
476.251139993
484.347487904
492.750434123
2.05121227084
777.404369164
1168.75931155
73.9744416606
73.40159463
69.6363295151
29.8402219624
26.4388090429
37.64411521
33.930070329
578.43800823
4.20197993986
9.9050857472
0.155193141358
0.182676305421
0.0497234872189
16.2057299045
4.41677989873
18.6197762215
0.403940182102
23.4875658741
0.496887288343
0.499377380318
zero_count
0
0
0
0
0
569
0
569
724
0
2
0
0
0
1514
6393
59
172
0
623
557
42892
81
43869
7344
8840
7388
8914
7140
19537
20887
missing_count
0
0
0
0
1086
0
1195
0
0
0
32
1195
13
16649
1195
1086
0
0
35
16026
16024
0
9774
0
35045
35045
35045
35045
35045
0
0



In [4]:

    
# ----------

# 2- Data exploration and munging. Generate scatter plots 
# of various columns and plot fitted GLM model.

# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
    if(fit):
        lr = h2o.glm(x = data[[x]], y = data[y], family = "gaussian")
        coeff = lr.coef()
    df = data[[x,y]]
    runif = df[y].runif()
    df_subset = df[runif < float(max_points)/data.nrow()]
    df_py = h2o.as_list(df_subset)
    
    if(fit): h2o.remove(lr._key)

    # If x variable is string, generate box-and-whisker plot
    if(df_py[x].dtype == "object"):
        df_py.boxplot(column = y, by = x)
    # Otherwise, generate a scatter plot
    else:
        df_py.plot(x = x, y = y, kind = "scatter")
    
    if(fit):
        x_min = min(df_py[x])
        x_max = max(df_py[x])
        y_min = coeff["Intercept"] + coeff[x]*x_min
        y_max = coeff["Intercept"] + coeff[x]*x_max
        plt.plot([x_min, x_max], [y_min, y_max], "k-")
    plt.show()

scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)









    



glm Model Build Progress: [##################################################] 100%



In [5]:

    
# Group flights by month
aggregates = {"Month": ["nrow", 0, "all"], "Cancelled": ["sum", 1, "all"]}
bpd = h2o.group_by(data, cols=["Month"], aggregates=aggregates)
bpd.show()
bpd.describe()
bpd.dim()

# Convert columns to factors
data["Year"]      = data["Year"]     .asfactor()
data["Month"]     = data["Month"]    .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()









    



Displaying 2 row(s):






    




Row ID
Month
Cancelled
Month
1
[10.0]
[19990.0]
[1999.0]
2
[1.0]
[41979.0]
[41979.0]






    



Rows: 2 Cols: 3

Chunk compression summary:







    




chunk_type
chunk_name
count
count_percentage
size
size_percentage
C1N
1-Byte Integers (w/o NAs)
1
33.333336
     70  B
28.455284
C2S
2-Byte Fractions
2
66.66667
    176  B
71.544716






    



Frame distribution summary:







    





size
number_of_rows
number_of_chunks_per_column
number_of_chunks
10.0.0.14:54321
    246  B
2.0
1.0
3.0
mean
    246  B
2.0
1.0
3.0
min
    246  B
2.0
1.0
3.0
max
    246  B
2.0
1.0
3.0
stddev
      0  B
0.0
0.0
0.0
total
    246  B
2.0
1.0
3.0






    



Column-by-Column Summary:







    





Month
Cancelled
Month
type
int
int
int
mins
1.0
19990.0
1999.0
maxs
10.0
41979.0
41979.0
sigma
6.36396103068
15548.5710115
28270.1291118
zero_count
0
0
0
missing_count
0
0
0



In [6]:

    
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1

hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2

# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = h2o.ifelse((arrTime-depTime) > 0, (arrTime-depTime), None)[0]
scatter_plot(data, "Distance", "TravelTime")









    



glm Model Build Progress: [##################################################] 100%



In [7]:

    
# Impute missing travel times and re-plot
h2o.impute(data = data, column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")









    



glm Model Build Progress: [##################################################] 100%



In [8]:

    
# ----------
# 3- Fit a model on train; using test as validation

# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test  = data[s > 0.75]

# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]

# Simple GLM - Predict Delays
data_glm = h2o.glm(x           =train[myX],
                   y           =train[myY],
                   validation_x=test [myX],
                   validation_y=test [myY],
                   family      ="binomial",
                   standardize =True)

# Simple GBM
data_gbm = h2o.gbm(x              =train[myX],
                   y              =train[myY],
                   validation_x   =test [myX],
                   validation_y   =test [myY],
                   balance_classes=True,
                   ntrees         =3,
                   max_depth      =1,
                   distribution   ="bernoulli",
                   learn_rate     =0.1,
                   min_rows       =2)

# Complex GBM
data_gbm2 = h2o.gbm(x              =train[myX],
                    y              =train[myY],
                    validation_x   =test [myX],
                    validation_y   =test [myY],
                    balance_classes=True,
                    ntrees         =50,
                    max_depth      =5,
                    distribution   ="bernoulli",
                    learn_rate     =0.1,
                    min_rows       =2)

# Simple Random Forest
data_rf = h2o.random_forest(x              =train[myX],
                            y              =train[myY],
                            validation_x   =test [myX],
                            validation_y   =test [myY],
                            ntrees         =5,
                            max_depth      =2,
                            balance_classes=True)

# Complex Random Forest
data_rf2 = h2o.random_forest(x              =train[myX], 
                             y              =train[myY],
                             validation_x   =test [myX],
                             validation_y   =test [myY],
                             ntrees         =10,
                             max_depth      =5,
                             balance_classes=True)

# Deep Learning with 5 epochs
data_dl = h2o.deeplearning(x                   =train[myX],
                           y                   =train[myY],
                           validation_x        =test [myX],
                           validation_y        =test [myY],
                           hidden              =[10,10],
                           epochs              =5,
                           variable_importances=True,
                           balance_classes     =True,
                           loss                ="Automatic")









    



glm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%



In [19]:

    
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
glm_varimp = data_glm.coef_norm()
for k,v in glm_varimp.iteritems():
    glm_varimp[k] = abs(glm_varimp[k])
    
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print "Variable Importances:\n\n" + table

data_gbm.varimp()
data_rf.varimp()









    



Variable Importances:

| Predictor        |   Normalized Coefficient |
|------------------+--------------------------|
| Year.2008        |              2.20645     |
| Dest.LYH         |              1.66734     |
| Year.2003        |              1.527       |
| Year.2007        |              1.45986     |
| Origin.HPN       |              1.38768     |
| Origin.MDW       |              1.34977     |
| Origin.LIH       |              1.33007     |
| Origin.LEX       |              1.31243     |
| Origin.CAE       |              1.17961     |
| UniqueCarrier.HP |              1.16061     |
| Origin.TLH       |              1.11197     |
| Origin.HNL       |              1.10426     |
| Year.2001        |              1.04418     |
| Origin.ERI       |              1.04216     |
| Origin.MLB       |              1.01101     |
| Origin.OGG       |              0.98171     |
| Origin.GRR       |              0.968939    |
| Year.2002        |              0.942916    |
| Origin.MYR       |              0.938213    |
| UniqueCarrier.TW |              0.913381    |
| Origin.PSP       |              0.873085    |
| Origin.TRI       |              0.861922    |
| Origin.TUL       |              0.853857    |
| Origin.ATL       |              0.847938    |
| Origin.CRP       |              0.844794    |
| Year.2006        |              0.836969    |
| Year.2004        |              0.823102    |
| Origin.CHO       |              0.811761    |
| Origin.SAV       |              0.802768    |
| Origin.SDF       |              0.798546    |
| Origin.ORD       |              0.776268    |
| Dest.PSP         |              0.761213    |
| Origin.LYH       |              0.754704    |
| Dest.CHO         |              0.754704    |
| Dest.DAY         |              0.747404    |
| Origin.LBB       |              0.746334    |
| Origin.ACY       |              0.74528     |
| Year.1994        |              0.735115    |
| Origin.PBI       |              0.720532    |
| Origin.STL       |              0.716061    |
| Dest.ISP         |              0.703318    |
| Origin.IAH       |              0.696018    |
| Dest.CAK         |              0.691879    |
| Origin.BOI       |              0.672926    |
| Origin.OKC       |              0.672778    |
| Dest.TOL         |              0.665686    |
| Dest.FLL         |              0.641893    |
| Origin.ALB       |              0.640183    |
| Dest.ICT         |              0.637126    |
| Dest.HTS         |              0.630531    |
| Dest.GEG         |              0.628694    |
| Origin.MRY       |              0.598009    |
| Origin.CMH       |              0.597743    |
| Origin.OMA       |              0.597156    |
| Dest.PBI         |              0.588023    |
| Dest.KOA         |              0.586975    |
| Origin.BTV       |              0.582203    |
| Origin.MSY       |              0.581737    |
| Year.1996        |              0.580467    |
| Origin.ROA       |              0.579396    |
| Origin.AUS       |              0.576124    |
| Dest.LIH         |              0.57356     |
| Origin.PIT       |              0.572273    |
| Dest.FAY         |              0.56828     |
| Origin.CRW       |              0.552122    |
| Origin.MAF       |              0.551273    |
| Origin.JAX       |              0.547234    |
| Year.1990        |              0.546692    |
| Dest.PNS         |              0.527311    |
| Dest.IND         |              0.526206    |
| Dest.JAX         |              0.524667    |
| Dest.CAE         |              0.521757    |
| Year.2005        |              0.521459    |
| Origin.PHL       |              0.521111    |
| Dest.OGG         |              0.51657     |
| UniqueCarrier.CO |              0.516328    |
| Origin.PWM       |              0.516188    |
| Year.1997        |              0.508542    |
| Origin.SRQ       |              0.501791    |
| Dest.UCA         |              0.50147     |
| Dest.TPA         |              0.49956     |
| Origin.ABE       |              0.490904    |
| Origin.LAX       |              0.489046    |
| Dest.AVL         |              0.487257    |
| Dest.LBB         |              0.485847    |
| Origin.TUS       |              0.482996    |
| Dest.SFO         |              0.479318    |
| Origin.FLL       |              0.478682    |
| Origin.DAY       |              0.473651    |
| Dest.COS         |              0.471269    |
| UniqueCarrier.WN |              0.464096    |
| Origin.GSO       |              0.456694    |
| Origin.MIA       |              0.449807    |
| Dest.CHS         |              0.447067    |
| Dest.GSO         |              0.431911    |
| Origin.LGA       |              0.425385    |
| Origin.BDL       |              0.423546    |
| Dest.MKE         |              0.414258    |
| Dest.ABQ         |              0.413438    |
| Year.1995        |              0.406966    |
| Origin.SYR       |              0.406785    |
| Origin.EYW       |              0.406293    |
| Dest.CLE         |              0.403485    |
| Origin.MCI       |              0.389425    |
| UniqueCarrier.PI |              0.384624    |
| Origin.IND       |              0.381196    |
| Dest.SLC         |              0.379741    |
| Origin.SLC       |              0.367927    |
| Dest.BGM         |              0.366001    |
| Origin.MCO       |              0.363054    |
| Origin.DFW       |              0.359103    |
| Dest.CMH         |              0.352386    |
| Dest.PWM         |              0.3502      |
| Origin.BOS       |              0.34326     |
| Dest.EYW         |              0.334537    |
| Origin.RNO       |              0.33134     |
| Origin.DSM       |              0.329586    |
| Origin.HRL       |              0.327308    |
| Origin.ICT       |              0.322426    |
| Origin.JFK       |              0.31971     |
| Dest.BUF         |              0.318101    |
| Origin.MDT       |              0.317615    |
| Dest.OAJ         |              0.315752    |
| Origin.BUR       |              0.311579    |
| Origin.OAK       |              0.307521    |
| Origin.LIT       |              0.303218    |
| Dest.MCO         |              0.302983    |
| Year.1991        |              0.302938    |
| UniqueCarrier.US |              0.300675    |
| Year.1992        |              0.297015    |
| Origin.ROC       |              0.28823     |
| Origin.EGE       |              0.288116    |
| Dest.BDL         |              0.28648     |
| Origin.SAN       |              0.283992    |
| Origin.MSP       |              0.28085     |
| Origin.PHF       |              0.280173    |
| Origin.JAN       |              0.280049    |
| Dest.MYR         |              0.27796     |
| Dest.SNA         |              0.265734    |
| Dest.CVG         |              0.264375    |
| Dest.IAD         |              0.263366    |
| Dest.SAT         |              0.262028    |
| Origin.BWI       |              0.26197     |
| Dest.BHM         |              0.260658    |
| Dest.AUS         |              0.257046    |
| Origin.CLT       |              0.252817    |
| Origin.BGM       |              0.249072    |
| Origin.IAD       |              0.247584    |
| Origin.DEN       |              0.239967    |
| Dest.IAH         |              0.236912    |
| Month.10         |              0.236816    |
| Year.1987        |              0.236816    |
| Origin.TPA       |              0.234566    |
| Origin.BUF       |              0.234454    |
| Dest.BUR         |              0.23317     |
| Dest.FAT         |              0.230435    |
| DayOfWeek.5      |              0.229992    |
| Dest.ALB         |              0.229499    |
| Origin.ORF       |              0.22929     |
| UniqueCarrier.AA |              0.224943    |
| Dest.RNO         |              0.222055    |
| Dest.SWF         |              0.221337    |
| Origin.BHM       |              0.212308    |
| Origin.CLE       |              0.210425    |
| Dest.PVD         |              0.208197    |
| Year.1999        |              0.206588    |
| Origin.STX       |              0.20182     |
| Dest.PDX         |              0.200751    |
| Dest.GRR         |              0.200415    |
| Dest.EWR         |              0.198965    |
| DayOfWeek.2      |              0.197407    |
| Dest.SAN         |              0.197022    |
| Dest.RDU         |              0.196143    |
| Dest.RSW         |              0.195439    |
| Origin.PHX       |              0.190512    |
| Dest.LAX         |              0.189674    |
| Origin.ELP       |              0.187637    |
| Dest.MCI         |              0.18748     |
| Origin.SWF       |              0.185943    |
| Dest.CLT         |              0.182888    |
| Dest.SDF         |              0.179141    |
| DayOfWeek.6      |              0.175119    |
| Origin.GEG       |              0.168538    |
| Dest.CRP         |              0.166112    |
| Dest.BWI         |              0.165897    |
| Year.1993        |              0.165274    |
| Origin.PDX       |              0.163935    |
| Dest.MSP         |              0.16179     |
| Origin.SNA       |              0.161758    |
| Dest.BTV         |              0.159138    |
| Origin.EWR       |              0.158962    |
| Dest.SEA         |              0.158931    |
| Distance         |              0.158559    |
| Dest.PHL         |              0.155809    |
| Dest.PHX         |              0.153835    |
| Dest.OMA         |              0.148225    |
| Dest.SYR         |              0.148075    |
| Dest.STL         |              0.145674    |
| Dest.SJU         |              0.145017    |
| Dest.MIA         |              0.13949     |
| Origin.RSW       |              0.13946     |
| Dest.ILM         |              0.138816    |
| Origin.ISP       |              0.137236    |
| Dest.LGA         |              0.124268    |
| DayOfWeek.4      |              0.120262    |
| Dest.RIC         |              0.118344    |
| Dest.EUG         |              0.118323    |
| Origin.RDU       |              0.118281    |
| Dest.OAK         |              0.114895    |
| Origin.TYS       |              0.11424     |
| Dest.LAS         |              0.114106    |
| Dest.MHT         |              0.11257     |
| Month.1          |              0.112187    |
| Origin.ABQ       |              0.110686    |
| Dest.ORD         |              0.106508    |
| Origin.MEM       |              0.105434    |
| Year.1989        |              0.105135    |
| Dest.SMF         |              0.104787    |
| Origin.SFO       |              0.102576    |
| Dest.SJC         |              0.101178    |
| Origin.MHT       |              0.0984085   |
| Origin.SJU       |              0.0979404   |
| Origin.BNA       |              0.0954815   |
| Origin.COS       |              0.093112    |
| Year.2000        |              0.084138    |
| Origin.UCA       |              0.0831781   |
| Dest.HPN         |              0.079277    |
| Dest.ORF         |              0.0783008   |
| Dest.DEN         |              0.0781418   |
| Dest.SBN         |              0.077641    |
| Dest.DFW         |              0.0771464   |
| Dest.BOS         |              0.074874    |
| UniqueCarrier.UA |              0.0680672   |
| Dest.BNA         |              0.0609925   |
| Year.1998        |              0.0580267   |
| Dest.JFK         |              0.0543564   |
| DayOfWeek.7      |              0.0530451   |
| Dest.ROA         |              0.0523803   |
| Origin.SEA       |              0.051804    |
| Dest.MDW         |              0.048072    |
| Dest.DTW         |              0.0454117   |
| Dest.TUS         |              0.0429144   |
| Dest.HOU         |              0.0424006   |
| Origin.PVD       |              0.042323    |
| Dest.ONT         |              0.0398159   |
| Dest.TUL         |              0.0381556   |
| Origin.DTW       |              0.0369941   |
| Dest.MDT         |              0.0350554   |
| DayOfWeek.3      |              0.0348907   |
| Origin.RIC       |              0.0313877   |
| Origin.HOU       |              0.0295949   |
| Dest.ELP         |              0.0293808   |
| Dest.ATL         |              0.0286942   |
| FlightNum        |              0.0273415   |
| Dest.ROC         |              0.02682     |
| Year.1988        |              0.0266856   |
| Dest.BOI         |              0.0257301   |
| Dest.DAL         |              0.0252462   |
| Origin.SJC       |              0.0241171   |
| Dest.ABE         |              0.0238435   |
| Dest.MSY         |              0.0232965   |
| Origin.MKE       |              0.0206479   |
| Origin.CHS       |              0.0191289   |
| Origin.DCA       |              0.0160487   |
| Dest.DCA         |              0.0120622   |
| Origin.CVG       |              0.00886461  |
| Origin.SCK       |              0.00671922  |
| Origin.LAS       |              0.00537378  |
| Intercept        |              0.000536486 |
| Dest.LIT         |              0           |
| Dest.OKC         |              0           |
| Dest.ORH         |              0           |
| Origin.ANC       |              0           |
| Dest.SRQ         |              0           |
| Dest.MRY         |              0           |
| UniqueCarrier.DL |              0           |
| Dest.PIT         |              0           |
| Origin.AVP       |              0           |
| Origin.SAT       |              0           |
| Dest.DSM         |              0           |
| Dest.HNL         |              0           |
| Origin.LAN       |              0           |
| Origin.KOA       |              0           |
| Origin.SMF       |              0           |
| Origin.AMA       |              0           |
| Dest.ERI         |              0           |
| Dest.ACY         |              0           |
| Dest.FNT         |              0           |
| Dest.GSP         |              0           |
| Origin.MFR       |              0           |
| Origin.ONT       |              0           |
| Dest.JAN         |              0           |
| Origin.STT       |              0           |
| Dest.ELM         |              0           |
| Dest.CHA         |              0           |
| Origin.SBN       |              0           |
| Origin.GNV       |              0           |
| Dest.AMA         |              0           |
| Dest.LEX         |              0           |
| UniqueCarrier.PS |              0           |
| Dest.MAF         |              0           |
| Dest.PHF         |              0           |
| Dest.ANC         |              0           |
| Dest.HRL         |              0           |
| Dest.AVP         |              0           |
| DayOfWeek.1      |              0           |
| Dest.STT         |              0           |
| Dest.SCK         |              0           |
| Origin.BIL       |              0           |
| Origin.DAL       |              0           |

Variable Importances:







    




variable
relative_importance
scaled_importance
percentage
Year
1234.6340332
1.0
1.0
Origin
0.0
0.0
0.0
Dest
0.0
0.0
0.0
UniqueCarrier
0.0
0.0
0.0
DayOfWeek
0.0
0.0
0.0
Month
0.0
0.0
0.0
Distance
0.0
0.0
0.0
FlightNum
0.0
0.0
0.0






    



Variable Importances:







    




variable
relative_importance
scaled_importance
percentage
Year
1895.45153809
1.0
0.437243955524
Origin
1551.8223877
0.81870855388
0.35797536652
Dest
465.785095215
0.245738329815
0.107447599352
FlightNum
260.802764893
0.137594003145
0.0601621461915
DayOfWeek
55.2715072632
0.0291600740787
0.0127500661336
UniqueCarrier
53.0120201111
0.0279680166155
0.0122288462131
Distance
52.8523788452
0.0278837932721
0.0121920200653
Month
0.0
0.0
0.0



In [20]:

    
# Model performance of GBM model on test data
data_gbm2.model_performance(test)









    



ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.206072998286
R^2: 0.173530324799
LogLoss: 0.598709245809
AUC: 0.740860130858
Gini: 0.481720261716

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.390111773837:







    





NO
YES
Error
Rate
NO
2418.0
2820.0
0.5384
 (2820.0/5238.0)
YES
886.0
4929.0
0.1524
 (886.0/5815.0)
Total
3304.0
7749.0
0.6908
 (0.6908/11053.0)






    



Maximum Metrics:







    




metric
threshold
value
idx
max f1
0.390111773837
0.726776762017
271.0
max f2
0.203821606737
0.848781779661
366.0
max f0point5
0.584877735232
0.701223357296
170.0
max accuracy
0.485219911311
0.678277390754
220.0
max precision
0.953543170817
1.0
0.0
max absolute_MCC
0.584877735232
0.35864900149
170.0
max min_per_class_accuracy
0.507749761479
0.675448644521
209.0






    Out[20]:



In [ ]:

H2O cluster uptime:	9 minutes 40 seconds 391 milliseconds
H2O cluster version:	3.1.0.99999
H2O cluster name:	Anqi
H2O cluster total nodes:	1
H2O cluster total memory:	1.78 GB
H2O cluster total cores:	8
H2O cluster allowed cores:	8
H2O cluster healthy:	True
H2O Connection ip:	127.0.0.1
H2O Connection port:	54321

chunk_type	chunk_name	count	count_percentage	size	size_percentage
C0L	Constant Integers	10	5.376344	800 B	0.05040237
C0D	Constant Reals	23	12.365591	1.8 KB	0.115925446
CBS	Bits	2	1.0752689	2.0 KB	0.12720299
CX0	Sparse Bits	10	5.376344	1.9 KB	0.12474586
C1	1-Byte Integers	40	21.505377	287.8 KB	18.564957
C1N	1-Byte Integers (w/o NAs)	19	10.215054	133.1 KB	8.58617
C1S	1-Byte Fractions	6	3.2258065	43.4 KB	2.8024976
C2	2-Byte Integers	76	40.860214	1.1 MB	69.628105

	size	number_of_rows	number_of_chunks_per_column	number_of_chunks
10.0.0.14:54321	1.5 MB	43978.0	6.0	186.0
mean	1.5 MB	43978.0	6.0	186.0
min	1.5 MB	43978.0	6.0	186.0
max	1.5 MB	43978.0	6.0	186.0
stddev	0 B	0.0	0.0	0.0
total	1.5 MB	43978.0	6.0	186.0

	Year	Month	DayofMonth	DayOfWeek	DepTime	CRSDepTime	ArrTime	CRSArrTime	UniqueCarrier	FlightNum	TailNum	ActualElapsedTime	CRSElapsedTime	AirTime	ArrDelay	DepDelay	Origin	Dest	Distance	TaxiIn	TaxiOut	Cancelled	CancellationCode	Diverted	CarrierDelay	WeatherDelay	NASDelay	SecurityDelay	LateAircraftDelay	IsArrDelayed	IsDepDelayed
type	int	int	int	int	int	int	int	int	enum	int	enum	int	int	int	int	int	enum	enum	int	int	int	int	enum	int	int	int	int	int	int	enum	enum
mins	1987.0	1.0	1.0	1.0	1.0	0.0	1.0	0.0	0.0	1.0	0.0	16.0	17.0	14.0	-63.0	-16.0	0.0	0.0	11.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
maxs	2008.0	10.0	31.0	7.0	2400.0	2359.0	2400.0	2359.0	9.0	3949.0	3500.0	475.0	437.0	402.0	475.0	473.0	131.0	133.0	3365.0	128.0	254.0	1.0	3.0	1.0	369.0	201.0	323.0	14.0	373.0	1.0	1.0
sigma	6.34436090171	1.87471137134	9.17579042586	1.90501311913	465.340899124	476.251139993	484.347487904	492.750434123	2.05121227084	777.404369164	1168.75931155	73.9744416606	73.40159463	69.6363295151	29.8402219624	26.4388090429	37.64411521	33.930070329	578.43800823	4.20197993986	9.9050857472	0.155193141358	0.182676305421	0.0497234872189	16.2057299045	4.41677989873	18.6197762215	0.403940182102	23.4875658741	0.496887288343	0.499377380318
zero_count	0	0	0	0	0	569	0	569	724	0	2	0	0	0	1514	6393	59	172	0	623	557	42892	81	43869	7344	8840	7388	8914	7140	19537	20887
missing_count	0	0	0	0	1086	0	1195	0	0	0	32	1195	13	16649	1195	1086	0	0	35	16026	16024	0	9774	0	35045	35045	35045	35045	35045	0	0

Row ID	Month	Cancelled	Month
1	[10.0]	[19990.0]	[1999.0]
2	[1.0]	[41979.0]	[41979.0]

variable	relative_importance	scaled_importance	percentage
Year	1234.6340332	1.0	1.0
Origin	0.0	0.0	0.0
Dest	0.0	0.0	0.0
UniqueCarrier	0.0	0.0	0.0
DayOfWeek	0.0	0.0	0.0
Month	0.0	0.0	0.0
Distance	0.0	0.0	0.0
FlightNum	0.0	0.0	0.0

	NO	YES	Error	Rate
NO	2418.0	2820.0	0.5384	(2820.0/5238.0)
YES	886.0	4929.0	0.1524	(886.0/5815.0)
Total	3304.0	7749.0	0.6908	(0.6908/11053.0)

metric	threshold	value	idx
max f1	0.390111773837	0.726776762017	271.0
max f2	0.203821606737	0.848781779661	366.0
max f0point5	0.584877735232	0.701223357296	170.0
max accuracy	0.485219911311	0.678277390754	220.0
max precision	0.953543170817	1.0	0.0
max absolute_MCC	0.584877735232	0.35864900149	170.0
max min_per_class_accuracy	0.507749761479	0.675448644521	209.0