In [1]:
import h2o
import pandas
import pprint
import operator
import matplotlib
from tabulate import tabulate


/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/pandas/__init__.py:7: DeprecationWarning: bad escape \s
  from pandas import hashtable, tslib, lib

In [2]:
# Connect to a cluster
h2o.init()


H2O cluster uptime: 5 minutes 25 seconds 443 milliseconds
H2O cluster version: 3.7.0.99999
H2O cluster name: spIdea
H2O cluster total nodes: 1
H2O cluster total free memory: 12.12 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321
H2O Connection proxy: None
Python Version: 3.5.0

In [3]:
# set this to True if interactive (matplotlib) plots are desired
interactive = False
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt

In [4]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
# air_path = [_locate("bigdata/laptop/airlines_all.05p.csv")]
# air_path = [_locate("bigdata/laptop/flights-nyc/flights14.csv.zip")]
air_path = [_locate("smalldata/airlines/allyears2k_headers.zip")]

# ----------

# 1- Load data - 1 row per flight.  Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print("Import and Parse airlines data")
data = h2o.import_file(path=air_path)
data.describe()


Import and Parse airlines data

Parse Progress: [##################################################] 100%
Rows:43,978 Cols:31

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 10 5.376344 800 B 0.0504024
C0D Constant Reals 23 12.365591 1.8 KB 0.1159254
CBS Bits 2 1.0752689 2.0 KB 0.1272030
CX0 Sparse Bits 10 5.376344 1.9 KB 0.1247459
C1 1-Byte Integers 40 21.505377 287.8 KB 18.564957
C1N 1-Byte Integers (w/o NAs) 19 10.215054 133.1 KB 8.58617
C1S 1-Byte Fractions 6 3.2258065 43.4 KB 2.8024976
C2 2-Byte Integers 76 40.860214 1.1 MB 69.628105
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.84:54321 1.5 MB 43978.0 6.0 186.0
mean 1.5 MB 43978.0 6.0 186.0
min 1.5 MB 43978.0 6.0 186.0
max 1.5 MB 43978.0 6.0 186.0
stddev 0 B 0.0 0.0 0.0
total 1.5 MB 43978.0 6.0 186.0

Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay IsArrDelayed IsDepDelayed
type int int int int int int int int enum int enum int int int int int enum enum int int int int enum int int int int int int enum enum
mins 1987.0 1.0 1.0 1.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 16.0 17.0 14.0 -63.0 -16.0 0.0 0.0 11.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
mean 1997.5 1.40909090909090914.6010732639046793.820614852880991 1345.84666138207631313.22286143071641504.63413037888841485.289167310927 NaN 818.8429896766577NaN 124.8145291354043 125.02156260661899114.316111090782779.317111936984313 10.0073906556001 NaN NaN 730.18219056505015.38136805953062814.1686341847320560.024694165264450407NaN 0.00247851198326435934.0478002910556270.28937646927124174.855031904175534 0.0170155602821000967.620060450016789 0.555755150302424 0.5250579835372226
maxs 2008.0 10.0 31.0 7.0 2400.0 2359.0 2400.0 2359.0 9.0 3949.0 3500.0 475.0 437.0 402.0 475.0 473.0 131.0 133.0 3365.0 128.0 254.0 1.0 3.0 1.0 369.0 201.0 323.0 14.0 373.0 1.0 1.0
sigma 6.3443609017111771.8747113713439639.175790425861443 1.9050131191328936465.340899124234 476.25113999259946484.34748790351614492.75043412270094NaN 777.4043691636349NaN 73.97444166059017 73.4015946300093 69.63632951506109 29.84022196241484826.438809042916454NaN NaN 578.438008230424 4.2019799398648289.905085747204327 0.15519314135784237 NaN 0.049723487218862286 16.205729904484234.416779898734124 18.6197762214756820.40394018210151184 23.487565874106213 0.49688728834288370.49937738031758017
zeros 0 0 0 0 0 569 0 569 724 0 2 0 0 -8878 1514 6393 59 172 0 -8255 -8321 42892 81 43869 -23296 -21800 -23252 -21726 -23500 19537 20887
missing0 0 0 0 1086 0 1195 0 0 0 32 1195 13 16649 1195 1086 0 0 35 16026 16024 0 9774 0 35045 35045 35045 35045 35045 0 0
0 1987.0 10.0 14.0 3.0 741.0 730.0 912.0 849.0 PS 1451.0 NA 91.0 79.0 nan 23.0 11.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES YES
1 1987.0 10.0 15.0 4.0 729.0 730.0 903.0 849.0 PS 1451.0 NA 94.0 79.0 nan 14.0 -1.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES NO
2 1987.0 10.0 17.0 6.0 741.0 730.0 918.0 849.0 PS 1451.0 NA 97.0 79.0 nan 29.0 11.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES YES
3 1987.0 10.0 18.0 7.0 729.0 730.0 847.0 849.0 PS 1451.0 NA 78.0 79.0 nan -2.0 -1.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan NO NO
4 1987.0 10.0 19.0 1.0 749.0 730.0 922.0 849.0 PS 1451.0 NA 93.0 79.0 nan 33.0 19.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES YES
5 1987.0 10.0 21.0 3.0 728.0 730.0 848.0 849.0 PS 1451.0 NA 80.0 79.0 nan -1.0 -2.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan NO NO
6 1987.0 10.0 22.0 4.0 728.0 730.0 852.0 849.0 PS 1451.0 NA 84.0 79.0 nan 3.0 -2.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES NO
7 1987.0 10.0 23.0 5.0 731.0 730.0 902.0 849.0 PS 1451.0 NA 91.0 79.0 nan 13.0 1.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES YES
8 1987.0 10.0 24.0 6.0 744.0 730.0 908.0 849.0 PS 1451.0 NA 84.0 79.0 nan 19.0 14.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES YES
9 1987.0 10.0 25.0 7.0 729.0 730.0 851.0 849.0 PS 1451.0 NA 82.0 79.0 nan 2.0 -1.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES NO

In [5]:
# ----------

# 2- Data exploration and munging. Generate scatter plots 
# of various columns and plot fitted GLM model.

# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
    if(fit):
        lr = h2o.glm(x = data[[x]], y = data[y], family = "gaussian")
        coeff = lr.coef()
    df = data[[x,y]]
    runif = df[y].runif()
    df_subset = df[runif < float(max_points)/data.nrow]
    df_py = h2o.as_list(df_subset)
    
    if(fit): h2o.remove(lr._id)

    # If x variable is string, generate box-and-whisker plot
    if(df_py[x].dtype == "object"):
        if interactive: df_py.boxplot(column = y, by = x)
    # Otherwise, generate a scatter plot
    else:
        if interactive: df_py.plot(x = x, y = y, kind = "scatter")
    
    if(fit):
        x_min = min(df_py[x])
        x_max = max(df_py[x])
        y_min = coeff["Intercept"] + coeff[x]*x_min
        y_max = coeff["Intercept"] + coeff[x]*x_max
        plt.plot([x_min, x_max], [y_min, y_max], "k-")
    if interactive: plt.show()

scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)


glm Model Build Progress: [##################################################] 100%
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:9: DeprecationWarning: `h2o.glm` is deprecated. Use the estimators sub module to build an H2OGeneralizedLinearEstimator.

In [6]:
# Group flights by month
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim

# Convert columns to factors
data["Year"]      = data["Year"]     .asfactor()
data["Month"]     = data["Month"]    .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()


Month nrow_Year sum_Cancelled
1 41979 1067
10 1999 19
Rows:2 Cols:3

Chunk compression summary: 
chunk_type chunk_name count count_percentage size size_percentage
C1N 1-Byte Integers (w/o NAs) 1 33.333336 70 B 30.434782
C2 2-Byte Integers 1 33.333336 72 B 31.304348
C2S 2-Byte Fractions 1 33.333336 88 B 38.260868
Frame distribution summary: 
size number_of_rows number_of_chunks_per_column number_of_chunks
172.16.2.84:54321 230 B 2.0 1.0 3.0
mean 230 B 2.0 1.0 3.0
min 230 B 2.0 1.0 3.0
max 230 B 2.0 1.0 3.0
stddev 0 B 0.0 0.0 0.0
total 230 B 2.0 1.0 3.0

Month nrow_Year sum_Cancelled
type int int int
mins 1.0 1999.0 19.0
mean 5.5 21989.0 543.0
maxs 10.0 41979.0 1067.0
sigma 6.36396103067892828270.12911183817741.0479066835018
zeros 0 0 0
missing0 0 0
0 1.0 41979.0 1067.0
1 10.0 1999.0 19.0

In [7]:
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1

hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2

# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = (arrTime-depTime > 0).ifelse((arrTime-depTime), h2o.H2OFrame([[None]]*data.nrow))
scatter_plot(data, "Distance", "TravelTime")


Parse Progress: [##################################################] 100%

glm Model Build Progress: [##################################################] 100%
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:9: DeprecationWarning: `h2o.glm` is deprecated. Use the estimators sub module to build an H2OGeneralizedLinearEstimator.

In [8]:
# Impute missing travel times and re-plot
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")


glm Model Build Progress: [##################################################] 100%
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:9: DeprecationWarning: `h2o.glm` is deprecated. Use the estimators sub module to build an H2OGeneralizedLinearEstimator.

In [9]:
# ----------
# 3- Fit a model on train; using test as validation

# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test  = data[s > 0.75]

# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]

# Simple GLM - Predict Delays
data_glm = h2o.glm(x           =train[myX],
                   y           =train[myY],
                   validation_x=test [myX],
                   validation_y=test [myY],
                   family      ="binomial",
                   standardize =True)

# Simple GBM
data_gbm = h2o.gbm(x              =train[myX],
                   y              =train[myY],
                   validation_x   =test [myX],
                   validation_y   =test [myY],
                   balance_classes=True,
                   ntrees         =3,
                   max_depth      =1,
                   distribution   ="bernoulli",
                   learn_rate     =0.1,
                   min_rows       =2)

# Complex GBM
data_gbm2 = h2o.gbm(x              =train[myX],
                    y              =train[myY],
                    validation_x   =test [myX],
                    validation_y   =test [myY],
                    balance_classes=True,
                    ntrees         =50,
                    max_depth      =5,
                    distribution   ="bernoulli",
                    learn_rate     =0.1,
                    min_rows       =2)

# Simple Random Forest
data_rf = h2o.random_forest(x              =train[myX],
                            y              =train[myY],
                            validation_x   =test [myX],
                            validation_y   =test [myY],
                            ntrees         =5,
                            max_depth      =2,
                            balance_classes=True)

# Complex Random Forest
data_rf2 = h2o.random_forest(x              =train[myX], 
                             y              =train[myY],
                             validation_x   =test [myX],
                             validation_y   =test [myY],
                             ntrees         =10,
                             max_depth      =5,
                             balance_classes=True)

# Deep Learning with 5 epochs
data_dl = h2o.deeplearning(x                   =train[myX],
                           y                   =train[myY],
                           validation_x        =test [myX],
                           validation_y        =test [myY],
                           hidden              =[10,10],
                           epochs              =5,
                           variable_importances=True,
                           balance_classes     =True,
                           loss                ="Automatic")


glm Model Build Progress: [##################################################] 100%
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:19: DeprecationWarning: `h2o.glm` is deprecated. Use the estimators sub module to build an H2OGeneralizedLinearEstimator.
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:31: DeprecationWarning: `h2o.gbm` is deprecated. Use the estimators sub module to build an H2OGradientBoostedEstimator.

gbm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:43: DeprecationWarning: `h2o.gbm` is deprecated. Use the estimators sub module to build an H2OGradientBoostedEstimator.
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:52: DeprecationWarning: `h2o.random_forest` is deprecated. Use the estimators sub module to build an H2ORandomForestEstimator.

drf Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:61: DeprecationWarning: `h2o.random_forest` is deprecated. Use the estimators sub module to build an H2ORandomForestEstimator.
/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/ipykernel/__main__.py:72: DeprecationWarning: `h2o.deeplearning` is deprecated. Use the estimators sub module to build an H2ODeepLearningEstimator.

deeplearning Model Build Progress: [##################################################] 100%

In [11]:
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
from six import iteritems
glm_varimp = data_glm.coef_norm()
for k,v in iteritems(glm_varimp):
    glm_varimp[k] = abs(glm_varimp[k])
    
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print("Variable Importances:\n\n" + table)

data_gbm.varimp()
data_rf.varimp()


Variable Importances:

| Predictor        |   Normalized Coefficient |
|------------------+--------------------------|
| Year.2008        |               2.1663     |
| Dest.HTS         |               1.59911    |
| Year.2003        |               1.59565    |
| Origin.MDW       |               1.58362    |
| Year.2007        |               1.37479    |
| Origin.HPN       |               1.34354    |
| Origin.LIH       |               1.32598    |
| Dest.LYH         |               1.29275    |
| Origin.LBB       |               1.21984    |
| Origin.LEX       |               1.21291    |
| Origin.ERI       |               1.20959    |
| Origin.TLH       |               1.17343    |
| Origin.CAE       |               1.15044    |
| UniqueCarrier.HP |               1.12944    |
| Origin.PSP       |               1.11685    |
| Origin.HNL       |               1.11194    |
| Origin.TRI       |               1.02187    |
| UniqueCarrier.TW |               1.0169     |
| Year.2001        |               0.979973   |
| Year.2002        |               0.944374   |
| Origin.SDF       |               0.939753   |
| Origin.ATL       |               0.935832   |
| Origin.GRR       |               0.884671   |
| Origin.PBI       |               0.882257   |
| Origin.CHO       |               0.878584   |
| Origin.OGG       |               0.864754   |
| Origin.SRQ       |               0.856535   |
| Year.2004        |               0.846669   |
| Origin.MYR       |               0.835173   |
| Origin.ACY       |               0.804102   |
| Origin.ORD       |               0.787865   |
| Year.1994        |               0.781128   |
| Origin.MAF       |               0.766548   |
| Origin.TUL       |               0.765077   |
| Origin.MRY       |               0.759124   |
| Year.2006        |               0.749834   |
| Origin.STL       |               0.737706   |
| Origin.LYH       |               0.728328   |
| Dest.CHO         |               0.728328   |
| Origin.CMH       |               0.703809   |
| Dest.GSO         |               0.694797   |
| Origin.BTV       |               0.678703   |
| Origin.ROA       |               0.672739   |
| Dest.ISP         |               0.666122   |
| Dest.LIH         |               0.647256   |
| Origin.AUS       |               0.646233   |
| Origin.IAH       |               0.637049   |
| Dest.FLL         |               0.624057   |
| Origin.MLB       |               0.611271   |
| Dest.PBI         |               0.609092   |
| Origin.PIT       |               0.604604   |
| Origin.PWM       |               0.603332   |
| Dest.ICT         |               0.601697   |
| Year.1996        |               0.601507   |
| Origin.TYS       |               0.590041   |
| Origin.MSY       |               0.587653   |
| Year.1990        |               0.564752   |
| Dest.DAY         |               0.564026   |
| Origin.SYR       |               0.560879   |
| Dest.IAH         |               0.553572   |
| Dest.EUG         |               0.54793    |
| Origin.JAX       |               0.542031   |
| Origin.BOI       |               0.541044   |
| Dest.TOL         |               0.528751   |
| Dest.TPA         |               0.51248    |
| Dest.BUF         |               0.512192   |
| Dest.PSP         |               0.508527   |
| Origin.ALB       |               0.506946   |
| Origin.SAV       |               0.50483    |
| Origin.CRW       |               0.504431   |
| Dest.PNS         |               0.503218   |
| UniqueCarrier.CO |               0.499991   |
| Dest.SFO         |               0.499403   |
| Origin.PHL       |               0.498516   |
| Year.1997        |               0.492557   |
| Origin.OKC       |               0.491762   |
| Origin.LGA       |               0.488253   |
| Origin.MIA       |               0.480325   |
| Origin.OMA       |               0.477082   |
| Dest.CHS         |               0.475901   |
| Dest.CAK         |               0.473522   |
| Origin.FLL       |               0.469294   |
| Origin.ICT       |               0.464117   |
| Dest.GEG         |               0.461246   |
| Origin.EGE       |               0.461207   |
| Dest.ABQ         |               0.461191   |
| Dest.EYW         |               0.452089   |
| Year.2005        |               0.45045    |
| Dest.IND         |               0.449927   |
| UniqueCarrier.WN |               0.446792   |
| Origin.IND       |               0.446311   |
| Origin.GSO       |               0.442529   |
| Origin.MCO       |               0.434966   |
| Origin.LAX       |               0.433672   |
| Origin.BDL       |               0.418545   |
| Dest.CAE         |               0.414453   |
| Dest.SMF         |               0.409427   |
| Origin.CRP       |               0.403216   |
| Origin.DFW       |               0.399445   |
| Dest.BDL         |               0.395146   |
| Dest.CVG         |               0.391672   |
| Dest.UCA         |               0.39075    |
| Origin.DSM       |               0.387103   |
| Origin.MEM       |               0.383554   |
| Origin.EYW       |               0.375727   |
| Dest.CLE         |               0.372843   |
| Dest.FAT         |               0.369287   |
| UniqueCarrier.PI |               0.366404   |
| Origin.SLC       |               0.354344   |
| Origin.JFK       |               0.34159    |
| Origin.BWI       |               0.339737   |
| Dest.MIA         |               0.338326   |
| Origin.ROC       |               0.328992   |
| Origin.OAK       |               0.327167   |
| Dest.BGM         |               0.323214   |
| Origin.IAD       |               0.320497   |
| Dest.JAX         |               0.319508   |
| Dest.MKE         |               0.31828    |
| Year.1992        |               0.31714    |
| Dest.MCO         |               0.315641   |
| Dest.FAY         |               0.315447   |
| Dest.COS         |               0.314929   |
| Origin.RNO       |               0.314859   |
| Origin.MCI       |               0.313843   |
| Dest.SAT         |               0.305571   |
| Year.1995        |               0.29602    |
| Origin.SAN       |               0.292782   |
| Dest.OGG         |               0.281564   |
| Year.1991        |               0.274708   |
| Dest.BUR         |               0.270584   |
| Dest.ALB         |               0.268558   |
| Dest.TUL         |               0.26762    |
| Origin.DAY       |               0.264843   |
| Origin.BUR       |               0.264689   |
| Origin.CLT       |               0.256984   |
| Origin.ONT       |               0.256321   |
| Origin.MKE       |               0.254529   |
| Origin.HRL       |               0.253809   |
| DayOfWeek.5      |               0.244342   |
| UniqueCarrier.US |               0.239344   |
| Dest.BTV         |               0.23824    |
| Origin.ABE       |               0.234584   |
| Origin.TPA       |               0.22891    |
| Dest.STT         |               0.225113   |
| Origin.STX       |               0.223986   |
| Dest.GSP         |               0.221914   |
| Origin.BHM       |               0.219408   |
| Dest.IAD         |               0.219399   |
| Origin.BOS       |               0.21936    |
| Origin.MDT       |               0.217089   |
| Dest.PVD         |               0.21636    |
| Dest.RSW         |               0.208373   |
| Origin.ELP       |               0.207048   |
| Origin.DEN       |               0.205402   |
| Dest.LIT         |               0.204071   |
| Month.10         |               0.203185   |
| Year.1987        |               0.203185   |
| Dest.BWI         |               0.202309   |
| Origin.MSP       |               0.201702   |
| Dest.PDX         |               0.201547   |
| Dest.ROC         |               0.199012   |
| Origin.TUS       |               0.197624   |
| Dest.KOA         |               0.197388   |
| Dest.CLT         |               0.191233   |
| Dest.OAJ         |               0.188976   |
| Year.1999        |               0.186221   |
| Origin.SJC       |               0.182876   |
| Dest.DAL         |               0.179589   |
| Origin.BUF       |               0.178246   |
| DayOfWeek.2      |               0.17761    |
| Origin.DAL       |               0.175027   |
| Origin.CLE       |               0.173502   |
| Dest.GRR         |               0.169856   |
| Dest.PWM         |               0.16768    |
| UniqueCarrier.AA |               0.167342   |
| Year.1993        |               0.166087   |
| Dest.RNO         |               0.165744   |
| Distance         |               0.163211   |
| Dest.LBB         |               0.157175   |
| Dest.HRL         |               0.156284   |
| Dest.ABE         |               0.155532   |
| Dest.CMH         |               0.154857   |
| Dest.CRP         |               0.151555   |
| Dest.SNA         |               0.151435   |
| Origin.SFO       |               0.150441   |
| Dest.SEA         |               0.149936   |
| Dest.ROA         |               0.148303   |
| Year.2000        |               0.146046   |
| Dest.ORF         |               0.134053   |
| Dest.SAN         |               0.133593   |
| DayOfWeek.6      |               0.132748   |
| Dest.MSP         |               0.132271   |
| Origin.COS       |               0.128671   |
| Dest.HOU         |               0.127342   |
| Dest.TUS         |               0.120346   |
| DayOfWeek.4      |               0.119748   |
| Dest.DSM         |               0.116603   |
| Dest.LAX         |               0.11609    |
| Dest.SLC         |               0.114966   |
| Dest.AVP         |               0.112227   |
| Dest.STL         |               0.110793   |
| Origin.ORF       |               0.108536   |
| Dest.BHM         |               0.108348   |
| UniqueCarrier.UA |               0.107298   |
| Origin.DTW       |               0.105773   |
| Dest.MDW         |               0.10405    |
| Dest.DFW         |               0.0989164  |
| Origin.CVG       |               0.0967693  |
| Origin.SMF       |               0.0959796  |
| Origin.RSW       |               0.0934595  |
| Origin.SWF       |               0.0927228  |
| Month.1          |               0.092347   |
| Dest.PHL         |               0.0848795  |
| Dest.PHX         |               0.0848389  |
| Origin.RDU       |               0.0839633  |
| Origin.DCA       |               0.0832363  |
| Dest.OAK         |               0.0818515  |
| Dest.MCI         |               0.0815358  |
| Dest.EWR         |               0.0785491  |
| Dest.DEN         |               0.0783454  |
| Dest.DTW         |               0.0774459  |
| Year.1989        |               0.0762646  |
| Dest.LAS         |               0.0743316  |
| Dest.MDT         |               0.0731147  |
| Dest.RIC         |               0.0723303  |
| Dest.OMA         |               0.0661859  |
| UniqueCarrier.PS |               0.0645156  |
| Year.1998        |               0.05845    |
| Dest.MHT         |               0.0576363  |
| Origin.BNA       |               0.0553462  |
| Origin.PHX       |               0.0522407  |
| Origin.GNV       |               0.0504304  |
| Dest.MSY         |               0.0501866  |
| Origin.PVD       |               0.0490418  |
| Origin.MFR       |               0.0437977  |
| Origin.SNA       |               0.0421396  |
| FlightNum        |               0.0376186  |
| Origin.SEA       |               0.0372322  |
| Dest.BNA         |               0.0347007  |
| Origin.PHF       |               0.029703   |
| Dest.LGA         |               0.0291171  |
| Intercept        |               0.026855   |
| Dest.ORD         |               0.0244753  |
| DayOfWeek.7      |               0.0234737  |
| Dest.SJC         |               0.0177833  |
| Dest.AVL         |               0.0172911  |
| Dest.BOS         |               0.0162872  |
| DayOfWeek.1      |               0.0153713  |
| Origin.PDX       |               0.0112833  |
| Origin.RIC       |               0.011192   |
| Origin.SAT       |               0.0110852  |
| Year.1988        |               0.00996483 |
| Origin.BGM       |               0.00952641 |
| Dest.PIT         |               0.00935131 |
| Dest.ATL         |               0.00882664 |
| Origin.CHS       |               0.00818887 |
| Origin.ABQ       |               0.00803383 |
| Dest.ILM         |               0.00255637 |
| UniqueCarrier.DL |               0.00110988 |
| Origin.SBN       |               0          |
| Dest.SRQ         |               0          |
| Origin.EWR       |               0          |
| Dest.LEX         |               0          |
| Dest.HPN         |               0          |
| Dest.MRY         |               0          |
| Origin.SCK       |               0          |
| Dest.ONT         |               0          |
| Origin.AMA       |               0          |
| Dest.JAN         |               0          |
| Dest.CHA         |               0          |
| DayOfWeek.3      |               0          |
| Origin.BIL       |               0          |
| Dest.OKC         |               0          |
| Dest.ORH         |               0          |
| Origin.LAN       |               0          |
| Dest.RDU         |               0          |
| Dest.MAF         |               0          |
| Dest.MYR         |               0          |
| Origin.AVP       |               0          |
| Dest.ANC         |               0          |
| Origin.ISP       |               0          |
| Dest.PHF         |               0          |
| Dest.SBN         |               0          |
| Origin.MHT       |               0          |
| Origin.LIT       |               0          |
| Dest.FNT         |               0          |
| Dest.ACY         |               0          |
| Origin.KOA       |               0          |
| Dest.SYR         |               0          |
| Dest.SJU         |               0          |
| Origin.HOU       |               0          |
| Dest.HNL         |               0          |
| Origin.GEG       |               0          |
| Dest.AMA         |               0          |
| Origin.LAS       |               0          |
| Dest.JFK         |               0          |
| Dest.AUS         |               0          |
| Dest.ELP         |               0          |
| Dest.ERI         |               0          |
| Dest.DCA         |               0          |
| Origin.SJU       |               0          |
| Dest.SWF         |               0          |
| Origin.STT       |               0          |
| Origin.JAN       |               0          |
| Origin.UCA       |               0          |
| Dest.SDF         |               0          |
| Dest.SCK         |               0          |
| Dest.ELM         |               0          |
| Dest.BOI         |               0          |
| Origin.ANC       |               0          |
Out[11]:
[('Origin', 954.8896484375, 1.0, 0.5008653471985322),
 ('Year', 679.748779296875, 0.7118610830153599, 0.3565465485016113),
 ('UniqueCarrier',
  132.99842834472656,
  0.13928146415908252,
  0.06976125890435877),
 ('FlightNum', 60.533199310302734, 0.06339287414975553, 0.03175129392093015),
 ('Distance', 60.19770812988281, 0.06304153388654407, 0.03157531975801193),
 ('DayOfWeek', 18.11199951171875, 0.01896763625132567, 0.009500231716555696),
 ('Dest', 0.0, 0.0, 0.0),
 ('Month', 0.0, 0.0, 0.0)]

In [12]:
# Model performance of GBM model on test data
data_gbm2.model_performance(test)


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.20453710890468096
R^2: 0.17931766817429018
LogLoss: 0.5955368499629994
AUC: 0.7453701269377677
Gini: 0.49074025387553544

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.3472357730956941: 
NO YES Error Rate
NO 1865.0 3302.0 0.6391 (3302.0/5167.0)
YES 615.0 5160.0 0.1065 (615.0/5775.0)
Total 2480.0 8462.0 0.358 (3917.0/10942.0)
Maximum Metrics: Maximum metrics at their respective thresholds

metric threshold value idx
max f1 0.3472358 0.7248718 298.0
max f2 0.1737435 0.8500030 377.0
max f0point5 0.5352998 0.7082464 197.0
max accuracy 0.5016028 0.6858892 214.0
max precision 0.9626287 1.0 0.0
max absolute_MCC 0.5352998 0.3733331 197.0
max min_per_class_accuracy 0.5096470 0.6832900 210.0
Gains/Lift Table: Avg response rate: 52.78 %

group lower_threshold cumulative_data_fraction response_rate cumulative_response_rate capture_rate cumulative_capture_rate lift cumulative_lift gain cumulative_gain
1 0.8546116 0.0500823 0.8905109 0.8905109 0.0845022 0.0845022 1.6872677 1.6872677 68.7267672 68.7267672
2 0.7960067 0.1000731 0.8555759 0.8730594 0.0810390 0.1655411 1.6210755 1.6542018 62.1075524 65.4201823
3 0.7463254 0.1501554 0.7737226 0.8399270 0.0734199 0.2389610 1.4659867 1.5914253 46.5986665 59.1425252
4 0.7118147 0.2001462 0.7952468 0.8287671 0.0753247 0.3142857 1.5067689 1.5702805 50.6768917 57.0280496
5 0.6824229 0.2503199 0.7158470 0.8061336 0.0680519 0.3823377 1.3563286 1.5273964 35.6328626 52.7396386
6 0.6543955 0.3003107 0.7020110 0.7888010 0.0664935 0.4488312 1.3301133 1.4945559 33.0113251 49.4555888
7 0.6209159 0.3503930 0.6514599 0.7691706 0.0618182 0.5106494 1.2343331 1.4573618 23.4333112 45.7361814
8 0.5858773 0.4006580 0.6109091 0.7493157 0.0581818 0.5688312 1.1575008 1.4197424 15.7500826 41.9742393
9 0.5504401 0.4501005 0.5878004 0.7315736 0.0550649 0.6238961 1.1137163 1.3861261 11.3716302 38.6126126
10 0.5141673 0.5000914 0.5411335 0.7125365 0.0512554 0.6751515 1.0252956 1.3500563 2.5295631 35.0056264
11 0.4810761 0.5499909 0.4890110 0.6922566 0.0462338 0.7213853 0.9265382 1.3116314 -7.3461776 31.1631397
12 0.4443224 0.5999817 0.4570384 0.6726580 0.0432900 0.7646753 0.8659591 1.2744977 -13.4040853 27.4497700
13 0.4130338 0.6507037 0.4162162 0.6526685 0.04 0.8046753 0.7886126 1.2366232 -21.1387387 23.6623231
14 0.3856764 0.6999634 0.3617811 0.6321974 0.0337662 0.8384416 0.6854733 1.1978362 -31.4526661 19.7836210
15 0.3578235 0.7501371 0.3916211 0.6161062 0.0372294 0.8756710 0.7420118 1.1673480 -25.7988156 16.7347957
16 0.3315650 0.7999452 0.3596330 0.6001371 0.0339394 0.9096104 0.6814034 1.1370909 -31.8596608 13.7090927
17 0.3057750 0.8500274 0.3010949 0.5825180 0.0285714 0.9381818 0.5704901 1.1037077 -42.9509906 10.3707715
18 0.2734386 0.9000183 0.2815356 0.5658002 0.0266667 0.9648485 0.5334308 1.0720321 -46.6569165 7.2032100
19 0.2290241 0.9500091 0.2120658 0.5471861 0.0200866 0.9849351 0.4018050 1.0367638 -59.8194956 3.6763779
20 0.0661176 1.0 0.1590494 0.5277829 0.0150649 1.0 0.3013538 1.0 -69.8646217 0.0

Out[12]: