In [1]:
import h2o
import pandas
import pprint
import operator
import matplotlib.pyplot as plt
from tabulate import tabulate

In [2]:
# Connect to a cluster
h2o.init()


H2O cluster uptime: 9 minutes 40 seconds 391 milliseconds
H2O cluster version: 3.1.0.99999
H2O cluster name: Anqi
H2O cluster total nodes: 1
H2O cluster total memory: 1.78 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321

In [3]:
# air_path = [h2o.locate("bigdata/laptop/airlines_all.05p.csv")]
# air_path = [h2o.locate("bigdata/laptop/flights-nyc/flights14.csv.zip")]
air_path = [h2o.locate("smalldata/airlines/allyears2k_headers.zip")]

# ----------

# 1- Load data - 1 row per flight.  Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print "Import and Parse airlines data"
data = h2o.import_frame(path=air_path)
data.describe()


Import and Parse airlines data

Parse Progress: [##################################################] 100%

Parsed 43,978 rows and 31 cols:

File1 C:\Users\Anqi\Documents\Work\h2o-3\smalldata/airlines/allyears2k_headers.zip
Rows: 43,978 Cols: 31

Chunk compression summary:

chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 10 5.376344 800 B 0.05040237
C0D Constant Reals 23 12.365591 1.8 KB 0.115925446
CBS Bits 2 1.0752689 2.0 KB 0.12720299
CX0 Sparse Bits 10 5.376344 1.9 KB 0.12474586
C1 1-Byte Integers 40 21.505377 287.8 KB 18.564957
C1N 1-Byte Integers (w/o NAs) 19 10.215054 133.1 KB 8.58617
C1S 1-Byte Fractions 6 3.2258065 43.4 KB 2.8024976
C2 2-Byte Integers 76 40.860214 1.1 MB 69.628105
Frame distribution summary:

size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.14:54321 1.5 MB 43978.0 6.0 186.0
mean 1.5 MB 43978.0 6.0 186.0
min 1.5 MB 43978.0 6.0 186.0
max 1.5 MB 43978.0 6.0 186.0
stddev 0 B 0.0 0.0 0.0
total 1.5 MB 43978.0 6.0 186.0
Column-by-Column Summary:

Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay IsArrDelayed IsDepDelayed
type int int int int int int int int enum int enum int int int int int enum enum int int int int enum int int int int int int enum enum
mins 1987.0 1.0 1.0 1.0 1.0 0.0 1.0 0.0 0.0 1.0 0.0 16.0 17.0 14.0 -63.0 -16.0 0.0 0.0 11.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
maxs 2008.0 10.0 31.0 7.0 2400.0 2359.0 2400.0 2359.0 9.0 3949.0 3500.0 475.0 437.0 402.0 475.0 473.0 131.0 133.0 3365.0 128.0 254.0 1.0 3.0 1.0 369.0 201.0 323.0 14.0 373.0 1.0 1.0
sigma 6.34436090171 1.87471137134 9.17579042586 1.90501311913 465.340899124 476.251139993 484.347487904 492.750434123 2.05121227084 777.404369164 1168.75931155 73.9744416606 73.40159463 69.6363295151 29.8402219624 26.4388090429 37.64411521 33.930070329 578.43800823 4.20197993986 9.9050857472 0.155193141358 0.182676305421 0.0497234872189 16.2057299045 4.41677989873 18.6197762215 0.403940182102 23.4875658741 0.496887288343 0.499377380318
zero_count 0 0 0 0 0 569 0 569 724 0 2 0 0 0 1514 6393 59 172 0 623 557 42892 81 43869 7344 8840 7388 8914 7140 19537 20887
missing_count 0 0 0 0 1086 0 1195 0 0 0 32 1195 13 16649 1195 1086 0 0 35 16026 16024 0 9774 0 35045 35045 35045 35045 35045 0 0

In [4]:
# ----------

# 2- Data exploration and munging. Generate scatter plots 
# of various columns and plot fitted GLM model.

# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
    if(fit):
        lr = h2o.glm(x = data[[x]], y = data[y], family = "gaussian")
        coeff = lr.coef()
    df = data[[x,y]]
    runif = df[y].runif()
    df_subset = df[runif < float(max_points)/data.nrow()]
    df_py = h2o.as_list(df_subset)
    
    if(fit): h2o.remove(lr._key)

    # If x variable is string, generate box-and-whisker plot
    if(df_py[x].dtype == "object"):
        df_py.boxplot(column = y, by = x)
    # Otherwise, generate a scatter plot
    else:
        df_py.plot(x = x, y = y, kind = "scatter")
    
    if(fit):
        x_min = min(df_py[x])
        x_max = max(df_py[x])
        y_min = coeff["Intercept"] + coeff[x]*x_min
        y_max = coeff["Intercept"] + coeff[x]*x_max
        plt.plot([x_min, x_max], [y_min, y_max], "k-")
    plt.show()

scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)


glm Model Build Progress: [##################################################] 100%

In [5]:
# Group flights by month
aggregates = {"Month": ["nrow", 0, "all"], "Cancelled": ["sum", 1, "all"]}
bpd = h2o.group_by(data, cols=["Month"], aggregates=aggregates)
bpd.show()
bpd.describe()
bpd.dim()

# Convert columns to factors
data["Year"]      = data["Year"]     .asfactor()
data["Month"]     = data["Month"]    .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()


Displaying 2 row(s):
Row ID Month Cancelled Month
1 [10.0] [19990.0] [1999.0]
2 [1.0] [41979.0] [41979.0]
Rows: 2 Cols: 3

Chunk compression summary:

chunk_type chunk_name count count_percentage size size_percentage
C1N 1-Byte Integers (w/o NAs) 1 33.333336 70 B 28.455284
C2S 2-Byte Fractions 2 66.66667 176 B 71.544716
Frame distribution summary:

size number_of_rows number_of_chunks_per_column number_of_chunks
10.0.0.14:54321 246 B 2.0 1.0 3.0
mean 246 B 2.0 1.0 3.0
min 246 B 2.0 1.0 3.0
max 246 B 2.0 1.0 3.0
stddev 0 B 0.0 0.0 0.0
total 246 B 2.0 1.0 3.0
Column-by-Column Summary:

Month Cancelled Month
type int int int
mins 1.0 19990.0 1999.0
maxs 10.0 41979.0 41979.0
sigma 6.36396103068 15548.5710115 28270.1291118
zero_count 0 0 0
missing_count 0 0 0

In [6]:
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1

hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2

# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = h2o.ifelse((arrTime-depTime) > 0, (arrTime-depTime), None)[0]
scatter_plot(data, "Distance", "TravelTime")


glm Model Build Progress: [##################################################] 100%

In [7]:
# Impute missing travel times and re-plot
h2o.impute(data = data, column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")


glm Model Build Progress: [##################################################] 100%

In [8]:
# ----------
# 3- Fit a model on train; using test as validation

# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test  = data[s > 0.75]

# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]

# Simple GLM - Predict Delays
data_glm = h2o.glm(x           =train[myX],
                   y           =train[myY],
                   validation_x=test [myX],
                   validation_y=test [myY],
                   family      ="binomial",
                   standardize =True)

# Simple GBM
data_gbm = h2o.gbm(x              =train[myX],
                   y              =train[myY],
                   validation_x   =test [myX],
                   validation_y   =test [myY],
                   balance_classes=True,
                   ntrees         =3,
                   max_depth      =1,
                   distribution   ="bernoulli",
                   learn_rate     =0.1,
                   min_rows       =2)

# Complex GBM
data_gbm2 = h2o.gbm(x              =train[myX],
                    y              =train[myY],
                    validation_x   =test [myX],
                    validation_y   =test [myY],
                    balance_classes=True,
                    ntrees         =50,
                    max_depth      =5,
                    distribution   ="bernoulli",
                    learn_rate     =0.1,
                    min_rows       =2)

# Simple Random Forest
data_rf = h2o.random_forest(x              =train[myX],
                            y              =train[myY],
                            validation_x   =test [myX],
                            validation_y   =test [myY],
                            ntrees         =5,
                            max_depth      =2,
                            balance_classes=True)

# Complex Random Forest
data_rf2 = h2o.random_forest(x              =train[myX], 
                             y              =train[myY],
                             validation_x   =test [myX],
                             validation_y   =test [myY],
                             ntrees         =10,
                             max_depth      =5,
                             balance_classes=True)

# Deep Learning with 5 epochs
data_dl = h2o.deeplearning(x                   =train[myX],
                           y                   =train[myY],
                           validation_x        =test [myX],
                           validation_y        =test [myY],
                           hidden              =[10,10],
                           epochs              =5,
                           variable_importances=True,
                           balance_classes     =True,
                           loss                ="Automatic")


glm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%

gbm Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

drf Model Build Progress: [##################################################] 100%

deeplearning Model Build Progress: [##################################################] 100%

In [19]:
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
glm_varimp = data_glm.coef_norm()
for k,v in glm_varimp.iteritems():
    glm_varimp[k] = abs(glm_varimp[k])
    
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print "Variable Importances:\n\n" + table

data_gbm.varimp()
data_rf.varimp()


Variable Importances:

| Predictor        |   Normalized Coefficient |
|------------------+--------------------------|
| Year.2008        |              2.20645     |
| Dest.LYH         |              1.66734     |
| Year.2003        |              1.527       |
| Year.2007        |              1.45986     |
| Origin.HPN       |              1.38768     |
| Origin.MDW       |              1.34977     |
| Origin.LIH       |              1.33007     |
| Origin.LEX       |              1.31243     |
| Origin.CAE       |              1.17961     |
| UniqueCarrier.HP |              1.16061     |
| Origin.TLH       |              1.11197     |
| Origin.HNL       |              1.10426     |
| Year.2001        |              1.04418     |
| Origin.ERI       |              1.04216     |
| Origin.MLB       |              1.01101     |
| Origin.OGG       |              0.98171     |
| Origin.GRR       |              0.968939    |
| Year.2002        |              0.942916    |
| Origin.MYR       |              0.938213    |
| UniqueCarrier.TW |              0.913381    |
| Origin.PSP       |              0.873085    |
| Origin.TRI       |              0.861922    |
| Origin.TUL       |              0.853857    |
| Origin.ATL       |              0.847938    |
| Origin.CRP       |              0.844794    |
| Year.2006        |              0.836969    |
| Year.2004        |              0.823102    |
| Origin.CHO       |              0.811761    |
| Origin.SAV       |              0.802768    |
| Origin.SDF       |              0.798546    |
| Origin.ORD       |              0.776268    |
| Dest.PSP         |              0.761213    |
| Origin.LYH       |              0.754704    |
| Dest.CHO         |              0.754704    |
| Dest.DAY         |              0.747404    |
| Origin.LBB       |              0.746334    |
| Origin.ACY       |              0.74528     |
| Year.1994        |              0.735115    |
| Origin.PBI       |              0.720532    |
| Origin.STL       |              0.716061    |
| Dest.ISP         |              0.703318    |
| Origin.IAH       |              0.696018    |
| Dest.CAK         |              0.691879    |
| Origin.BOI       |              0.672926    |
| Origin.OKC       |              0.672778    |
| Dest.TOL         |              0.665686    |
| Dest.FLL         |              0.641893    |
| Origin.ALB       |              0.640183    |
| Dest.ICT         |              0.637126    |
| Dest.HTS         |              0.630531    |
| Dest.GEG         |              0.628694    |
| Origin.MRY       |              0.598009    |
| Origin.CMH       |              0.597743    |
| Origin.OMA       |              0.597156    |
| Dest.PBI         |              0.588023    |
| Dest.KOA         |              0.586975    |
| Origin.BTV       |              0.582203    |
| Origin.MSY       |              0.581737    |
| Year.1996        |              0.580467    |
| Origin.ROA       |              0.579396    |
| Origin.AUS       |              0.576124    |
| Dest.LIH         |              0.57356     |
| Origin.PIT       |              0.572273    |
| Dest.FAY         |              0.56828     |
| Origin.CRW       |              0.552122    |
| Origin.MAF       |              0.551273    |
| Origin.JAX       |              0.547234    |
| Year.1990        |              0.546692    |
| Dest.PNS         |              0.527311    |
| Dest.IND         |              0.526206    |
| Dest.JAX         |              0.524667    |
| Dest.CAE         |              0.521757    |
| Year.2005        |              0.521459    |
| Origin.PHL       |              0.521111    |
| Dest.OGG         |              0.51657     |
| UniqueCarrier.CO |              0.516328    |
| Origin.PWM       |              0.516188    |
| Year.1997        |              0.508542    |
| Origin.SRQ       |              0.501791    |
| Dest.UCA         |              0.50147     |
| Dest.TPA         |              0.49956     |
| Origin.ABE       |              0.490904    |
| Origin.LAX       |              0.489046    |
| Dest.AVL         |              0.487257    |
| Dest.LBB         |              0.485847    |
| Origin.TUS       |              0.482996    |
| Dest.SFO         |              0.479318    |
| Origin.FLL       |              0.478682    |
| Origin.DAY       |              0.473651    |
| Dest.COS         |              0.471269    |
| UniqueCarrier.WN |              0.464096    |
| Origin.GSO       |              0.456694    |
| Origin.MIA       |              0.449807    |
| Dest.CHS         |              0.447067    |
| Dest.GSO         |              0.431911    |
| Origin.LGA       |              0.425385    |
| Origin.BDL       |              0.423546    |
| Dest.MKE         |              0.414258    |
| Dest.ABQ         |              0.413438    |
| Year.1995        |              0.406966    |
| Origin.SYR       |              0.406785    |
| Origin.EYW       |              0.406293    |
| Dest.CLE         |              0.403485    |
| Origin.MCI       |              0.389425    |
| UniqueCarrier.PI |              0.384624    |
| Origin.IND       |              0.381196    |
| Dest.SLC         |              0.379741    |
| Origin.SLC       |              0.367927    |
| Dest.BGM         |              0.366001    |
| Origin.MCO       |              0.363054    |
| Origin.DFW       |              0.359103    |
| Dest.CMH         |              0.352386    |
| Dest.PWM         |              0.3502      |
| Origin.BOS       |              0.34326     |
| Dest.EYW         |              0.334537    |
| Origin.RNO       |              0.33134     |
| Origin.DSM       |              0.329586    |
| Origin.HRL       |              0.327308    |
| Origin.ICT       |              0.322426    |
| Origin.JFK       |              0.31971     |
| Dest.BUF         |              0.318101    |
| Origin.MDT       |              0.317615    |
| Dest.OAJ         |              0.315752    |
| Origin.BUR       |              0.311579    |
| Origin.OAK       |              0.307521    |
| Origin.LIT       |              0.303218    |
| Dest.MCO         |              0.302983    |
| Year.1991        |              0.302938    |
| UniqueCarrier.US |              0.300675    |
| Year.1992        |              0.297015    |
| Origin.ROC       |              0.28823     |
| Origin.EGE       |              0.288116    |
| Dest.BDL         |              0.28648     |
| Origin.SAN       |              0.283992    |
| Origin.MSP       |              0.28085     |
| Origin.PHF       |              0.280173    |
| Origin.JAN       |              0.280049    |
| Dest.MYR         |              0.27796     |
| Dest.SNA         |              0.265734    |
| Dest.CVG         |              0.264375    |
| Dest.IAD         |              0.263366    |
| Dest.SAT         |              0.262028    |
| Origin.BWI       |              0.26197     |
| Dest.BHM         |              0.260658    |
| Dest.AUS         |              0.257046    |
| Origin.CLT       |              0.252817    |
| Origin.BGM       |              0.249072    |
| Origin.IAD       |              0.247584    |
| Origin.DEN       |              0.239967    |
| Dest.IAH         |              0.236912    |
| Month.10         |              0.236816    |
| Year.1987        |              0.236816    |
| Origin.TPA       |              0.234566    |
| Origin.BUF       |              0.234454    |
| Dest.BUR         |              0.23317     |
| Dest.FAT         |              0.230435    |
| DayOfWeek.5      |              0.229992    |
| Dest.ALB         |              0.229499    |
| Origin.ORF       |              0.22929     |
| UniqueCarrier.AA |              0.224943    |
| Dest.RNO         |              0.222055    |
| Dest.SWF         |              0.221337    |
| Origin.BHM       |              0.212308    |
| Origin.CLE       |              0.210425    |
| Dest.PVD         |              0.208197    |
| Year.1999        |              0.206588    |
| Origin.STX       |              0.20182     |
| Dest.PDX         |              0.200751    |
| Dest.GRR         |              0.200415    |
| Dest.EWR         |              0.198965    |
| DayOfWeek.2      |              0.197407    |
| Dest.SAN         |              0.197022    |
| Dest.RDU         |              0.196143    |
| Dest.RSW         |              0.195439    |
| Origin.PHX       |              0.190512    |
| Dest.LAX         |              0.189674    |
| Origin.ELP       |              0.187637    |
| Dest.MCI         |              0.18748     |
| Origin.SWF       |              0.185943    |
| Dest.CLT         |              0.182888    |
| Dest.SDF         |              0.179141    |
| DayOfWeek.6      |              0.175119    |
| Origin.GEG       |              0.168538    |
| Dest.CRP         |              0.166112    |
| Dest.BWI         |              0.165897    |
| Year.1993        |              0.165274    |
| Origin.PDX       |              0.163935    |
| Dest.MSP         |              0.16179     |
| Origin.SNA       |              0.161758    |
| Dest.BTV         |              0.159138    |
| Origin.EWR       |              0.158962    |
| Dest.SEA         |              0.158931    |
| Distance         |              0.158559    |
| Dest.PHL         |              0.155809    |
| Dest.PHX         |              0.153835    |
| Dest.OMA         |              0.148225    |
| Dest.SYR         |              0.148075    |
| Dest.STL         |              0.145674    |
| Dest.SJU         |              0.145017    |
| Dest.MIA         |              0.13949     |
| Origin.RSW       |              0.13946     |
| Dest.ILM         |              0.138816    |
| Origin.ISP       |              0.137236    |
| Dest.LGA         |              0.124268    |
| DayOfWeek.4      |              0.120262    |
| Dest.RIC         |              0.118344    |
| Dest.EUG         |              0.118323    |
| Origin.RDU       |              0.118281    |
| Dest.OAK         |              0.114895    |
| Origin.TYS       |              0.11424     |
| Dest.LAS         |              0.114106    |
| Dest.MHT         |              0.11257     |
| Month.1          |              0.112187    |
| Origin.ABQ       |              0.110686    |
| Dest.ORD         |              0.106508    |
| Origin.MEM       |              0.105434    |
| Year.1989        |              0.105135    |
| Dest.SMF         |              0.104787    |
| Origin.SFO       |              0.102576    |
| Dest.SJC         |              0.101178    |
| Origin.MHT       |              0.0984085   |
| Origin.SJU       |              0.0979404   |
| Origin.BNA       |              0.0954815   |
| Origin.COS       |              0.093112    |
| Year.2000        |              0.084138    |
| Origin.UCA       |              0.0831781   |
| Dest.HPN         |              0.079277    |
| Dest.ORF         |              0.0783008   |
| Dest.DEN         |              0.0781418   |
| Dest.SBN         |              0.077641    |
| Dest.DFW         |              0.0771464   |
| Dest.BOS         |              0.074874    |
| UniqueCarrier.UA |              0.0680672   |
| Dest.BNA         |              0.0609925   |
| Year.1998        |              0.0580267   |
| Dest.JFK         |              0.0543564   |
| DayOfWeek.7      |              0.0530451   |
| Dest.ROA         |              0.0523803   |
| Origin.SEA       |              0.051804    |
| Dest.MDW         |              0.048072    |
| Dest.DTW         |              0.0454117   |
| Dest.TUS         |              0.0429144   |
| Dest.HOU         |              0.0424006   |
| Origin.PVD       |              0.042323    |
| Dest.ONT         |              0.0398159   |
| Dest.TUL         |              0.0381556   |
| Origin.DTW       |              0.0369941   |
| Dest.MDT         |              0.0350554   |
| DayOfWeek.3      |              0.0348907   |
| Origin.RIC       |              0.0313877   |
| Origin.HOU       |              0.0295949   |
| Dest.ELP         |              0.0293808   |
| Dest.ATL         |              0.0286942   |
| FlightNum        |              0.0273415   |
| Dest.ROC         |              0.02682     |
| Year.1988        |              0.0266856   |
| Dest.BOI         |              0.0257301   |
| Dest.DAL         |              0.0252462   |
| Origin.SJC       |              0.0241171   |
| Dest.ABE         |              0.0238435   |
| Dest.MSY         |              0.0232965   |
| Origin.MKE       |              0.0206479   |
| Origin.CHS       |              0.0191289   |
| Origin.DCA       |              0.0160487   |
| Dest.DCA         |              0.0120622   |
| Origin.CVG       |              0.00886461  |
| Origin.SCK       |              0.00671922  |
| Origin.LAS       |              0.00537378  |
| Intercept        |              0.000536486 |
| Dest.LIT         |              0           |
| Dest.OKC         |              0           |
| Dest.ORH         |              0           |
| Origin.ANC       |              0           |
| Dest.SRQ         |              0           |
| Dest.MRY         |              0           |
| UniqueCarrier.DL |              0           |
| Dest.PIT         |              0           |
| Origin.AVP       |              0           |
| Origin.SAT       |              0           |
| Dest.DSM         |              0           |
| Dest.HNL         |              0           |
| Origin.LAN       |              0           |
| Origin.KOA       |              0           |
| Origin.SMF       |              0           |
| Origin.AMA       |              0           |
| Dest.ERI         |              0           |
| Dest.ACY         |              0           |
| Dest.FNT         |              0           |
| Dest.GSP         |              0           |
| Origin.MFR       |              0           |
| Origin.ONT       |              0           |
| Dest.JAN         |              0           |
| Origin.STT       |              0           |
| Dest.ELM         |              0           |
| Dest.CHA         |              0           |
| Origin.SBN       |              0           |
| Origin.GNV       |              0           |
| Dest.AMA         |              0           |
| Dest.LEX         |              0           |
| UniqueCarrier.PS |              0           |
| Dest.MAF         |              0           |
| Dest.PHF         |              0           |
| Dest.ANC         |              0           |
| Dest.HRL         |              0           |
| Dest.AVP         |              0           |
| DayOfWeek.1      |              0           |
| Dest.STT         |              0           |
| Dest.SCK         |              0           |
| Origin.BIL       |              0           |
| Origin.DAL       |              0           |

Variable Importances:

variable relative_importance scaled_importance percentage
Year 1234.6340332 1.0 1.0
Origin 0.0 0.0 0.0
Dest 0.0 0.0 0.0
UniqueCarrier 0.0 0.0 0.0
DayOfWeek 0.0 0.0 0.0
Month 0.0 0.0 0.0
Distance 0.0 0.0 0.0
FlightNum 0.0 0.0 0.0
Variable Importances:

variable relative_importance scaled_importance percentage
Year 1895.45153809 1.0 0.437243955524
Origin 1551.8223877 0.81870855388 0.35797536652
Dest 465.785095215 0.245738329815 0.107447599352
FlightNum 260.802764893 0.137594003145 0.0601621461915
DayOfWeek 55.2715072632 0.0291600740787 0.0127500661336
UniqueCarrier 53.0120201111 0.0279680166155 0.0122288462131
Distance 52.8523788452 0.0278837932721 0.0121920200653
Month 0.0 0.0 0.0

In [20]:
# Model performance of GBM model on test data
data_gbm2.model_performance(test)


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.206072998286
R^2: 0.173530324799
LogLoss: 0.598709245809
AUC: 0.740860130858
Gini: 0.481720261716

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.390111773837:

NO YES Error Rate
NO 2418.0 2820.0 0.5384 (2820.0/5238.0)
YES 886.0 4929.0 0.1524 (886.0/5815.0)
Total 3304.0 7749.0 0.6908 (0.6908/11053.0)
Maximum Metrics:

metric threshold value idx
max f1 0.390111773837 0.726776762017 271.0
max f2 0.203821606737 0.848781779661 366.0
max f0point5 0.584877735232 0.701223357296 170.0
max accuracy 0.485219911311 0.678277390754 220.0
max precision 0.953543170817 1.0 0.0
max absolute_MCC 0.584877735232 0.35864900149 170.0
max min_per_class_accuracy 0.507749761479 0.675448644521 209.0
Out[20]:


In [ ]: