In [1]:
import h2o


[WARNING] H2O requires requests module of version 2.10 or newer. You have version 2.4.3.
You can upgrade to the newest version of the module running from the command line
    $ pip2 install --upgrade requests

In [3]:
h2o.connect(ip="35.196.153.55")


Connecting to H2O server at http://35.196.153.55:54321... successful.
H2O cluster uptime: 7 mins 14 secs
H2O cluster version: 3.14.0.7
H2O cluster version age: 10 days
H2O cluster name: sparkling-water-olmsteadbrett_local-1509393572563
H2O cluster total nodes: 1
H2O cluster free memory: 2.322 Gb
H2O cluster total cores: 4
H2O cluster allowed cores: 4
H2O cluster status: locked, healthy
H2O connection url: http://35.196.153.55:54321
H2O connection proxy: None
H2O internal security: False
Python version: 2.7.9 final
Out[3]:
<H2OConnection to http://35.196.153.55:54321, session _sid_bae2>

In [5]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator

In [7]:
air = h2o.import_file("https://raw.github.com/0xdata/h2o/master/smalldata/airlines/allyears2k_headers.zip")


Parse progress: |█████████████████████████████████████████████████████████| 100%

In [8]:
# set this to True if interactive (matplotlib) plots are desired
interactive = False
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt

In [9]:
air_path = "https://raw.github.com/0xdata/h2o/master/smalldata/airlines/allyears2k_headers.zip"

# ----------

# 1- Load data - 1 row per flight.  Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print("Import and Parse airlines data")
data = h2o.import_file(path=air_path)
data.describe()


Import and Parse airlines data
Parse progress: |█████████████████████████████████████████████████████████| 100%
Rows:43978
Cols:31


Year Month DayofMonth DayOfWeek DepTime CRSDepTime ArrTime CRSArrTime UniqueCarrier FlightNum TailNum ActualElapsedTime CRSElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted CarrierDelay WeatherDelay NASDelay SecurityDelay LateAircraftDelay IsArrDelayed IsDepDelayed
type int int int int int int int int enum int enum int int int int int enum enum int int int int enum int int int int int int enum enum
mins 1987.0 1.0 1.0 1.0 1.0 0.0 1.0 0.0 1.0 16.0 17.0 14.0 -63.0 -16.0 11.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
mean 1997.5 1.4090909090914.60107326393.820614852881345.846661381313.222861431504.634130381485.28916731 818.842989677 124.814529135 125.021562607 114.3161110919.3171119369810.0073906556 730.1821905655.3813680595314.16863418470.0246941652645 0.002478511983264.04780029106 0.2893764692714.855031904180.01701556028217.62006045002
maxs 2008.0 10.0 31.0 7.0 2400.0 2359.0 2400.0 2359.0 3949.0 475.0 437.0 402.0 475.0 473.0 3365.0 128.0 254.0 1.0 1.0 369.0 201.0 323.0 14.0 373.0
sigma 6.344360901711.874711371349.175790425861.90501311913465.340899124476.251139993484.347487904492.750434123 777.404369164 73.9744416606 73.40159463 69.636329515129.840221962426.4388090429 578.43800823 4.201979939869.9050857472 0.155193141358 0.0497234872189 16.2057299045 4.41677989873 18.61977622150.403940182102 23.4875658741
zeros 0 0 0 0 0 569 0 569 0 0 0 0 1514 6393 0 623 557 42892 43869 7344 8840 7388 8914 7140
missing0 0 0 0 1086 0 1195 0 0 0 32 1195 13 16649 1195 1086 0 0 35 16026 16024 0 9774 0 35045 35045 35045 35045 35045 0 0
0 1987.0 10.0 14.0 3.0 741.0 730.0 912.0 849.0 PS 1451.0 NA 91.0 79.0 nan 23.0 11.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES YES
1 1987.0 10.0 15.0 4.0 729.0 730.0 903.0 849.0 PS 1451.0 NA 94.0 79.0 nan 14.0 -1.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES NO
2 1987.0 10.0 17.0 6.0 741.0 730.0 918.0 849.0 PS 1451.0 NA 97.0 79.0 nan 29.0 11.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES YES
3 1987.0 10.0 18.0 7.0 729.0 730.0 847.0 849.0 PS 1451.0 NA 78.0 79.0 nan -2.0 -1.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan NO NO
4 1987.0 10.0 19.0 1.0 749.0 730.0 922.0 849.0 PS 1451.0 NA 93.0 79.0 nan 33.0 19.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES YES
5 1987.0 10.0 21.0 3.0 728.0 730.0 848.0 849.0 PS 1451.0 NA 80.0 79.0 nan -1.0 -2.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan NO NO
6 1987.0 10.0 22.0 4.0 728.0 730.0 852.0 849.0 PS 1451.0 NA 84.0 79.0 nan 3.0 -2.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES NO
7 1987.0 10.0 23.0 5.0 731.0 730.0 902.0 849.0 PS 1451.0 NA 91.0 79.0 nan 13.0 1.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES YES
8 1987.0 10.0 24.0 6.0 744.0 730.0 908.0 849.0 PS 1451.0 NA 84.0 79.0 nan 19.0 14.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES YES
9 1987.0 10.0 25.0 7.0 729.0 730.0 851.0 849.0 PS 1451.0 NA 82.0 79.0 nan 2.0 -1.0 SAN SFO 447.0 nan nan 0.0 NA 0.0 nan nan nan nan nan YES NO

In [10]:
# ----------

# 2- Data exploration and munging. Generate scatter plots 
# of various columns and plot fitted GLM model.

# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
    if(fit):
        lr = H2OGeneralizedLinearEstimator(family = "gaussian")
        lr.train(x=x, y=y, training_frame=data)
        coeff = lr.coef()
    df = data[[x,y]]
    runif = df[y].runif()
    df_subset = df[runif < float(max_points)/data.nrow]
    df_py = h2o.as_list(df_subset)
    
    if(fit): h2o.remove(lr._id)

    # If x variable is string, generate box-and-whisker plot
    if(df_py[x].dtype == "object"):
        if interactive: df_py.boxplot(column = y, by = x)
    # Otherwise, generate a scatter plot
    else:
        if interactive: df_py.plot(x = x, y = y, kind = "scatter")
    
    if(fit):
        x_min = min(df_py[x])
        x_max = max(df_py[x])
        y_min = coeff["Intercept"] + coeff[x]*x_min
        y_max = coeff["Intercept"] + coeff[x]*x_max
        plt.plot([x_min, x_max], [y_min, y_max], "k-")
    if interactive: plt.show()

scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)


glm Model Build progress: |███████████████████████████████████████████████| 100%

In [11]:
# Group flights by month
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim

# Convert columns to factors
data["Year"]      = data["Year"]     .asfactor()
data["Month"]     = data["Month"]    .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()


Month sum_Cancelled nrow
1 1067 41979
10 19 1999
Rows:2
Cols:3


Month sum_Cancelled nrow
type int int int
mins 1.0 19.0 1999.0
mean 5.5 543.0 21989.0
maxs 10.0 1067.0 41979.0
sigma 6.36396103068741.047906684 28270.1291118
zeros 0 0 0
missing0 0 0
0 1.0 1067.0 41979.0
1 10.0 19.0 1999.0

In [12]:
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1

hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2

# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = (arrTime-depTime > 0).ifelse((arrTime-depTime), h2o.H2OFrame([[None]] * data.nrow))
scatter_plot(data, "Distance", "TravelTime")


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%

In [13]:
# Impute missing travel times and re-plot
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")


glm Model Build progress: |███████████████████████████████████████████████| 100%

In [14]:
# ----------
# 3- Fit a model on train; using test as validation

# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test  = data[s > 0.75]

# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]

# Simple GLM - Predict Delays
data_glm = H2OGeneralizedLinearEstimator(family="binomial", standardize=True)
data_glm.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Simple GBM
data_gbm = H2OGradientBoostingEstimator(balance_classes=True,
                                        ntrees         =3,
                                        max_depth      =1,
                                        distribution   ="bernoulli",
                                        learn_rate     =0.1,
                                        min_rows       =2)

data_gbm.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Complex GBM
data_gbm2 = H2OGradientBoostingEstimator(balance_classes=True,
                                         ntrees         =50,
                                         max_depth      =5,
                                         distribution   ="bernoulli",
                                         learn_rate     =0.1,
                                         min_rows       =2)

data_gbm2.train(x               =myX,
                y               =myY,
                training_frame  =train,
                validation_frame=test)

# Simple Random Forest
data_rf = H2ORandomForestEstimator(ntrees         =5,
                                   max_depth      =2,
                                   balance_classes=True)

data_rf.train(x               =myX,
              y               =myY,
              training_frame  =train,
              validation_frame=test)

# Complex Random Forest
data_rf2 = H2ORandomForestEstimator(ntrees         =10,
                                    max_depth      =5,
                                    balance_classes=True)

data_rf2.train(x               =myX,
               y               =myY,
               training_frame  =train,
               validation_frame=test)

# Deep Learning with 5 epochs
data_dl = H2ODeepLearningEstimator(hidden              =[10,10],
                                   epochs              =5,
                                   variable_importances=True,
                                   balance_classes     =True,
                                   loss                ="Automatic")

data_dl.train(x               =myX,
              y               =myY,
              training_frame  =train,
              validation_frame=test)


glm Model Build progress: |███████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%

In [15]:
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
from six import iteritems
glm_varimp = data_glm.coef_norm()
for k,v in iteritems(glm_varimp):
    glm_varimp[k] = abs(glm_varimp[k])
    
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print("Variable Importances:\n\n" + table)

data_gbm.varimp()
data_rf.varimp()


Variable Importances:

| Predictor        |   Normalized Coefficient |
|------------------+--------------------------|
| Year.2008        |               2.12834    |
| Origin.HPN       |               1.78092    |
| Origin.LIH       |               1.72329    |
| Year.2003        |               1.66481    |
| Dest.LYH         |               1.48352    |
| Origin.TLH       |               1.4438     |
| Origin.MDW       |               1.42237    |
| Origin.LEX       |               1.37468    |
| Origin.CHO       |               1.37442    |
| Year.2007        |               1.33457    |
| Origin.TRI       |               1.1948     |
| Origin.LBB       |               1.11611    |
| Dest.PNS         |               1.09135    |
| Dest.HTS         |               1.03871    |
| UniqueCarrier.HP |               1.00414    |
| Origin.ERI       |               1.00131    |
| Year.2002        |               1.00045    |
| Year.2001        |               0.989799   |
| Origin.SRQ       |               0.982801   |
| Year.2004        |               0.962006   |
| Origin.MRY       |               0.959141   |
| Origin.HNL       |               0.944968   |
| Dest.ICT         |               0.935913   |
| Origin.CAE       |               0.932485   |
| Origin.SAV       |               0.923519   |
| Origin.MYR       |               0.888292   |
| Origin.IAH       |               0.834644   |
| Year.2006        |               0.828845   |
| UniqueCarrier.TW |               0.819648   |
| Origin.EYW       |               0.795584   |
| Origin.TUL       |               0.794152   |
| Origin.ATL       |               0.793512   |
| Origin.ACY       |               0.79148    |
| Origin.AUS       |               0.783774   |
| Dest.DAY         |               0.779332   |
| Origin.MLB       |               0.748394   |
| Origin.JAX       |               0.733829   |
| Dest.FLL         |               0.708209   |
| Origin.BOI       |               0.706706   |
| Origin.ORD       |               0.687341   |
| Year.1996        |               0.685849   |
| Origin.RNO       |               0.684905   |
| Year.1994        |               0.683743   |
| Origin.ROA       |               0.678766   |
| Origin.PBI       |               0.672746   |
| Dest.IAH         |               0.668079   |
| Origin.TYS       |               0.666651   |
| Origin.LYH       |               0.634583   |
| Dest.CHO         |               0.634583   |
| Origin.PSP       |               0.631683   |
| Origin.CRP       |               0.629172   |
| Origin.ABE       |               0.615234   |
| Origin.GSO       |               0.612062   |
| Dest.GEG         |               0.590951   |
| Origin.OKC       |               0.590465   |
| Dest.SFO         |               0.586402   |
| Origin.STL       |               0.586156   |
| Year.1997        |               0.584964   |
| Year.2005        |               0.582897   |
| Dest.TPA         |               0.581864   |
| Dest.ISP         |               0.578443   |
| Dest.PSP         |               0.57025    |
| Dest.KOA         |               0.56587    |
| Origin.BTV       |               0.564405   |
| Origin.MSY       |               0.563711   |
| Origin.ALB       |               0.555417   |
| Dest.GSO         |               0.553097   |
| Dest.OAJ         |               0.531787   |
| Dest.PBI         |               0.522437   |
| Dest.SDF         |               0.511014   |
| Origin.PIT       |               0.500839   |
| Dest.CHS         |               0.497163   |
| Dest.UCA         |               0.494345   |
| Dest.SAT         |               0.493991   |
| Origin.MCO       |               0.493147   |
| Dest.FAY         |               0.487934   |
| Year.1990        |               0.487001   |
| Origin.DAY       |               0.479214   |
| Origin.OAK       |               0.478934   |
| Dest.CLE         |               0.478929   |
| Origin.OMA       |               0.474536   |
| Origin.MIA       |               0.47264    |
| UniqueCarrier.WN |               0.459947   |
| Dest.ABQ         |               0.457894   |
| Dest.BDL         |               0.454467   |
| Dest.LBB         |               0.448505   |
| Origin.PWM       |               0.440791   |
| Origin.BUR       |               0.434567   |
| Origin.STX       |               0.430196   |
| Dest.SEA         |               0.429586   |
| Origin.FLL       |               0.412585   |
| Dest.STL         |               0.411415   |
| Origin.SAN       |               0.398132   |
| Dest.CAE         |               0.397591   |
| UniqueCarrier.PI |               0.39487    |
| Origin.SLC       |               0.394488   |
| Origin.SYR       |               0.384714   |
| Origin.IND       |               0.381832   |
| UniqueCarrier.CO |               0.376084   |
| Dest.IND         |               0.373964   |
| Origin.LGA       |               0.364869   |
| Origin.PHL       |               0.361748   |
| Origin.MDT       |               0.358408   |
| Dest.TUL         |               0.356359   |
| Origin.LAX       |               0.351487   |
| Origin.CRW       |               0.342324   |
| Origin.TUS       |               0.340778   |
| Origin.JFK       |               0.336137   |
| Year.1995        |               0.333443   |
| Dest.CMH         |               0.331511   |
| Year.1992        |               0.331424   |
| Year.1991        |               0.320566   |
| Origin.MCI       |               0.320389   |
| Dest.BGM         |               0.319531   |
| Dest.SLC         |               0.319182   |
| Origin.DFW       |               0.312263   |
| UniqueCarrier.US |               0.310796   |
| Origin.BDL       |               0.305355   |
| Dest.JAX         |               0.286775   |
| Origin.CLE       |               0.279886   |
| Origin.IAD       |               0.279167   |
| Dest.MIA         |               0.277144   |
| Origin.PHF       |               0.274691   |
| Origin.SMF       |               0.274546   |
| DayOfWeek.5      |               0.266387   |
| Year.1989        |               0.264867   |
| Dest.PDX         |               0.262286   |
| Dest.BUF         |               0.257864   |
| Dest.MCO         |               0.257      |
| Origin.ROC       |               0.25666    |
| Dest.LAX         |               0.233518   |
| Dest.IAD         |               0.228351   |
| Dest.MKE         |               0.226911   |
| Origin.BUF       |               0.225691   |
| Origin.PHX       |               0.225306   |
| Year.2000        |               0.224469   |
| Origin.BGM       |               0.222624   |
| Dest.CLT         |               0.215434   |
| Year.1987        |               0.213982   |
| Month.10         |               0.213982   |
| Origin.BOS       |               0.212845   |
| Dest.PVD         |               0.206415   |
| Origin.EWR       |               0.19826    |
| Dest.ROC         |               0.195916   |
| Origin.MKE       |               0.192487   |
| Dest.DTW         |               0.191773   |
| UniqueCarrier.UA |               0.18491    |
| Origin.BWI       |               0.184507   |
| Origin.TPA       |               0.184506   |
| Dest.AUS         |               0.181474   |
| Dest.ONT         |               0.18135    |
| Origin.DAL       |               0.176221   |
| DayOfWeek.6      |               0.174946   |
| UniqueCarrier.AA |               0.170341   |
| DayOfWeek.2      |               0.169405   |
| Origin.SJC       |               0.168895   |
| Dest.BUR         |               0.166759   |
| Dest.BTV         |               0.162648   |
| Dest.SMF         |               0.158714   |
| Dest.SJC         |               0.152107   |
| Distance         |               0.152086   |
| Dest.ORD         |               0.150718   |
| Dest.TUS         |               0.14548    |
| Origin.CLT       |               0.143246   |
| Origin.MSP       |               0.139783   |
| Origin.COS       |               0.137617   |
| Dest.BWI         |               0.134686   |
| DayOfWeek.4      |               0.134527   |
| Origin.LAS       |               0.133967   |
| Dest.FAT         |               0.13061    |
| Dest.PHL         |               0.127272   |
| Dest.JFK         |               0.124315   |
| Origin.DCA       |               0.121861   |
| Origin.SFO       |               0.120471   |
| Dest.SNA         |               0.117144   |
| Month.1          |               0.112903   |
| Dest.ALB         |               0.112647   |
| Dest.PHX         |               0.105518   |
| Year.1999        |               0.102472   |
| Year.1993        |               0.100089   |
| Dest.MHT         |               0.0965757  |
| Dest.OAK         |               0.0947442  |
| Origin.MEM       |               0.0924573  |
| Origin.SWF       |               0.0921668  |
| Dest.MSP         |               0.0911779  |
| Dest.LAS         |               0.0899772  |
| Origin.PDX       |               0.0880033  |
| Dest.OMA         |               0.0875835  |
| Dest.MDW         |               0.0836909  |
| Dest.MCI         |               0.0834905  |
| Dest.MYR         |               0.0817729  |
| Dest.RSW         |               0.0806226  |
| Dest.BNA         |               0.0729002  |
| Origin.BNA       |               0.0721146  |
| Dest.ORF         |               0.0693907  |
| Dest.BOI         |               0.0686461  |
| Intercept        |               0.0685957  |
| Origin.DSM       |               0.0673692  |
| Dest.ILM         |               0.0645363  |
| Origin.DEN       |               0.0645087  |
| Origin.EGE       |               0.0604727  |
| Origin.AVP       |               0.057963   |
| Origin.PVD       |               0.0572925  |
| Dest.HOU         |               0.0555928  |
| Origin.ABQ       |               0.0545723  |
| Dest.DCA         |               0.0513871  |
| Dest.ATL         |               0.0474207  |
| Origin.ELP       |               0.0459049  |
| Origin.RDU       |               0.0457665  |
| Origin.GEG       |               0.0439692  |
| DayOfWeek.3      |               0.0425205  |
| Origin.RSW       |               0.0382602  |
| Origin.SNA       |               0.0293943  |
| Origin.HOU       |               0.0278525  |
| Dest.DEN         |               0.02111    |
| Dest.RNO         |               0.0147958  |
| FlightNum        |               0.0142613  |
| Dest.LGA         |               0.0138716  |
| Origin.ONT       |               0.0121315  |
| Dest.SYR         |               0.0103941  |
| DayOfWeek.7      |               0.0102764  |
| Dest.SBN         |               0.00860861 |
| Origin.SAT       |               0.00572206 |
| Origin.DTW       |               0.00555071 |
| Origin.BHM       |               0.00399109 |
| Origin.CHS       |               0.00323463 |
| Dest.ABE         |               0.00184446 |
| Year.1998        |               0.00144204 |
| Dest.LIT         |               0          |
| Dest.OKC         |               0          |
| Dest.HPN         |               0          |
| Dest.ORH         |               0          |
| Origin.ANC       |               0          |
| Dest.LIH         |               0          |
| Dest.SRQ         |               0          |
| Dest.MRY         |               0          |
| Dest.SJU         |               0          |
| UniqueCarrier.DL |               0          |
| Dest.MSY         |               0          |
| Origin.GRR       |               0          |
| Year.1988        |               0          |
| Dest.PIT         |               0          |
| Dest.DSM         |               0          |
| Origin.CVG       |               0          |
| Dest.HNL         |               0          |
| Origin.SCK       |               0          |
| Dest.PWM         |               0          |
| Origin.ISP       |               0          |
| Origin.RIC       |               0          |
| Origin.CMH       |               0          |
| Origin.LAN       |               0          |
| Origin.ICT       |               0          |
| Origin.LIT       |               0          |
| Dest.GRR         |               0          |
| Dest.SWF         |               0          |
| Origin.KOA       |               0          |
| Dest.DAL         |               0          |
| Origin.UCA       |               0          |
| Dest.SAN         |               0          |
| Origin.AMA       |               0          |
| Origin.SEA       |               0          |
| Dest.DFW         |               0          |
| Dest.ERI         |               0          |
| Dest.ACY         |               0          |
| Dest.FNT         |               0          |
| Dest.GSP         |               0          |
| Dest.CRP         |               0          |
| Origin.MFR       |               0          |
| Origin.MHT       |               0          |
| Dest.JAN         |               0          |
| Origin.STT       |               0          |
| Dest.EWR         |               0          |
| Dest.ELP         |               0          |
| Dest.BHM         |               0          |
| Dest.ELM         |               0          |
| Dest.CHA         |               0          |
| Origin.SBN       |               0          |
| Origin.SJU       |               0          |
| Origin.ORF       |               0          |
| Origin.OGG       |               0          |
| Dest.MDT         |               0          |
| Origin.HRL       |               0          |
| Origin.GNV       |               0          |
| Dest.AMA         |               0          |
| Dest.CAK         |               0          |
| Dest.LEX         |               0          |
| Dest.BOS         |               0          |
| Dest.RIC         |               0          |
| Origin.JAN       |               0          |
| Dest.ROA         |               0          |
| Dest.OGG         |               0          |
| UniqueCarrier.PS |               0          |
| Origin.SDF       |               0          |
| Dest.AVL         |               0          |
| Dest.MAF         |               0          |
| Dest.PHF         |               0          |
| Dest.TOL         |               0          |
| Dest.ANC         |               0          |
| Dest.HRL         |               0          |
| Dest.AVP         |               0          |
| Dest.RDU         |               0          |
| DayOfWeek.1      |               0          |
| Dest.STT         |               0          |
| Dest.EYW         |               0          |
| Origin.MAF       |               0          |
| Dest.EUG         |               0          |
| Dest.SCK         |               0          |
| Origin.BIL       |               0          |
| Dest.CVG         |               0          |
| Dest.COS         |               0          |
Out[15]:
[(u'Year', 1041.415283203125, 1.0, 0.547312376507354),
 (u'Origin', 523.938720703125, 0.5031025846784432, 0.27535427124735096),
 (u'FlightNum', 101.34282684326172, 0.09731259803635424, 0.053260389295381905),
 (u'DayOfWeek', 93.48446655273438, 0.08976675113236311, 0.04913045389359786),
 (u'Distance', 64.25371551513672, 0.06169845646734591, 0.03376832883597861),
 (u'Dest', 55.47443771362305, 0.05326831534774291, 0.029154408265516324),
 (u'UniqueCarrier',
  22.870986938476562,
  0.021961447375855002,
  0.012019771954820393),
 (u'Month', 0.0, 0.0, 0.0)]

In [16]:
# Model performance of GBM model on test data
data_gbm2.model_performance(test)


ModelMetricsBinomial: gbm
** Reported on test data. **

MSE: 0.204516957162
RMSE: 0.452235510726
LogLoss: 0.595705258221
Mean Per-Class Error: 0.314546559781
AUC: 0.745583363998
Gini: 0.491166727997
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.378769592765: 
NO YES Error Rate
NO 2300.0 2967.0 0.5633 (2967.0/5267.0)
YES 814.0 4942.0 0.1414 (814.0/5756.0)
Total 3114.0 7909.0 0.343 (3781.0/11023.0)
Maximum Metrics: Maximum metrics at their respective thresholds

metric threshold value idx
max f1 0.3787696 0.7233077 284.0
max f2 0.1839260 0.8460443 372.0
max f0point5 0.5536517 0.7028692 193.0
max accuracy 0.5048457 0.6860201 220.0
max precision 0.9688600 1.0 0.0
max recall 0.0689509 1.0 398.0
max specificity 0.9688600 1.0 0.0
max absolute_mcc 0.5048457 0.3708690 220.0
max min_per_class_accuracy 0.5163738 0.6829315 214.0
max mean_per_class_accuracy 0.5048457 0.6854534 220.0
Gains/Lift Table: Avg response rate: 52.22 %

group cumulative_data_fraction lower_threshold lift cumulative_lift response_rate cumulative_response_rate capture_rate cumulative_capture_rate gain cumulative_gain
1 0.0100699 0.9336374 1.7597712 1.7597712 0.9189189 0.9189189 0.0177206 0.0177206 75.9771238 75.9771238
2 0.0200490 0.9086334 1.6887217 1.7244072 0.8818182 0.9004525 0.0168520 0.0345726 68.8721650 72.4407190
3 0.0301188 0.8954846 1.7080133 1.7189261 0.8918919 0.8975904 0.0171994 0.0517721 70.8013260 71.8926087
4 0.0400980 0.8821818 1.7757692 1.7330726 0.9272727 0.9049774 0.0177206 0.0694927 77.5769158 73.3072552
5 0.0500771 0.8675461 1.6190836 1.7103574 0.8454545 0.8931159 0.0161571 0.0856498 61.9083644 71.0357371
6 0.1002449 0.8094473 1.6068372 1.6585504 0.8390597 0.8660633 0.0806115 0.1662613 60.6837177 65.8550433
7 0.1505035 0.7670999 1.5209745 1.6126089 0.7942238 0.8420735 0.0764420 0.2427033 52.0974503 61.2608862
8 0.2001270 0.7295836 1.4494126 1.5721427 0.7568556 0.8209429 0.0719249 0.3146282 44.9412615 57.2142703
9 0.3000091 0.6659620 1.3775802 1.5073670 0.7193460 0.7871182 0.1375956 0.4522238 37.7580177 50.7366973
10 0.3999819 0.5947748 1.1625819 1.4211902 0.6070780 0.7421184 0.1162265 0.5684503 16.2581868 42.1190247
11 0.5000454 0.5214679 1.0590912 1.3487310 0.5530372 0.7042816 0.1059764 0.6744267 5.9091164 34.8731014
12 0.6000181 0.4510046 0.9436203 1.2812330 0.4927405 0.6690354 0.0943363 0.7687630 -5.6379739 28.1232972
13 0.7000816 0.3888302 0.7413638 1.2040688 0.3871260 0.6287417 0.0741835 0.8429465 -25.8636185 20.4068831
14 0.7999637 0.3223049 0.6505240 1.1349542 0.3396912 0.5926514 0.0649757 0.9079222 -34.9476028 13.4954191
15 0.9004808 0.2554322 0.5202424 1.0663364 0.2716606 0.5568205 0.0522933 0.9602154 -47.9757585 6.6336355
16 1.0 0.0578914 0.3997679 1.0 0.2087511 0.5221809 0.0397846 1.0 -60.0232139 0.0

Out[16]: