In [1]:
import h2o
import pandas
import pprint
import operator
import matplotlib
from tabulate import tabulate
In [2]:
# Connect to a cluster
h2o.init()
In [3]:
# set this to True if interactive (matplotlib) plots are desired
interactive = False
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt
In [4]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
# air_path = [_locate("bigdata/laptop/airlines_all.05p.csv")]
# air_path = [_locate("bigdata/laptop/flights-nyc/flights14.csv.zip")]
air_path = [_locate("smalldata/airlines/allyears2k_headers.zip")]
# ----------
# 1- Load data - 1 row per flight. Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print("Import and Parse airlines data")
data = h2o.import_file(path=air_path)
data.describe()
In [5]:
# ----------
# 2- Data exploration and munging. Generate scatter plots
# of various columns and plot fitted GLM model.
# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
if(fit):
lr = h2o.glm(x = data[[x]], y = data[y], family = "gaussian")
coeff = lr.coef()
df = data[[x,y]]
runif = df[y].runif()
df_subset = df[runif < float(max_points)/data.nrow]
df_py = h2o.as_list(df_subset)
if(fit): h2o.remove(lr._id)
# If x variable is string, generate box-and-whisker plot
if(df_py[x].dtype == "object"):
if interactive: df_py.boxplot(column = y, by = x)
# Otherwise, generate a scatter plot
else:
if interactive: df_py.plot(x = x, y = y, kind = "scatter")
if(fit):
x_min = min(df_py[x])
x_max = max(df_py[x])
y_min = coeff["Intercept"] + coeff[x]*x_min
y_max = coeff["Intercept"] + coeff[x]*x_max
plt.plot([x_min, x_max], [y_min, y_max], "k-")
if interactive: plt.show()
scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)
In [6]:
# Group flights by month
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim
# Convert columns to factors
data["Year"] = data["Year"] .asfactor()
data["Month"] = data["Month"] .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()
In [7]:
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1
hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2
# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = (arrTime-depTime > 0).ifelse((arrTime-depTime), h2o.H2OFrame([[None]]*data.nrow))
scatter_plot(data, "Distance", "TravelTime")
In [8]:
# Impute missing travel times and re-plot
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")
In [9]:
# ----------
# 3- Fit a model on train; using test as validation
# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test = data[s > 0.75]
# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]
# Simple GLM - Predict Delays
data_glm = h2o.glm(x =train[myX],
y =train[myY],
validation_x=test [myX],
validation_y=test [myY],
family ="binomial",
standardize =True)
# Simple GBM
data_gbm = h2o.gbm(x =train[myX],
y =train[myY],
validation_x =test [myX],
validation_y =test [myY],
balance_classes=True,
ntrees =3,
max_depth =1,
distribution ="bernoulli",
learn_rate =0.1,
min_rows =2)
# Complex GBM
data_gbm2 = h2o.gbm(x =train[myX],
y =train[myY],
validation_x =test [myX],
validation_y =test [myY],
balance_classes=True,
ntrees =50,
max_depth =5,
distribution ="bernoulli",
learn_rate =0.1,
min_rows =2)
# Simple Random Forest
data_rf = h2o.random_forest(x =train[myX],
y =train[myY],
validation_x =test [myX],
validation_y =test [myY],
ntrees =5,
max_depth =2,
balance_classes=True)
# Complex Random Forest
data_rf2 = h2o.random_forest(x =train[myX],
y =train[myY],
validation_x =test [myX],
validation_y =test [myY],
ntrees =10,
max_depth =5,
balance_classes=True)
# Deep Learning with 5 epochs
data_dl = h2o.deeplearning(x =train[myX],
y =train[myY],
validation_x =test [myX],
validation_y =test [myY],
hidden =[10,10],
epochs =5,
variable_importances=True,
balance_classes =True,
loss ="Automatic")
In [11]:
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
from six import iteritems
glm_varimp = data_glm.coef_norm()
for k,v in iteritems(glm_varimp):
glm_varimp[k] = abs(glm_varimp[k])
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print("Variable Importances:\n\n" + table)
data_gbm.varimp()
data_rf.varimp()
Out[11]:
In [12]:
# Model performance of GBM model on test data
data_gbm2.model_performance(test)
Out[12]: