In [1]:
import h2o
In [3]:
h2o.connect(ip="35.196.153.55")
Out[3]:
In [5]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
In [7]:
air = h2o.import_file("https://raw.github.com/0xdata/h2o/master/smalldata/airlines/allyears2k_headers.zip")
In [8]:
# set this to True if interactive (matplotlib) plots are desired
interactive = False
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt
In [9]:
air_path = "https://raw.github.com/0xdata/h2o/master/smalldata/airlines/allyears2k_headers.zip"
# ----------
# 1- Load data - 1 row per flight. Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print("Import and Parse airlines data")
data = h2o.import_file(path=air_path)
data.describe()
In [10]:
# ----------
# 2- Data exploration and munging. Generate scatter plots
# of various columns and plot fitted GLM model.
# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
if(fit):
lr = H2OGeneralizedLinearEstimator(family = "gaussian")
lr.train(x=x, y=y, training_frame=data)
coeff = lr.coef()
df = data[[x,y]]
runif = df[y].runif()
df_subset = df[runif < float(max_points)/data.nrow]
df_py = h2o.as_list(df_subset)
if(fit): h2o.remove(lr._id)
# If x variable is string, generate box-and-whisker plot
if(df_py[x].dtype == "object"):
if interactive: df_py.boxplot(column = y, by = x)
# Otherwise, generate a scatter plot
else:
if interactive: df_py.plot(x = x, y = y, kind = "scatter")
if(fit):
x_min = min(df_py[x])
x_max = max(df_py[x])
y_min = coeff["Intercept"] + coeff[x]*x_min
y_max = coeff["Intercept"] + coeff[x]*x_max
plt.plot([x_min, x_max], [y_min, y_max], "k-")
if interactive: plt.show()
scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)
In [11]:
# Group flights by month
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim
# Convert columns to factors
data["Year"] = data["Year"] .asfactor()
data["Month"] = data["Month"] .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()
In [12]:
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1
hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2
# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = (arrTime-depTime > 0).ifelse((arrTime-depTime), h2o.H2OFrame([[None]] * data.nrow))
scatter_plot(data, "Distance", "TravelTime")
In [13]:
# Impute missing travel times and re-plot
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")
In [14]:
# ----------
# 3- Fit a model on train; using test as validation
# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test = data[s > 0.75]
# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]
# Simple GLM - Predict Delays
data_glm = H2OGeneralizedLinearEstimator(family="binomial", standardize=True)
data_glm.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Simple GBM
data_gbm = H2OGradientBoostingEstimator(balance_classes=True,
ntrees =3,
max_depth =1,
distribution ="bernoulli",
learn_rate =0.1,
min_rows =2)
data_gbm.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Complex GBM
data_gbm2 = H2OGradientBoostingEstimator(balance_classes=True,
ntrees =50,
max_depth =5,
distribution ="bernoulli",
learn_rate =0.1,
min_rows =2)
data_gbm2.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Simple Random Forest
data_rf = H2ORandomForestEstimator(ntrees =5,
max_depth =2,
balance_classes=True)
data_rf.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Complex Random Forest
data_rf2 = H2ORandomForestEstimator(ntrees =10,
max_depth =5,
balance_classes=True)
data_rf2.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Deep Learning with 5 epochs
data_dl = H2ODeepLearningEstimator(hidden =[10,10],
epochs =5,
variable_importances=True,
balance_classes =True,
loss ="Automatic")
data_dl.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
In [15]:
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
from six import iteritems
glm_varimp = data_glm.coef_norm()
for k,v in iteritems(glm_varimp):
glm_varimp[k] = abs(glm_varimp[k])
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print("Variable Importances:\n\n" + table)
data_gbm.varimp()
data_rf.varimp()
Out[15]:
In [16]:
# Model performance of GBM model on test data
data_gbm2.model_performance(test)
Out[16]: