In [2]:
import h2o
import pandas
import pprint
import operator
import matplotlib
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from tabulate import tabulate
In [3]:
# Connect to a cluster
h2o.init()
In [4]:
# set this to True if interactive (matplotlib) plots are desired
interactive = False
if not interactive: matplotlib.use('Agg', warn=False)
import matplotlib.pyplot as plt
In [5]:
from h2o.h2o import _locate # private function. used to find files within h2o git project directory.
# air_path = [_locate("bigdata/laptop/airlines_all.05p.csv")]
# air_path = [_locate("bigdata/laptop/flights-nyc/flights14.csv.zip")]
air_path = [_locate("smalldata/airlines/allyears2k_headers.zip")]
# ----------
# 1- Load data - 1 row per flight. Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print "Import and Parse airlines data"
data = h2o.import_file(path=air_path)
data.describe()
In [6]:
# ----------
# 2- Data exploration and munging. Generate scatter plots
# of various columns and plot fitted GLM model.
# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
if(fit):
lr = H2OGeneralizedLinearEstimator(family = "gaussian")
lr.train(x=x, y=y, training_frame=data)
coeff = lr.coef()
df = data[[x,y]]
runif = df[y].runif()
df_subset = df[runif < float(max_points)/data.nrow]
df_py = h2o.as_list(df_subset)
if(fit): h2o.remove(lr._id)
# If x variable is string, generate box-and-whisker plot
if(df_py[x].dtype == "object"):
if interactive: df_py.boxplot(column = y, by = x)
# Otherwise, generate a scatter plot
else:
if interactive: df_py.plot(x = x, y = y, kind = "scatter")
if(fit):
x_min = min(df_py[x])
x_max = max(df_py[x])
y_min = coeff["Intercept"] + coeff[x]*x_min
y_max = coeff["Intercept"] + coeff[x]*x_max
plt.plot([x_min, x_max], [y_min, y_max], "k-")
if interactive: plt.show()
scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)
In [7]:
# Group flights by month
grouped = data.group_by("Month")
bpd = grouped.count().sum("Cancelled").frame
bpd.show()
bpd.describe()
bpd.dim
# Convert columns to factors
data["Year"] = data["Year"] .asfactor()
data["Month"] = data["Month"] .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()
In [8]:
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1
hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2
# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = h2o.ifelse((arrTime-depTime) > 0, (arrTime-depTime), h2o.H2OFrame(python_obj=[[None] * data.nrow]))
scatter_plot(data, "Distance", "TravelTime")
In [9]:
# Impute missing travel times and re-plot
data.impute(column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")
In [10]:
# ----------
# 3- Fit a model on train; using test as validation
# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test = data[s > 0.75]
# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]
# Simple GLM - Predict Delays
data_glm = H2OGeneralizedLinearEstimator(family="binomial", standardize=True)
data_glm.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Simple GBM
data_gbm = H2OGradientBoostingEstimator(balance_classes=True,
ntrees =3,
max_depth =1,
distribution ="bernoulli",
learn_rate =0.1,
min_rows =2)
data_gbm.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Complex GBM
data_gbm2 = H2OGradientBoostingEstimator(balance_classes=True,
ntrees =50,
max_depth =5,
distribution ="bernoulli",
learn_rate =0.1,
min_rows =2)
data_gbm2.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Simple Random Forest
data_rf = H2ORandomForestEstimator(ntrees =5,
max_depth =2,
balance_classes=True)
data_rf.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Complex Random Forest
data_rf2 = H2ORandomForestEstimator(ntrees =10,
max_depth =5,
balance_classes=True)
data_rf2.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
# Deep Learning with 5 epochs
data_dl = H2ODeepLearningEstimator(hidden =[10,10],
epochs =5,
variable_importances=True,
balance_classes =True,
loss ="Automatic")
data_dl.train(x =myX,
y =myY,
training_frame =train,
validation_frame=test)
In [11]:
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
glm_varimp = data_glm.coef_norm()
for k,v in glm_varimp.iteritems():
glm_varimp[k] = abs(glm_varimp[k])
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print "Variable Importances:\n\n" + table
data_gbm.varimp()
data_rf.varimp()
In [12]:
# Model performance of GBM model on test data
data_gbm2.model_performance(test)
Out[12]: