In [1]:
import h2o
import pandas
import pprint
import operator
import matplotlib.pyplot as plt
from tabulate import tabulate
In [2]:
# Connect to a cluster
h2o.init()
In [3]:
# air_path = [h2o.locate("bigdata/laptop/airlines_all.05p.csv")]
# air_path = [h2o.locate("bigdata/laptop/flights-nyc/flights14.csv.zip")]
air_path = [h2o.locate("smalldata/airlines/allyears2k_headers.zip")]
# ----------
# 1- Load data - 1 row per flight. Has columns showing the origin,
# destination, departure and arrival time, carrier information, and
# whether the flight was delayed.
print "Import and Parse airlines data"
data = h2o.import_frame(path=air_path)
data.describe()
In [4]:
# ----------
# 2- Data exploration and munging. Generate scatter plots
# of various columns and plot fitted GLM model.
# Function to fit a GLM model and plot the fitted (x,y) values
def scatter_plot(data, x, y, max_points = 1000, fit = True):
if(fit):
lr = h2o.glm(x = data[[x]], y = data[y], family = "gaussian")
coeff = lr.coef()
df = data[[x,y]]
runif = df[y].runif()
df_subset = df[runif < float(max_points)/data.nrow()]
df_py = h2o.as_list(df_subset)
if(fit): h2o.remove(lr._key)
# If x variable is string, generate box-and-whisker plot
if(df_py[x].dtype == "object"):
df_py.boxplot(column = y, by = x)
# Otherwise, generate a scatter plot
else:
df_py.plot(x = x, y = y, kind = "scatter")
if(fit):
x_min = min(df_py[x])
x_max = max(df_py[x])
y_min = coeff["Intercept"] + coeff[x]*x_min
y_max = coeff["Intercept"] + coeff[x]*x_max
plt.plot([x_min, x_max], [y_min, y_max], "k-")
plt.show()
scatter_plot(data, "Distance", "AirTime", fit = True)
scatter_plot(data, "UniqueCarrier", "ArrDelay", max_points = 5000, fit = False)
In [5]:
# Group flights by month
aggregates = {"Month": ["nrow", 0, "all"], "Cancelled": ["sum", 1, "all"]}
bpd = h2o.group_by(data, cols=["Month"], aggregates=aggregates)
bpd.show()
bpd.describe()
bpd.dim()
# Convert columns to factors
data["Year"] = data["Year"] .asfactor()
data["Month"] = data["Month"] .asfactor()
data["DayOfWeek"] = data["DayOfWeek"].asfactor()
data["Cancelled"] = data["Cancelled"].asfactor()
In [6]:
# Calculate and plot travel time
hour1 = data["CRSArrTime"] / 100
mins1 = data["CRSArrTime"] % 100
arrTime = hour1*60 + mins1
hour2 = data["CRSDepTime"] / 100
mins2 = data["CRSDepTime"] % 100
depTime = hour2*60 + mins2
# TODO: Replace this once list comprehension is supported. See PUBDEV-1286.
# data["TravelTime"] = [x if x > 0 else None for x in (arrTime - depTime)]
data["TravelTime"] = h2o.ifelse((arrTime-depTime) > 0, (arrTime-depTime), None)[0]
scatter_plot(data, "Distance", "TravelTime")
In [7]:
# Impute missing travel times and re-plot
h2o.impute(data = data, column = "Distance", by = ["Origin", "Dest"])
scatter_plot(data, "Distance", "TravelTime")
In [8]:
# ----------
# 3- Fit a model on train; using test as validation
# Create test/train split
s = data["Year"].runif()
train = data[s <= 0.75]
test = data[s > 0.75]
# Set predictor and response variables
myY = "IsDepDelayed"
myX = ["Origin", "Dest", "Year", "UniqueCarrier", "DayOfWeek", "Month", "Distance", "FlightNum"]
# Simple GLM - Predict Delays
data_glm = h2o.glm(x =train[myX],
y =train[myY],
validation_x=test [myX],
validation_y=test [myY],
family ="binomial",
standardize =True)
# Simple GBM
data_gbm = h2o.gbm(x =train[myX],
y =train[myY],
validation_x =test [myX],
validation_y =test [myY],
balance_classes=True,
ntrees =3,
max_depth =1,
distribution ="bernoulli",
learn_rate =0.1,
min_rows =2)
# Complex GBM
data_gbm2 = h2o.gbm(x =train[myX],
y =train[myY],
validation_x =test [myX],
validation_y =test [myY],
balance_classes=True,
ntrees =50,
max_depth =5,
distribution ="bernoulli",
learn_rate =0.1,
min_rows =2)
# Simple Random Forest
data_rf = h2o.random_forest(x =train[myX],
y =train[myY],
validation_x =test [myX],
validation_y =test [myY],
ntrees =5,
max_depth =2,
balance_classes=True)
# Complex Random Forest
data_rf2 = h2o.random_forest(x =train[myX],
y =train[myY],
validation_x =test [myX],
validation_y =test [myY],
ntrees =10,
max_depth =5,
balance_classes=True)
# Deep Learning with 5 epochs
data_dl = h2o.deeplearning(x =train[myX],
y =train[myY],
validation_x =test [myX],
validation_y =test [myY],
hidden =[10,10],
epochs =5,
variable_importances=True,
balance_classes =True,
loss ="Automatic")
In [19]:
# Variable importances from each algorithm
# Calculate magnitude of normalized GLM coefficients
glm_varimp = data_glm.coef_norm()
for k,v in glm_varimp.iteritems():
glm_varimp[k] = abs(glm_varimp[k])
# Sort in descending order by magnitude
glm_sorted = sorted(glm_varimp.items(), key = operator.itemgetter(1), reverse = True)
table = tabulate(glm_sorted, headers = ["Predictor", "Normalized Coefficient"], tablefmt = "orgtbl")
print "Variable Importances:\n\n" + table
data_gbm.varimp()
data_rf.varimp()
In [20]:
# Model performance of GBM model on test data
data_gbm2.model_performance(test)
Out[20]:
In [ ]: