In [1]:
#----------------------------------------------------------------------
# Purpose: Condition an Airline dataset by filtering out NAs where the
# departure delay in the input dataset is unknown.
#
# Then treat anything longer than minutesOfDelayWeTolerate
# as delayed.
#----------------------------------------------------------------------
In [2]:
import h2o
In [3]:
h2o.init()
In [4]:
from h2o.utils.shared_utils import _locate # private function. used to find files within h2o git project directory.
air = h2o.import_file(_locate("smalldata/airlines/allyears2k_headers.zip"))
In [5]:
numRows, numCols = air.dim
print("Original dataset rows: {0}, columns: {1}".format(numRows, numCols))
x_cols = ["Month", "DayofMonth", "DayOfWeek", "CRSDepTime", "CRSArrTime", "UniqueCarrier", "CRSElapsedTime", "Origin", "Dest", "Distance"]
y_col = "SynthDepDelayed"
noDepDelayedNAs = air[air["DepDelay"].isna() == 0]
rows, cols = noDepDelayedNAs.dim
print("New dataset rows: {0}, columns: {1}".format(rows, cols))
In [6]:
minutesOfDelayWeTolerate = 15
noDepDelayedNAs = noDepDelayedNAs.cbind(noDepDelayedNAs["DepDelay"] > minutesOfDelayWeTolerate)
noDepDelayedNAs[numCols] = noDepDelayedNAs[numCols-1].asfactor()
noDepDelayedNAs.set_name(numCols,y_col)
Out[6]:
In [7]:
gbm = h2o.gbm(x=noDepDelayedNAs[x_cols], y=noDepDelayedNAs[y_col], distribution="bernoulli")
gbm.show()