In [ ]:
import h2o
import time
import numpy as np

In [ ]:
h2o.init()

In [ ]:
# Import and parse ACS 2013 5-year DP02 demographic data
acs_orig = h2o.upload_file(path=h2o.locate("bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip"), col_types = (["enum"] + ["numeric"]*149))
acs_orig.describe()
acs_zcta_col = acs_orig["ZCTA5"]
acs_full = acs_orig.drop("ZCTA5")

In [ ]:
# Import and parse WHD 2014-2015 labor violations data
whd_zcta = h2o.upload_file(path=h2o.locate("bigdata/laptop/census/whd_zcta_cleaned.zip"), col_types = (["enum"]*7 + ["numeric"]*97))
whd_zcta.describe()

In [ ]:
# Create validation data with 20% missing entries
# acs_miss = h2o.upload_file(path=h2o.locate("bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip"), col_types = (["enum"] + ["numeric"]*149))
# acs_miss = acs_miss.drop("ZCTA5")
# acs_miss = acs_miss.insert_missing_values(fraction=0.2)
# acs_miss.describe()

In [ ]:
# Run GLRM to reduce ZCTA demographics to 15 archetypes
acs_model = h2o.glrm(x = acs_full, 
                     k = 5,
                     transform = "STANDARDIZE",
                     init = "PlusPlus",
                     loss = "Quadratic",
                     max_iterations = 100,
                     regularization_x = "Quadratic",
                     regularization_y = "L1",
                     gamma_x = 0.25,
                     gamma_y = 0.5)
print acs_model

In [ ]:
# Embedding of ZCTAs into archetypes (X)
zcta_arch_x = h2o.get_frame(acs_model._model_json["output"]["loading_key"]["name"])
arch_x_head = zcta_arch_x.head(show = True)

In [ ]:
# Archetypes to full feature mapping (Y)
arch_feat_y = acs_model._model_json["output"]["archetypes"]
print arch_feat_y

In [ ]:
# Split WHD data into test/train with 20/80 ratio
split = whd_zcta["flsa_repeat_violator"].runif()
train = whd_zcta[split <= 0.8]
test = whd_zcta[split > 0.8]

# Build a GBM model to predict repeat violators and score
s = time.time()
gbm_orig = h2o.gbm(x = train.drop("flsa_repeat_violator"),
                   y = train["flsa_repeat_violator"],
                   validation_x = test.drop("flsa_repeat_violator"),
                   validation_y = test["flsa_repeat_violator"],
                   ntrees = 10, 
                   max_depth = 6, 
                   distribution = "multinomial")
orig_elapsed = time.time() - s

In [ ]:
# Replace zcta5_cd column in WHD data with GLRM archetypes
zcta_arch_x["zcta5_cd"] = acs_zcta_col
whd_arch = whd_zcta.merge(zcta_arch_x, allLeft = True, allRite = False)
whd_arch = whd_arch.drop("zcta5_cd")
whd_arch.describe()

In [ ]:
# Split WHD data into test/train with 20/80 ratio
train_mod = whd_arch[split <= 0.8]
test_mod = whd_arch[split > 0.8]

# Build a GBM model to predict repeat violators and score
s = time.time()
gbm_mod = h2o.gbm(x = train_mod.drop("flsa_repeat_violator"),
                   y = train_mod["flsa_repeat_violator"],
                   validation_x = test_mod.drop("flsa_repeat_violator"),
                   validation_y = test_mod["flsa_repeat_violator"],
                   ntrees = 10, 
                   max_depth = 6, 
                   distribution = "multinomial")
mod_elapsed = time.time() - s

In [ ]:
# Model performance comparison
train_mse_orig = gbm_orig.model_performance(train).mse()
test_mse_orig  = gbm_orig.model_performance(test ).mse()
train_mse_mod  = gbm_mod .model_performance(train).mse()
test_mse_mod   = gbm_orig.model_performance(test ).mse()

# Print results in pretty HTML table
header = ["Metric"   , "Original"    , "Reduced"    ]
table = [
         ["Runtime"  , orig_elapsed  , mod_elapsed  ],
         ["Train MSE", train_mse_orig, train_mse_mod],
         ["Test MSE" , test_mse_orig , test_mse_mod ],
        ]
h2o.H2ODisplay(table,header)

In [ ]: