In [1]:
import h2o
import csv
import time
import numpy as np
import matplotlib.pyplot as plt
from h2o.estimators.glrm import H2OGeneralizedLowRankEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator

In [2]:
h2o.init()


Warning: Version mismatch. H2O is version 3.5.0.99999, but the python package is version UNKNOWN.
H2O cluster uptime: 2 hours 50 minutes 5 seconds 841 milliseconds
H2O cluster version: 3.5.0.99999
H2O cluster name: ludirehak
H2O cluster total nodes: 1
H2O cluster total memory: 4.44 GB
H2O cluster total cores: 8
H2O cluster allowed cores: 8
H2O cluster healthy: True
H2O Connection ip: 127.0.0.1
H2O Connection port: 54321

In [3]:
from h2o.h2o import _locate # private function. used to find files within h2o git project directory.

# Import and parse ACS 2013 5-year DP02 demographic data
acs_orig = h2o.upload_file(path=_locate("bigdata/laptop/census/ACS_13_5YR_DP02_cleaned.zip"), col_types = (["enum"] + ["numeric"]*149))
acs_orig.describe()

acs_zcta_col = acs_orig["ZCTA5"].asfactor()
acs_full = acs_orig.drop("ZCTA5")


Parse Progress: [##################################################] 100%
Uploaded py519e0629-b8ed-463a-af40-955210ca2d9b into cluster with 32,989 rows and 150 cols
Rows: 32,989 Cols: 150

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C2 2-Byte Integers 104 69.333336 6.6 MB 60.1219
C2S 2-Byte Fractions 23 15.333334 1.4 MB 13.29941
C4 4-Byte Integers 23 15.333334 2.9 MB 26.578688
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
192.168.1.76:54321 10.9 MB 32989.0 1.0 150.0
mean 10.9 MB 32989.0 1.0 150.0
min 10.9 MB 32989.0 1.0 150.0
max 10.9 MB 32989.0 1.0 150.0
stddev 0 B 0.0 0.0 0.0
total 10.9 MB 32989.0 1.0 150.0
Column-by-Column Summary:

ZCTA5 total_households family_households families_w_children_under_18 married_families married_w_children_under_18 male_no_wife male_no_wife_w_children_under_18 female_no_husband female_no_husband_w_children_under_18 nonfamily_households living_alone living_alone_65_and_over households_w_people_under_18 households_w_people_over_65 average_household_size average_family_size pop_in_households pop_householder pop_spouse pop_child pop_other_relatives pop_nonrelatives pop_unmarried_partner males_15_and_over males_15_and_over_unmarried males_15_and_over_married males_15_and_over_separated males_15_and_over_widowed males_15_and_over_divorced females_15_and_over females_15_and_over_unmarried females_15_and_over_married females_15_and_over_separated females_15_and_over_widowed females_15_and_over_divorced women_birth_past_year unmarried_women_birth_past_year unmarried_women_birth_past_year_per_1000 women_birth_past_year_per_1000 women_15_to_19_birth_past_year_per_1000 women_20_to_34_birth_past_year_per_1000 women_35_to_50_birth_past_year_per_1000 grdparents_live_w_grdchildren grdparents_live_w_resp_grdchildren resp_grdchildren_less_1yr resp_grdchildren_1_or_2yrs resp_grdchildren_3_or_4yrs resp_grdchildren_5yrs_or_more grdparents_resp_grdchildren female_grdparents_resp_grdchildren married_grdparents_resp_grdchildren pop_3_and_over_in_school nursery_and_preschool kindergarten elementary_school high_school college_or_graduate_school pop_25_years_and_over less_than_9th_grade high_school_no_diploma high_school_graduate college_no_degree associate_degree bachelors_degree graduate_or_prof_degree pct_high_school_grad_or_higher pct_bachelors_degree_or_higher civilian_18_and_over civilian_veterans pop_civilian_noninst pop_civilian_w_disability age_under_18 age_under_18_w_disability age_18_to_64 age_18_to_64_w_disability age_65_and_over age_65_and_over_w_disability pop_1_and_over res_same_house res_diff_house_in_us res_same_county res_diff_county res_diff_county_same_state res_diff_county_and_state res_abroad total_pop_place_of_birth pop_native native_born_in_us born_state_of_residence born_different_state born_islands_american_abroad total_pop_foreign_born pop_foreign_born foreign_born_naturalized_us_citizen foreign_born_not_us_citizen pop_born_outside_us native native_entered_2010_or_later native_entered_before_2010 foreign_born foreign_born_entered_2010_or_later foreign_born_entered_before_2010 foreign_born_excl_born_at_sea foreign_born_europe foreign_born_asia foreign_born_africa foreign_born_oceania foreign_born_latin_america foreign_born_northern_america pop_5_and_over english_language_only non_english_language non_english_poor_english spanish_language spanish_poor_english other_indo_euro_language other_indo_euro_poor_english asian_language asian_poor_english other_language english_less_than_v_well total_pop_ancestory pop_american pop_arab pop_czech pop_danish pop_dutch pop_english pop_french pop_french_canadian pop_german pop_greek pop_hungarian pop_irish pop_italian pop_lithuanian pop_norwegian pop_polish pop_portuguese pop_russian pop_scotch_irish pop_scottish pop_slovak pop_subsaharan_africa pop_swedish pop_swiss pop_ukrainian pop_welsh pop_west_indian
type int int int int int int int int int int int int int int int real real int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int real real int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int int
mins 1001.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
maxs 99929.0 43421.0 28649.0 15305.0 19802.0 11351.0 4083.0 2157.0 12898.0 7366.0 26569.0 19593.0 7810.0 17240.0 21794.0 14.53 14.53 114353.0 43421.0 19783.0 45713.0 21866.0 15961.0 4124.0 46255.0 23453.0 22038.0 1711.0 1374.0 4419.0 47542.0 23297.0 21644.0 3303.0 5493.0 6335.0 3677.0 2527.0 1000.0 1000.0 1000.0 1000.0 1000.0 7001.0 2187.0 604.0 510.0 556.0 796.0 2187.0 1368.0 1548.0 39977.0 3236.0 3221.0 16848.0 9254.0 36548.0 73169.0 24006.0 13228.0 24509.0 16996.0 6323.0 24256.0 28318.0 100.0 100.0 83817.0 12850.0 113821.0 14673.0 40691.0 2422.0 72738.0 10304.0 34581.0 9605.0 113555.0 100211.0 27326.0 16632.0 13287.0 11849.0 10227.0 3928.0 114734.0 88228.0 86214.0 70376.0 45677.0 12507.0 69424.0 69424.0 32541.0 47531.0 70775.0 12507.0 1174.0 11333.0 69424.0 3737.0 66787.0 69424.0 28823.0 48809.0 7626.0 2614.0 55018.0 2168.0 105696.0 66897.0 84946.0 54104.0 82723.0 44842.0 40436.0 26483.0 46350.0 35889.0 29294.0 12035.0 114734.0 16031.0 32701.0 3110.0 3757.0 15498.0 16337.0 12731.0 9529.0 27423.0 5776.0 10122.0 19562.0 30502.0 1790.0 14974.0 19492.0 14824.0 13310.0 3258.0 2956.0 3733.0 12233.0 8120.0 3090.0 5320.0 2373.0 46449.0
mean 49860.5 3504.5 2326.3 1035.7 1706.8 699.8 164.8 79.5 454.8 256.3 1178.1 963.3 342.4 1154.3 892.3 2.5 3.0 9200.2 3504.5 1705.8 2811.0 650.4 528.5 206.8 3698.0 1310.7 1872.2 68.0 94.2 352.8 3892.1 1133.3 1830.8 97.9 359.9 470.1 123.4 43.9 35.0 52.8 21.4 104.2 18.7 212.9 82.9 17.8 19.7 13.8 31.6 82.9 51.9 58.4 2504.5 151.9 127.6 994.2 520.0 711.0 6262.1 372.0 503.6 1760.6 1330.6 489.1 1130.2 675.9 84.7 21.9 7171.1 644.5 9289.3 1126.7 2235.1 90.1 5825.1 588.2 1229.0 448.4 9329.8 7920.2 1354.6 848.8 505.8 294.7 211.1 55.0 9443.2 8220.3 8088.3 5547.3 2541.0 132.1 1222.8 1222.8 551.9 670.9 1354.9 132.1 5.7 126.4 1222.8 57.0 1165.9 1222.8 146.1 352.4 50.4 6.8 642.2 24.8 8835.3 7005.7 1829.6 762.3 1135.4 495.4 325.5 103.8 289.1 138.5 79.6 24.5 9443.2 656.5 51.9 45.9 41.3 137.4 782.3 263.1 62.5 1446.3 39.4 44.5 1048.4 528.5 20.4 136.1 290.9 42.6 89.8 97.4 166.3 22.9 88.9 123.3 28.8 29.1 55.2 81.8
sigma 27446.6 5020.6 3324.7 1601.3 2416.2 1107.4 276.1 135.7 813.5 469.4 1907.0 1523.3 519.6 1782.8 1276.1 0.6 0.7 13600.8 5020.6 2414.6 4454.7 1351.9 1013.9 337.4 5342.9 2177.8 2674.3 126.3 138.4 503.0 5713.0 1992.9 2619.2 193.4 529.8 716.3 211.9 93.7 67.9 66.0 71.4 124.2 42.5 411.8 149.9 38.4 41.7 31.6 63.0 149.9 97.2 107.1 3981.1 248.4 212.2 1572.4 825.2 1501.2 9033.6 965.9 877.9 2464.0 1975.2 731.6 2014.4 1368.6 14.1 16.2 10427.7 937.2 13696.0 1595.5 3534.4 151.9 8737.6 868.1 1765.9 641.1 13678.4 11564.1 2295.6 1552.5 892.7 563.1 441.5 152.7 13858.0 11405.6 11183.4 7904.8 4262.7 399.1 3573.6 3573.6 1684.7 2055.8 3813.8 399.1 26.7 380.6 3573.6 182.6 3430.9 3573.6 494.9 1370.1 222.8 45.9 2447.2 66.1 12903.5 9609.9 5300.9 2554.9 4101.5 1992.2 1040.9 476.2 1280.6 733.9 342.1 135.0 13858.0 1101.1 259.7 120.7 124.5 317.9 1218.3 521.6 193.2 2326.2 112.6 144.4 1620.3 1212.6 60.7 446.7 702.0 281.2 291.0 181.8 278.3 104.7 350.1 280.1 73.4 103.8 108.2 755.5
zero_count 0 593 750 1686 865 2130 4977 8106 3238 5267 1191 1326 2487 1519 1201 708 750 593 593 865 1250 3474 3372 4563 390 1272 690 9847 5422 2454 423 1790 746 9038 2505 2684 6761 13376 13388 6762 23829 7942 16228 6304 9630 18938 18654 20710 15312 9630 10322 11880 1162 6446 6563 2191 2764 3151 378 3578 2199 801 1034 2300 1926 3406 407 1479 353 1403 426 987 1382 7746 523 1660 1152 1997 335 363 2471 4125 3914 5570 7893 17415 335 338 339 447 816 10813 6524 6524 8941 9728 5353 10813 27722 10904 6524 20522 6684 6524 12195 14122 22774 27741 12560 19146 335 360 4207 8809 7053 12474 8888 16544 15905 19438 19300 24678 335 2730 20303 14693 15462 6056 1878 4808 14166 1377 18470 16858 1618 5907 20996 10745 7657 21142 14324 7645 5766 21935 19844 8994 16092 20448 11410 22013
missing_count 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 441 441 441 441 441 441 441 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [4]:
# Import and parse WHD 2014-2015 labor violations data
whd_zcta = h2o.upload_file(path=_locate("bigdata/laptop/census/whd_zcta_cleaned.zip"), col_types = (["enum"]*7 + ["numeric"]*97))
whd_zcta["zcta5_cd"] = whd_zcta["zcta5_cd"].asfactor()
whd_zcta.describe()


Parse Progress: [##################################################] 100%
Uploaded py33d01288-0d61-489a-ade1-9781c19378c2 into cluster with 208,806 rows and 104 cols
Rows: 208,806 Cols: 104

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 40 19.23077 3.1 KB 0.0
CX0 Sparse Bits 7 3.3653848 674 B 0.0
CXI Sparse Integers 69 33.173077 213.7 KB 1.4656583
C1 1-Byte Integers 4 1.923077 408.1 KB 2.7992976
C1N 1-Byte Integers (w/o NAs) 3 1.4423077 308.8 KB 2.1181762
C2 2-Byte Integers 13 6.25 2.6 MB 17.927673
C4 4-Byte Integers 14 6.7307696 5.6 MB 39.470284
C4S 4-Byte Fractions 12 5.769231 4.8 MB 33.576458
CXD Sparse Reals 46 22.115383 381.4 KB 2.6165023
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
192.168.1.76:54321 14.2 MB 208806.0 2.0 208.0
mean 14.2 MB 208806.0 2.0 208.0
min 14.2 MB 208806.0 2.0 208.0
max 14.2 MB 208806.0 2.0 208.0
stddev 0 B 0.0 0.0 0.0
total 14.2 MB 208806.0 2.0 208.0
Column-by-Column Summary:

case_id trade_nm cty_nm st_cd zcta5_cd naic_cd flsa_repeat_violator case_violtn_cnt cmp_assd_cnt ee_violtd_cnt bw_atp_amt ee_atp_cnt flsa_violtn_cnt flsa_bw_atp_amt flsa_ee_atp_cnt flsa_mw_bw_atp_amt flsa_ot_bw_atp_amt flsa_15a3_bw_atp_amt flsa_cmp_assd_amt sca_violtn_cnt sca_bw_atp_amt sca_ee_atp_cnt mspa_violtn_cnt mspa_bw_atp_amt mspa_ee_atp_cnt mspa_cmp_assd_amt h1b_violtn_cnt h1b_bw_atp_amt h1b_ee_atp_cnt h1b_cmp_assd_amt fmla_violtn_cnt fmla_bw_atp_amt fmla_ee_atp_cnt fmla_cmp_assd_amt flsa_cl_violtn_cnt flsa_cl_minor_cnt flsa_cl_cmp_assd_amt dbra_cl_violtn_cnt dbra_bw_atp_amt dbra_ee_atp_cnt h2a_violtn_cnt h2a_bw_atp_amt h2a_ee_atp_cnt h2a_cmp_assd_amt flsa_smw14_violtn_cnt flsa_smw14_bw_amt flsa_smw14_ee_atp_cnt cwhssa_violtn_cnt cwhssa_bw_amt cwhssa_ee_cnt osha_violtn_cnt osha_bw_atp_amt osha_ee_atp_cnt osha_cmp_assd_amt eppa_violtn_cnt eppa_bw_atp_amt eppa_ee_cnt eppa_cmp_assd_amt h1a_violtn_cnt h1a_bw_atp_amt h1a_ee_atp_cnt h1a_cmp_assd_amt crew_violtn_cnt crew_bw_atp_amt crew_ee_atp_cnt crew_cmp_assd_amt ccpa_violtn_cnt ccpa_bw_atp_amt ccpa_ee_atp_cnt flsa_smwpw_violtn_cnt flsa_smwpw_bw_atp_amt flsa_smwpw_ee_atp_cnt flsa_hmwkr_violtn_cnt flsa_hmwkr_bw_atp_amt flsa_hmwkr_ee_atp_cnt flsa_hmwkr_cmp_assd_amt ca_violtn_cnt ca_bw_atp_amt ca_ee_atp_cnt pca_violtn_cnt pca_bw_atp_amt pca_ee_atp_cnt flsa_smwap_violtn_cnt flsa_smwap_bw_atp_amt flsa_smwap_ee_atp_cnt flsa_smwft_violtn_cnt flsa_smwft_bw_atp_amt flsa_smwft_ee_atp_cnt flsa_smwl_violtn_cnt flsa_smwl_bw_atp_amt flsa_smwl_ee_atp_cnt flsa_smwmg_violtn_cnt flsa_smwmg_bw_atp_amt flsa_smwmg_ee_atp_cnt flsa_smwsl_violtn_cnt flsa_smwsl_bw_atp_amt flsa_smwsl_ee_atp_cnt eev_violtn_cnt h2b_violtn_cnt h2b_bw_atp_amt h2b_ee_atp_cnt sraw_violtn_cnt sraw_bw_atp_amt sraw_ee_atp_cnt
type int enum enum enum enum int enum int real int real int int real int real real real real int real int int real int real int real int real int real int int int int real int real int int real int real int real int int real int int int int real int real int int int real int int int int int int int real int int real int int real int int int real int int real int int int int int real int int int int int int int int real int int int real int int int int
mins 1918.0 0.0 0.0 0.0 0.0 9.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
maxs 1081028538.0 178147.0 11043.0 55.0 19358.0 928120.0 3.0 530002.0 3129900.0 76664.0 30857204.7 76664.0 76664.0 30857204.7 76664.0 1426727.4 30857204.7 103847.31 708950.0 13667.0 18598525.9 7170.0 905.0 123330.54 903.0 97225.0 1742.0 4222146.3 1046.0 1740000.0 530002.0 118250.0 105.0 100.0 766.0 764.0 287980.0 752.0 5745385.3 747.0 2661.0 2338699.6 1365.0 3129900.0 544.0 299592.9 544.0 6171.0 3213935.4 2153.0 32.0 0.0 0.0 37800.0 166.0 20750.78 2.0 274000.0 119.0 1041759.0 119.0 40000.0 2.0 0.0 0.0 165000.0 27.0 25000.0 27.0 220.0 188895.49 191.0 43.0 15945.66 33.0 500.0 45.0 25973.23 45.0 224.0 360000.0 224.0 0.0 0.0 0.0 7.0 1254.91 7.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 382.05 2.0 1.0 194.0 93507.0 159.0 0.0 0.0 0.0
mean 1554203.5 NaN NaN NaN NaN 386060.0 NaN 26.7 539.6 13.8 10330.6 12.5 13.3 7610.0 10.9 1092.4 6512.0 2.5 209.4 1.2 1262.5 0.7 0.3 21.3 0.1 45.4 0.1 347.5 0.0 35.9 10.1 48.9 0.0 0.0 0.2 0.1 154.2 0.5 781.1 0.3 0.3 63.1 0.1 76.6 0.3 44.2 0.3 0.3 135.7 0.3 0.0 0.0 0.0 3.1 0.0 0.7 0.0 6.3 0.0 5.0 0.0 0.2 0.0 0.0 0.0 1.4 0.0 0.5 0.0 0.0 2.2 0.0 0.0 0.2 0.0 0.0 0.0 0.3 0.0 0.0 3.1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.1 0.0 0.0 0.0 0.0
sigma 2365752.3 NaN NaN NaN NaN 269790.4 NaN 1404.6 10479.0 198.0 105051.0 197.4 199.2 84369.4 196.0 12170.2 83106.5 292.7 4025.9 37.1 51171.7 21.2 5.9 804.5 4.9 695.1 4.9 14265.2 3.0 4924.4 1389.4 981.7 0.3 0.4 2.6 2.4 1912.5 5.8 19590.9 3.9 13.5 5981.3 5.8 8009.1 6.1 1455.2 5.8 15.8 9880.6 9.3 0.3 0.0 0.0 175.0 0.6 94.5 0.0 781.2 0.3 2279.8 0.3 87.5 0.0 0.0 0.0 420.5 0.1 63.2 0.1 0.8 476.4 0.8 0.2 48.2 0.1 1.1 0.1 65.1 0.1 0.6 895.5 0.5 0.0 0.0 0.0 0.0 2.8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.8 422.3 0.6 0.0 0.0 0.0
zero_count 0 1 1 401 18 0 197677 52029 191181 66849 91683 92486 79949 107455 107941 176733 116858 208707 202664 203548 203885 204044 200862 207890 207896 204455 207098 207374 207423 208521 200531 207013 207050 208802 199621 200240 202775 202274 202881 202915 207246 208016 208026 207926 207180 207252 207268 204894 205041 205075 207983 208806 208806 208472 208638 208783 208784 208738 208805 208805 208805 208805 208803 208806 208806 208803 208663 208697 208698 208750 208761 208761 208757 208795 208796 208805 208791 208793 208793 208790 208794 208794 208806 208806 208806 208804 208804 208804 208806 208806 208806 208806 208806 208806 208798 208798 208798 208790 208673 208747 208747 208806 208806 208806
missing_count 0 0 4 3 1353 180 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [5]:
# Run GLRM to reduce ZCTA demographics to 10 archetypes
acs_model = H2OGeneralizedLowRankEstimator(k = 10,
                                           transform = "STANDARDIZE",
                                           loss = "Quadratic",
                                           regularization_x = "Quadratic",
                                           regularization_y = "L1",
                                           gamma_x = 0.25,
                                           gamma_y = 0.5,
                                           max_iterations = 100)
acs_model.train(x = acs_full.names, training_frame= acs_full)
print acs_model


glrm Model Build Progress: [##################################################] 100%
Model Details
=============
H2OGeneralizedLowRankEstimator :  Beta - Generalized Low Rank Model
Model Key:  GLRM_model_python_1445611374604_10

Model Summary:
number_of_iterations final_step_size final_objective_value
100.0 0.0 1252607.1

ModelMetricsGLRM: glrm
** Reported on train data. **

MSE: NaN
Sum of Squared Error (Numeric): 1246609.52854
Misclassification Error (Categorical): 0.0

Scoring History:
timestamp duration iteration step_size objective
2015-10-23 10:33:24 17.873 sec 0.0 0.7 2402051.2
2015-10-23 10:33:26 19.923 sec 1.0 0.4 2402051.2
2015-10-23 10:33:28 21.946 sec 2.0 0.2 2402051.2
2015-10-23 10:33:30 24.066 sec 3.0 0.1 2402051.2
2015-10-23 10:33:32 26.152 sec 4.0 0.0 2402051.2
--- --- --- --- --- ---
2015-10-23 10:36:35 3 min 28.583 sec 95.0 0.0 1255318.5
2015-10-23 10:36:37 3 min 30.553 sec 96.0 0.0 1254232.9
2015-10-23 10:36:38 3 min 32.527 sec 97.0 0.0 1253254.0
2015-10-23 10:36:40 3 min 34.504 sec 98.0 0.0 1252607.1
2015-10-23 10:36:42 3 min 36.466 sec 99.0 0.0 1252607.1


In [6]:
# Plot objective function value each iteration
acs_model_score = acs_model.score_history()
plt.xlabel("Iteration")
plt.ylabel("Objective")
plt.title("Objective Function Value per Iteration")
plt.plot(acs_model_score["iteration"], acs_model_score["objective"])
plt.show()

In [7]:
# Embedding of ZCTAs into archetypes (X)
zcta_arch_x = h2o.get_frame(acs_model._model_json["output"]["representation_name"])
zcta_arch_x.head()


H2OFrame with 32989 rows and 10 columns: 
Arch1 0.1 0.1 0.3 0.4 0.2 0.9 0.9 0.7 0.9 0.9
Arch2 -0.0 -0.0 -0.0 -0.0 -0.0 -0.0 -0.0 -0.0 -0.0 -0.0
Arch3 0.1 -0.0 -0.0 0.0 0.0 0.0 0.0 -0.0 0.0 -0.0
Arch4 0.0 0.2 0.0 -0.0 -0.0 -0.0 0.0 0.0 0.0 -0.0
Arch5 0.3 0.2 0.1 0.1 0.3 0.0 0.0 0.1 0.0 0.0
Arch6 0.0 0.2 -0.0 0.0 0.1 0.0 0.0 0.0 0.0 0.0
Arch7 0.2 0.1 0.3 0.4 0.3 0.1 0.0 0.2 0.0 0.1
Arch8 0.2 0.1 0.0 0.0 0.1 0.0 0.0 0.0 0.0 0.0
Arch9 -0.1 -0.2 -0.1 -0.0 -0.0 -0.0 -0.0 -0.0 -0.0 -0.0
Arch10 0.1 0.3 0.1 -0.0 0.0 0.0 0.0 -0.0 0.0 0.0

In [8]:
# BUG: The or command isn't working, idx only contains indicator of "10065"
# Plot a few ZCTAs on the first two archetypes
idx = ((acs_zcta_col == "10065") |   # Manhattan, NY (Upper East Side)
       (acs_zcta_col == "11219") |   # Manhattan, NY (East Harlem)
       (acs_zcta_col == "66753") |   # McCune, KS
       (acs_zcta_col == "84104") |   # Salt Lake City, UT
       (acs_zcta_col == "94086") |   # Sunnyvale, CA
       (acs_zcta_col == "95014"))    # Cupertino, CA

city_arch = np.array(h2o.as_list(zcta_arch_x[idx,[0,1]]))
plt.xlabel("First Archetype")
plt.ylabel("Second Archetype")
plt.title("Archetype Representation of Zip Code Tabulation Areas")
plt.plot(city_arch[:,0], city_arch[:,1], "o")

# Label city names corresponding to ZCTAs
city_names = ["Upper East Side", "East Harlem", "McCune", "Salt Lake City", "Sunnyvale", "Cupertino"]
for i, txt in enumerate(city_names):
   plt.annotate(txt, (city_arch[i,0], city_arch[i,1]))
plt.show()

In [9]:
# Archetypes to full feature mapping (Y)
arch_feat_y = acs_model._model_json["output"]["archetypes"]
print arch_feat_y


Archetypes:
total_households family_households families_w_children_under_18 married_families married_w_children_under_18 male_no_wife male_no_wife_w_children_under_18 female_no_husband female_no_husband_w_children_under_18 nonfamily_households living_alone living_alone_65_and_over households_w_people_under_18 households_w_people_over_65 average_household_size average_family_size pop_in_households pop_householder pop_spouse pop_child pop_other_relatives pop_nonrelatives pop_unmarried_partner males_15_and_over males_15_and_over_unmarried males_15_and_over_married males_15_and_over_separated males_15_and_over_widowed males_15_and_over_divorced females_15_and_over females_15_and_over_unmarried females_15_and_over_married females_15_and_over_separated females_15_and_over_widowed females_15_and_over_divorced women_birth_past_year unmarried_women_birth_past_year unmarried_women_birth_past_year_per_1000 women_birth_past_year_per_1000 women_15_to_19_birth_past_year_per_1000 women_20_to_34_birth_past_year_per_1000 women_35_to_50_birth_past_year_per_1000 grdparents_live_w_grdchildren grdparents_live_w_resp_grdchildren resp_grdchildren_less_1yr resp_grdchildren_1_or_2yrs resp_grdchildren_3_or_4yrs resp_grdchildren_5yrs_or_more grdparents_resp_grdchildren female_grdparents_resp_grdchildren married_grdparents_resp_grdchildren pop_3_and_over_in_school nursery_and_preschool kindergarten elementary_school high_school college_or_graduate_school pop_25_years_and_over less_than_9th_grade high_school_no_diploma high_school_graduate college_no_degree associate_degree bachelors_degree graduate_or_prof_degree pct_high_school_grad_or_higher pct_bachelors_degree_or_higher civilian_18_and_over civilian_veterans pop_civilian_noninst pop_civilian_w_disability age_under_18 age_under_18_w_disability age_18_to_64 age_18_to_64_w_disability age_65_and_over age_65_and_over_w_disability pop_1_and_over res_same_house res_diff_house_in_us res_same_county res_diff_county res_diff_county_same_state res_diff_county_and_state res_abroad total_pop_place_of_birth pop_native native_born_in_us born_state_of_residence born_different_state born_islands_american_abroad total_pop_foreign_born pop_foreign_born foreign_born_naturalized_us_citizen foreign_born_not_us_citizen pop_born_outside_us native native_entered_2010_or_later native_entered_before_2010 foreign_born foreign_born_entered_2010_or_later foreign_born_entered_before_2010 foreign_born_excl_born_at_sea foreign_born_europe foreign_born_asia foreign_born_africa foreign_born_oceania foreign_born_latin_america foreign_born_northern_america pop_5_and_over english_language_only non_english_language non_english_poor_english spanish_language spanish_poor_english other_indo_euro_language other_indo_euro_poor_english asian_language asian_poor_english other_language english_less_than_v_well total_pop_ancestory pop_american pop_arab pop_czech pop_danish pop_dutch pop_english pop_french pop_french_canadian pop_german pop_greek pop_hungarian pop_irish pop_italian pop_lithuanian pop_norwegian pop_polish pop_portuguese pop_russian pop_scotch_irish pop_scottish pop_slovak pop_subsaharan_africa pop_swedish pop_swiss pop_ukrainian pop_welsh pop_west_indian
Arch1 -0.8 -0.8 -0.7 -0.8 -0.7 -0.7 -0.7 -0.6 -0.6 -0.7 -0.7 -0.7 -0.7 -0.8 -0.2 -0.2 -0.8 -0.8 -0.8 -0.7 -0.5 -0.6 -0.7 -0.8 -0.7 -0.8 -0.6 -0.8 -0.8 -0.8 -0.6 -0.8 -0.6 -0.8 -0.8 -0.7 -0.6 -0.2 -0.3 -0.2 -0.2 -0.2 -0.6 -0.6 -0.5 -0.5 -0.5 -0.6 -0.6 -0.6 -0.6 -0.7 -0.7 -0.7 -0.7 -0.7 -0.5 -0.8 -0.4 -0.6 -0.8 -0.8 -0.8 -0.7 -0.6 0.0 -0.4 -0.8 -0.8 -0.8 -0.8 -0.7 -0.7 -0.7 -0.8 -0.8 -0.8 -0.8 -0.8 -0.7 -0.6 -0.6 -0.6 -0.5 -0.4 -0.8 -0.8 -0.8 -0.8 -0.7 -0.4 -0.4 -0.4 -0.3 -0.3 -0.4 -0.4 -0.2 -0.4 -0.4 -0.3 -0.4 -0.4 -0.4 -0.2 -0.2 -0.2 -0.3 -0.4 -0.8 -0.8 -0.4 -0.3 -0.3 -0.3 -0.4 -0.3 -0.3 -0.2 0.1 0.1 -0.8 -0.7 0.2 -0.4 -0.3 -0.5 -0.7 -0.6 -0.4 -0.7 -0.4 -0.4 -0.8 -0.5 -0.4 -0.3 -0.5 -0.2 -0.4 -0.6 -0.7 -0.3 -0.3 -0.5 -0.4 -0.3 -0.6 -0.1
Arch2 1.9 2.1 3.0 2.4 3.9 2.3 1.7 0.9 0.7 1.3 1.5 2.1 2.9 1.8 2.0 2.4 2.8 1.9 2.4 4.2 3.3 0.4 0.3 2.4 2.1 2.7 1.4 2.5 0.9 2.3 1.9 2.7 0.8 2.1 1.2 4.1 0.7 -0.2 0.5 -0.3 0.3 0.9 2.7 2.7 2.6 4.5 1.2 1.3 2.7 2.5 3.1 3.9 2.9 3.4 4.4 3.3 2.9 2.1 4.7 3.0 1.7 2.0 0.9 1.1 0.8 -1.2 -0.1 2.3 0.4 2.8 3.3 4.2 4.4 2.4 3.6 1.6 2.3 2.8 2.8 2.3 3.3 0.4 -0.0 0.8 3.2 2.8 1.7 1.6 2.1 0.2 5.7 5.3 5.3 7.5 3.1 5.6 5.7 9.7 5.3 5.3 4.9 5.3 5.3 1.7 12.5 4.5 0.1 -0.1 2.2 2.7 0.4 5.7 4.9 -0.1 -0.1 1.5 1.3 0.1 -0.1 85.5 89.0 2.8 0.3 125.4 -0.1 -0.1 -0.1 -0.0 -0.2 0.3 -0.0 0.6 1.4 0.1 0.4 1.2 -0.2 2.0 -0.2 -0.1 0.0 0.2 -0.0 1.3 0.1 -0.0 0.9 0.1 0.0
Arch3 1.7 1.6 1.6 0.9 0.7 2.2 2.1 3.1 3.1 1.7 1.9 1.6 1.8 1.5 0.7 1.0 1.7 1.7 0.9 1.8 2.3 1.4 1.7 1.6 1.8 1.0 2.8 2.1 2.5 1.7 2.0 1.0 2.7 2.1 2.3 2.3 3.4 1.0 0.7 0.9 0.4 0.1 2.6 3.9 3.1 3.3 3.1 3.6 3.9 4.0 3.4 1.7 1.4 2.0 1.8 1.7 1.1 1.6 1.3 2.9 2.4 2.1 1.3 0.1 -0.3 -1.3 -1.6 1.7 1.9 1.7 2.8 1.9 2.7 1.6 3.2 1.4 2.0 1.7 1.6 2.1 2.3 1.4 1.5 1.0 0.4 1.7 2.0 2.0 2.2 1.2 -0.4 0.4 0.4 0.2 0.5 0.3 -0.4 -0.5 -0.4 0.4 0.4 0.4 0.4 -0.4 0.5 1.3 0.4 0.3 -0.6 1.7 2.0 0.4 0.4 0.3 0.2 0.1 0.2 0.5 0.6 0.9 0.8 1.7 2.1 0.1 -0.5 -0.7 -0.0 0.2 0.2 -0.3 -0.2 -0.4 -0.5 0.1 -0.6 -0.9 -0.7 -0.8 -0.4 -0.6 1.0 0.1 -0.5 2.2 -0.9 -0.7 -0.4 -0.3 0.5
Arch4 2.6 2.4 2.6 1.4 1.3 3.3 3.2 4.6 4.9 2.6 2.7 2.1 2.7 1.9 0.4 0.7 2.7 2.6 1.4 2.9 3.2 2.9 3.4 2.5 3.4 1.6 4.0 2.0 2.5 2.7 3.7 1.5 4.7 2.3 2.7 3.2 4.3 0.2 -0.0 0.2 -0.1 0.0 2.8 2.8 2.2 2.4 2.1 2.7 2.8 3.1 2.2 2.8 2.6 2.8 2.8 2.7 2.1 2.5 2.7 3.8 3.1 2.1 2.2 1.1 0.7 -0.6 -0.6 2.6 1.8 2.7 3.5 2.8 4.5 2.7 4.0 1.8 2.3 2.7 2.6 2.7 2.8 2.1 1.5 2.4 3.8 2.7 2.6 2.1 1.9 2.1 13.8 2.2 2.2 2.0 2.3 3.5 13.8 11.5 13.6 2.2 3.0 2.2 2.2 1.3 0.1 2.3 0.1 2.7 0.8 2.7 1.8 3.3 2.9 3.7 3.3 1.8 1.7 -0.0 0.0 1.4 1.4 2.7 1.2 0.8 -0.0 -0.4 0.1 0.2 0.5 0.5 0.3 0.6 1.5 0.8 1.2 0.8 -0.4 1.1 -0.2 0.5 0.2 0.2 0.5 2.4 -0.3 0.1 0.8 0.4 2.8
Arch5 1.5 1.4 1.2 1.7 1.5 0.6 0.7 0.3 0.3 1.5 1.5 1.8 1.0 1.7 -0.4 -0.3 1.2 1.5 1.7 0.9 -0.0 0.8 1.1 1.3 0.8 1.6 0.2 1.3 1.1 1.3 0.7 1.6 0.1 1.3 1.2 0.6 -0.1 -0.5 -0.2 -0.3 -0.3 0.3 -0.0 -0.3 -0.2 -0.3 -0.4 -0.3 -0.3 -0.4 -0.2 1.0 1.3 0.8 1.0 1.0 0.9 1.4 -0.1 0.0 0.8 1.2 1.4 2.0 2.1 1.2 2.3 1.3 1.5 1.2 0.8 0.9 0.5 1.2 0.5 1.7 1.3 1.2 1.3 0.9 0.7 1.1 0.9 1.1 0.5 1.2 1.4 1.4 1.2 1.6 0.4 0.2 0.2 0.2 0.1 0.2 0.4 0.2 0.4 0.2 0.2 0.2 0.2 0.6 -0.1 -0.2 -0.0 0.1 1.6 1.2 1.6 0.1 -0.0 0.2 0.2 0.2 -0.2 -0.3 -0.4 0.3 0.2 1.2 0.9 0.9 1.8 1.5 1.5 2.2 1.5 1.0 2.5 1.4 1.4 2.3 1.9 1.8 1.6 2.0 -0.7 1.2 1.7 2.2 1.2 -0.4 2.1 1.8 0.9 2.1 -0.2
Arch6 3.2 3.2 3.1 3.6 3.4 1.9 2.3 1.6 1.8 2.8 2.7 2.6 2.9 2.7 0.0 0.0 2.9 3.2 3.6 2.6 0.8 2.5 2.8 3.0 2.3 3.4 1.2 2.3 3.0 3.0 2.2 3.4 1.0 2.4 3.1 2.4 1.1 -0.9 -0.9 -0.7 -0.6 -0.4 1.2 1.4 1.5 1.2 1.1 1.0 1.4 1.2 1.5 3.0 2.9 2.6 2.8 2.7 2.7 2.9 -0.2 1.0 2.5 3.5 3.4 3.3 2.8 1.6 1.0 3.0 3.8 2.9 2.4 2.8 2.3 2.9 2.3 2.8 2.4 3.0 2.9 3.1 2.7 3.4 3.0 3.1 1.1 2.9 3.6 3.6 3.0 3.9 0.5 0.0 0.0 0.0 0.0 0.1 0.5 -0.0 0.6 0.0 0.2 0.0 0.0 0.6 0.2 0.3 0.6 -0.3 2.6 3.0 4.0 0.1 -0.4 -0.0 -0.3 0.2 -0.5 0.0 -0.3 0.7 0.5 2.9 2.9 1.3 2.7 3.3 3.0 4.5 2.9 1.6 4.5 1.9 1.9 4.0 2.3 1.9 2.7 2.5 -0.5 1.2 3.8 4.5 1.6 0.5 3.6 2.9 1.4 4.2 -0.4
Arch7 -0.0 0.0 -0.0 0.1 0.0 -0.0 -0.0 -0.1 -0.1 -0.2 -0.1 -0.0 -0.0 0.1 0.3 0.3 -0.0 -0.0 0.1 0.0 0.0 -0.2 -0.1 0.0 -0.1 0.1 0.1 0.1 0.1 -0.0 -0.1 0.1 0.0 0.1 -0.1 -0.0 0.0 0.7 0.8 0.6 0.7 0.5 0.1 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.3 -0.1 -0.1 0.0 0.0 0.0 -0.2 0.0 0.1 0.2 0.2 -0.0 0.0 -0.2 -0.2 -0.4 0.2 -0.0 0.1 -0.0 0.2 0.0 0.1 -0.0 0.2 0.1 0.2 -0.0 0.0 -0.2 -0.2 -0.0 0.0 -0.1 -0.2 -0.0 0.0 0.0 0.1 -0.1 -0.0 -0.1 -0.1 -0.1 -0.1 -0.1 -0.0 -0.0 -0.0 -0.1 -0.2 -0.1 -0.1 -0.1 -0.0 -0.2 -0.1 -0.1 -0.2 -0.0 0.0 -0.1 -0.1 -0.1 -0.1 -0.1 -0.1 -0.0 -0.0 0.1 0.2 -0.0 0.4 0.2 -0.2 -0.2 0.1 0.0 -0.0 -0.0 -0.0 -0.1 -0.1 0.1 -0.0 -0.1 -0.2 -0.1 -0.3 -0.1 0.0 -0.1 -0.0 -0.2 -0.2 -0.0 -0.0 -0.1 -0.0
Arch8 1.3 1.2 1.0 1.0 0.7 1.3 1.2 1.3 1.2 1.3 1.3 1.8 1.0 1.6 -0.2 -0.0 1.1 1.3 1.0 1.0 0.7 0.9 1.7 1.2 1.2 1.0 1.0 1.6 1.3 1.2 1.1 1.0 1.0 1.7 1.3 0.9 1.1 -0.0 -0.2 -0.2 -0.2 0.1 0.7 0.4 0.2 0.5 0.1 0.4 0.4 0.4 0.3 0.7 0.9 0.6 0.8 0.9 0.4 1.3 1.6 1.4 1.7 0.9 1.3 0.6 0.3 -0.4 -0.1 1.2 0.9 1.1 1.6 0.8 1.3 1.1 1.6 1.5 1.6 1.1 1.2 0.5 0.7 0.1 0.0 0.2 0.0 1.1 1.0 1.0 1.3 0.2 1.0 1.0 1.0 1.2 0.7 1.0 1.0 0.7 1.0 1.0 0.3 1.0 1.0 4.7 -0.2 2.0 0.2 0.4 0.7 1.1 1.0 1.0 0.9 0.2 0.2 4.5 4.5 -0.3 -0.3 0.2 0.3 1.1 -0.2 0.7 -0.4 -0.4 -0.3 1.2 5.8 6.6 -0.4 1.6 0.1 2.0 2.2 1.5 -0.3 1.6 24.9 0.5 -0.3 0.7 -0.1 2.8 0.4 -0.4 0.7 -0.3 0.4
Arch9 2.9 3.6 4.2 2.8 3.5 5.4 5.0 4.6 4.4 1.4 1.3 1.3 4.4 2.3 2.3 2.0 4.1 2.9 2.8 4.9 6.9 3.4 3.4 3.9 4.3 3.3 4.9 2.7 2.8 3.7 4.1 3.2 5.6 2.8 2.9 4.7 5.0 0.2 0.0 0.2 -0.0 0.0 6.6 5.3 4.4 4.6 4.1 4.9 5.3 5.2 5.4 4.3 3.7 4.7 4.9 5.0 2.2 3.6 8.1 6.0 3.7 3.1 2.6 1.1 0.1 -1.7 -1.2 3.7 1.2 4.1 3.4 4.9 3.8 3.9 3.4 2.3 2.9 4.0 4.2 3.0 3.9 0.9 1.2 0.3 2.1 4.1 2.8 2.8 3.5 0.7 1.6 6.9 6.9 5.3 7.6 6.6 1.6 0.8 1.6 6.9 3.5 7.0 6.9 0.1 2.1 1.6 0.9 8.7 0.2 4.0 1.2 7.4 7.7 8.5 8.7 1.0 1.1 2.3 2.3 1.3 1.3 4.1 0.8 0.8 0.0 0.3 0.1 0.1 0.1 -0.4 -0.1 -0.3 -0.4 -0.2 -0.2 -0.5 -0.1 -0.4 0.6 -0.5 0.0 0.0 -0.6 1.9 -0.0 -0.1 -0.6 -0.1 2.3
Arch10 3.1 2.8 2.7 3.0 3.2 2.4 1.3 1.7 1.1 3.2 3.1 2.6 2.5 2.9 0.3 0.4 3.0 3.1 3.0 2.5 2.8 3.2 2.2 3.2 3.3 3.2 1.7 2.2 1.5 3.2 3.3 3.2 1.9 2.7 2.1 2.4 0.4 -0.7 -0.3 -0.5 -0.6 0.6 2.1 0.4 0.1 0.3 0.5 0.4 0.4 0.5 0.4 2.8 2.8 2.2 2.1 2.2 3.2 3.3 2.1 1.3 1.7 2.1 2.6 4.7 4.9 0.7 2.6 3.2 0.9 3.0 1.5 2.2 0.3 3.3 0.8 2.9 2.5 3.0 3.1 2.2 2.1 2.0 1.6 2.0 5.3 3.0 1.9 1.8 1.7 1.6 2.2 5.8 5.8 6.8 4.5 5.6 2.2 1.5 2.2 5.8 6.1 5.7 5.8 6.3 7.9 3.6 3.0 2.2 3.5 3.0 1.6 4.6 4.4 1.5 1.4 7.0 6.4 7.3 6.9 3.4 3.0 3.0 0.2 2.5 1.1 0.7 0.3 0.8 0.5 0.1 0.7 4.0 2.8 1.6 3.1 2.5 0.4 2.6 -0.2 5.8 0.3 1.1 0.8 2.1 1.0 0.9 4.5 0.9 1.7


In [10]:
# Split WHD data into test/train with 20/80 ratio
split = whd_zcta["flsa_repeat_violator"].runif()
train = whd_zcta[split <= 0.8]
test = whd_zcta[split > 0.8]

# Build a DL model to predict repeat violators and score
s = time.time()
dl_orig = H2ODeepLearningEstimator(epochs = 0.1, hidden = [50,50,50], distribution = "multinomial")
idx_x = train.names
idx_x.remove("flsa_repeat_violator")
idx_x = idx_x[4:]
dl_orig.train(x               =idx_x,
              y               ="flsa_repeat_violator",
              training_frame  =train,
              validation_frame=test)
orig_elapsed = time.time() - s


deeplearning Model Build Progress: [##################################################] 100%

In [11]:
# Replace zcta5_cd column in WHD data with GLRM archetypes
zcta_arch_x["zcta5_cd"] = acs_zcta_col
whd_arch = whd_zcta.merge(zcta_arch_x, allLeft = True, allRite = False)
whd_arch = whd_arch.drop("zcta5_cd")
whd_arch.describe()


Rows: 208,806 Cols: 113

Chunk compression summary:
chunk_type chunk_name count count_percentage size size_percentage
C0L Constant Integers 40 17.699116 3.1 KB 0.0
CX0 Sparse Bits 7 3.097345 674 B 0.0
CXI Sparse Integers 69 30.530973 213.7 KB 0.6039514
C1 1-Byte Integers 2 0.9 204.0 KB 0.6
C1N 1-Byte Integers (w/o NAs) 5 2.2123895 512.8 KB 1.4495845
C2 2-Byte Integers 11 4.8672566 2.2 MB 6.2343006
C4 4-Byte Integers 14 6.19469 5.6 MB 16.264456
CXD Sparse Reals 46 20.353981 381.4 KB 1.0781779
C8D 64-bit Reals 32 14.159292 25.5 MB 73.78209
Frame distribution summary:
size number_of_rows number_of_chunks_per_column number_of_chunks
192.168.1.76:54321 34.5 MB 208806.0 2.0 226.0
mean 34.5 MB 208806.0 2.0 226.0
min 34.5 MB 208806.0 2.0 226.0
max 34.5 MB 208806.0 2.0 226.0
stddev 0 B 0.0 0.0 0.0
total 34.5 MB 208806.0 2.0 226.0
Column-by-Column Summary:

trade_nm cty_nm st_cd case_id naic_cd flsa_repeat_violator case_violtn_cnt cmp_assd_cnt ee_violtd_cnt bw_atp_amt ee_atp_cnt flsa_violtn_cnt flsa_bw_atp_amt flsa_ee_atp_cnt flsa_mw_bw_atp_amt flsa_ot_bw_atp_amt flsa_15a3_bw_atp_amt flsa_cmp_assd_amt sca_violtn_cnt sca_bw_atp_amt sca_ee_atp_cnt mspa_violtn_cnt mspa_bw_atp_amt mspa_ee_atp_cnt mspa_cmp_assd_amt h1b_violtn_cnt h1b_bw_atp_amt h1b_ee_atp_cnt h1b_cmp_assd_amt fmla_violtn_cnt fmla_bw_atp_amt fmla_ee_atp_cnt fmla_cmp_assd_amt flsa_cl_violtn_cnt flsa_cl_minor_cnt flsa_cl_cmp_assd_amt dbra_cl_violtn_cnt dbra_bw_atp_amt dbra_ee_atp_cnt h2a_violtn_cnt h2a_bw_atp_amt h2a_ee_atp_cnt h2a_cmp_assd_amt flsa_smw14_violtn_cnt flsa_smw14_bw_amt flsa_smw14_ee_atp_cnt cwhssa_violtn_cnt cwhssa_bw_amt cwhssa_ee_cnt osha_violtn_cnt osha_bw_atp_amt osha_ee_atp_cnt osha_cmp_assd_amt eppa_violtn_cnt eppa_bw_atp_amt eppa_ee_cnt eppa_cmp_assd_amt h1a_violtn_cnt h1a_bw_atp_amt h1a_ee_atp_cnt h1a_cmp_assd_amt crew_violtn_cnt crew_bw_atp_amt crew_ee_atp_cnt crew_cmp_assd_amt ccpa_violtn_cnt ccpa_bw_atp_amt ccpa_ee_atp_cnt flsa_smwpw_violtn_cnt flsa_smwpw_bw_atp_amt flsa_smwpw_ee_atp_cnt flsa_hmwkr_violtn_cnt flsa_hmwkr_bw_atp_amt flsa_hmwkr_ee_atp_cnt flsa_hmwkr_cmp_assd_amt ca_violtn_cnt ca_bw_atp_amt ca_ee_atp_cnt pca_violtn_cnt pca_bw_atp_amt pca_ee_atp_cnt flsa_smwap_violtn_cnt flsa_smwap_bw_atp_amt flsa_smwap_ee_atp_cnt flsa_smwft_violtn_cnt flsa_smwft_bw_atp_amt flsa_smwft_ee_atp_cnt flsa_smwl_violtn_cnt flsa_smwl_bw_atp_amt flsa_smwl_ee_atp_cnt flsa_smwmg_violtn_cnt flsa_smwmg_bw_atp_amt flsa_smwmg_ee_atp_cnt flsa_smwsl_violtn_cnt flsa_smwsl_bw_atp_amt flsa_smwsl_ee_atp_cnt eev_violtn_cnt h2b_violtn_cnt h2b_bw_atp_amt h2b_ee_atp_cnt sraw_violtn_cnt sraw_bw_atp_amt sraw_ee_atp_cnt Arch2 Arch3 Arch4 Arch5 Arch6 Arch7 Arch8 Arch9 Arch10 Arch1
type enum enum enum int int enum int real int real int int real int real real real real int real int int real int real int real int real int real int int int int real int real int int real int real int real int int real int int int int real int real int int int real int int int int int int int real int int real int int real int int int real int int real int int int int int real int int int int int int int int real int int int real int int int int real real real real real real real real real real
mins 0.0 0.0 0.0 1918.0 9.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 -0.0 -0.4 -0.4 -0.4 -0.7 -0.0 -0.2 -0.4 -0.3 -0.1
maxs 178147.0 11043.0 55.0 1081028538.0 928120.0 3.0 530002.0 3129900.0 76664.0 30857204.7 76664.0 76664.0 30857204.7 76664.0 1426727.4 30857204.7 103847.31 708950.0 13667.0 18598525.9 7170.0 905.0 123330.54 903.0 97225.0 1742.0 4222146.3 1046.0 1740000.0 530002.0 118250.0 105.0 100.0 766.0 764.0 287980.0 752.0 5745385.3 747.0 2661.0 2338699.6 1365.0 3129900.0 544.0 299592.9 544.0 6171.0 3213935.4 2153.0 32.0 0.0 0.0 37800.0 166.0 20750.78 2.0 274000.0 119.0 1041759.0 119.0 40000.0 2.0 0.0 0.0 165000.0 27.0 25000.0 27.0 220.0 188895.49 191.0 43.0 15945.66 33.0 500.0 45.0 25973.23 45.0 224.0 360000.0 224.0 0.0 0.0 0.0 7.0 1254.91 7.0 0.0 0.0 0.0 0.0 0.0 0.0 3.0 382.05 2.0 1.0 194.0 93507.0 159.0 0.0 0.0 0.0 1.0 1.0 2.3 0.6 0.9 0.8 2.0 2.0 2.9 1.0
mean NaN NaN NaN 1554203.5 386060.0 NaN 26.7 539.6 13.8 10330.6 12.5 13.3 7610.0 10.9 1092.4 6512.0 2.5 209.4 1.2 1262.5 0.7 0.3 21.3 0.1 45.4 0.1 347.5 0.0 35.9 10.1 48.9 0.0 0.0 0.2 0.1 154.2 0.5 781.1 0.3 0.3 63.1 0.1 76.6 0.3 44.2 0.3 0.3 135.7 0.3 0.0 0.0 0.0 3.1 0.0 0.7 0.0 6.3 0.0 5.0 0.0 0.2 0.0 0.0 0.0 1.4 0.0 0.5 0.0 0.0 2.2 0.0 0.0 0.2 0.0 0.0 0.0 0.3 0.0 0.0 3.1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4.1 0.0 0.0 0.0 0.0 -0.0 0.1 0.0 0.2 0.1 0.2 0.0 0.1 0.1 0.2
sigma NaN NaN NaN 2365752.3 269790.4 NaN 1404.6 10479.0 198.0 105051.0 197.4 199.2 84369.4 196.0 12170.2 83106.5 292.7 4025.9 37.1 51171.7 21.2 5.9 804.5 4.9 695.1 4.9 14265.2 3.0 4924.4 1389.4 981.7 0.3 0.4 2.6 2.4 1912.5 5.8 19590.9 3.9 13.5 5981.3 5.8 8009.1 6.1 1455.2 5.8 15.8 9880.6 9.3 0.3 0.0 0.0 175.0 0.6 94.5 0.0 781.2 0.3 2279.8 0.3 87.5 0.0 0.0 0.0 420.5 0.1 63.2 0.1 0.8 476.4 0.8 0.2 48.2 0.1 1.1 0.1 65.1 0.1 0.6 895.5 0.5 0.0 0.0 0.0 0.0 2.8 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.8 422.3 0.6 0.0 0.0 0.0 0.0 0.2 0.1 0.1 0.2 0.2 0.1 0.2 0.2 0.2
zero_count 1 1 401 0 0 197677 52029 191181 66849 91683 92486 79949 107455 107941 176733 116858 208707 202664 203548 203885 204044 200862 207890 207896 204455 207098 207374 207423 208521 200531 207013 207050 208802 199621 200240 202775 202274 202881 202915 207246 208016 208026 207926 207180 207252 207268 204894 205041 205075 207983 208806 208806 208472 208638 208783 208784 208738 208805 208805 208805 208805 208803 208806 208806 208803 208663 208697 208698 208750 208761 208761 208757 208795 208796 208805 208791 208793 208793 208790 208794 208794 208806 208806 208806 208804 208804 208804 208806 208806 208806 208806 208806 208806 208798 208798 208798 208790 208673 208747 208747 208806 208806 208806 0 0 0 0 0 0 0 0 0 0
missing_count 0 4 3 0 180 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3628 3628 3628 3628 3628 3628 3628 3628 3628 3628

In [12]:
# Split WHD data into test/train with 20/80 ratio
train_mod = whd_arch[split <= 0.8]
test_mod = whd_arch[split > 0.8]

# Build a GBM model to predict repeat violators and score
s = time.time()
dl_mod = H2ODeepLearningEstimator(epochs = 0.1, hidden = [50,50,50], distribution = "multinomial")

dl_mod.train(x               =idx_x,
             y               ="flsa_repeat_violator",
             training_frame  =train,
             validation_frame=test)

mod_elapsed = time.time() - s


deeplearning Model Build Progress: [##################################################] 100%

In [13]:
# Model performance comparison
train_ll_orig = dl_orig.model_performance(train).logloss()
test_ll_orig  = dl_orig.model_performance(test ).logloss()
train_ll_mod  = dl_mod .model_performance(train).logloss()
test_ll_mod   = dl_mod .model_performance(test ).logloss()

# Print results in pretty HTML table
header = ["Metric"   , "Original"    , "Reduced"    ]
table = [
         ["Runtime"  , orig_elapsed  , mod_elapsed  ],
         ["Train LogLoss", train_ll_orig, train_ll_mod],
         ["Test LogLoss" , test_ll_orig , test_ll_mod ],
        ]
h2o.H2ODisplay(table,header)


Metric Original Reduced
Runtime 30.0 30.1
Train LogLoss 0.2 0.2
Test LogLoss 0.2 0.2
Out[13]:
Metric Original Reduced
Runtime 30.0 30.1
Train LogLoss 0.2 0.2
Test LogLoss 0.2 0.2

In [ ]: