In [1]:
cd ~


/home/samuel_harrold

In [2]:
# Import standard packages.
import os
import subprocess
import sys
# Import installed packages.
import numpy as np
import pandas as pd
# Import local packages.
# Insert current directory into module search path.
# `dsdemos` version: https://github.com/stharrold/dsdemos/releases/tag/v0.0.3
sys.path.insert(0, os.path.join(os.path.curdir, r'dsdemos'))
import dsdemos as dsd

In [3]:
# File paths
# Base path to ACS:
#     http://www2.census.gov/programs-surveys/acs/
# 2013 5-year PUMS data dictionary:
#     tech_docs/pums/data_dict/PUMS_Data_Dictionary_2009-2013.txt
# 2013 5-year PUMS housing records for Washington DC (extracted from csv_pdc.zip):
#     data/pums/2013/5-Year/csv_pdc.zip
# 2013 5-year PUMS user verification estimates:
#     tech_docs/pums/estimates/pums_estimates_9_13.csv
path_acs = r'/mnt/disk-20151227t211000z/www2-census-gov/programs-surveys/acs/'
path_dtxt = os.path.join(path_acs, r'tech_docs/pums/data_dict/PUMS_Data_Dictionary_2009-2013.txt')
path_hcsv = os.path.join(path_acs, r'data/pums/2013/5-Year/ss13hdc.csv')
path_ecsv = os.path.join(path_acs, r'tech_docs/pums/estimates/pums_estimates_9_13.csv')

In [4]:
# Load and display the data dictionary.
ddict = dsd.census.parse_pumsdatadict(path=path_dtxt)
print("`ddict`, `dfd`: Convert the nested `dict` into a hierarchical data frame.")
tmp = dict() # `tmp` is a throwaway variable
for record_type in ddict['record_types']:
    tmp[record_type] = pd.DataFrame.from_dict(ddict['record_types'][record_type], orient='index')
dfd = pd.concat(tmp, names=['record_type', 'var_name'])
dfd.head()


`ddict`, `dfd`: Convert the nested `dict` into a hierarchical data frame.
Out[4]:
length description var_codes notes
record_type var_name
HOUSING RECORD ACR 1 Lot size {'b': 'N/A (GQ/not a one-family house or mobil... NaN
ADJHSG 7 Adjustment factor for housing dollar amounts (... {'1086032': '2009 factor', '1068395': '2010 fa... [Note: The values of ADJHSG inflation-adjusts ...
ADJINC 7 Adjustment factor for income and earnings doll... {'1085467': '2009 factor (0.999480 * 1.0860317... [Note: The values of ADJINC inflation-adjusts ...
AGS 1 Sales of Agriculture Products (Yearly sales) {'b': 'N/A (GQ/vacant/not a one-family house o... [Note: No adjustment factor is applied to AGS.]
BATH 1 Bathtub or shower {'b': 'N/A (GQ)', '1': 'Yes', '2': 'No'} NaN

In [5]:
# Load and display the housing records.
print("`dfh`: First 5 housing records and first 10 columns.")
dfh = pd.read_csv(path_hcsv)
dfh.iloc[:, :10].head()


`dfh`: First 5 housing records and first 10 columns.
Out[5]:
insp RT SERIALNO DIVISION PUMA00 PUMA10 REGION ST ADJHSG ADJINC
0 600 H 2009000000403 5 102 -9 3 11 1086032 1085467
1 NaN H 2009000001113 5 103 -9 3 11 1086032 1085467
2 480 H 2009000001978 5 103 -9 3 11 1086032 1085467
3 NaN H 2009000002250 5 105 -9 3 11 1086032 1085467
4 2500 H 2009000002985 5 101 -9 3 11 1086032 1085467

In [6]:
# Load and display the verification estimates.
# Select the estimates for Washington DC then for the
# characteristic 'Owner occupied units (TEN in 1,2)'.
dfe = pd.read_csv(path_ecsv)
tfmask_dc = dfe['state'] == 'District of Columbia'
dfe_dc = dfe.loc[tfmask_dc]
print("`dfe_dc`: This example verifies these quantities.")
dfe_dc.loc[[310]]


`dfe_dc`: This example verifies these quantities.
Out[6]:
st state characteristic pums_est_09_to_13 pums_se_09_to_13 pums_moe_09_to_13
310 11 District of Columbia Owner occupied units (TEN in 1,2) 110,362 1363 2242

In [7]:
# Verify the estimates following
# https://www.census.gov/programs-surveys/acs/
#     technical-documentation/pums/documentation.2013.html
#     tech_docs/pums/accuracy/2009_2013AccuracyPUMS.pdf
# Define the column names for the housing weights.
# Select the reference verification data for the characteristic,
# and select the records for the characteristic.
hwt = 'WGTP'
hwts = [hwt+str(inum) for inum in range(1, 81)] # ['WGTP1', ..., 'WGTP80']
char = 'Owner occupied units (TEN in 1,2)'
print("'{char}'".format(char=char))
tfmask_ref = dfe_dc['characteristic'] == char
tfmask_test = np.logical_or(dfh['TEN'] == 1, dfh['TEN'] == 2)

# Calculate and verify the estimate ('est') for the characteristic.
# The estimate is the sum of the sample weights 'WGTP'.
col = 'pums_est_09_to_13'
print("    '{col}':".format(col=col), end=' ')
ref_est = int(dfe_dc.loc[tfmask_ref, col].values[0].replace(',', ''))
test_est = dfh.loc[tfmask_test, hwt].sum()
assert np.isclose(ref_est, test_est, rtol=0, atol=1)
print("(ref, test) = {tup}".format(tup=(ref_est, test_est)))

# Calculate and verify the "direct standard error" ('se') of the estimate.
# The direct standard error is a modified root-mean-square deviation
# using the "replicate weights" 'WGTP[1-80]'.
col = 'pums_se_09_to_13'
print("    '{col}' :".format(col=col), end=' ')
ref_se = dfe_dc.loc[tfmask_ref, col].values[0]
test_se = ((4/80)*((dfh.loc[tfmask_test, hwts].sum() - test_est)**2).sum())**0.5
assert np.isclose(ref_se, test_se, rtol=0, atol=1)
print("(ref, test) = {tup}".format(tup=(ref_se, test_se)))

# Calculate and verify the margin of error ('moe') at the
# 90% confidence level (+/- 1.645 standard errors).
col = 'pums_moe_09_to_13'
print("    '{col}':".format(col=col), end=' ')
ref_moe = dfe_dc.loc[tfmask_ref, col].values[0]
test_moe = 1.645*test_se
assert np.isclose(ref_moe, test_moe, rtol=0, atol=1)
print("(ref, test) = {tup}".format(tup=(ref_moe, test_moe)))


'Owner occupied units (TEN in 1,2)'
    'pums_est_09_to_13': (ref, test) = (110362, 110362)
    'pums_se_09_to_13' : (ref, test) = (1363, 1363.1910174293257)
    'pums_moe_09_to_13': (ref, test) = (2242, 2242.449223671241)

In [8]:
# Export ipynb to html
path_static = os.path.join(os.path.expanduser(r'~'), r'stharrold.github.io/content/static')
basename = r'20160110-etl-census-with-python'
filename = r'example'
path_ipynb = os.path.join(path_static, basename, filename+'.ipynb')
for template in ['basic', 'full']:
    path_html = os.path.splitext(path_ipynb)[0]+'-'+template+'.html'
    cmd = ['jupyter', 'nbconvert', '--to', 'html', '--template', template, path_ipynb, '--output', path_html]
    print(' '.join(cmd))
    subprocess.run(args=cmd, check=True)
    print()


jupyter nbconvert --to html --template basic /home/samuel_harrold/stharrold.github.io/content/static/20160110-etl-census-with-python/example.ipynb --output /home/samuel_harrold/stharrold.github.io/content/static/20160110-etl-census-with-python/example-basic.html

jupyter nbconvert --to html --template full /home/samuel_harrold/stharrold.github.io/content/static/20160110-etl-census-with-python/example.ipynb --output /home/samuel_harrold/stharrold.github.io/content/static/20160110-etl-census-with-python/example-full.html