In [1]:
cd ~
In [2]:
# Import standard packages.
import os
import subprocess
import sys
# Import installed packages.
import numpy as np
import pandas as pd
# Import local packages.
# Insert current directory into module search path.
# `dsdemos` version: https://github.com/stharrold/dsdemos/releases/tag/v0.0.3
sys.path.insert(0, os.path.join(os.path.curdir, r'dsdemos'))
import dsdemos as dsd
In [3]:
# File paths
# Base path to ACS:
# http://www2.census.gov/programs-surveys/acs/
# 2013 5-year PUMS data dictionary:
# tech_docs/pums/data_dict/PUMS_Data_Dictionary_2009-2013.txt
# 2013 5-year PUMS housing records for Washington DC (extracted from csv_pdc.zip):
# data/pums/2013/5-Year/csv_pdc.zip
# 2013 5-year PUMS user verification estimates:
# tech_docs/pums/estimates/pums_estimates_9_13.csv
path_acs = r'/mnt/disk-20151227t211000z/www2-census-gov/programs-surveys/acs/'
path_dtxt = os.path.join(path_acs, r'tech_docs/pums/data_dict/PUMS_Data_Dictionary_2009-2013.txt')
path_hcsv = os.path.join(path_acs, r'data/pums/2013/5-Year/ss13hdc.csv')
path_ecsv = os.path.join(path_acs, r'tech_docs/pums/estimates/pums_estimates_9_13.csv')
In [4]:
# Load and display the data dictionary.
ddict = dsd.census.parse_pumsdatadict(path=path_dtxt)
print("`ddict`, `dfd`: Convert the nested `dict` into a hierarchical data frame.")
tmp = dict() # `tmp` is a throwaway variable
for record_type in ddict['record_types']:
tmp[record_type] = pd.DataFrame.from_dict(ddict['record_types'][record_type], orient='index')
dfd = pd.concat(tmp, names=['record_type', 'var_name'])
dfd.head()
Out[4]:
In [5]:
# Load and display the housing records.
print("`dfh`: First 5 housing records and first 10 columns.")
dfh = pd.read_csv(path_hcsv)
dfh.iloc[:, :10].head()
Out[5]:
In [6]:
# Load and display the verification estimates.
# Select the estimates for Washington DC then for the
# characteristic 'Owner occupied units (TEN in 1,2)'.
dfe = pd.read_csv(path_ecsv)
tfmask_dc = dfe['state'] == 'District of Columbia'
dfe_dc = dfe.loc[tfmask_dc]
print("`dfe_dc`: This example verifies these quantities.")
dfe_dc.loc[[310]]
Out[6]:
In [7]:
# Verify the estimates following
# https://www.census.gov/programs-surveys/acs/
# technical-documentation/pums/documentation.2013.html
# tech_docs/pums/accuracy/2009_2013AccuracyPUMS.pdf
# Define the column names for the housing weights.
# Select the reference verification data for the characteristic,
# and select the records for the characteristic.
hwt = 'WGTP'
hwts = [hwt+str(inum) for inum in range(1, 81)] # ['WGTP1', ..., 'WGTP80']
char = 'Owner occupied units (TEN in 1,2)'
print("'{char}'".format(char=char))
tfmask_ref = dfe_dc['characteristic'] == char
tfmask_test = np.logical_or(dfh['TEN'] == 1, dfh['TEN'] == 2)
# Calculate and verify the estimate ('est') for the characteristic.
# The estimate is the sum of the sample weights 'WGTP'.
col = 'pums_est_09_to_13'
print(" '{col}':".format(col=col), end=' ')
ref_est = int(dfe_dc.loc[tfmask_ref, col].values[0].replace(',', ''))
test_est = dfh.loc[tfmask_test, hwt].sum()
assert np.isclose(ref_est, test_est, rtol=0, atol=1)
print("(ref, test) = {tup}".format(tup=(ref_est, test_est)))
# Calculate and verify the "direct standard error" ('se') of the estimate.
# The direct standard error is a modified root-mean-square deviation
# using the "replicate weights" 'WGTP[1-80]'.
col = 'pums_se_09_to_13'
print(" '{col}' :".format(col=col), end=' ')
ref_se = dfe_dc.loc[tfmask_ref, col].values[0]
test_se = ((4/80)*((dfh.loc[tfmask_test, hwts].sum() - test_est)**2).sum())**0.5
assert np.isclose(ref_se, test_se, rtol=0, atol=1)
print("(ref, test) = {tup}".format(tup=(ref_se, test_se)))
# Calculate and verify the margin of error ('moe') at the
# 90% confidence level (+/- 1.645 standard errors).
col = 'pums_moe_09_to_13'
print(" '{col}':".format(col=col), end=' ')
ref_moe = dfe_dc.loc[tfmask_ref, col].values[0]
test_moe = 1.645*test_se
assert np.isclose(ref_moe, test_moe, rtol=0, atol=1)
print("(ref, test) = {tup}".format(tup=(ref_moe, test_moe)))
In [8]:
# Export ipynb to html
path_static = os.path.join(os.path.expanduser(r'~'), r'stharrold.github.io/content/static')
basename = r'20160110-etl-census-with-python'
filename = r'example'
path_ipynb = os.path.join(path_static, basename, filename+'.ipynb')
for template in ['basic', 'full']:
path_html = os.path.splitext(path_ipynb)[0]+'-'+template+'.html'
cmd = ['jupyter', 'nbconvert', '--to', 'html', '--template', template, path_ipynb, '--output', path_html]
print(' '.join(cmd))
subprocess.run(args=cmd, check=True)
print()