notebook.community

Edit and run



In [1]:

    
cd ~









    



/home/samuel_harrold



In [2]:

    
# Import standard packages.
import os
import subprocess
import sys
# Import installed packages.
import numpy as np
import pandas as pd
# Import local packages.
# Insert current directory into module search path.
# `dsdemos` version: https://github.com/stharrold/dsdemos/releases/tag/v0.0.3
sys.path.insert(0, os.path.join(os.path.curdir, r'dsdemos'))
import dsdemos as dsd



In [3]:

    
# File paths
# Base path to ACS:
#     http://www2.census.gov/programs-surveys/acs/
# 2013 5-year PUMS data dictionary:
#     tech_docs/pums/data_dict/PUMS_Data_Dictionary_2009-2013.txt
# 2013 5-year PUMS housing records for Washington DC (extracted from csv_pdc.zip):
#     data/pums/2013/5-Year/csv_pdc.zip
# 2013 5-year PUMS user verification estimates:
#     tech_docs/pums/estimates/pums_estimates_9_13.csv
path_acs = r'/mnt/disk-20151227t211000z/www2-census-gov/programs-surveys/acs/'
path_dtxt = os.path.join(path_acs, r'tech_docs/pums/data_dict/PUMS_Data_Dictionary_2009-2013.txt')
path_hcsv = os.path.join(path_acs, r'data/pums/2013/5-Year/ss13hdc.csv')
path_ecsv = os.path.join(path_acs, r'tech_docs/pums/estimates/pums_estimates_9_13.csv')



In [4]:

    
# Load and display the data dictionary.
ddict = dsd.census.parse_pumsdatadict(path=path_dtxt)
print("`ddict`, `dfd`: Convert the nested `dict` into a hierarchical data frame.")
tmp = dict() # `tmp` is a throwaway variable
for record_type in ddict['record_types']:
    tmp[record_type] = pd.DataFrame.from_dict(ddict['record_types'][record_type], orient='index')
dfd = pd.concat(tmp, names=['record_type', 'var_name'])
dfd.head()









    



`ddict`, `dfd`: Convert the nested `dict` into a hierarchical data frame.






    Out[4]:






  
    
      
      
      length
      description
      var_codes
      notes
    
    
      record_type
      var_name
      
      
      
      
    
  
  
    
      HOUSING RECORD
      ACR
      1
      Lot size
      {'b': 'N/A (GQ/not a one-family house or mobil...
      NaN
    
    
      ADJHSG
      7
      Adjustment factor for housing dollar amounts (...
      {'1086032': '2009 factor', '1068395': '2010 fa...
      [Note: The values of ADJHSG inflation-adjusts ...
    
    
      ADJINC
      7
      Adjustment factor for income and earnings doll...
      {'1085467': '2009 factor (0.999480 * 1.0860317...
      [Note: The values of ADJINC inflation-adjusts ...
    
    
      AGS
      1
      Sales of Agriculture Products (Yearly sales)
      {'b': 'N/A (GQ/vacant/not a one-family house o...
      [Note: No adjustment factor is applied to AGS.]
    
    
      BATH
      1
      Bathtub or shower
      {'b': 'N/A (GQ)', '1': 'Yes', '2': 'No'}
      NaN



In [5]:

    
# Load and display the housing records.
print("`dfh`: First 5 housing records and first 10 columns.")
dfh = pd.read_csv(path_hcsv)
dfh.iloc[:, :10].head()









    



`dfh`: First 5 housing records and first 10 columns.






    Out[5]:






  
    
      
      insp
      RT
      SERIALNO
      DIVISION
      PUMA00
      PUMA10
      REGION
      ST
      ADJHSG
      ADJINC
    
  
  
    
      0
      600
      H
      2009000000403
      5
      102
      -9
      3
      11
      1086032
      1085467
    
    
      1
      NaN
      H
      2009000001113
      5
      103
      -9
      3
      11
      1086032
      1085467
    
    
      2
      480
      H
      2009000001978
      5
      103
      -9
      3
      11
      1086032
      1085467
    
    
      3
      NaN
      H
      2009000002250
      5
      105
      -9
      3
      11
      1086032
      1085467
    
    
      4
      2500
      H
      2009000002985
      5
      101
      -9
      3
      11
      1086032
      1085467



In [6]:

    
# Load and display the verification estimates.
# Select the estimates for Washington DC then for the
# characteristic 'Owner occupied units (TEN in 1,2)'.
dfe = pd.read_csv(path_ecsv)
tfmask_dc = dfe['state'] == 'District of Columbia'
dfe_dc = dfe.loc[tfmask_dc]
print("`dfe_dc`: This example verifies these quantities.")
dfe_dc.loc[[310]]









    



`dfe_dc`: This example verifies these quantities.






    Out[6]:






  
    
      
      st
      state
      characteristic
      pums_est_09_to_13
      pums_se_09_to_13
      pums_moe_09_to_13
    
  
  
    
      310
      11
      District of Columbia
      Owner occupied units (TEN in 1,2)
      110,362
      1363
      2242



In [7]:

    
# Verify the estimates following
# https://www.census.gov/programs-surveys/acs/
#     technical-documentation/pums/documentation.2013.html
#     tech_docs/pums/accuracy/2009_2013AccuracyPUMS.pdf
# Define the column names for the housing weights.
# Select the reference verification data for the characteristic,
# and select the records for the characteristic.
hwt = 'WGTP'
hwts = [hwt+str(inum) for inum in range(1, 81)] # ['WGTP1', ..., 'WGTP80']
char = 'Owner occupied units (TEN in 1,2)'
print("'{char}'".format(char=char))
tfmask_ref = dfe_dc['characteristic'] == char
tfmask_test = np.logical_or(dfh['TEN'] == 1, dfh['TEN'] == 2)

# Calculate and verify the estimate ('est') for the characteristic.
# The estimate is the sum of the sample weights 'WGTP'.
col = 'pums_est_09_to_13'
print("    '{col}':".format(col=col), end=' ')
ref_est = int(dfe_dc.loc[tfmask_ref, col].values[0].replace(',', ''))
test_est = dfh.loc[tfmask_test, hwt].sum()
assert np.isclose(ref_est, test_est, rtol=0, atol=1)
print("(ref, test) = {tup}".format(tup=(ref_est, test_est)))

# Calculate and verify the "direct standard error" ('se') of the estimate.
# The direct standard error is a modified root-mean-square deviation
# using the "replicate weights" 'WGTP[1-80]'.
col = 'pums_se_09_to_13'
print("    '{col}' :".format(col=col), end=' ')
ref_se = dfe_dc.loc[tfmask_ref, col].values[0]
test_se = ((4/80)*((dfh.loc[tfmask_test, hwts].sum() - test_est)**2).sum())**0.5
assert np.isclose(ref_se, test_se, rtol=0, atol=1)
print("(ref, test) = {tup}".format(tup=(ref_se, test_se)))

# Calculate and verify the margin of error ('moe') at the
# 90% confidence level (+/- 1.645 standard errors).
col = 'pums_moe_09_to_13'
print("    '{col}':".format(col=col), end=' ')
ref_moe = dfe_dc.loc[tfmask_ref, col].values[0]
test_moe = 1.645*test_se
assert np.isclose(ref_moe, test_moe, rtol=0, atol=1)
print("(ref, test) = {tup}".format(tup=(ref_moe, test_moe)))









    



'Owner occupied units (TEN in 1,2)'
    'pums_est_09_to_13': (ref, test) = (110362, 110362)
    'pums_se_09_to_13' : (ref, test) = (1363, 1363.1910174293257)
    'pums_moe_09_to_13': (ref, test) = (2242, 2242.449223671241)



In [8]:

    
# Export ipynb to html
path_static = os.path.join(os.path.expanduser(r'~'), r'stharrold.github.io/content/static')
basename = r'20160110-etl-census-with-python'
filename = r'example'
path_ipynb = os.path.join(path_static, basename, filename+'.ipynb')
for template in ['basic', 'full']:
    path_html = os.path.splitext(path_ipynb)[0]+'-'+template+'.html'
    cmd = ['jupyter', 'nbconvert', '--to', 'html', '--template', template, path_ipynb, '--output', path_html]
    print(' '.join(cmd))
    subprocess.run(args=cmd, check=True)
    print()









    



jupyter nbconvert --to html --template basic /home/samuel_harrold/stharrold.github.io/content/static/20160110-etl-census-with-python/example.ipynb --output /home/samuel_harrold/stharrold.github.io/content/static/20160110-etl-census-with-python/example-basic.html

jupyter nbconvert --to html --template full /home/samuel_harrold/stharrold.github.io/content/static/20160110-etl-census-with-python/example.ipynb --output /home/samuel_harrold/stharrold.github.io/content/static/20160110-etl-census-with-python/example-full.html

		length	description	var_codes	notes
record_type	var_name
HOUSING RECORD	ACR	1	Lot size	{'b': 'N/A (GQ/not a one-family house or mobil...	NaN
	ADJHSG	7	Adjustment factor for housing dollar amounts (...	{'1086032': '2009 factor', '1068395': '2010 fa...	[Note: The values of ADJHSG inflation-adjusts ...
	ADJINC	7	Adjustment factor for income and earnings doll...	{'1085467': '2009 factor (0.999480 * 1.0860317...	[Note: The values of ADJINC inflation-adjusts ...
	AGS	1	Sales of Agriculture Products (Yearly sales)	{'b': 'N/A (GQ/vacant/not a one-family house o...	[Note: No adjustment factor is applied to AGS.]
	BATH	1	Bathtub or shower	{'b': 'N/A (GQ)', '1': 'Yes', '2': 'No'}	NaN

	insp	RT	SERIALNO	DIVISION	PUMA00	PUMA10	REGION	ST	ADJHSG	ADJINC
0	600	H	2009000000403	5	102	-9	3	11	1086032	1085467
1	NaN	H	2009000001113	5	103	-9	3	11	1086032	1085467
2	480	H	2009000001978	5	103	-9	3	11	1086032	1085467
3	NaN	H	2009000002250	5	105	-9	3	11	1086032	1085467
4	2500	H	2009000002985	5	101	-9	3	11	1086032	1085467