In [6]:
import io
import pathlib
import urllib.request
import tempfile
import zipfile

import numpy as np
import pandas as pd

Background (links good as of 2019-12-10)

This takes a fair amount of RAM and download time

This script fetches ~50 .sf1 state and territory zip files from the census server at the following location:

These each of these zip files contains (among other things) a geographic file and a population file containing.

The fixed-width geo file is useful because it ties a "Logical Record" to ZCTA codes. The CSV population file is useful because it contains the actual data we need for the logical records.

Specifically, we are pulling our data from population table 9 ("P9" ... see Summary File 1 documentation page 184).

P5. HISPANIC OR LATINO ORIGIN BY RACE Universe: Total population (17)

The data for this table is as follows:

P5 data:
    Total:
        Not Hispanic or Latino:
            White alone
            Black or African American alone
            American Indian and Alaska Native alone
            Asian alone
            Native Hawaiian and Other Pacific Islander alone 
            Some Other Race alone
            Two or More Races
        Hispanic or Latino:
            White alone
            Black or African American alone 
            American Indian and Alaska Native alone
            Asian alone
            Native Hawaiian and Other Pacific Islander alone 
            Some Other Race alone
            Two or More Races

Notes:

  • Geo table must be filtered by summary level == 871 (State-5-Digit ZIP Code Tabulation Area) so that it is limited to ZCTA records.
  • The cut down geo table is joined to the population table using the LOGRECNO (Log Record Number)
  • All Hispanic groups are condense to a single group to make consistent with surname data
  • Asian and API are combined to make consistent with surname data
  • "Some Other Race" is apporitoned abong the groups.

For more details, the technical documentation for Summary File 1 can be found here:


In [41]:
# These are the start/stop indices for the fixed width geo file.
# It allows them to be easily converted to dataframes.
GEO_MAP_2010 = {
    'FILEID'  : (1  , 7  ),
    'STUSAB'  : (7  , 9  ),
    'SUMLEV'  : (9  , 12 ),
    'GEOCOMP' : (12 , 14 ),
    'CHARITER': (14 , 17 ),
    'CIFSN'   : (17 , 19 ),
    'LOGRECNO': (19 , 26 ),
    'REGION'  : (26 , 27 ),
    'DIVISION': (27 , 28 ),
    'STATE'   : (28 , 30 ),
    'COUNTY'  : (30 , 33 ),
    'COUNTYCC': (33 , 35 ),
    'COUNTYSC': (35 , 37 ),
    'COUSUB'  : (37 , 42 ),
    'COUSUBCC': (42 , 44 ),
    'COUSUBSC': (44 , 46 ),
    'PLACE'   : (46 , 51 ),
    'PLACECC' : (51 , 53 ),
    'PLACESC' : (53 , 55 ),
    'TRACT'   : (55 , 61 ),
    'BLKGRP'  : (61 , 62 ),
    'BLOCK'   : (62 , 66 ),
    'IUC'     : (66 , 68 ),
    'CONCIT'  : (68 , 73 ),
    'CONCITCC': (73 , 75 ),
    'CONCITSC': (75 , 77 ),
    'AIANHH'  : (77 , 81 ),
    'AIANHHFP': (81 , 86 ),
    'AIANHHCC': (86 , 88 ),
    'AIHHTLI' : (88 , 89 ),
    'AITSCE'  : (89 , 92 ),
    'AITS'    : (92 , 97 ),
    'AITSCC'  : (97 , 99 ),
    'TTRACT'  : (99 , 105),
    'TBLKGRP' : (105, 106),
    'ANRC'    : (106, 111),
    'ANRCCC'  : (111, 113),
    'CBSA'    : (113, 118),
    'CBSASC'  : (118, 120),
    'METDIV'  : (120, 125),
    'CSA'     : (125, 128),
    'NECTA'   : (128, 133),
    'NECTASC' : (133, 135),
    'NECTADIV': (135, 140),
    'CNECTA'  : (140, 143),
    'CBSAPCI' : (143, 144),
    'NECTAPCI': (144, 145),
    'UA'      : (145, 150),
    'UASC'    : (150, 152),
    'UATYPE'  : (152, 153),
    'UR'      : (153, 154),
    'CD'      : (154, 156),
    'SLDU'    : (156, 159),
    'SLDL'    : (159, 162),
    'VTD'     : (162, 168),
    'VTDI'    : (168, 169),
    'RESERVE2': (169, 172),
    'ZCTA5'   : (172, 177),
    'SUBMCD'  : (177, 182),
    'SUBMCDCC': (182, 184),
    'SDELM'   : (184, 189),
    'SDSEC'   : (189, 194),
    'SDUNI'   : (194, 199),
    'AREALAND': (119, 213),
    'AREAWATR': (213, 227),
    'NAME'    : (227, 317),
    'FUNCSTAT': (317, 318),
    'GCUNI'   : (318, 319),
    'POP100'  : (319, 328),
    'HU100'   : (328, 337),
    'INTPTLAT': (337, 348),
    'INTPTLON': (348, 360),
    'LSADC'   : (360, 362),
    'PARTFLAG': (362, 363),
    'RESERVE3': (363, 369),
    'UGA'     : (369, 374),
    'STATENS' : (374, 382),
    'COUNTYNS': (382, 390),
    'COUSUBNS': (390, 398),
    'PLACENS' : (398, 406),
    'CONCITNS': (406, 414),
    'AIANHHNS': (414, 422),
    'AITSNS'  : (422, 430),
    'ANRCNS'  : (430, 438),
    'SUBMCDNS': (438, 446),
    'CD113'   : (446, 448),
    'CD114'   : (448, 450),
    'CD115'   : (450, 452),
    'SLDU2'   : (452, 455),
    'SLDU3'   : (455, 458),
    'SLDU4'   : (458, 461),
    'SLDL2'   : (461, 464),
    'SLDL3'   : (464, 467),
    'SLDL4'   : (467, 470),
    'AIANHHSC': (470, 472),
    'CSASC'   : (472, 476),
    'CNECTASC': (474, 477),
    'MEMI'    : (476, 478),
    'NMEMI'   : (477, 478),
    'PUMA'    : (478, 483),
    'RESERVED': (483, 501),
}

# Our zip downloads have URLs that can be recreated with state/arrevs.
STATES = {
    'AL': 'Alabama',
    'AK': 'Alaska',
    'AZ': 'Arizona',
    'AR': 'Arkansas',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DE': 'Delaware',
    'DC': 'District_of_Columbia',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'IA': 'Iowa',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'MD': 'Maryland',
    'MA': 'Massachusetts',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MS': 'Mississippi',
    'MO': 'Missouri',
    'MT': 'Montana',
    'NE': 'Nebraska',
    'NV': 'Nevada',
    'NH': 'New_Hampshire',
    'NJ': 'New_Jersey',
    'NM': 'New_Mexico',
    'NY': 'New_York',
    'NC': 'North_Carolina',
    'ND': 'North_Dakota',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'PR': 'Puerto_Rico',
    'RI': 'Rhode_Island',
    'SC': 'South_Carolina',
    'SD': 'South_Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VT': 'Vermont',
    'VA': 'Virginia',
    'WA': 'Washington',
    'WV': 'West_Virginia',
    'WI': 'Wisconsin',
    'WY': 'Wyoming',    
}

# This is the template for the URLs
URL_TEMPLATE_ZIP = 'https://www2.census.gov/census_2010/04-Summary_File_1/{state}/{state_abbrev}2010.sf1.zip'

TEMP_DIR = pathlib.Path(tempfile.gettempdir()) / 'surgeo_temp'
TEMP_DIR.mkdir(exist_ok=True)

# This creates a URL with each and every state in the STATES dictioanry
urls = [
    URL_TEMPLATE_ZIP.format(state_abbrev=code.lower(), state=name)
    for code, name
    in STATES.items()
]

In [42]:
def request_data(url, retries):
    '''Helper that attempts to get file a number of times'''
    tries = 0
    while True:
        try:
            with urllib.request.urlopen(url) as r:
                data = r.read()
                return data
        except Exception:
            tries += 1
            if tries >= retries:
                raise
        print('Retrying {url}...'.format(url))

def dl_file(url, file_path):
    '''Helper func: downloads zip from URL and stores it in local folder'''
    # If it exsits do nothing
    if file_path.exists():
        print('{} is already present. Processing ...'.format(file_path))
        pass
    # Otherwise download file to dir
    else:
    # Open request
        data = request_data(url, 3)
        file_path.touch()
        file_path.write_bytes(data)

def make_geo_df(file_path):
    '''Helper func: takes zip and creates a geographic file from data'''
    # Read zip data
    with zipfile.ZipFile(file_path) as zf:
        # Filter out everything except the ZipInfo (geo) for csv we want
        target = zf.filelist[0]
        # Read that CSV into BytesIO object
        geo_data = io.BytesIO(zf.read(target))
        # Read fixed-width file into dataframe
        geo_df = pd.read_fwf(
            geo_data, 
            header=None,
            # Use the GEO MAP but subtract from column start/stop
            colspecs=[
                (tuple_[0] - 1, tuple_[1] - 1)
                for tuple_
                in GEO_MAP_2010.values()
            ],
            dtype=str
        )
    # Give names to columns
    geo_df.columns = tuple(GEO_MAP_2010.keys())
    # Filter out all records that are not related to ZCTAs only
    # e.g. get rid of census block data
    geo_df = geo_df.loc[geo_df.SUMLEV == '871']
    # Keep the STUSAB (state), LOGRECONO (join key), and ZCTA5 (zip code proxy)
    geo_df = geo_df[['STUSAB', 'LOGRECNO', 'ZCTA5']]#.dropna(subset=['ZCTA5'])
    return geo_df

def make_pop_df(file_path):
    '''Helper func: Takes a zip and creates population df'''
    # Read zip data
    with zipfile.ZipFile(file_path) as zf:
        # Filter out everything except the ZipInfo for csv we want
        # This contains Table P5
        target = zf.filelist[3]
        # Read that CSV into BytesIO object
        pop_data = io.BytesIO(zf.read(target))
        pop_df = pd.read_csv(
            pop_data, 
            header=None,
            dtype=str
        )
        # Keep only a subset of columns and renames them
        pop_df = pop_df[[1, 4, 18, 19, 20, 21, 22, 23, 24, 25]]
        pop_df.columns = [
            'STUSAB',
            'LOGRECNO',
            'white',
            'black',
            'native',
            'asian',
            'pi',
            'other',
            'multiple',
            'hispanic',
        ]
        return pop_df

def merge_frames(geo_df, pop_df):
    '''Merges our GEO and POP frames'''
    # Merges common STUSAB and LOGRECNO fields
    merged = geo_df.merge(pop_df)
    # Rename zctq5
    merged = merged.rename(columns={'ZCTA5': 'zcta5'})
    # Set index to ZCTA5 and sort
    merged = merged.set_index('zcta5')
    merged = merged.sort_index()
    return merged
    
def create_df(url, temp_dir):
    '''Main function to download, join, and clean data for single state'''
    print(url)
    file_name = url.rpartition('/')[2]
    file_path = temp_dir / file_name 
    # Download
    data   = dl_file(url, file_path)
    # Make dfs
    geo_df = make_geo_df(file_path)
    pop_df = make_pop_df(file_path)
    # Join DFs, sort, trip, and process
    df     = merge_frames(geo_df, pop_df)
    df = df.iloc[:, 2:]
    df = df.astype(np.float64)
    return df

In [43]:
print('Starting download.')

# Create a dataframe for each URL and store in list
data = [
    create_df(url, TEMP_DIR)
    for url
    in urls
]

print('Download complete.')


Starting download.
https://www2.census.gov/census_2010/04-Summary_File_1/Alabama/al2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\al2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Alaska/ak2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ak2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Arizona/az2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\az2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Arkansas/ar2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ar2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/California/ca2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ca2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Colorado/co2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\co2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Connecticut/ct2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ct2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Delaware/de2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\de2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/District_of_Columbia/dc2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\dc2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Florida/fl2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\fl2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Georgia/ga2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ga2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Hawaii/hi2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\hi2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Idaho/id2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\id2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Illinois/il2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\il2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Indiana/in2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\in2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Iowa/ia2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ia2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Kansas/ks2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ks2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Kentucky/ky2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ky2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Louisiana/la2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\la2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Maine/me2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\me2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Maryland/md2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\md2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Massachusetts/ma2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ma2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Michigan/mi2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\mi2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Minnesota/mn2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\mn2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Mississippi/ms2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ms2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Missouri/mo2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\mo2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Montana/mt2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\mt2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Nebraska/ne2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ne2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Nevada/nv2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nv2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/New_Hampshire/nh2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nh2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/New_Jersey/nj2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nj2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/New_Mexico/nm2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nm2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/New_York/ny2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ny2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/North_Carolina/nc2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nc2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/North_Dakota/nd2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nd2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Ohio/oh2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\oh2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Oklahoma/ok2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ok2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Oregon/or2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\or2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Pennsylvania/pa2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\pa2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Puerto_Rico/pr2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\pr2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Rhode_Island/ri2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ri2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/South_Carolina/sc2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\sc2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/South_Dakota/sd2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\sd2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Tennessee/tn2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\tn2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Texas/tx2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\tx2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Utah/ut2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ut2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Vermont/vt2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\vt2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Virginia/va2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\va2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Washington/wa2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\wa2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/West_Virginia/wv2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\wv2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Wisconsin/wi2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\wi2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Wyoming/wy2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\wy2010.sf1.zip is already present. Processing ...
Download complete.

In [44]:
# Join all data into single dataframe and sort index
df = pd.concat(data)
df = df.sort_index()

In [47]:
# Check a ZCTA that crosses state lines
df.loc['69201']


Out[47]:
white black native asian pi other multiple hispanic
zcta5
69201 3291.0 4.0 265.0 20.0 0.0 0.0 97.0 61.0
69201 36.0 2.0 196.0 5.0 0.0 0.0 19.0 9.0

In [48]:
# https://github.com/theonaunheim/surgeo/issues/10
# Certain zctas cross state lines and must be added together.
df = df.groupby(df.index).apply(sum)

df.head()


Out[48]:
white black native asian pi other multiple hispanic
zcta5
00601 80.0 2.0 1.0 1.0 0.0 0.0 0.0 18486.0
00602 216.0 13.0 0.0 15.0 0.0 4.0 7.0 41265.0
00603 628.0 101.0 2.0 48.0 2.0 9.0 22.0 53877.0
00606 32.0 3.0 0.0 3.0 1.0 0.0 1.0 6575.0
00610 187.0 22.0 0.0 8.0 0.0 5.0 5.0 28789.0

In [50]:
# Recheck zip that crosses state lines
df.loc['69201']


Out[50]:
white       3327.0
black          6.0
native       461.0
asian         25.0
pi             0.0
other          0.0
multiple     116.0
hispanic      70.0
Name: 69201, dtype: float64

In [51]:
# Store column totals
totals = df.sum(axis=1)
totals.head()


Out[51]:
zcta5
00601    18570.0
00602    41520.0
00603    54689.0
00606     6615.0
00610    29016.0
dtype: float64

In [52]:
# Store some other race so it can be divvyed up among other groups
other = df['other']
other.head()


Out[52]:
zcta5
00601    0.0
00602    4.0
00603    9.0
00606    0.0
00610    5.0
Name: other, dtype: float64

In [53]:
# Create Asian or Pacific Islander (this is what surname uses)
df['api'] = df['asian'] + df['pi']
df.head()


Out[53]:
white black native asian pi other multiple hispanic api
zcta5
00601 80.0 2.0 1.0 1.0 0.0 0.0 0.0 18486.0 1.0
00602 216.0 13.0 0.0 15.0 0.0 4.0 7.0 41265.0 15.0
00603 628.0 101.0 2.0 48.0 2.0 9.0 22.0 53877.0 50.0
00606 32.0 3.0 0.0 3.0 1.0 0.0 1.0 6575.0 4.0
00610 187.0 22.0 0.0 8.0 0.0 5.0 5.0 28789.0 8.0

In [54]:
# Drop columns we will no longer use
df = df.drop(columns=['other', 'asian', 'pi'])
df.head()


Out[54]:
white black native multiple hispanic api
zcta5
00601 80.0 2.0 1.0 0.0 18486.0 1.0
00602 216.0 13.0 0.0 7.0 41265.0 15.0
00603 628.0 101.0 2.0 22.0 53877.0 50.0
00606 32.0 3.0 0.0 1.0 6575.0 4.0
00610 187.0 22.0 0.0 5.0 28789.0 8.0

In [55]:
# Now determine what percent of the row each items makes up.
percentages = df.divide(totals, axis='rows')
percentages.head()


Out[55]:
white black native multiple hispanic api
zcta5
00601 0.004308 0.000108 0.000054 0.000000 0.995477 0.000054
00602 0.005202 0.000313 0.000000 0.000169 0.993858 0.000361
00603 0.011483 0.001847 0.000037 0.000402 0.985152 0.000914
00606 0.004837 0.000454 0.000000 0.000151 0.993953 0.000605
00610 0.006445 0.000758 0.000000 0.000172 0.992177 0.000276

In [56]:
# Split up 'other' into the remaining rows based on the percents above
apportioned_other = percentages.multiply(other, axis='rows')
apportioned_other.head()


Out[56]:
white black native multiple hispanic api
zcta5
00601 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
00602 0.020809 0.001252 0.000000 0.000674 3.975434 0.001445
00603 0.103348 0.016621 0.000329 0.003620 8.866372 0.008228
00606 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
00610 0.032224 0.003791 0.000000 0.000862 4.960884 0.001379

In [57]:
# Impute 'other' to the remaining groups based on percentage makeup
# quasi Iterative proportortional fit / matrix rake over single dimension
df += apportioned_other
df.head()


Out[57]:
white black native multiple hispanic api
zcta5
00601 80.000000 2.000000 1.000000 0.000000 18486.000000 1.000000
00602 216.020809 13.001252 0.000000 7.000674 41268.975434 15.001445
00603 628.103348 101.016621 2.000329 22.003620 53885.866372 50.008228
00606 32.000000 3.000000 0.000000 1.000000 6575.000000 4.000000
00610 187.032224 22.003791 0.000000 5.000862 28793.960884 8.001379

In [58]:
# Reconvert to percentage
column_totals = df.sum(axis=0)
ratio_by_column = df.divide(column_totals, axis='columns').copy()

In [59]:
# Reorder columns
ratio_by_column = ratio_by_column[[
    'white',
    'black',
    'api',
    'native',
    'multiple',
    'hispanic'
]]

In [60]:
# Reconvert to percentage
row_totals = df.sum(axis=1)
ratio_by_row = df.divide(row_totals, axis='index').copy()

In [61]:
# Reorder columns
ratio_by_row = ratio_by_row[[
    'white',
    'black',
    'api',
    'native',
    'multiple',
    'hispanic'
]]

Write data to module as CSV


In [62]:
current_directory = pathlib.Path().cwd()
project_directory = current_directory.parents[0]
data_directory    = project_directory / 'surgeo' / 'data'

In [63]:
# Prob zcta given race
rbc_path = data_directory / 'prob_zcta_given_race_2010.csv'
ratio_by_column.to_csv(rbc_path)

In [64]:
# Prob race given block zcta
rbr_path = data_directory / 'prob_race_given_zcta_2010.csv'
ratio_by_row.to_csv(rbr_path)

Clean up TEMP_DIR

If you turn this into comments, you cache the census files locally so you do not have to re-download everything (because the census FTP server drops connections like nobody's business). This comes at the expense of ~10GB disk space.


In [ ]:
# Delete files
for path in TEMP_DIR.rglob('*')
    path.unlink()

# Delete dir
TEMP_DIR.rmdir()