In [6]:

    
import io
import pathlib
import urllib.request
import tempfile
import zipfile

import numpy as np
import pandas as pd

Background (links good as of 2019-12-10)

This takes a fair amount of RAM and download time

This script fetches ~50 .sf1 state and territory zip files from the census server at the following location:

https://www2.census.gov/census_2010/04-Summary_File_1/

These each of these zip files contains (among other things) a geographic file and a population file containing.

The fixed-width geo file is useful because it ties a "Logical Record" to ZCTA codes. The CSV population file is useful because it contains the actual data we need for the logical records.

Specifically, we are pulling our data from population table 9 ("P9" ... see Summary File 1 documentation page 184).

P5. HISPANIC OR LATINO ORIGIN BY RACE Universe: Total population (17)

The data for this table is as follows:

P5 data:
    Total:
        Not Hispanic or Latino:
            White alone
            Black or African American alone
            American Indian and Alaska Native alone
            Asian alone
            Native Hawaiian and Other Pacific Islander alone 
            Some Other Race alone
            Two or More Races
        Hispanic or Latino:
            White alone
            Black or African American alone 
            American Indian and Alaska Native alone
            Asian alone
            Native Hawaiian and Other Pacific Islander alone 
            Some Other Race alone
            Two or More Races

Notes:

Geo table must be filtered by summary level == 871 (State-5-Digit ZIP Code Tabulation Area) so that it is limited to ZCTA records.
The cut down geo table is joined to the population table using the LOGRECNO (Log Record Number)
All Hispanic groups are condense to a single group to make consistent with surname data
Asian and API are combined to make consistent with surname data
"Some Other Race" is apporitoned abong the groups.

For more details, the technical documentation for Summary File 1 can be found here:

https://www.census.gov/prod/cen2010/doc/sf1.pdf



In [41]:

    
# These are the start/stop indices for the fixed width geo file.
# It allows them to be easily converted to dataframes.
GEO_MAP_2010 = {
    'FILEID'  : (1  , 7  ),
    'STUSAB'  : (7  , 9  ),
    'SUMLEV'  : (9  , 12 ),
    'GEOCOMP' : (12 , 14 ),
    'CHARITER': (14 , 17 ),
    'CIFSN'   : (17 , 19 ),
    'LOGRECNO': (19 , 26 ),
    'REGION'  : (26 , 27 ),
    'DIVISION': (27 , 28 ),
    'STATE'   : (28 , 30 ),
    'COUNTY'  : (30 , 33 ),
    'COUNTYCC': (33 , 35 ),
    'COUNTYSC': (35 , 37 ),
    'COUSUB'  : (37 , 42 ),
    'COUSUBCC': (42 , 44 ),
    'COUSUBSC': (44 , 46 ),
    'PLACE'   : (46 , 51 ),
    'PLACECC' : (51 , 53 ),
    'PLACESC' : (53 , 55 ),
    'TRACT'   : (55 , 61 ),
    'BLKGRP'  : (61 , 62 ),
    'BLOCK'   : (62 , 66 ),
    'IUC'     : (66 , 68 ),
    'CONCIT'  : (68 , 73 ),
    'CONCITCC': (73 , 75 ),
    'CONCITSC': (75 , 77 ),
    'AIANHH'  : (77 , 81 ),
    'AIANHHFP': (81 , 86 ),
    'AIANHHCC': (86 , 88 ),
    'AIHHTLI' : (88 , 89 ),
    'AITSCE'  : (89 , 92 ),
    'AITS'    : (92 , 97 ),
    'AITSCC'  : (97 , 99 ),
    'TTRACT'  : (99 , 105),
    'TBLKGRP' : (105, 106),
    'ANRC'    : (106, 111),
    'ANRCCC'  : (111, 113),
    'CBSA'    : (113, 118),
    'CBSASC'  : (118, 120),
    'METDIV'  : (120, 125),
    'CSA'     : (125, 128),
    'NECTA'   : (128, 133),
    'NECTASC' : (133, 135),
    'NECTADIV': (135, 140),
    'CNECTA'  : (140, 143),
    'CBSAPCI' : (143, 144),
    'NECTAPCI': (144, 145),
    'UA'      : (145, 150),
    'UASC'    : (150, 152),
    'UATYPE'  : (152, 153),
    'UR'      : (153, 154),
    'CD'      : (154, 156),
    'SLDU'    : (156, 159),
    'SLDL'    : (159, 162),
    'VTD'     : (162, 168),
    'VTDI'    : (168, 169),
    'RESERVE2': (169, 172),
    'ZCTA5'   : (172, 177),
    'SUBMCD'  : (177, 182),
    'SUBMCDCC': (182, 184),
    'SDELM'   : (184, 189),
    'SDSEC'   : (189, 194),
    'SDUNI'   : (194, 199),
    'AREALAND': (119, 213),
    'AREAWATR': (213, 227),
    'NAME'    : (227, 317),
    'FUNCSTAT': (317, 318),
    'GCUNI'   : (318, 319),
    'POP100'  : (319, 328),
    'HU100'   : (328, 337),
    'INTPTLAT': (337, 348),
    'INTPTLON': (348, 360),
    'LSADC'   : (360, 362),
    'PARTFLAG': (362, 363),
    'RESERVE3': (363, 369),
    'UGA'     : (369, 374),
    'STATENS' : (374, 382),
    'COUNTYNS': (382, 390),
    'COUSUBNS': (390, 398),
    'PLACENS' : (398, 406),
    'CONCITNS': (406, 414),
    'AIANHHNS': (414, 422),
    'AITSNS'  : (422, 430),
    'ANRCNS'  : (430, 438),
    'SUBMCDNS': (438, 446),
    'CD113'   : (446, 448),
    'CD114'   : (448, 450),
    'CD115'   : (450, 452),
    'SLDU2'   : (452, 455),
    'SLDU3'   : (455, 458),
    'SLDU4'   : (458, 461),
    'SLDL2'   : (461, 464),
    'SLDL3'   : (464, 467),
    'SLDL4'   : (467, 470),
    'AIANHHSC': (470, 472),
    'CSASC'   : (472, 476),
    'CNECTASC': (474, 477),
    'MEMI'    : (476, 478),
    'NMEMI'   : (477, 478),
    'PUMA'    : (478, 483),
    'RESERVED': (483, 501),
}

# Our zip downloads have URLs that can be recreated with state/arrevs.
STATES = {
    'AL': 'Alabama',
    'AK': 'Alaska',
    'AZ': 'Arizona',
    'AR': 'Arkansas',
    'CA': 'California',
    'CO': 'Colorado',
    'CT': 'Connecticut',
    'DE': 'Delaware',
    'DC': 'District_of_Columbia',
    'FL': 'Florida',
    'GA': 'Georgia',
    'HI': 'Hawaii',
    'ID': 'Idaho',
    'IL': 'Illinois',
    'IN': 'Indiana',
    'IA': 'Iowa',
    'KS': 'Kansas',
    'KY': 'Kentucky',
    'LA': 'Louisiana',
    'ME': 'Maine',
    'MD': 'Maryland',
    'MA': 'Massachusetts',
    'MI': 'Michigan',
    'MN': 'Minnesota',
    'MS': 'Mississippi',
    'MO': 'Missouri',
    'MT': 'Montana',
    'NE': 'Nebraska',
    'NV': 'Nevada',
    'NH': 'New_Hampshire',
    'NJ': 'New_Jersey',
    'NM': 'New_Mexico',
    'NY': 'New_York',
    'NC': 'North_Carolina',
    'ND': 'North_Dakota',
    'OH': 'Ohio',
    'OK': 'Oklahoma',
    'OR': 'Oregon',
    'PA': 'Pennsylvania',
    'PR': 'Puerto_Rico',
    'RI': 'Rhode_Island',
    'SC': 'South_Carolina',
    'SD': 'South_Dakota',
    'TN': 'Tennessee',
    'TX': 'Texas',
    'UT': 'Utah',
    'VT': 'Vermont',
    'VA': 'Virginia',
    'WA': 'Washington',
    'WV': 'West_Virginia',
    'WI': 'Wisconsin',
    'WY': 'Wyoming',    
}

# This is the template for the URLs
URL_TEMPLATE_ZIP = 'https://www2.census.gov/census_2010/04-Summary_File_1/{state}/{state_abbrev}2010.sf1.zip'

TEMP_DIR = pathlib.Path(tempfile.gettempdir()) / 'surgeo_temp'
TEMP_DIR.mkdir(exist_ok=True)

# This creates a URL with each and every state in the STATES dictioanry
urls = [
    URL_TEMPLATE_ZIP.format(state_abbrev=code.lower(), state=name)
    for code, name
    in STATES.items()
]



In [42]:

    
def request_data(url, retries):
    '''Helper that attempts to get file a number of times'''
    tries = 0
    while True:
        try:
            with urllib.request.urlopen(url) as r:
                data = r.read()
                return data
        except Exception:
            tries += 1
            if tries >= retries:
                raise
        print('Retrying {url}...'.format(url))

def dl_file(url, file_path):
    '''Helper func: downloads zip from URL and stores it in local folder'''
    # If it exsits do nothing
    if file_path.exists():
        print('{} is already present. Processing ...'.format(file_path))
        pass
    # Otherwise download file to dir
    else:
    # Open request
        data = request_data(url, 3)
        file_path.touch()
        file_path.write_bytes(data)

def make_geo_df(file_path):
    '''Helper func: takes zip and creates a geographic file from data'''
    # Read zip data
    with zipfile.ZipFile(file_path) as zf:
        # Filter out everything except the ZipInfo (geo) for csv we want
        target = zf.filelist[0]
        # Read that CSV into BytesIO object
        geo_data = io.BytesIO(zf.read(target))
        # Read fixed-width file into dataframe
        geo_df = pd.read_fwf(
            geo_data, 
            header=None,
            # Use the GEO MAP but subtract from column start/stop
            colspecs=[
                (tuple_[0] - 1, tuple_[1] - 1)
                for tuple_
                in GEO_MAP_2010.values()
            ],
            dtype=str
        )
    # Give names to columns
    geo_df.columns = tuple(GEO_MAP_2010.keys())
    # Filter out all records that are not related to ZCTAs only
    # e.g. get rid of census block data
    geo_df = geo_df.loc[geo_df.SUMLEV == '871']
    # Keep the STUSAB (state), LOGRECONO (join key), and ZCTA5 (zip code proxy)
    geo_df = geo_df[['STUSAB', 'LOGRECNO', 'ZCTA5']]#.dropna(subset=['ZCTA5'])
    return geo_df

def make_pop_df(file_path):
    '''Helper func: Takes a zip and creates population df'''
    # Read zip data
    with zipfile.ZipFile(file_path) as zf:
        # Filter out everything except the ZipInfo for csv we want
        # This contains Table P5
        target = zf.filelist[3]
        # Read that CSV into BytesIO object
        pop_data = io.BytesIO(zf.read(target))
        pop_df = pd.read_csv(
            pop_data, 
            header=None,
            dtype=str
        )
        # Keep only a subset of columns and renames them
        pop_df = pop_df[[1, 4, 18, 19, 20, 21, 22, 23, 24, 25]]
        pop_df.columns = [
            'STUSAB',
            'LOGRECNO',
            'white',
            'black',
            'native',
            'asian',
            'pi',
            'other',
            'multiple',
            'hispanic',
        ]
        return pop_df

def merge_frames(geo_df, pop_df):
    '''Merges our GEO and POP frames'''
    # Merges common STUSAB and LOGRECNO fields
    merged = geo_df.merge(pop_df)
    # Rename zctq5
    merged = merged.rename(columns={'ZCTA5': 'zcta5'})
    # Set index to ZCTA5 and sort
    merged = merged.set_index('zcta5')
    merged = merged.sort_index()
    return merged
    
def create_df(url, temp_dir):
    '''Main function to download, join, and clean data for single state'''
    print(url)
    file_name = url.rpartition('/')[2]
    file_path = temp_dir / file_name 
    # Download
    data   = dl_file(url, file_path)
    # Make dfs
    geo_df = make_geo_df(file_path)
    pop_df = make_pop_df(file_path)
    # Join DFs, sort, trip, and process
    df     = merge_frames(geo_df, pop_df)
    df = df.iloc[:, 2:]
    df = df.astype(np.float64)
    return df



In [43]:

    
print('Starting download.')

# Create a dataframe for each URL and store in list
data = [
    create_df(url, TEMP_DIR)
    for url
    in urls
]

print('Download complete.')









    



Starting download.
https://www2.census.gov/census_2010/04-Summary_File_1/Alabama/al2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\al2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Alaska/ak2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ak2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Arizona/az2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\az2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Arkansas/ar2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ar2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/California/ca2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ca2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Colorado/co2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\co2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Connecticut/ct2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ct2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Delaware/de2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\de2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/District_of_Columbia/dc2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\dc2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Florida/fl2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\fl2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Georgia/ga2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ga2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Hawaii/hi2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\hi2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Idaho/id2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\id2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Illinois/il2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\il2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Indiana/in2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\in2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Iowa/ia2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ia2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Kansas/ks2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ks2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Kentucky/ky2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ky2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Louisiana/la2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\la2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Maine/me2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\me2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Maryland/md2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\md2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Massachusetts/ma2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ma2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Michigan/mi2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\mi2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Minnesota/mn2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\mn2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Mississippi/ms2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ms2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Missouri/mo2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\mo2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Montana/mt2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\mt2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Nebraska/ne2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ne2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Nevada/nv2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nv2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/New_Hampshire/nh2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nh2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/New_Jersey/nj2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nj2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/New_Mexico/nm2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nm2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/New_York/ny2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ny2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/North_Carolina/nc2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nc2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/North_Dakota/nd2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\nd2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Ohio/oh2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\oh2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Oklahoma/ok2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ok2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Oregon/or2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\or2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Pennsylvania/pa2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\pa2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Puerto_Rico/pr2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\pr2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Rhode_Island/ri2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ri2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/South_Carolina/sc2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\sc2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/South_Dakota/sd2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\sd2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Tennessee/tn2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\tn2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Texas/tx2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\tx2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Utah/ut2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\ut2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Vermont/vt2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\vt2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Virginia/va2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\va2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Washington/wa2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\wa2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/West_Virginia/wv2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\wv2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Wisconsin/wi2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\wi2010.sf1.zip is already present. Processing ...
https://www2.census.gov/census_2010/04-Summary_File_1/Wyoming/wy2010.sf1.zip
C:\Users\theon\AppData\Local\Temp\surgeo_temp\wy2010.sf1.zip is already present. Processing ...
Download complete.



In [44]:

    
# Join all data into single dataframe and sort index
df = pd.concat(data)
df = df.sort_index()



In [47]:

    
# Check a ZCTA that crosses state lines
df.loc['69201']



In [48]:

    
# https://github.com/theonaunheim/surgeo/issues/10
# Certain zctas cross state lines and must be added together.
df = df.groupby(df.index).apply(sum)

df.head()



In [50]:

    
# Recheck zip that crosses state lines
df.loc['69201']









    Out[50]:





white       3327.0
black          6.0
native       461.0
asian         25.0
pi             0.0
other          0.0
multiple     116.0
hispanic      70.0
Name: 69201, dtype: float64



In [51]:

    
# Store column totals
totals = df.sum(axis=1)
totals.head()









    Out[51]:





zcta5
00601    18570.0
00602    41520.0
00603    54689.0
00606     6615.0
00610    29016.0
dtype: float64



In [52]:

    
# Store some other race so it can be divvyed up among other groups
other = df['other']
other.head()









    Out[52]:





zcta5
00601    0.0
00602    4.0
00603    9.0
00606    0.0
00610    5.0
Name: other, dtype: float64



In [53]:

    
# Create Asian or Pacific Islander (this is what surname uses)
df['api'] = df['asian'] + df['pi']
df.head()



In [54]:

    
# Drop columns we will no longer use
df = df.drop(columns=['other', 'asian', 'pi'])
df.head()



In [55]:

    
# Now determine what percent of the row each items makes up.
percentages = df.divide(totals, axis='rows')
percentages.head()



In [56]:

    
# Split up 'other' into the remaining rows based on the percents above
apportioned_other = percentages.multiply(other, axis='rows')
apportioned_other.head()



In [57]:

    
# Impute 'other' to the remaining groups based on percentage makeup
# quasi Iterative proportortional fit / matrix rake over single dimension
df += apportioned_other
df.head()









    Out[57]:







  
    
      
      white
      black
      native
      multiple
      hispanic
      api
    
    
      zcta5
      
      
      
      
      
      
    
  
  
    
      00601
      80.000000
      2.000000
      1.000000
      0.000000
      18486.000000
      1.000000
    
    
      00602
      216.020809
      13.001252
      0.000000
      7.000674
      41268.975434
      15.001445
    
    
      00603
      628.103348
      101.016621
      2.000329
      22.003620
      53885.866372
      50.008228
    
    
      00606
      32.000000
      3.000000
      0.000000
      1.000000
      6575.000000
      4.000000
    
    
      00610
      187.032224
      22.003791
      0.000000
      5.000862
      28793.960884
      8.001379



In [58]:

    
# Reconvert to percentage
column_totals = df.sum(axis=0)
ratio_by_column = df.divide(column_totals, axis='columns').copy()



In [59]:

    
# Reorder columns
ratio_by_column = ratio_by_column[[
    'white',
    'black',
    'api',
    'native',
    'multiple',
    'hispanic'
]]



In [60]:

    
# Reconvert to percentage
row_totals = df.sum(axis=1)
ratio_by_row = df.divide(row_totals, axis='index').copy()



In [61]:

    
# Reorder columns
ratio_by_row = ratio_by_row[[
    'white',
    'black',
    'api',
    'native',
    'multiple',
    'hispanic'
]]

Write data to module as CSV



In [62]:

    
current_directory = pathlib.Path().cwd()
project_directory = current_directory.parents[0]
data_directory    = project_directory / 'surgeo' / 'data'



In [63]:

    
# Prob zcta given race
rbc_path = data_directory / 'prob_zcta_given_race_2010.csv'
ratio_by_column.to_csv(rbc_path)



In [64]:

    
# Prob race given block zcta
rbr_path = data_directory / 'prob_race_given_zcta_2010.csv'
ratio_by_row.to_csv(rbr_path)

Clean up TEMP_DIR

If you turn this into comments, you cache the census files locally so you do not have to re-download everything (because the census FTP server drops connections like nobody's business). This comes at the expense of ~10GB disk space.



In [ ]:

    
# Delete files
for path in TEMP_DIR.rglob('*')
    path.unlink()

# Delete dir
TEMP_DIR.rmdir()

	white	black	native	multiple	hispanic	api
zcta5
00601	0.004308	0.000108	0.000054	0.000000	0.995477	0.000054
00602	0.005202	0.000313	0.000000	0.000169	0.993858	0.000361
00603	0.011483	0.001847	0.000037	0.000402	0.985152	0.000914
00606	0.004837	0.000454	0.000000	0.000151	0.993953	0.000605
00610	0.006445	0.000758	0.000000	0.000172	0.992177	0.000276

	white	black	native	asian	pi	other	multiple	hispanic
zcta5
69201	3291.0	4.0	265.0	20.0	0.0	0.0	97.0	61.0
69201	36.0	2.0	196.0	5.0	0.0	0.0	19.0	9.0

	white	black	native	asian	pi	other	multiple	hispanic
zcta5
00601	80.0	2.0	1.0	1.0	0.0	0.0	0.0	18486.0
00602	216.0	13.0	0.0	15.0	0.0	4.0	7.0	41265.0
00603	628.0	101.0	2.0	48.0	2.0	9.0	22.0	53877.0
00606	32.0	3.0	0.0	3.0	1.0	0.0	1.0	6575.0
00610	187.0	22.0	0.0	8.0	0.0	5.0	5.0	28789.0

	white	black	native	multiple	hispanic	api
zcta5
00601	80.000000	2.000000	1.000000	0.000000	18486.000000	1.000000
00602	216.020809	13.001252	0.000000	7.000674	41268.975434	15.001445
00603	628.103348	101.016621	2.000329	22.003620	53885.866372	50.008228
00606	32.000000	3.000000	0.000000	1.000000	6575.000000	4.000000
00610	187.032224	22.003791	0.000000	5.000862	28793.960884	8.001379