In [1]:
import io
import pathlib
import urllib.request
import zipfile

import numpy as np
import pandas as pd

Background (links good as of 2019-12-07)

This fetches data from the following zipfiles:

https://www2.census.gov/topics/genealogy/2010surnames/names.zip

These zipfiles contains CSVs with race data based on the surnames. The script creates dataframes from the CSVs in these zipfiles. It then cleans them and imputes data to remove anonymization in accordance with the rules laid out below.

Information about the contents of these files may be found here:

https://www2.census.gov/topics/genealogy/2010surnames/surnames.pdf

It then the data for use in Surgeo calculations.

For transformations, see: https://www2.census.gov/topics/genealogy/2000surnames/surnames.pdf?#

https://www2.census.gov/topics/genealogy/2010surnames/surnames.pdf

Constants


In [2]:
#CENSUS_URL_2000 = 'https://www2.census.gov/topics/genealogy/2000surnames/names.zip'

CENSUS_URL_2010 = 'https://www2.census.gov/topics/genealogy/2010surnames/names.zip'

CENSUS_SURNAME_COLUMNS = [
    'name',
    'rank',
    'count',
    'proportion',
    'cum_proportion',
    'white',
    'black',
    'api',
    'native',
    'multiple',
    'hispanic',
]

TARGET_SURNAME_COLUMNS = [
    'name',
    'white',
    'black',
    'api',
    'native',
    'multiple',
    'hispanic',
]

Download Surname Data


In [3]:
def url_to_df(url):
    '''Takes the URL of a Census zip file and converts to DF
    
    Note: it appears the Census webservers rate limit this so it
    may take some time.
    
    '''
    # Download zipfile from census URL
    with urllib.request.urlopen(url) as response:
        # Write file into BytesIO object
        zip_data = io.BytesIO(response.read())
        # Open zip data as zipfile
        with zipfile.ZipFile(zip_data) as zf:
            # Filter out everything except the ZipInfo for csv we want
            csv_info = [file for file in zf.filelist if '.csv' in file.filename][0]
            # Read that CSV into BytesIO object
            raw_data = io.BytesIO(zf.read(csv_info))
            # Create dataframe with only suppressed '(S)' converted to NA
            df = pd.read_csv(raw_data, na_values='(S)', keep_default_na=False)
            return df

In [4]:
df_2010 = url_to_df(CENSUS_URL_2010)

In [5]:
df_2010.tail()


Out[5]:
name rank count prop100k cum_prop100k pctwhite pctblack pctapi pctaian pct2prace pcthispanic
162249 DIETZMANN 160975 100 0.03 90062.93 96.00 0.00 0.00 NaN 0.00 NaN
162250 DOKAS 160975 100 0.03 90062.96 94.00 NaN 0.00 0.00 NaN NaN
162251 DONLEA 160975 100 0.03 90062.99 94.00 0.00 0.00 0.00 0.00 6.00
162252 DORIOTT 160975 100 0.03 90063.03 89.00 0.00 NaN 0.00 5.00 NaN
162253 ALL OTHER NAMES 0 29312001 9936.97 9936.97 66.65 8.53 7.97 0.86 2.32 13.67

Clean Data


In [6]:
def clean_df(df):
    '''Change column names, set index, and convert percentages'''
    # Change names
    df.columns = CENSUS_SURNAME_COLUMNS
    # Filter columns
    df = df[TARGET_SURNAME_COLUMNS]
    # Set index to name
    df = df.set_index('name')
    # Sort index
    df = df.sort_index()
    # Convert percentages to 0 to 1 numbers
    df = df / 100
    return df

In [7]:
df_2010 = clean_df(df_2010)

In [8]:
df_2010.tail()


Out[8]:
white black api native multiple hispanic
name
ZYSK 0.9873 0.0 NaN NaN 0.0000 NaN
ZYSKOWSKI 0.9655 NaN NaN 0.0 0.0164 0.0127
ZYSMAN 0.9457 NaN 0.0 0.0 NaN NaN
ZYWICKI 0.9552 NaN NaN 0.0 0.0125 0.0233
ZYWIEC 0.9829 NaN 0.0 0.0 0.0000 NaN

Unsuppress / impute anonymized data


In [9]:
def unsuppress_row(row):
    '''Apply function to desuppress data on row-basis
    
    If a percentage falls beneath a certain threshold, the
    US Census quasi-anonymizes it by supressing. To impute
    new values, we get the outstanding allocated percentage
    and divy it up among the suppressed fields.
    
    '''
    # Check if row has NA values
    if row.isna().sum() > 0:
        # Get count of NA values
        na_count = row.isna().sum()
        # Get total of percentages
        row_sum = row.sum()
        # Get unallocated percentage and divide by count of NA
        na_value = (1 - row_sum) / na_count
        # Fill NA values with that row value.
        reconstituted_row = row.fillna(na_value)
        # Round if necessary
        return reconstituted_row.round(4)
    else:
        # If there's no NA, there's no need to impute
        return row

In [10]:
# Get rows with NaNs
target_2010 = df_2010.isna().any(axis=1)
# Run this inefficient operation on rows with NaNs only
df_2010.loc[target_2010] = (
    df_2010.loc[target_2010].apply(unsuppress_row, axis=1)
)

In [11]:
# Round to 4 digits
df_2010 = df_2010.round(4)

In [12]:
df_2010.tail()


Out[12]:
white black api native multiple hispanic
name
ZYSK 0.9873 0.0000 0.0042 0.0042 0.0000 0.0042
ZYSKOWSKI 0.9655 0.0027 0.0027 0.0000 0.0164 0.0127
ZYSMAN 0.9457 0.0181 0.0000 0.0000 0.0181 0.0181
ZYWICKI 0.9552 0.0045 0.0045 0.0000 0.0125 0.0233
ZYWIEC 0.9829 0.0085 0.0000 0.0000 0.0000 0.0085

Write data to module as CSV


In [13]:
current_directory = pathlib.Path().cwd()
project_directory = current_directory.parents[0]
data_directory    = project_directory / 'surgeo' / 'data'
path_2010         = data_directory / 'prob_race_given_surname_2010.csv'
df_2010.to_csv(path_2010)