In [1]:

    
import io
import pathlib
import urllib.request
import zipfile

import numpy as np
import pandas as pd

Background (links good as of 2019-12-07)

This fetches data from the following zipfiles:

https://www2.census.gov/topics/genealogy/2010surnames/names.zip

These zipfiles contains CSVs with race data based on the surnames. The script creates dataframes from the CSVs in these zipfiles. It then cleans them and imputes data to remove anonymization in accordance with the rules laid out below.

Information about the contents of these files may be found here:

https://www2.census.gov/topics/genealogy/2010surnames/surnames.pdf

It then the data for use in Surgeo calculations.

For transformations, see: https://www2.census.gov/topics/genealogy/2000surnames/surnames.pdf?#

https://www2.census.gov/topics/genealogy/2010surnames/surnames.pdf

Constants



In [2]:

    
#CENSUS_URL_2000 = 'https://www2.census.gov/topics/genealogy/2000surnames/names.zip'

CENSUS_URL_2010 = 'https://www2.census.gov/topics/genealogy/2010surnames/names.zip'

CENSUS_SURNAME_COLUMNS = [
    'name',
    'rank',
    'count',
    'proportion',
    'cum_proportion',
    'white',
    'black',
    'api',
    'native',
    'multiple',
    'hispanic',
]

TARGET_SURNAME_COLUMNS = [
    'name',
    'white',
    'black',
    'api',
    'native',
    'multiple',
    'hispanic',
]

Download Surname Data



In [3]:

    
def url_to_df(url):
    '''Takes the URL of a Census zip file and converts to DF
    
    Note: it appears the Census webservers rate limit this so it
    may take some time.
    
    '''
    # Download zipfile from census URL
    with urllib.request.urlopen(url) as response:
        # Write file into BytesIO object
        zip_data = io.BytesIO(response.read())
        # Open zip data as zipfile
        with zipfile.ZipFile(zip_data) as zf:
            # Filter out everything except the ZipInfo for csv we want
            csv_info = [file for file in zf.filelist if '.csv' in file.filename][0]
            # Read that CSV into BytesIO object
            raw_data = io.BytesIO(zf.read(csv_info))
            # Create dataframe with only suppressed '(S)' converted to NA
            df = pd.read_csv(raw_data, na_values='(S)', keep_default_na=False)
            return df



In [4]:

    
df_2010 = url_to_df(CENSUS_URL_2010)



In [5]:

    
df_2010.tail()









    Out[5]:







  
    
      
      name
      rank
      count
      prop100k
      cum_prop100k
      pctwhite
      pctblack
      pctapi
      pctaian
      pct2prace
      pcthispanic
    
  
  
    
      162249
      DIETZMANN
      160975
      100
      0.03
      90062.93
      96.00
      0.00
      0.00
      NaN
      0.00
      NaN
    
    
      162250
      DOKAS
      160975
      100
      0.03
      90062.96
      94.00
      NaN
      0.00
      0.00
      NaN
      NaN
    
    
      162251
      DONLEA
      160975
      100
      0.03
      90062.99
      94.00
      0.00
      0.00
      0.00
      0.00
      6.00
    
    
      162252
      DORIOTT
      160975
      100
      0.03
      90063.03
      89.00
      0.00
      NaN
      0.00
      5.00
      NaN
    
    
      162253
      ALL OTHER NAMES
      0
      29312001
      9936.97
      9936.97
      66.65
      8.53
      7.97
      0.86
      2.32
      13.67

Clean Data



In [6]:

    
def clean_df(df):
    '''Change column names, set index, and convert percentages'''
    # Change names
    df.columns = CENSUS_SURNAME_COLUMNS
    # Filter columns
    df = df[TARGET_SURNAME_COLUMNS]
    # Set index to name
    df = df.set_index('name')
    # Sort index
    df = df.sort_index()
    # Convert percentages to 0 to 1 numbers
    df = df / 100
    return df



In [7]:

    
df_2010 = clean_df(df_2010)



In [8]:

    
df_2010.tail()









    Out[8]:







  
    
      
      white
      black
      api
      native
      multiple
      hispanic
    
    
      name
      
      
      
      
      
      
    
  
  
    
      ZYSK
      0.9873
      0.0
      NaN
      NaN
      0.0000
      NaN
    
    
      ZYSKOWSKI
      0.9655
      NaN
      NaN
      0.0
      0.0164
      0.0127
    
    
      ZYSMAN
      0.9457
      NaN
      0.0
      0.0
      NaN
      NaN
    
    
      ZYWICKI
      0.9552
      NaN
      NaN
      0.0
      0.0125
      0.0233
    
    
      ZYWIEC
      0.9829
      NaN
      0.0
      0.0
      0.0000
      NaN

Unsuppress / impute anonymized data



In [9]:

    
def unsuppress_row(row):
    '''Apply function to desuppress data on row-basis
    
    If a percentage falls beneath a certain threshold, the
    US Census quasi-anonymizes it by supressing. To impute
    new values, we get the outstanding allocated percentage
    and divy it up among the suppressed fields.
    
    '''
    # Check if row has NA values
    if row.isna().sum() > 0:
        # Get count of NA values
        na_count = row.isna().sum()
        # Get total of percentages
        row_sum = row.sum()
        # Get unallocated percentage and divide by count of NA
        na_value = (1 - row_sum) / na_count
        # Fill NA values with that row value.
        reconstituted_row = row.fillna(na_value)
        # Round if necessary
        return reconstituted_row.round(4)
    else:
        # If there's no NA, there's no need to impute
        return row



In [10]:

    
# Get rows with NaNs
target_2010 = df_2010.isna().any(axis=1)
# Run this inefficient operation on rows with NaNs only
df_2010.loc[target_2010] = (
    df_2010.loc[target_2010].apply(unsuppress_row, axis=1)
)



In [11]:

    
# Round to 4 digits
df_2010 = df_2010.round(4)



In [12]:

    
df_2010.tail()

Write data to module as CSV



In [13]:

    
current_directory = pathlib.Path().cwd()
project_directory = current_directory.parents[0]
data_directory    = project_directory / 'surgeo' / 'data'
path_2010         = data_directory / 'prob_race_given_surname_2010.csv'
df_2010.to_csv(path_2010)

	name	rank	count	prop100k	cum_prop100k	pctwhite	pctblack	pctapi	pctaian	pct2prace	pcthispanic
162249	DIETZMANN	160975	100	0.03	90062.93	96.00	0.00	0.00	NaN	0.00	NaN
162250	DOKAS	160975	100	0.03	90062.96	94.00	NaN	0.00	0.00	NaN	NaN
162251	DONLEA	160975	100	0.03	90062.99	94.00	0.00	0.00	0.00	0.00	6.00
162252	DORIOTT	160975	100	0.03	90063.03	89.00	0.00	NaN	0.00	5.00	NaN
162253	ALL OTHER NAMES	0	29312001	9936.97	9936.97	66.65	8.53	7.97	0.86	2.32	13.67

	white	black	api	native	multiple	hispanic
name
ZYSK	0.9873	0.0	NaN	NaN	0.0000	NaN
ZYSKOWSKI	0.9655	NaN	NaN	0.0	0.0164	0.0127
ZYSMAN	0.9457	NaN	0.0	0.0	NaN	NaN
ZYWICKI	0.9552	NaN	NaN	0.0	0.0125	0.0233
ZYWIEC	0.9829	NaN	0.0	0.0	0.0000	NaN