In [1]:
import pandas as pd
import censusdata
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
import statsmodels.formula.api as sm
We begin by downloading data on some basic socioeconomic characteristics for all U.S. states:
In [2]:
statedata = censusdata.download('acs5', 2015, censusdata.censusgeo([('state', '*')]),
['B01001_001E', 'B19013_001E', 'B19083_001E',
'C17002_001E', 'C17002_002E', 'C17002_003E', 'C17002_004E',
'B03002_001E', 'B03002_003E', 'B03002_004E', 'B03002_012E',])
We then link data on the percent of voters in each state voting Democratic in the 2016 U.S. presidential election:
In [3]:
voting2016 = {
censusdata.censusgeo((('state', '01'),)): 34.6,
censusdata.censusgeo((('state', '02'),)): 37.7,
censusdata.censusgeo((('state', '04'),)): 45.4,
censusdata.censusgeo((('state', '05'),)): 33.8,
censusdata.censusgeo((('state', '06'),)): 61.6,
censusdata.censusgeo((('state', '08'),)): 47.2,
censusdata.censusgeo((('state', '09'),)): 54.5,
censusdata.censusgeo((('state', '10'),)): 53.4,
censusdata.censusgeo((('state', '11'),)): 92.8,
censusdata.censusgeo((('state', '12'),)): 47.8,
censusdata.censusgeo((('state', '13'),)): 45.6,
censusdata.censusgeo((('state', '15'),)): 62.3,
censusdata.censusgeo((('state', '16'),)): 27.6,
censusdata.censusgeo((('state', '17'),)): 55.4,
censusdata.censusgeo((('state', '18'),)): 37.9,
censusdata.censusgeo((('state', '19'),)): 42.2,
censusdata.censusgeo((('state', '20'),)): 36.2,
censusdata.censusgeo((('state', '21'),)): 32.7,
censusdata.censusgeo((('state', '22'),)): 38.4,
censusdata.censusgeo((('state', '23'),)): 47.9,
censusdata.censusgeo((('state', '24'),)): 60.5,
censusdata.censusgeo((('state', '25'),)): 60.8,
censusdata.censusgeo((('state', '26'),)): 47.3,
censusdata.censusgeo((('state', '27'),)): 46.9,
censusdata.censusgeo((('state', '28'),)): 39.7,
censusdata.censusgeo((('state', '29'),)): 38,
censusdata.censusgeo((('state', '30'),)): 36,
censusdata.censusgeo((('state', '31'),)): 34,
censusdata.censusgeo((('state', '32'),)): 47.9,
censusdata.censusgeo((('state', '33'),)): 47.6,
censusdata.censusgeo((('state', '34'),)): 55,
censusdata.censusgeo((('state', '35'),)): 48.3,
censusdata.censusgeo((('state', '36'),)): 58.8,
censusdata.censusgeo((('state', '37'),)): 46.7,
censusdata.censusgeo((('state', '38'),)): 27.8,
censusdata.censusgeo((('state', '39'),)): 43.5,
censusdata.censusgeo((('state', '40'),)): 28.9,
censusdata.censusgeo((('state', '41'),)): 51.7,
censusdata.censusgeo((('state', '42'),)): 47.6,
censusdata.censusgeo((('state', '44'),)): 55.4,
censusdata.censusgeo((('state', '45'),)): 40.8,
censusdata.censusgeo((('state', '46'),)): 31.7,
censusdata.censusgeo((('state', '47'),)): 34.9,
censusdata.censusgeo((('state', '48'),)): 43.4,
censusdata.censusgeo((('state', '49'),)): 27.8,
censusdata.censusgeo((('state', '50'),)): 61.1,
censusdata.censusgeo((('state', '51'),)): 49.9,
censusdata.censusgeo((('state', '53'),)): 54.4,
censusdata.censusgeo((('state', '54'),)): 26.5,
censusdata.censusgeo((('state', '55'),)): 46.9,
censusdata.censusgeo((('state', '56'),)): 22.5,
}
voting2016 = pd.DataFrame.from_dict(voting2016, orient='index')
statedata['percent_democratic_pres_2016'] = voting2016
We then rename columns, compute some additional variables, and rescale some variables to make regression coefficients more easily interpretable:
In [4]:
statedata = statedata.rename(columns={'B01001_001E': 'population_size'})
statedata.population_size = statedata.population_size / 100000
statedata = statedata.rename(columns={'B19013_001E': 'median_HH_income'})
statedata['median_HH_income'] = statedata['median_HH_income'] / 1000
statedata = statedata.rename(columns={'B19083_001E': 'gini_index'})
statedata.gini_index = statedata.gini_index * 100
statedata['percent_below_125_poverty'] = (statedata['C17002_002E'] + statedata['C17002_003E'] + statedata['C17002_004E']) / statedata['C17002_001E'] * 100
statedata['percent_nonhisp_white'] = statedata['B03002_003E'] / statedata['B03002_001E'] * 100
statedata['percent_nonhisp_black'] = statedata['B03002_004E'] / statedata['B03002_001E'] * 100
statedata['percent_hispanic'] = statedata['B03002_012E'] / statedata['B03002_001E'] * 100
We run a quick check on the data and then delete variables we no longer need:
In [5]:
assert (statedata['population_size'] == statedata['B03002_001E'] / 100000).all()
for column in ['C17002_001E', 'C17002_002E', 'C17002_003E', 'C17002_004E',
'B03002_001E', 'B03002_003E', 'B03002_004E', 'B03002_012E',]:
del statedata[column]
We are only interested in the 50 states + DC, so we drop Puerto Rico:
In [6]:
statedata = statedata.drop([censusdata.censusgeo([('state', '72')])])
Finally, we reorder the variables and run simple descriptives:
In [7]:
statedata = statedata.reindex(columns=['percent_democratic_pres_2016', 'population_size', 'median_HH_income', 'percent_below_125_poverty', 'gini_index', 'percent_nonhisp_white', 'percent_nonhisp_black', 'percent_hispanic'])
statedata.describe()
Out[7]:
Then we examine bivariate correlations prior to running a linear regression model:
In [8]:
statedata.corr()
Out[8]:
In [9]:
result = sm.ols(formula=("percent_democratic_pres_2016 ~ population_size + median_HH_income"
"+ percent_nonhisp_black + percent_hispanic"), data=statedata).fit()
result.summary()
Out[9]:
In this simple model, the percentage voting Democratic is not significantly associated with population size or % Hispanic, at the p<.05 level. It is significantly associated with median household income and the % non-Hispanic black. Every $1,000 increase in median household income is associated with an increase of just under 1 percentage point in the Democratic vote. Every one percentage point increase in the % non-Hispanic black is associated with about a half a percentage point increase in the Democratic vote. Of course,