I wanted to see how mean income (normalized to 2015 USD values) changed over time. In particular, I had two main questions:
By visualizing mean income versus population on a county-scale over time, I found that income has risen overall (even for the poorest counties) but the inequality is also slightly more severe than it has been (though this has been tempered by the 2008 economic crisis).
Much more analysis, and some nice visualizations using kernel density estimation (KDE) can be found in the README from my original github repository where I first posted this code.
Here's an example of the KDE visualization:
...and some plots that show how things have changed for the nation as a whole:
...and for the counties which make up the NYC metro area:
The following demonstration is taken from that repository as well, but with some minor modifications to make use of the supplied pickles and other data so that users will not have to download any external material or wait for the aggregation process (which takes several minutes).
In [1]:
import analysis
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.io import output_notebook, gridplot
from bokeh.palettes import brewer
In [2]:
# call this to set up bokeh integration
output_notebook()
In [3]:
PCPI_data = pd.read_pickle('PCPI_agg.p')
POP_data = pd.read_pickle('POP_agg.p')
counties = analysis.get_county_dict('national_county.txt')
In [4]:
counties
# Format of {FIPS Code : Name}
Out[4]:
In [5]:
PCPI_data.head()
# In units of dollars, not adjusted for inflation
Out[5]:
In [6]:
POP_data.head()
# In units of thousands
Out[6]:
In [7]:
# Normalize PCPI values to 2015 USD and convert to units of thousands
PCPI_norm = analysis.deflate_mod(PCPI_data/1e3, norm_filename='CPIAUCSL.csv', base='2015')
# Take logarithm of POP data, then convert from thousands (add 3)
POP_log = POP_data.apply(np.log10) + 3
In [8]:
year1 = '1970'
year2 = '2013'
f = figure(width = 900, height = 540, x_range=[0, 100], y_range=[0, 8], title='PCPI vs POP', \
x_axis_label='Per Capita Personal Income (2015 Dollars, Thousands)', y_axis_label='Log County Population')
f.circle(PCPI_norm[year1].values[0], POP_log[year1].values[0], alpha=0.1, color='royalblue', legend=year1)
f.circle(PCPI_norm[year2].values[0], POP_log[year2].values[0], alpha=0.1, color='firebrick', legend=year2)
show(f)
In [9]:
# Double check that the names of counties uniquely identify each position
# (otherwise we'll have errors which won't raise exceptions)
len(set(counties.keys())) == len(set(counties.values()))
Out[9]:
In [10]:
# We're in the clear, so let's invert the dictionary
counties_inv = {name : FIPS for FIPS, name in counties.items()}
# and test it out
print counties_inv['Hennepin County, MN']
print counties[counties_inv['Hennepin County, MN']]
In [11]:
names = [
'Hennepin County, MN',
'New York County, NY',
'Los Angeles County, CA'
]
colors = [
'royalblue',
'firebrick',
'forestgreen'
]
f1 = figure(width = 900, height = 540, x_range=[1970, 2013], title='PCPI for counties in which the author has lived',\
x_axis_label='Year', y_axis_label='PCPI (2015 Dollars, Thousands)')
f2 = figure(width = 900, height = 540, x_range=f1.x_range, title='Population of counties in which the author has lived',\
x_axis_label='Year', y_axis_label='Log County Population')
x = PCPI_norm.index.year
for color, name in zip(colors, names):
FIPS = counties_inv[name]
f1.line(x, PCPI_norm[FIPS+'_PCPI'], color=color, legend=name)
f2.line(x, POP_log[FIPS+'_POP'], color=color, legend=name)
f = gridplot([[f1], [f2]])
show(f)
In [12]:
state = 'MN'
counties_in_state = [name for name in counties_inv.keys() if name.split(',')[1].strip() == state]
f = figure(width = 900, height = 540, x_range=[1970, 2013], y_range=[0, 100], title='PCPI in '+state, \
x_axis_label='Year', y_axis_label='PCPI (2015 Dollars, Thousands)')
for c in counties_in_state:
try:
column_name = counties_inv[c]+'_PCPI'
f.line(PCPI_norm.index.year, PCPI_norm[column_name].values)
except KeyError:
pass
show(f)
In [13]:
# first, let's get a list of FIPS codes with valid data so that we dont raise KeyErrors
valid_FIPS = [col[:5] for col in POP_data.columns]
In [14]:
n = 10 # if using brewer below, n must be between 2 and 10
state = 'MN'
counties_in_state = [name for name in counties_inv.keys() if name.split(',')[1].strip() == state]
# Let's make a dataframe from the POP_log dataframe which only has the state of interest
# and then take the n highest populated counties at 2013
FIPS_in_state = [FIPS for FIPS, name in counties.items() if name.split(',')[1].strip() == state]
top = POP_log[[FIPS+'_POP' for FIPS in FIPS_in_state if FIPS in valid_FIPS]]['2013'].max().sort_values(ascending=False).index[:n]
# Let's make another new dataframe, this time for PCPI
# We want the top n counties and the national average
top_PCPI = PCPI_norm[[top_i[:5]+'_PCPI' for top_i in top]]
top_PCPI.columns = [counties[colname[:5]] for colname in top_PCPI.columns] # renaming is easier before we add natl_avg
natl_avg = PCPI_norm.mean(axis = 1)
natl_avg.name = "National Average"
top_PCPI = top_PCPI.join(natl_avg)
# We'll sort the data by the maximum PCPI, so data will be ordered
# in legend and coloring by highest to lowest
cols_in_order = top_PCPI.max().sort_values(ascending=False).index
x = top_PCPI.index.year
colors = brewer['Spectral'][n+1][::-1]
fig_title = 'PCPI of {} most populous counties in {}'.format(n, state)
f = figure(width = 900, height = 540, x_range=[1970, 2013], y_range=[0, 100], title=fig_title, \
x_axis_label='Year', y_axis_label='PCPI (2015 Dollars, Thousands)')
for color, column in zip(colors, cols_in_order):
f.circle(x, top_PCPI[column].values, fill_color=color, line_color='black', size=8, legend=column)
f.legend.orientation = 'top_left'
show(f)
In [15]:
n = 10 # if using brewer below, n must be between 2 and 10
state = 'MN'
counties_in_state = [name for name in counties_inv.keys() if name.split(',')[1].strip() == state]
# Let's make a dataframe from the POP_log dataframe which only has the state of interest
# and then take the n highest populated counties at 2013
FIPS_in_state = [FIPS for FIPS, name in counties.items() if name.split(',')[1].strip() == state]
top = POP_log[[FIPS+'_POP' for FIPS in FIPS_in_state if FIPS in valid_FIPS]]['2013'].max().sort_values(ascending=False).index[:n]
# Let's make another new dataframe, this time for PCPI
# We want the top n counties and the national average
top_PCPI = PCPI_norm[[top_i[:5]+'_PCPI' for top_i in top]]
top_PCPI.columns = [counties[colname[:5]] for colname in top_PCPI.columns] # renaming is easier before we add natl_avg
natl_avg = PCPI_norm.mean(axis = 1)
natl_avg.name = "National Average"
top_PCPI = top_PCPI.join(natl_avg)
# We'll sort the data by the maximum PCPI, so data will be ordered
# in legend and coloring by highest to lowest
cols_in_order = top_PCPI.max().sort_values(ascending=False).index
x = top_PCPI.index.year
colors = brewer['Spectral'][n+1][::-1]
fig_title = 'Growth of mean income in {} most populous counties in {}'.format(n, state)
f = figure(width = 900, height = 540, x_range=[1970, 2013], y_range=[0, 2.5], title=fig_title, \
x_axis_label='Year', y_axis_label='Normalized Income Growth since 1970')
for color, column in zip(colors, cols_in_order):
y = top_PCPI[column].values / top_PCPI[column].values[0]
f.circle(x, y, fill_color=color, line_color='black', size=8, legend=column)
f.legend.orientation = 'bottom_right'
show(f)
In [16]:
state = 'MN'
counties_in_state = [name for name in counties_inv.keys() if name.split(',')[1].strip() == state]
f = figure(width = 900, height = 540, x_range=[1971, 1975], y_range=[0, 60], title='PCPI in '+state, \
x_axis_label='Year', y_axis_label='PCPI (2015 Dollars, Thousands)')
for c in counties_in_state:
try:
column_name = counties_inv[c]+'_PCPI'
f.line(PCPI_norm.index.year, PCPI_norm[column_name].values)
except KeyError:
pass
show(f)
In [17]:
state = 'MN'
FIPS_in_state = [FIPS for FIPS, name in counties.items() if name.split(',')[1].strip() == state]
in_state = PCPI_norm[[FIPS+'_PCPI' for FIPS in FIPS_in_state]]
in_state.columns = [counties[colname[:5]] for colname in in_state.columns]
natl_avg = PCPI_norm.mean(axis = 1)
natl_avg.name = "National Average"
in_state = in_state.join(natl_avg)
growth = in_state.diff() / in_state
# convert single row from in_state to series
growth['1973'].squeeze().sort_values(ascending=False)
Out[17]: