This notebook creates a few visualizations that illustrate trends in income inequality over the past few decades in the US. This is based on data from the Census Bureau, found in tables A-2, A-4, and some more of "Income and Poverty in the United States: 2018", here: https://www.census.gov/data/tables/2019/demo/income-poverty/p60-266.html
In [16]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('whitegrid')
In [2]:
# Load data
df_ie = pd.read_excel('/gh/data/census/income/2018_to_1967_income_inequality_metrics.xls',
skiprows=6, nrows=10, index_col=0).T
# Change column names
col_names = list(df_ie.columns)
col_numbers = [int(''.join([s for s in name if s.isdigit()])) for name in col_names]
df_ie = df_ie.rename(columns=dict(zip(col_names, col_numbers)))
# Format df
df_ie = df_ie.reset_index().rename(columns={'index': 'year'})
df_ie['year'] = [int(str(x).split()[0]) for x in df_ie['year']]
df_ie = df_ie.rename_axis('percentile', axis=1)
df_ie.head()
Out[2]:
In [3]:
df_plt = pd.melt(df_ie, id_vars=['year'], value_vars=col_numbers)
pcs_plt = [10, 20, 50, 90, 95]
df_plt = df_plt[df_plt['percentile'].isin(pcs_plt)]
df_plt['percentile'] = ['{}th %'.format(x) for x in df_plt['percentile']]
df_plt['value'] = df_plt['value'] / 1000
plt.figure(figsize=(12,6))
sns.lineplot(data=df_plt, x='year', y='value', hue='percentile',
palette=['k', 'k', 'k', 'k', 'k'], linewidth=4, legend=False)
g = sns.lineplot(data=df_plt, x='year', y='value', hue='percentile',
palette="coolwarm", linewidth=3)
plt.legend(fontsize=15, loc='upper left')
plt.xlabel('')
plt.ylim((0, 260))
plt.ylabel('Annual household income (x $1,000)\n(inflation-adjusted, 2018-equivalent USD)', size=20)
plt.yticks(np.arange(0, 280, 20), size=20)
plt.xticks(size=20)
plt.xlim((min(df_plt['year']), 2020))
plt.title('Income inequality is increasing', size=30)
plt.tight_layout()
plt.show(g)
In [31]:
for pc in [10, 50, 90]:
for year in [2008, 2018]:
print('{} income {}%: {}'.format(year, pc, df_ie.loc[df_ie['year']==year, pc].values[0]))
print('{}% income change over 10 years: {:.0%}\n'.format(pc, df_ie.loc[df_ie['year']==2018, pc].values[0] / df_ie.loc[df_ie['year']==2008, pc].values[0] - 1))
In [4]:
df_ie['90% : 10%'] = df_ie[90] / df_ie[10]
df_ie['90% : 50%'] = df_ie[90] / df_ie[50]
df_ie['95% : 10%'] = df_ie[95] / df_ie[10]
df_ie['95% : 50%'] = df_ie[95] / df_ie[50]
df_ie['50% : 10%'] = df_ie[50] / df_ie[10]
df_ie.head()
Out[4]:
In [33]:
df_plt = pd.melt(df_ie.rename_axis('percentile ratio', axis=1),
id_vars=['year'],
value_vars=['90% : 10%', '90% : 50%', '50% : 10%'])
plt.figure(figsize=(12,5))
g = sns.lineplot(data=df_plt, x='year', y='value', hue='percentile ratio',
palette=['b', 'r', 'g'], linewidth=5)
plt.legend(fontsize=15, loc='upper left')
plt.xlabel('')
plt.ylim((1, 13))
plt.ylabel('Ratio of earnings', size=20)
plt.yticks(np.arange(1, 14), size=20)
plt.xticks(size=20)
plt.xlim((min(df_plt['year']), 2020))
plt.title('Income inequality is increasing', size=30)
plt.tight_layout()
plt.show(g)
In [6]:
# Define the start row of each data set
n_rows_skip_map = {'all': 4,
'white alone': 61,
'white': 83,
'white alone, not hispanic': 121,
'white, not hispanic': 143,
'black alone or in combination': 176,
'black alone': 198,
'black': 220,
'asian alone or in combination': 258,
'asian alone': 280,
'asian and pacific islander': 302,
'hispanic': 320
}
last_row = 371
# Define the row length of each data set
n_rows_skip_vals = np.append(np.array(list(n_rows_skip_map.values())), last_row)
n_rows_map = {}
for k in n_rows_skip_map.keys():
row_start = n_rows_skip_map[k]
n_rows_map[k] = min(n_rows_skip_vals[n_rows_skip_vals > row_start]) - row_start - 3
# Convert column names
col_names_map = {
'Number\n(thousands) Unnamed: 1_level_1'.replace(' ', ''): 'n',
'Percent distribution Total'.replace(' ', ''): 'pc_total',
'Percent distribution Under $15,000'.replace(' ', ''): '< 15k',
'Percent distribution $15,000 to $24,999'.replace(' ', ''): '15-25k',
'Percent distribution $25,000 to $34,999'.replace(' ', ''): '25-35k',
'Percent distribution $35,000 to $49,999'.replace(' ', ''): '35-50k',
'Percent distribution $50,000 to $74,999'.replace(' ', ''): '50-75k',
'Percent distribution $75,000 to $99,999'.replace(' ', ''): '75-100k',
'Percent distribution $100,000 to $149,999'.replace(' ', ''): '100-150k',
'Percent distribution $150,000 to $199,999'.replace(' ', ''): '150-200k',
'Percent distribution $200,000 and over'.replace(' ', ''): '> 200k',
'Median income (dollars) Estimate'.replace(' ', ''): 'median',
'Median income (dollars) Standard error'.replace(' ', ''): 'median se',
'Mean income (dollars) Estimate'.replace(' ', ''): 'mean',
'Mean income (dollars) Standard error'.replace(' ', ''): 'mean se',
}
In [7]:
def load_race_table(table_name):
"""Load table for a given section"""
# Load data
df_all = pd.read_excel('/gh/data/census/income/2018_to_1967_income_distribution_race.xls',
skiprows=n_rows_skip_map[table_name], header=[0,1], nrows=n_rows_map[table_name],
index_col=0)
# Merge multiindex columns
df_all.columns = [' '.join(col).strip().replace(' ', '') for col in df_all.columns.values]
# Clean table
df_all = df_all.rename(columns=col_names_map)
# Fix and de-dup year
df_all = df_all.rename(columns=col_names_map)
df_all = df_all.drop(['n', 'pc_total', 'median se', 'mean se'], axis=1)
df_all = df_all.reset_index().rename(columns={'index': 'year'})
df_all['year'] = [int(str(x).split()[0]) for x in df_all['year']]
df_all = df_all.drop_duplicates(subset=['year'])
return df_all
In [8]:
table_names = n_rows_skip_map.keys()
dfs = []
for table_name in table_names:
df_tmp = load_race_table(table_name)
df_tmp['race'] = table_name
dfs.append(df_tmp)
df_income_race = pd.concat(dfs)
# Fix null
df_income_race['mean'] = df_income_race['mean'].replace({'N': np.nan}).astype(float)
df_income_race.head()
Out[8]:
In [35]:
df_plt = df_income_race.copy()
df_plt['median'] = df_plt['median'] / 1000
races_keep = ['white alone, not hispanic',
'white, not hispanic', 'black alone or in combination',
'black', 'asian alone or in combination',
'asian and pacific islander', 'hispanic']
df_plt = df_plt[df_plt['race'].isin(races_keep)]
# rename races
df_plt['race_old'] = df_plt['race']
df_plt['race'] = df_plt['race_old'].map({'white alone, not hispanic': 'white',
'white, not hispanic': 'white',
'black alone or in combination': 'black',
'black': 'black',
'asian alone or in combination': 'asian',
'asian and pacific islander':'asian',
'hispanic': 'hispanic'})
plt.figure(figsize=(8,6))
sns.lineplot(data=df_plt, x='year', y='median', hue='race',
linewidth=5)
plt.tight_layout()
plt.xlabel('')
plt.ylabel('Median annual household income\n(x $1,000), adjusted for inflation', size=20)
plt.yticks(np.arange(0, 100, 10), size=20)
plt.xticks(size=20)
plt.xlim(1987, 2020)
plt.ylim((0,90))
plt.legend(bbox_to_anchor=(1,.5), fontsize=15, loc='center left', title='')
Out[35]:
In [44]:
for race in ['white', 'black', 'hispanic', 'asian']:
print(race)
for year in [2008, 2018]:
print('{} median income: {}'.format(year, df_plt.loc[(df_plt['year']==year) & (df_plt['race']==race), 'median'].values[0]))
print('Median income change over 10 years: {:.0%}\n'.format(df_plt.loc[(df_plt['year']==2018) & (df_plt['race']==race), 'median'].values[0] / df_plt.loc[(df_plt['year']==2008) & (df_plt['race']==race), 'median'].values[0] - 1))
In [45]:
for race in ['white', 'black', 'hispanic', 'asian']:
print(race)
for year in [2008, 2018]:
print('{} mean income: {}'.format(year, df_plt.loc[(df_plt['year']==year) & (df_plt['race']==race), 'mean'].values[0]))
print('mean income change over 10 years: {:.0%}\n'.format(df_plt.loc[(df_plt['year']==2018) & (df_plt['race']==race), 'mean'].values[0] / df_plt.loc[(df_plt['year']==2008) & (df_plt['race']==race), 'mean'].values[0] - 1))
In [10]:
race_plt = 'all'
# rename races
df_plt = df_income_race.copy()
df_plt['race_old'] = df_plt['race']
df_plt['race'] = df_plt['race_old'].map({'white alone, not hispanic': 'white',
'white, not hispanic': 'white',
'black alone or in combination': 'black',
'black': 'black',
'asian alone or in combination': 'asian',
'asian and pacific islander':'asian',
'hispanic': 'hispanic',
'all': 'all'})
df_plt = df_plt[df_plt['race']==race_plt].drop(['mean', 'median', 'race', 'race_old'], axis=1).set_index('year')
plt.figure(figsize=(14, 8))
ax = plt.gca()
df_plt.plot.area(ax=ax, alpha=.5)
plt.xticks(size=20)
plt.yticks(np.arange(0, 110, 10), size=20)
plt.xlim((min(df_plt.index), 2020))
plt.ylim((0, 100))
plt.xlabel('')
plt.ylabel('% of households in income bracket\n(inflation-adjusted 2018 USD)', size=20)
plt.legend(bbox_to_anchor=(1,.5), loc='center left', fontsize=15)
plt.tight_layout()
In [11]:
race_plt = 'black'
# rename races
df_plt = df_income_race.copy()
df_plt['race_old'] = df_plt['race']
df_plt['race'] = df_plt['race_old'].map({'white alone, not hispanic': 'white',
'white, not hispanic': 'white',
'black alone or in combination': 'black',
'black': 'black',
'asian alone or in combination': 'asian',
'asian and pacific islander':'asian',
'hispanic': 'hispanic',
'all': 'all'})
df_plt = df_plt[df_plt['race']==race_plt].drop(['mean', 'median', 'race', 'race_old'], axis=1).set_index('year')
plt.figure(figsize=(14, 8))
ax = plt.gca()
df_plt.plot.area(ax=ax, alpha=.5)
plt.xticks(size=20)
plt.yticks(np.arange(0, 110, 10), size=20)
plt.xlim((min(df_plt.index), 2020))
plt.ylim((0, 100))
plt.xlabel('')
plt.ylabel('% of households in income bracket\n(inflation-adjusted 2018 USD)', size=20)
plt.legend(bbox_to_anchor=(1,.5), loc='center left', fontsize=15)
plt.tight_layout()
In [12]:
race_plt = 'asian'
# rename races
df_plt = df_income_race.copy()
df_plt['race_old'] = df_plt['race']
df_plt['race'] = df_plt['race_old'].map({'white alone, not hispanic': 'white',
'white, not hispanic': 'white',
'black alone or in combination': 'black',
'black': 'black',
'asian alone or in combination': 'asian',
'asian and pacific islander':'asian',
'hispanic': 'hispanic',
'all': 'all'})
df_plt = df_plt[df_plt['race']==race_plt].drop(['mean', 'median', 'race', 'race_old'], axis=1).set_index('year')
plt.figure(figsize=(14, 8))
ax = plt.gca()
df_plt.plot.area(ax=ax, alpha=.5)
plt.xticks(size=20)
plt.yticks(np.arange(0, 110, 10), size=20)
plt.xlim((min(df_plt.index), 2020))
plt.ylim((0, 100))
plt.xlabel('')
plt.ylabel('% of households in income bracket\n(inflation-adjusted 2018 USD)', size=20)
plt.legend(bbox_to_anchor=(1,.5), loc='center left', fontsize=15)
plt.tight_layout()
In [13]:
df = pd.read_excel('/gh/data/census/income/Impact_Poverty.xls',
skiprows=7, usecols=[0,3], nrows=40, names=['year', 'poverty_percent'])
df['year'] = np.array([str(x)[:4] for x in df['year']], dtype=int)
df = df.drop_duplicates(subset=['year'])
fig = plt.figure(figsize=(8,5))
plt.plot(df['year'], df['poverty_percent'], 'k', linewidth=3)
plt.ylim(10, 16)
plt.xlim((1980, 2020))
plt.xticks(size=15)
plt.yticks(np.arange(10, 17), ['{}%'.format(x) for x in np.arange(10, 17)], size=15)
plt.ylabel('Poverty rate', size=20)
Out[13]:
In [14]:
df_pov_state = pd.read_excel('/gh/data/census/income/state.xls', usecols=[0,1], names=['state', 'poverty_rate'],
skiprows=10, nrows=55)
df_pov_state = df_pov_state.dropna()
df_pov_state['state'] = [str(x).replace('.', '').strip() for x in df_pov_state['state']]
df_pov_state = df_pov_state[df_pov_state['state']!='District of Columbia…'].reset_index(drop=True)
us_state_abbrev = {
'Alabama': 'AL',
'Alaska': 'AK',
'Arizona': 'AZ',
'Arkansas': 'AR',
'California': 'CA',
'Colorado': 'CO',
'Connecticut': 'CT',
'Delaware': 'DE',
'District of Columbia': 'DC',
'Florida': 'FL',
'Georgia': 'GA',
'Hawaii': 'HI',
'Idaho': 'ID',
'Illinois': 'IL',
'Indiana': 'IN',
'Iowa': 'IA',
'Kansas': 'KS',
'Kentucky': 'KY',
'Louisiana': 'LA',
'Maine': 'ME',
'Maryland': 'MD',
'Massachusetts': 'MA',
'Michigan': 'MI',
'Minnesota': 'MN',
'Mississippi': 'MS',
'Missouri': 'MO',
'Montana': 'MT',
'Nebraska': 'NE',
'Nevada': 'NV',
'New Hampshire': 'NH',
'New Jersey': 'NJ',
'New Mexico': 'NM',
'New York': 'NY',
'North Carolina': 'NC',
'North Dakota': 'ND',
'Northern Mariana Islands':'MP',
'Ohio': 'OH',
'Oklahoma': 'OK',
'Oregon': 'OR',
'Palau': 'PW',
'Pennsylvania': 'PA',
'Puerto Rico': 'PR',
'Rhode Island': 'RI',
'South Carolina': 'SC',
'South Dakota': 'SD',
'Tennessee': 'TN',
'Texas': 'TX',
'Utah': 'UT',
'Vermont': 'VT',
'Virgin Islands': 'VI',
'Virginia': 'VA',
'Washington': 'WA',
'West Virginia': 'WV',
'Wisconsin': 'WI',
'Wyoming': 'WY',
}
df_pov_state['state_abbrev'] = df_pov_state['state'].map(us_state_abbrev)
df_pov_state.head()
Out[14]:
In [15]:
import plotly.graph_objects as go
fig = go.Figure(data=go.Choropleth(
locations=df_pov_state['state_abbrev'], # Spatial coordinates
z = df_pov_state['poverty_rate'].astype(float), # Data to be color-coded
locationmode = 'USA-states', # set of locations match entries in `locations`
colorscale = 'viridis',
colorbar_title = "Poverty rate (%)",
marker_line_color='white',
zmin=6, zmax=20
))
fig.update_layout(
geo_scope='usa', # limite map scope to USA
)
In [46]:
df_pov_state.sort_values('poverty_rate').head()
Out[46]:
In [47]:
df_pov_state.sort_values('poverty_rate').tail()
Out[47]: