In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import statsmodels.formula.api as smf
matplotlib.style.use('ggplot')
%matplotlib inline
# Load the data on various variables
population = pd.read_excel('data/zh-population.xlsx')
population.rename(columns={'BFS_NR': 'municipality', 'GEBIET_NAME': 'municipality_name', 'INDIKATOR_JAHR': 'year'}, inplace=True)
income_and_wealth = pd.read_excel('data/zh-income-wealth.xlsx')
income_and_wealth.rename(columns={'BFS_NR': 'municipality', 'GEBIET_NAME': 'municipality_name', 'INDIKATOR_JAHR': 'year'}, inplace=True)
# helper function
def extract_group(df_grouped, group_id, column_name):
frame = df_grouped.get_group(group_id).rename(columns={'INDIKATOR_VALUE' : column_name})
return frame[['municipality', 'year', 'municipality_name', column_name]]
# Pick out the series we are interested in and discard columns we do not want
population_grouped = population.groupby('INDIKATOR_ID')
pop_20_to_39 = extract_group(population_grouped, 85, 'pct20_39')
pop_40_to_64 = extract_group(population_grouped, 87, 'pct40_64')
pop_65_to_79 = extract_group(population_grouped, 89, 'pct65_79')
pop_80_plus = extract_group(population_grouped, 91, 'pct80plus')
female = extract_group(population_grouped, 206, 'pctfemale')
foreign = extract_group(population_grouped, 205, 'pctforeign')
demographics = pop_20_to_39.merge(pop_40_to_64).merge(pop_65_to_79).merge(pop_80_plus).merge(female).merge(foreign)
income_and_wealth_grouped = income_and_wealth.groupby('INDIKATOR_ID')
median_income = extract_group(income_and_wealth_grouped, 316, 'income')
median_wealth = extract_group(income_and_wealth_grouped, 320, 'wealth')
median_income_and_wealth = median_income.merge(median_wealth, on=['municipality', 'year'], how="left")
# Drop the redundant columns
median_income_and_wealth = median_income_and_wealth[['municipality', 'year', 'income', 'wealth']]
# Load the data on voting
votes = pd.read_csv('data/2015-06-elections-partystrengths/analysis/output/partystrengths.csv')
parties = pd.read_csv('data/2015-06-elections-partystrengths/analysis/output/parties.csv')
parties.rename(columns={'ID': 'party', 'Abbr_D': 'party_name'}, inplace=True)
parties = parties[['party', 'party_name']]
# Create a frame that merges demographic, income, and voting data
df = demographics.merge(median_income_and_wealth,
on=['municipality', 'year'], how='inner').merge(votes,
on=['municipality', 'year'], how='inner').merge(parties,
on='party', how='left')
In [2]:
df.head()
Out[2]:
In [3]:
# Compute z-scores for income and party support for easier comparibility accross times
zscore = lambda x: (x - x.mean()) / x.std()
df['z_income'] = pd.Series(df.groupby(['year'])['income'].transform(zscore), index=df.index)
df['z_wealth'] = pd.Series(df.groupby(['year'])['wealth'].transform(zscore), index=df.index)
df['z_support'] = pd.Series(df.groupby(['year', 'party'])['support'].transform(zscore), index=df.index)
In [4]:
df.head()
Out[4]:
In [5]:
def plot_relationship(party, variable, ax):
# Get the data for the party/year
data = df.groupby('party_name').get_group(party)
# Filter out NaNs
data = data[np.isfinite(data[variable])]
# Create a model
formula = 'z_support ~ {}'.format(variable)
lm = smf.ols(formula=formula, data=data).fit()
rsquared = lm.rsquared
preds_input = pd.DataFrame({variable: [data[variable].min(), data[variable].max()]})
preds = lm.predict(preds_input)
# Plot the data
data.plot(x=variable, y='z_support', kind='scatter', ax=ax)
# Plot the model
ax.plot(preds_input, preds, c='red', linewidth=2)
ax.text(1.0, 0.05,'R^2 = {:.2f}'.format(rsquared), horizontalalignment='right',
verticalalignment='center', color='red',
transform=ax.transAxes)
def plot_monetary_relationships(party):
fig, axs = plt.subplots(1,2, sharey=True)
plot_relationship(party, 'z_income', axs[0])
plot_relationship(party, 'z_wealth', axs[1])
fig.suptitle(party + ' support by income and wealth', fontsize=16)
def plot_demographic_relationships(party):
fig, axs = plt.subplots(1,3, sharey=True, figsize=(10, 4))
plot_relationship(party, 'pct20_39', axs[0])
plot_relationship(party, 'pct40_64', axs[1])
plot_relationship(party, 'pct65_79', axs[2])
fig.suptitle(party + ' support by age', fontsize=16)
In [6]:
plot_monetary_relationships('FDP')
plot_monetary_relationships('SP')
plot_monetary_relationships('SVP')
plot_monetary_relationships('CVP')
plot_monetary_relationships('GPS')
plot_monetary_relationships('GLP')
# plot_monetary_relationships('BDP')
# plot_monetary_relationships('EVP')
In [7]:
plot_demographic_relationships('FDP')
plot_demographic_relationships('SP')
plot_demographic_relationships('SVP')
plot_demographic_relationships('CVP')
plot_demographic_relationships('GPS')
plot_demographic_relationships('GLP')
# plot_demographic_relationships('BDP')
# plot_demographic_relationships('EVP')
In [ ]: