In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as LR
%matplotlib inline
In [24]:
data = pd.read_csv('data.csv')
In [25]:
data
Out[25]:
In [64]:
label_states = ['California', 'New York', 'District of Columbia', 'Wyoming', 'Michigan']
names = ['CA', 'NY', 'DC', 'WY', 'MI']
alpha = .4
In [65]:
f, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
ax0, ax1, ax2 = axes
avg = data['EC votes'].sum() / data['2010 Census'].sum()
rural = np.dot(data['Rural population'], data['EC votes'] / data['2010 Census']) / data['Rural population'].sum()
urban = np.dot(data['Urban population'], data['EC votes'] / data['2010 Census']) / data['Urban population'].sum()
y = data['EC votes'] / data['2010 Census']
keys = ['2010 Census', 'Rural population', 'Urban population']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
xlabels = ['Population', 'Rural population', 'Urban population']
for ax, key, c, xlabel in zip(axes, keys, colors, xlabels):
x = data[key]
ax.scatter(x, y, c=c, alpha=alpha)
ax.grid()
ax.plot([x.min(), x.max()], [avg, avg], c='#1f77b4', label='Total')
ax.plot([x.min(), x.max()], [rural, rural], c='#ff7f0e', label='Urban')
ax.plot([x.min(), x.max()], [urban, urban], c='#2ca02c', label='Rural')
ax.set_xlabel(xlabel)
ax.set_ylabel('EC votes per person')
ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
ax.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))
for st, n in zip(label_states, names):
idx = np.argmax(data['Region'] == st)
ax.annotate(n, (x[idx], y[idx]),
xytext=(5, -5), textcoords='offset points',
rotation=-45)
ax.scatter(x[idx], y[idx], c=c)
ax0.set_ylim(0, 1.1 * y.max())
ax0.legend(loc='best')
plt.savefig('ec_per_person.png')
plt.draw()
In [126]:
y[8] / y[4]
Out[126]:
In [41]:
label_states2 = ['California', 'New York', 'Wyoming', 'Michigan']
names2 = ['CA', 'NY', 'WY', 'MI']
In [56]:
f, axes = plt.subplots(1, 2, figsize=(10, 5), sharey=False)
plot_first = 45
x = data['2010 Census'].values
idxs = np.argsort(x)
x = x[idxs]
y = data['EC votes'].values
y = y[idxs]
for ax in axes:
ax.scatter(x, y, c='#1f77b4', alpha=alpha)
ax.grid()
ax.plot([x.min(), x.max()], [x.min() * avg, x.max() * avg], c='#1f77b4',
label='Proportional representation')
for xp, yp in zip(x, y):
ax.plot([xp, xp], [yp, xp * avg], c='red', lw=.5)
ax.set_xlabel('Population')
ax.set_ylabel('EC votes')
ax.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))
for st, n in zip(label_states2, names2):
idx = np.argmax(data['Region'].values[idxs] == st)
try:
xp, yp = x[idx], y[idx]
yloc = yp - (avg * xp) + 10 * np.sign(yp - (avg * xp))
print(yloc)
ax.annotate(n, (xp, yp),
xytext=(0, yloc), textcoords='offset points',
rotation=0, ha='center', va='center')
ax.scatter(x[idx], y[idx], c='#1f77b4')
except IndexError:
pass
x = x[:plot_first]
y = y[:plot_first]
axes[0].set_xticks(np.arange(0, 5e7, 1e7))
axes[0].set_yticks(np.arange(0, 70, 10))
axes[1].set_xticks(np.arange(0, 1.5e7, .25e7))
axes[1].set_yticks(np.arange(0, 25, 5))
axes[0].legend(loc='best')
plt.savefig('ec_per_state.png')
plt.show()
In [63]:
f, ax = plt.subplots(1, figsize=(5, 5))
x = data['2010 Census'].values
idxs = np.argsort(x)
y = data['Rural population'].values / x
x = x[idxs]
y = y[idxs]
model = LR()
model.fit(x[:, np.newaxis], y[:, np.newaxis])
ax.plot([x.min(), x.max()], [float(model.predict(x.min())), float(model.predict(x.max()))],
label='Fit', c='#1f77b4')
ax.scatter(x, y, c='#1f77b4', alpha=alpha)
ax.grid()
ax.set_xlabel('Population')
ax.set_ylabel('Fraction Rural')
ax.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))
ax.set_xticks(np.arange(0, 5e7, 1e7))
ax.set_yticks(np.arange(0, .8, .2))
for st, n in zip(label_states, names):
idx = np.argmax(data['Region'].values[idxs] == st)
ax.annotate(n, (x[idx], y[idx]),
xytext=(0, 10), textcoords='offset points',
rotation=0, ha='center', va='center')
ax.scatter(x[idx], y[idx], c='#1f77b4')
ax.legend(loc='best')
f.tight_layout()
plt.savefig('pop_vs_rural.png')
plt.show()
In [113]:
print(data.columns)
print(data['EC votes'].sum())
print(data['2010 Census'].sum())