In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression as LR
%matplotlib inline

In [24]:
data = pd.read_csv('data.csv')

In [25]:
data


Out[25]:
Region EC votes 2016 Estimate 2010 Census Rural population Urban population
0 Alabama 9 4863300 4779736 1957932 2821804
1 Alaska 3 741894 710231 241338 468893
2 Arizona 11 6931071 6392017 651358 5740659
3 Arkansas 6 2988248 2915918 1278329 1637589
4 California 55 39250017 37253956 1880350 35373606
5 Colorado 9 5540545 5029196 696435 4332761
6 Connecticut 7 3576452 3574097 429155 3144942
7 Delaware 3 952065 897934 149985 747949
8 District of Columbia 3 681170 601723 0 601723
9 Florida 29 20612439 18801310 1661466 17139844
10 Georgia 16 10310371 9687653 2415502 7272151
11 Hawaii 4 1428557 1360301 109812 1250489
12 Idaho 4 1683140 1567582 461212 1106370
13 Illinois 20 12801539 12830632 1477079 11353553
14 Indiana 11 6633053 6483802 1786702 4697100
15 Iowa 6 3134693 3046355 1096099 1950256
16 Kansas 6 2907289 2853118 736157 2116961
17 Kentucky 8 4436974 4339367 1806024 2533343
18 Louisiana 8 4681666 4533372 1215567 3317805
19 Maine 4 1331479 1328361 814819 513542
20 Maryland 10 6016447 5773552 739221 5034331
21 Massachusetts 11 6811779 6547629 525640 6021989
22 Michigan 16 9928301 9883640 2513683 7369957
23 Minnesota 10 5519952 5303925 1417614 3886311
24 Mississippi 6 2988726 2967297 1503073 1464224
25 Missouri 10 6093000 5988927 1770556 4218371
26 Montana 3 1042520 989415 436401 553014
27 Nebraska 5 1907116 1826341 490655 1335686
28 Nevada 6 2940058 2700551 156754 2543797
29 New Hampshire 4 1334795 1316470 522598 793872
30 New Jersey 14 8944469 8791894 467768 8324126
31 New Mexico 5 2081015 2059179 464818 1594361
32 New York 29 19745289 19378102 2349997 17028105
33 North Carolina 15 10146788 9535483 3233727 6301756
34 North Dakota 3 757952 672591 269719 402872
35 Ohio 18 11614373 11536504 2546810 8989694
36 Oklahoma 7 3923561 3751351 1266322 2485029
37 Oregon 7 4093465 3831074 726692 3104382
38 Pennsylvania 20 12802503 12702379 2711092 9991287
39 Rhode Island 4 1056426 1052567 97524 955043
40 South Carolina 9 4961119 4625364 1557555 3067809
41 South Dakota 3 865454 814180 352933 461247
42 Tennessee 11 6651194 6346105 2132860 4213245
43 Texas 38 27862596 25145561 3847522 21298039
44 Utah 6 3051217 2763885 260290 2503595
45 Vermont 3 624594 625741 382356 243385
46 Virginia 13 8411808 8001024 1963930 6037094
47 Washington 12 7288000 6724540 1072671 5651869
48 West Virginia 5 1831102 1852994 950184 902810
49 Wisconsin 10 5778708 5686986 1697348 3989638
50 Wyoming 3 585501 563626 198633 364993

In [64]:
label_states = ['California', 'New York', 'District of Columbia', 'Wyoming', 'Michigan']
names = ['CA', 'NY', 'DC', 'WY', 'MI']
alpha = .4

In [65]:
f, axes = plt.subplots(1, 3, figsize=(15, 5), sharey=True)
ax0, ax1, ax2 = axes

avg = data['EC votes'].sum() / data['2010 Census'].sum()
rural = np.dot(data['Rural population'], data['EC votes'] / data['2010 Census']) / data['Rural population'].sum()
urban = np.dot(data['Urban population'], data['EC votes'] / data['2010 Census']) / data['Urban population'].sum()

y = data['EC votes'] / data['2010 Census']
keys = ['2010 Census', 'Rural population', 'Urban population']
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']
xlabels = ['Population', 'Rural population', 'Urban population']
for ax, key, c, xlabel in zip(axes, keys, colors, xlabels):
    x = data[key]
    ax.scatter(x, y, c=c, alpha=alpha)
    ax.grid()
    ax.plot([x.min(), x.max()], [avg, avg], c='#1f77b4', label='Total')
    ax.plot([x.min(), x.max()], [rural, rural], c='#ff7f0e', label='Urban')
    ax.plot([x.min(), x.max()], [urban, urban], c='#2ca02c', label='Rural')
    ax.set_xlabel(xlabel)
    ax.set_ylabel('EC votes per person')
    ax.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
    ax.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))
    for st, n in zip(label_states, names):
        idx = np.argmax(data['Region'] == st)
        ax.annotate(n, (x[idx], y[idx]),
                    xytext=(5, -5), textcoords='offset points',
                   rotation=-45)
        ax.scatter(x[idx], y[idx], c=c)

ax0.set_ylim(0, 1.1 * y.max())
ax0.legend(loc='best')

plt.savefig('ec_per_person.png')
plt.draw()



In [126]:
y[8] / y[4]


Out[126]:
3.3770255809340237

In [41]:
label_states2 = ['California', 'New York', 'Wyoming', 'Michigan']
names2 = ['CA', 'NY', 'WY', 'MI']

In [56]:
f, axes = plt.subplots(1, 2, figsize=(10, 5), sharey=False)

plot_first = 45
x = data['2010 Census'].values
idxs = np.argsort(x)
x = x[idxs]
y = data['EC votes'].values
y = y[idxs]

for ax in axes:
    ax.scatter(x, y, c='#1f77b4', alpha=alpha)
    ax.grid()
    ax.plot([x.min(), x.max()], [x.min() * avg, x.max() * avg], c='#1f77b4',
            label='Proportional representation')
    for xp, yp in zip(x, y):
        ax.plot([xp, xp], [yp, xp * avg], c='red', lw=.5)
    ax.set_xlabel('Population')
    ax.set_ylabel('EC votes')
    ax.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))

    for st, n in zip(label_states2, names2):
        idx = np.argmax(data['Region'].values[idxs] == st)
        try:
            xp, yp = x[idx], y[idx]
            yloc = yp - (avg * xp) + 10 * np.sign(yp - (avg * xp))
            print(yloc)
            ax.annotate(n, (xp, yp),
                        xytext=(0, yloc), textcoords='offset points',
                       rotation=0, ha='center', va='center')
            ax.scatter(x[idx], y[idx], c='#1f77b4')
        except IndexError:
            pass
    x = x[:plot_first]
    y = y[:plot_first]

axes[0].set_xticks(np.arange(0, 5e7, 1e7))
axes[0].set_yticks(np.arange(0, 70, 10))

axes[1].set_xticks(np.arange(0, 1.5e7, .25e7))
axes[1].set_yticks(np.arange(0, 25, 5))

axes[0].legend(loc='best')
plt.savefig('ec_per_state.png')
plt.show()


-19.9163335536
-14.7670268647
12.0178617966
-11.2225916347
12.0178617966
-11.2225916347

In [63]:
f, ax = plt.subplots(1, figsize=(5, 5))
x = data['2010 Census'].values
idxs = np.argsort(x)
y = data['Rural population'].values / x
x = x[idxs]
y = y[idxs]
model = LR()
model.fit(x[:, np.newaxis], y[:, np.newaxis])
ax.plot([x.min(), x.max()], [float(model.predict(x.min())), float(model.predict(x.max()))],
        label='Fit', c='#1f77b4')
ax.scatter(x, y, c='#1f77b4', alpha=alpha)
ax.grid()
ax.set_xlabel('Population')
ax.set_ylabel('Fraction Rural')
ax.ticklabel_format(style='sci', axis='x', scilimits=(0, 0))
ax.set_xticks(np.arange(0, 5e7, 1e7))
ax.set_yticks(np.arange(0, .8, .2))
for st, n in zip(label_states, names):
    idx = np.argmax(data['Region'].values[idxs] == st)
    ax.annotate(n, (x[idx], y[idx]),
                xytext=(0, 10), textcoords='offset points',
               rotation=0, ha='center', va='center')
    ax.scatter(x[idx], y[idx], c='#1f77b4')

ax.legend(loc='best')
f.tight_layout()

plt.savefig('pop_vs_rural.png')
plt.show()



In [113]:
print(data.columns)
print(data['EC votes'].sum())
print(data['2010 Census'].sum())


Index(['Region', 'EC votes', '2016 Estimate', '2010 Census',
       'Rural population', 'Urban population'],
      dtype='object')
538
308745538