In [1]:
# Static content
import numpy as np
from sklearn.metrics import euclidean_distances
from sklearn import manifold
import pandas
import pandas.io


def simple_vote(V):
    if V == 0:
        return 0
    if V >= 1 and V <= 3:
        return 1
    if V >3 and V <= 6:
        return -1
    if V > 6 and V <= 9:
        return 0
    return nan

datadir = "data"
datafilename = "%s/congress/sen112kh.ord" % datadir
dictionaryfilename = "%s/congress/s112desc.csv" % datadir

In [2]:
# Load description
votedescriptions = pandas.io.parsers.read_csv(dictionaryfilename)
print("Columns: %s" % ", ".join(votedescriptions.columns))
print("Number of votes: %d" % len(votedescriptions))


Columns: date, session, number, bill, question, result, description, yeatotal, naytotal 
Number of votes: 486

In [3]:
data = []
omitted = []
total_votes = {}
for line in open(datafilename, "r"):
    voterecord = np.array([simple_vote(int(v)) for v in list(line[36:].strip())])
    total_votes = np.count_nonzero(voterecord)
    record = (line[:3], line[3:8], line[8:10].strip(), line[10:12].strip(), line[12:20].strip(), line[20:23], line[23:25], line[25:36].strip(), voterecord, total_votes)
    if record[2] == "99":
        omitted.append(record)
        continue
    data.append(record)
columns = ('session', 'icpsr', 'stateid', 'district', 'state', 'party', 'occupancy', 'name', 'votes', 'total_votes')
data = pandas.DataFrame.from_records(data, columns=columns)
data.party = data.party.replace("100", "DEM").replace("200", "REP").replace("328","IND")
votes = data[['name','votes']]
party = data[['name','party']]
data.head()


Out[3]:
session icpsr stateid district state party occupancy name votes total_votes
0 112 49700 41 0 ALABAMA REP 01 SESSIONS [1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, ... 479
1 112 94659 41 0 ALABAMA REP 01 SHELBY [1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1... 483
2 112 40300 81 0 ALASKA REP 01 MURKOWSKI [1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1,... 462
3 112 40900 81 0 ALASKA DEM 01 BEGICH [1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, 1... 476
4 112 15429 61 0 ARIZONA REP 01 KYL [1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1... 482

In [4]:
party.groupby('party')
print(party.groupby('party').count())


       name
party      
DEM      53
IND       1
REP      48

In [5]:
# Verify array lengths. (Senators excludes VP)
counts = []
for (index, row) in votes.iterrows():
    counts.append(len(row['votes']))
num_senators = len(votes)
num_votes = np.max(counts)
print("Max votes: %d" % num_votes)
print("Min votes: %d" % np.min(counts))
print("Senators: %d" % num_senators)
if num_votes != len(votedescriptions):
    print("Missing discriptions")


Max votes: 486
Min votes: 486
Senators: 102

In [6]:
# Make Matrices
votematrix = np.ndarray((num_senators, num_votes))
idx = 0
while idx < num_senators:
    votematrix[idx,:] = votes.votes[idx]
    idx += 1
print(votematrix[0:5,0:5])
correlation = votematrix.dot(votematrix.transpose())

print("Correlation sample subset")
print(correlation[0:5,0:5])

distances = euclidean_distances(correlation)
print("\nDistances")
print(distances[0:5, 0:5])


[[ 1.  1. -1. -1. -1.]
 [ 1.  1.  1. -1. -1.]
 [ 1.  1.  1. -1. -1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1. -1. -1.]]
Correlation sample subset
[[ 479.  380.  167.  -81.  325.]
 [ 380.  483.  191.  -43.  319.]
 [ 167.  191.  462.  143.  214.]
 [ -81.  -43.  143.  476.  -72.]
 [ 325.  319.  214.  -72.  482.]]

Distances
[[    0.           327.02599285  1717.41841145  4113.06151182   332.6920498 ]
 [  327.02599285     0.          1472.44083073  3880.55292452
    338.06804049]
 [ 1717.41841145  1472.44083073     0.          2493.64813075
   1562.20997308]
 [ 4113.06151182  3880.55292452  2493.64813075     0.          3976.1570643 ]
 [  332.6920498    338.06804049  1562.20997308  3976.1570643      0.        ]]

In [7]:
# Example distances
def get_distance(sen1, sen2):
    row = data[data.name == sen1].index.tolist()[0]
    column = data[data.name == sen2].index.tolist()[0]
    return distances[row, column]

print("McCain v Boxer")
print(get_distance('MCCAIN', 'BOXER'))
print("McCain v Graham")
print(get_distance('MCCAIN', 'GRAHAM'))

furthest = max(distances.flatten())
mask = np.identity(distances.shape[0]) + distances
np.fill_diagonal(mask, nan)
mask = np.logical_not(isnan(mask))
closest = min(distances[mask].flatten())
for (key, val) in (("Furthest", furthest), ("Closest", closest)):
    tmp = ["%s & %s" % (data.name[loc[0]], data.name[loc[1]]) for loc in np.where(distances == val)]
    print("%s (%f): %s" % (key, val, ",".join(tmp[:len(tmp)/2])))


McCain v Boxer
3883.36735321
McCain v Graham
404.351332383
Furthest (4929.552211): CARDIN & LEE
Closest (33.090784): ENZI & BARASSO

In [8]:
# Calculate MDS
seed = np.random.RandomState(seed=3)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
                   dissimilarity="precomputed", n_jobs=1)
pos = mds.fit_transform(distances)

In [9]:
# Plot MDS
pos[:,0] = -pos[:,0]  # Flip coordinate to put DEMs on the left.
figure(figsize=(24, 24))
plt.scatter(pos[:, 0], pos[:, 1], s=5, c='g')
idx = 0
while idx < pos.shape[0]:
    loc = pos[idx, :]
    name = data['name'][idx]
    p = data['party'][idx]
    if p == "DEM":
        color = "blue"
    elif p == "REP":
        color = "red"
    else:
        color = "green"
    plt.annotate(name, loc, color=color)
    idx += 1



In [10]:
print("Distances of SANDERS (%s) to:" % party.party[party.name == 'SANDERS'].iloc[0])
for senator in ('BOXER', 'SHELBY', "HAGAN", "MCCASKILL", "MCCONNELL", "RISCH", "SCHATZ"):
    print("%s (%s): %f" % (senator, party.party[party['name'] == senator].iloc[0], get_distance('SANDERS', senator)))

print("\n")
print("Example locations:")
for senator in ("CARDIN", "LEE", "RISCH", "DEMINT", "LAUTENBERG", "SCHATZ"):
    rowid = data[data.name == senator].index.tolist()[0]
    print("Senator %s (%s) @ %s" % (senator, party.party[party['name'] == senator].iloc[0], pos[rowid,:]))


Distances of SANDERS (IND) to:
BOXER (DEM): 301.222509
SHELBY (REP): 3976.619167
HAGAN (DEM): 618.792372
MCCASKILL (DEM): 890.065728
MCCONNELL (REP): 4156.430319
RISCH (REP): 4429.106456
SCHATZ (DEM): 2581.076326


Example locations:
Senator CARDIN (DEM) @ [ -585.53784642 -1937.93283201]
Senator LEE (REP) @ [ 1983.16518193  2285.93031216]
Senator RISCH (REP) @ [ 1123.92837614  2422.08895471]
Senator DEMINT (REP) @ [ 2022.54314406  2238.39561642]
Senator LAUTENBERG (DEM) @ [ -384.01192325 -1764.07782411]
Senator SCHATZ (DEM) @ [ 1466.46972464   149.75082168]

In [11]:
# What's up with Schatz?
schatz = data[data.name == "SCHATZ"]
first = np.min(np.where(schatz.votes.values[0] != 0))
print("Schatz voted %d times." % schatz.total_votes)
print("Average number of votes is %d." % np.mean(data.total_votes))
print("First vote (one indexed): %d" % (first + 1))
print("Distance to INOUTE (%s): %d" % (party.party[party.name == 'INOUYE'].iloc[0], get_distance('SCHATZ', 'INOUYE')))
print("\n")


# Least Voters
threshold = 400
print("People voting less than %d times. First and last are 1-indexed." % threshold)
for (index, row) in data.iterrows():
    if row.total_votes < threshold:
        realvotes = np.where(row.votes != 0)
        first = np.min(realvotes) + 1
        last = np.max(realvotes) + 1
        print("%s (%s): Voted %d times. First on %d. Last on %d." % (row['name'], row.party, row.total_votes, first, last))
print("\n")

# Good voters
print("These people made every vote.")
for (index, row) in data[data.total_votes == num_votes].iterrows():
    print("%s (%s)" % (row['name'], row.party))


Schatz voted 20 times.
Average number of votes is 461.
First vote (one indexed): 467
Distance to INOUTE (DEM): 2564


People voting less than 400 times. First and last are 1-indexed.
SCHATZ (DEM): Voted 20 times. First on 467. Last on 486.
KIRK (REP): Voted 223 times. First on 1. Last on 235.
ENSIGN (REP): Voted 57 times. First on 1. Last on 61.
HELLER (REP): Voted 390 times. First on 67. Last on 486.


These people made every vote.
GRASSLEY (REP)
MCCONNELL (REP)
COLLINS (REP)
LEVIN  CARL (DEM)