In [1]:
# Static content
import numpy as np
from sklearn.metrics import euclidean_distances
from sklearn import manifold
import pandas
import pandas.io
def simple_vote(V):
if V == 0:
return 0
if V >= 1 and V <= 3:
return 1
if V >3 and V <= 6:
return -1
if V > 6 and V <= 9:
return 0
return nan
datadir = "data"
datafilename = "%s/congress/sen112kh.ord" % datadir
dictionaryfilename = "%s/congress/s112desc.csv" % datadir
In [2]:
# Load description
votedescriptions = pandas.io.parsers.read_csv(dictionaryfilename)
print("Columns: %s" % ", ".join(votedescriptions.columns))
print("Number of votes: %d" % len(votedescriptions))
In [3]:
data = []
omitted = []
total_votes = {}
for line in open(datafilename, "r"):
voterecord = np.array([simple_vote(int(v)) for v in list(line[36:].strip())])
total_votes = np.count_nonzero(voterecord)
record = (line[:3], line[3:8], line[8:10].strip(), line[10:12].strip(), line[12:20].strip(), line[20:23], line[23:25], line[25:36].strip(), voterecord, total_votes)
if record[2] == "99":
omitted.append(record)
continue
data.append(record)
columns = ('session', 'icpsr', 'stateid', 'district', 'state', 'party', 'occupancy', 'name', 'votes', 'total_votes')
data = pandas.DataFrame.from_records(data, columns=columns)
data.party = data.party.replace("100", "DEM").replace("200", "REP").replace("328","IND")
votes = data[['name','votes']]
party = data[['name','party']]
data.head()
Out[3]:
In [4]:
party.groupby('party')
print(party.groupby('party').count())
In [5]:
# Verify array lengths. (Senators excludes VP)
counts = []
for (index, row) in votes.iterrows():
counts.append(len(row['votes']))
num_senators = len(votes)
num_votes = np.max(counts)
print("Max votes: %d" % num_votes)
print("Min votes: %d" % np.min(counts))
print("Senators: %d" % num_senators)
if num_votes != len(votedescriptions):
print("Missing discriptions")
In [6]:
# Make Matrices
votematrix = np.ndarray((num_senators, num_votes))
idx = 0
while idx < num_senators:
votematrix[idx,:] = votes.votes[idx]
idx += 1
print(votematrix[0:5,0:5])
correlation = votematrix.dot(votematrix.transpose())
print("Correlation sample subset")
print(correlation[0:5,0:5])
distances = euclidean_distances(correlation)
print("\nDistances")
print(distances[0:5, 0:5])
In [7]:
# Example distances
def get_distance(sen1, sen2):
row = data[data.name == sen1].index.tolist()[0]
column = data[data.name == sen2].index.tolist()[0]
return distances[row, column]
print("McCain v Boxer")
print(get_distance('MCCAIN', 'BOXER'))
print("McCain v Graham")
print(get_distance('MCCAIN', 'GRAHAM'))
furthest = max(distances.flatten())
mask = np.identity(distances.shape[0]) + distances
np.fill_diagonal(mask, nan)
mask = np.logical_not(isnan(mask))
closest = min(distances[mask].flatten())
for (key, val) in (("Furthest", furthest), ("Closest", closest)):
tmp = ["%s & %s" % (data.name[loc[0]], data.name[loc[1]]) for loc in np.where(distances == val)]
print("%s (%f): %s" % (key, val, ",".join(tmp[:len(tmp)/2])))
In [8]:
# Calculate MDS
seed = np.random.RandomState(seed=3)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
dissimilarity="precomputed", n_jobs=1)
pos = mds.fit_transform(distances)
In [9]:
# Plot MDS
pos[:,0] = -pos[:,0] # Flip coordinate to put DEMs on the left.
figure(figsize=(24, 24))
plt.scatter(pos[:, 0], pos[:, 1], s=5, c='g')
idx = 0
while idx < pos.shape[0]:
loc = pos[idx, :]
name = data['name'][idx]
p = data['party'][idx]
if p == "DEM":
color = "blue"
elif p == "REP":
color = "red"
else:
color = "green"
plt.annotate(name, loc, color=color)
idx += 1
In [10]:
print("Distances of SANDERS (%s) to:" % party.party[party.name == 'SANDERS'].iloc[0])
for senator in ('BOXER', 'SHELBY', "HAGAN", "MCCASKILL", "MCCONNELL", "RISCH", "SCHATZ"):
print("%s (%s): %f" % (senator, party.party[party['name'] == senator].iloc[0], get_distance('SANDERS', senator)))
print("\n")
print("Example locations:")
for senator in ("CARDIN", "LEE", "RISCH", "DEMINT", "LAUTENBERG", "SCHATZ"):
rowid = data[data.name == senator].index.tolist()[0]
print("Senator %s (%s) @ %s" % (senator, party.party[party['name'] == senator].iloc[0], pos[rowid,:]))
In [11]:
# What's up with Schatz?
schatz = data[data.name == "SCHATZ"]
first = np.min(np.where(schatz.votes.values[0] != 0))
print("Schatz voted %d times." % schatz.total_votes)
print("Average number of votes is %d." % np.mean(data.total_votes))
print("First vote (one indexed): %d" % (first + 1))
print("Distance to INOUTE (%s): %d" % (party.party[party.name == 'INOUYE'].iloc[0], get_distance('SCHATZ', 'INOUYE')))
print("\n")
# Least Voters
threshold = 400
print("People voting less than %d times. First and last are 1-indexed." % threshold)
for (index, row) in data.iterrows():
if row.total_votes < threshold:
realvotes = np.where(row.votes != 0)
first = np.min(realvotes) + 1
last = np.max(realvotes) + 1
print("%s (%s): Voted %d times. First on %d. Last on %d." % (row['name'], row.party, row.total_votes, first, last))
print("\n")
# Good voters
print("These people made every vote.")
for (index, row) in data[data.total_votes == num_votes].iterrows():
print("%s (%s)" % (row['name'], row.party))