In [2]:
# Static content
import numpy as np
from sklearn.metrics import euclidean_distances
from sklearn import manifold
import pandas.io
def simple_vote(V):
if V == 0:
return 0
if V >= 1 and V <= 3:
return 1
if V >3 and V <= 6:
return -1
if V > 6 and V <= 9:
return 0
return nan
datadir = "data"
datafilename = "%s/congress/hou112kh.ord" % datadir
dictionaryfilename = "%s/congress/h112desc.csv" % datadir
In [3]:
votedescriptions = pandas.io.parsers.read_csv(dictionaryfilename)
print("Columns: %s" % ", ".join(votedescriptions.columns))
print("Number of votes: %d" % len(votedescriptions))
In [4]:
data = []
vp = None
for line in open(datafilename, "r"):
record = (line[:3], line[3:8], line[8:10].strip(), line[10:12].strip(), line[12:20].strip(), line[20:23], line[23:25], line[25:36].strip(), list(line[36:].strip()))
if vp is None and record[2] == "99":
vp = record
continue
data.append(record)
In [5]:
votes = {}
party = {}
for person in data:
voterecord = np.array([simple_vote(int(vote)) for vote in person[-1]])
name = person[7]
if name in votes:
name += person[3]
if name in votes:
name += "%d" % len([samename for samename in votes if samename == name])
votes[name] = voterecord
if person[5] == '100':
party[name] = "DEM"
elif person[5] == "200":
party[name] = "REP"
else:
party[name] = "OTH"
In [6]:
# How many and where?
for name in ("DEM", "REP", "OTH"):
print("%s: %d" % (name, len([p for p in party.iteritems() if p[1] == name])))
In [7]:
# Verify array lengths. (Senators excludes VP)
counts = []
for v in votes.values():
counts.append(len(v))
num_senators = len(votes)
num_votes = np.max(counts)
print("Max votes: %d" % num_votes)
print("Min votes: %d" % np.min(counts))
print("Senators: %d" % num_senators)
if num_votes != len(votedescriptions):
print("Missing discriptions")
In [8]:
# Make Matrices
votematrix = np.ndarray((num_senators, num_votes))
senatorids = {}
idsenators = {} # inverse of senatorids for convenience
total_votes = np.ndarray((num_senators, 1))
idx = 0
for (name, vote) in votes.iteritems():
senatorids[name] = idx
idsenators[idx] = name
votematrix[idx,:] = vote
total_votes[idx] = sum(vote.dot(vote))
idx += 1
correlation = votematrix.dot(votematrix.transpose())
print("Correlation sample subset")
print(correlation[0:5,0:5])
distances = euclidean_distances(correlation)
print("\nDistances")
print(distances[0:5, 0:5])
In [9]:
# Example distances
def get_distance(sen1, sen2):
return distances[senatorids[sen1], senatorids[sen2]]
print("Pelosi v Cantor")
print(get_distance('PELOSI', 'CANTOR'))
print("Ryan v Cantor")
print(get_distance('RYAN1', 'CANTOR'))
furthest = max(distances.flatten())
mask = np.identity(distances.shape[0]) + distances
np.fill_diagonal(mask, nan)
mask = np.logical_not(isnan(mask))
closest = min(distances[mask].flatten())
tmp = ["%s & %s" % (idsenators[loc[0]], idsenators[loc[1]]) for loc in np.where(distances == furthest)]
print("Furthest (%f): %s" % (furthest, ",".join(tmp[:len(tmp)/2])))
tmp = ["%s & %s" % (idsenators[loc[0]], idsenators[loc[1]]) for loc in np.where(distances == closest)]
print("Closest (%f): %s" % (closest, ",".join(tmp[:len(tmp)/2])))
In [10]:
# Calculate MDS
seed = np.random.RandomState(seed=3)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
dissimilarity="precomputed", n_jobs=1)
pos = mds.fit_transform(distances)
In [11]:
# Plot MDS
figure(figsize=(24, 24))
plt.scatter(pos[:, 0], pos[:, 1], s=5, c='g')
idx = 0
while idx < pos.shape[0]:
loc = pos[idx, :]
name = idsenators[idx]
p = party[name]
if p == "DEM":
color = "blue"
elif p == "REP":
color = "red"
else:
color = "green"
plt.annotate(name, loc, color=color)
idx += 1
In [12]:
print("Distances of CANTOR (%s) to:" % party['CANTOR'])
for senator in ('COHEN',):
print("%s (%s): %f" % (senator, party[senator], get_distance('CANTOR', senator)))
print("\n")
print("Example locations:")
for senator in ("BOEHNER",):
print("Senator %s (%s) @ %s" % (senator, party[senator], pos[senatorids[senator],:]))
In [13]:
# What's up with AMASH?
schatz = senatorids['AMASH']
print("Schatz voted %d times." % total_votes[schatz])
print("Average number of votes is %d." % mean(total_votes))
print("First vote (one indexed): %d" % (np.min(np.where(votes['AMASH'] != 0)) + 1))
print("\n")
# Least Voters (less than 400)
threshold = 800
print("People voting less than %d times. First and last are 1-indexed." % threshold)
for idx in xrange(0, len(total_votes)):
if total_votes[idx] >= 400:
continue
name = idsenators[idx]
indices = np.where(votes[name] != 0)
print("%s (%s): Voted %d times. First on %d. Last on %d." % (name, party[name], total_votes[idx], np.min(indices) + 1, np.max(indices) + 1))
In [14]:
giffords = votedescriptions.ix[np.where(votes['GIFFORDS'] != 0)]
pandas.concat([giffords, pandas.DataFrame(np.array(votes['GIFFORDS'])[np.where(votes['GIFFORDS'] != 0)],columns=("vote",))], axis=1)
Out[14]: