notebook.community

Edit and run



In [1]:

    
# Static content
import numpy as np
from sklearn.metrics import euclidean_distances
from sklearn import manifold
import pandas
import pandas.io


def simple_vote(V):
    if V == 0:
        return 0
    if V >= 1 and V <= 3:
        return 1
    if V >3 and V <= 6:
        return -1
    if V > 6 and V <= 9:
        return 0
    return nan

datadir = "data"
datafilename = "%s/congress/sen112kh.ord" % datadir
dictionaryfilename = "%s/congress/s112desc.csv" % datadir



In [2]:

    
# Load description
votedescriptions = pandas.io.parsers.read_csv(dictionaryfilename)
print("Columns: %s" % ", ".join(votedescriptions.columns))
print("Number of votes: %d" % len(votedescriptions))









    



Columns: date, session, number, bill, question, result, description, yeatotal, naytotal 
Number of votes: 486



In [3]:

    
data = []
omitted = []
total_votes = {}
for line in open(datafilename, "r"):
    voterecord = np.array([simple_vote(int(v)) for v in list(line[36:].strip())])
    total_votes = np.count_nonzero(voterecord)
    record = (line[:3], line[3:8], line[8:10].strip(), line[10:12].strip(), line[12:20].strip(), line[20:23], line[23:25], line[25:36].strip(), voterecord, total_votes)
    if record[2] == "99":
        omitted.append(record)
        continue
    data.append(record)
columns = ('session', 'icpsr', 'stateid', 'district', 'state', 'party', 'occupancy', 'name', 'votes', 'total_votes')
data = pandas.DataFrame.from_records(data, columns=columns)
data.party = data.party.replace("100", "DEM").replace("200", "REP").replace("328","IND")
votes = data[['name','votes']]
party = data[['name','party']]
data.head()









    Out[3]:






  
    
      
      session
      icpsr
      stateid
      district
      state
      party
      occupancy
      name
      votes
      total_votes
    
  
  
    
      0
       112
       49700
       41
       0
       ALABAMA
       REP
       01
        SESSIONS
       [1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, ...
       479
    
    
      1
       112
       94659
       41
       0
       ALABAMA
       REP
       01
          SHELBY
       [1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1...
       483
    
    
      2
       112
       40300
       81
       0
        ALASKA
       REP
       01
       MURKOWSKI
       [1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1,...
       462
    
    
      3
       112
       40900
       81
       0
        ALASKA
       DEM
       01
          BEGICH
       [1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, 1...
       476
    
    
      4
       112
       15429
       61
       0
       ARIZONA
       REP
       01
             KYL
       [1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1...
       482



In [4]:

    
party.groupby('party')
print(party.groupby('party').count())









    



       name
party      
DEM      53
IND       1
REP      48



In [5]:

    
# Verify array lengths. (Senators excludes VP)
counts = []
for (index, row) in votes.iterrows():
    counts.append(len(row['votes']))
num_senators = len(votes)
num_votes = np.max(counts)
print("Max votes: %d" % num_votes)
print("Min votes: %d" % np.min(counts))
print("Senators: %d" % num_senators)
if num_votes != len(votedescriptions):
    print("Missing discriptions")









    



Max votes: 486
Min votes: 486
Senators: 102



In [6]:

    
# Make Matrices
votematrix = np.ndarray((num_senators, num_votes))
idx = 0
while idx < num_senators:
    votematrix[idx,:] = votes.votes[idx]
    idx += 1
print(votematrix[0:5,0:5])
correlation = votematrix.dot(votematrix.transpose())

print("Correlation sample subset")
print(correlation[0:5,0:5])

distances = euclidean_distances(correlation)
print("\nDistances")
print(distances[0:5, 0:5])









    



[[ 1.  1. -1. -1. -1.]
 [ 1.  1.  1. -1. -1.]
 [ 1.  1.  1. -1. -1.]
 [ 1.  1.  1.  1.  1.]
 [ 1.  1.  1. -1. -1.]]
Correlation sample subset
[[ 479.  380.  167.  -81.  325.]
 [ 380.  483.  191.  -43.  319.]
 [ 167.  191.  462.  143.  214.]
 [ -81.  -43.  143.  476.  -72.]
 [ 325.  319.  214.  -72.  482.]]

Distances
[[    0.           327.02599285  1717.41841145  4113.06151182   332.6920498 ]
 [  327.02599285     0.          1472.44083073  3880.55292452
    338.06804049]
 [ 1717.41841145  1472.44083073     0.          2493.64813075
   1562.20997308]
 [ 4113.06151182  3880.55292452  2493.64813075     0.          3976.1570643 ]
 [  332.6920498    338.06804049  1562.20997308  3976.1570643      0.        ]]



In [7]:

    
# Example distances
def get_distance(sen1, sen2):
    row = data[data.name == sen1].index.tolist()[0]
    column = data[data.name == sen2].index.tolist()[0]
    return distances[row, column]

print("McCain v Boxer")
print(get_distance('MCCAIN', 'BOXER'))
print("McCain v Graham")
print(get_distance('MCCAIN', 'GRAHAM'))

furthest = max(distances.flatten())
mask = np.identity(distances.shape[0]) + distances
np.fill_diagonal(mask, nan)
mask = np.logical_not(isnan(mask))
closest = min(distances[mask].flatten())
for (key, val) in (("Furthest", furthest), ("Closest", closest)):
    tmp = ["%s & %s" % (data.name[loc[0]], data.name[loc[1]]) for loc in np.where(distances == val)]
    print("%s (%f): %s" % (key, val, ",".join(tmp[:len(tmp)/2])))









    



McCain v Boxer
3883.36735321
McCain v Graham
404.351332383
Furthest (4929.552211): CARDIN & LEE
Closest (33.090784): ENZI & BARASSO



In [8]:

    
# Calculate MDS
seed = np.random.RandomState(seed=3)
mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
                   dissimilarity="precomputed", n_jobs=1)
pos = mds.fit_transform(distances)



In [9]:

    
# Plot MDS
pos[:,0] = -pos[:,0]  # Flip coordinate to put DEMs on the left.
figure(figsize=(24, 24))
plt.scatter(pos[:, 0], pos[:, 1], s=5, c='g')
idx = 0
while idx < pos.shape[0]:
    loc = pos[idx, :]
    name = data['name'][idx]
    p = data['party'][idx]
    if p == "DEM":
        color = "blue"
    elif p == "REP":
        color = "red"
    else:
        color = "green"
    plt.annotate(name, loc, color=color)
    idx += 1



In [10]:

    
print("Distances of SANDERS (%s) to:" % party.party[party.name == 'SANDERS'].iloc[0])
for senator in ('BOXER', 'SHELBY', "HAGAN", "MCCASKILL", "MCCONNELL", "RISCH", "SCHATZ"):
    print("%s (%s): %f" % (senator, party.party[party['name'] == senator].iloc[0], get_distance('SANDERS', senator)))

print("\n")
print("Example locations:")
for senator in ("CARDIN", "LEE", "RISCH", "DEMINT", "LAUTENBERG", "SCHATZ"):
    rowid = data[data.name == senator].index.tolist()[0]
    print("Senator %s (%s) @ %s" % (senator, party.party[party['name'] == senator].iloc[0], pos[rowid,:]))









    



Distances of SANDERS (IND) to:
BOXER (DEM): 301.222509
SHELBY (REP): 3976.619167
HAGAN (DEM): 618.792372
MCCASKILL (DEM): 890.065728
MCCONNELL (REP): 4156.430319
RISCH (REP): 4429.106456
SCHATZ (DEM): 2581.076326


Example locations:
Senator CARDIN (DEM) @ [ -585.53784642 -1937.93283201]
Senator LEE (REP) @ [ 1983.16518193  2285.93031216]
Senator RISCH (REP) @ [ 1123.92837614  2422.08895471]
Senator DEMINT (REP) @ [ 2022.54314406  2238.39561642]
Senator LAUTENBERG (DEM) @ [ -384.01192325 -1764.07782411]
Senator SCHATZ (DEM) @ [ 1466.46972464   149.75082168]



In [11]:

    
# What's up with Schatz?
schatz = data[data.name == "SCHATZ"]
first = np.min(np.where(schatz.votes.values[0] != 0))
print("Schatz voted %d times." % schatz.total_votes)
print("Average number of votes is %d." % np.mean(data.total_votes))
print("First vote (one indexed): %d" % (first + 1))
print("Distance to INOUTE (%s): %d" % (party.party[party.name == 'INOUYE'].iloc[0], get_distance('SCHATZ', 'INOUYE')))
print("\n")


# Least Voters
threshold = 400
print("People voting less than %d times. First and last are 1-indexed." % threshold)
for (index, row) in data.iterrows():
    if row.total_votes < threshold:
        realvotes = np.where(row.votes != 0)
        first = np.min(realvotes) + 1
        last = np.max(realvotes) + 1
        print("%s (%s): Voted %d times. First on %d. Last on %d." % (row['name'], row.party, row.total_votes, first, last))
print("\n")

# Good voters
print("These people made every vote.")
for (index, row) in data[data.total_votes == num_votes].iterrows():
    print("%s (%s)" % (row['name'], row.party))









    



Schatz voted 20 times.
Average number of votes is 461.
First vote (one indexed): 467
Distance to INOUTE (DEM): 2564


People voting less than 400 times. First and last are 1-indexed.
SCHATZ (DEM): Voted 20 times. First on 467. Last on 486.
KIRK (REP): Voted 223 times. First on 1. Last on 235.
ENSIGN (REP): Voted 57 times. First on 1. Last on 61.
HELLER (REP): Voted 390 times. First on 67. Last on 486.


These people made every vote.
GRASSLEY (REP)
MCCONNELL (REP)
COLLINS (REP)
LEVIN  CARL (DEM)

	session	icpsr	stateid	state	party	occupancy	name	votes	total_votes
0	112	49700	41	ALABAMA	REP	01	SESSIONS	[1, 1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, ...	479
1	112	94659	41	ALABAMA	REP	01	SHELBY	[1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1...	483
2	112	40300	81	ALASKA	REP	01	MURKOWSKI	[1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 1, 1,...	462
3	112	40900	81	ALASKA	DEM	01	BEGICH	[1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, 1...	476
4	112	15429	61	ARIZONA	REP	01	KYL	[1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1...	482