The aim of this notebook is to compute the voting profiles for each person, ie analyze what each person in the parliament voted for each votation. Then we investigate the voting similarities between each person by representing people by vectors based on their votes, and model vote similarities by distances between their corresponding vectors, We finally obtain and save a distance matrix containing all distances between each people.
In [ ]:
import pandas as pd
import glob
import os
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict, learning_curve
import sklearn.metrics
%matplotlib inline
%load_ext autoreload
%autoreload 2
# There's a lot of columns in the DF.
# Therefore, we add this option so that we can see more columns
pd.options.display.max_columns = 100
In [ ]:
path = '../../datas/nlp_results/'
voting_df = pd.read_csv(path+'voting_with_topics.csv')
print('Entries in the DataFrame',voting_df.shape)
#Dropping the useless column
voting_df = voting_df.drop('Unnamed: 0',1)
#Putting numerical values into the columns that should have numerical values
#print(voting_df.columns.values)
num_cols = ['Decision', ' armée', ' asile / immigration', ' assurances', ' budget', ' dunno', ' entreprise/ finance',
' environnement', ' famille / enfants', ' imposition', ' politique internationale', ' retraite ']
voting_df[num_cols] = voting_df[num_cols].apply(pd.to_numeric)
#Inserting the full name at the second position
voting_df.insert(2,'Name', voting_df['FirstName'] + ' ' + voting_df['LastName'])
voting_df.head(3)
In [ ]:
voting_df_copy = voting_df.drop_duplicates(['text', 'Name'], keep = 'last')
In [ ]:
people = voting_df_copy['Name'].unique()
texts = voting_df_copy['text'].unique()
print("{n} people in the parliament from 2009 to 2015".format(n=people.shape[0]))
voting_df_copy = voting_df_copy.set_index(['Name', 'text'])
voting_df_copy.head()
In the next step, we create a new dataframe with people as indexes, and all the voting Bill / Business title as column. The profile matrix contains all the voting decisions of each person (one column per person) for all the subjects (one row per subject), with conventions :
In [ ]:
def processVote(vote):
if vote == 1 or vote == 2:
return vote-1
return 0.5
In [ ]:
profile_df = pd.DataFrame(data = -1, index = people, columns = texts)
#profile_df.loc[people[0], voting_df_copy.loc[people[0]].index] = voting_df_copy.loc[people[0]].Decision
for p in people:
profile_df.loc[p, voting_df_copy.loc[p].index] = [processVote(x) for x in voting_df_copy.loc[p].Decision]
profile_df.head()
In [ ]:
profile_df.to_csv("profileMatrix.csv")
print(profile_df.loc['Brigitta M. Gadient'].values)
profile_df.loc['Duri Campell'].values
In [ ]:
profile_df.loc[people[0]].values
In [ ]:
def distance(p1, p2):
d = 0.0
nCommonVotes = 0
for i in range(len(p1)):
if not (p1[i] == -1 or p2[i] == -1):
nCommonVotes += 1
d += (p1[i] - p2[i]) * (p1[i] - p2[i])
if nCommonVotes == 0:
return 100
return np.sqrt(d / nCommonVotes)
In [ ]:
n = people.shape[0]
distanceMatrix = np.zeros((n,n))
for i in range(n):
if i % 10 == 0:
print("Compute distances from person " + str(i))
for j in range(n):
distanceMatrix[i][j] = distance(profile_df.loc[people[i]].values,
profile_df.loc[people[j]].values)
In [ ]:
import networkx as nx
G = nx.from_numpy_matrix(distanceMatrix)
nx.draw(G)
import pylab as plt
plt.show()
In [ ]:
print("Mean distance : {d}".format(d = np.mean(distanceMatrix)))
In [ ]:
import pandas as pd
df = pd.DataFrame(distanceMatrix, index = people, columns = people)
df.to_csv("distanceMatrix.csv")
df.head()
In [ ]:
#voting_df['ParlGroupName']
groupId = {"Groupe conservateur-catholique" : 1, "Groupe socialiste" : 2, "Groupe des Paysans, Artisans et Bourgeois" : 3,
"Groupe radical-démocratique" : 4, "Groupe écologiste" : 5, "Groupe BD" : 6, "Groupe vert'libéral" : 7, "Non inscrit" : 8}
GroupPeople_df = voting_df.drop_duplicates(['Name'], keep = 'last')
GroupPeople_df = GroupPeople_df.set_index('Name')
GroupPeople_df['ParlGroupName'] = [groupId[x] for x in GroupPeople_df['ParlGroupName'].values]
GroupPeople_df = GroupPeople_df['ParlGroupName']
#GroupPeople_df['ParlGroupName'] = [groupId[x] for x in GroupPeople_df.values]
GroupPeople_df.to_json('GroupList.json')
GroupPeople_df
#groupId_df = pd.DataFrame.from_dict(groupId, orient='columns')
#groupId_df.to_json('GroupId.json')
In [ ]:
groupId_inv = {groupId[k] : k for k in groupId}
groupId_inv_df = pd.DataFrame.from_dict(groupId_inv, orient='index')
groupId_inv_df.to_json('GroupId.json')
In [ ]:
GroupPeople_df
In [ ]:
Group_df = pd.DataFrame(index = voting_df_copy['ParlGroupName'].unique())
Group_df['MeanDistance'] = 0.0
Group_df['NumberOfPeople'] = 0# voting_df[voting_df['ParlGroupName'] == Group_df.index].shape[0]
Group_df
In [ ]:
# Computes the mean distance from people within peopleGroup (list of string)
def meanDistance(peopleGroup):
d = 0.0
nbPairs = 0
for p1 in peopleGroup:
for p2 in peopleGroup:
#print(p1+' '+p2)
if not (p1 is p2):
nbPairs += 1
d += distance(profile_df.loc[p1].values,
profile_df.loc[p2].values)
return d / nbPairs
In [ ]:
Group_df.loc['Groupe conservateur-catholique']
voting_df[voting_df['ParlGroupName'] == 'Groupe conservateur-catholique']['Name'].unique
print(len(list(voting_df[voting_df['ParlGroupName'] == 'Groupe conservateur-catholique']['Name'].unique())))
print(meanDistance(list(voting_df[voting_df['ParlGroupName'] == 'Groupe BD']['Name'].unique())))
for p in Group_df.index:
print(p)
#Group_df.loc[p].NumberOfPeople = len(list(voting_df[voting_df['ParlGroupName'] == p]['Name'].unique()))
Group_df.set_value(p, 'NumberOfPeople', len(list(voting_df[voting_df['ParlGroupName'] == p]['Name'].unique())))
print(Group_df.loc[p]['NumberOfPeople'])
#Group_df.loc[p]['MeanDistance'] = meanDistance(list(voting_df[voting_df['ParlGroupName'] == p]['Name'].unique()))
Group_df.set_value(p, 'MeanDistance', meanDistance(list(voting_df[voting_df['ParlGroupName'] == p]['Name'].unique())))
print(Group_df.loc[p]['MeanDistance'])
In [ ]:
Group_df
In [ ]:
meanDistance(list(voting_df['Name'].unique()))
We observe that the average distance between all parliament members is 0.53, and that average distance between people belonging to a same partite is always smaller than 0.43, thus it indicates as expected that people belonging to a same partite vote in a similar way.
We then want to observe the difference between each groups. The difference between 2 groups is defined as the sum of all distances between people from one group to people from the other group (see function GroupDistance).
In [ ]:
# Computes the mean distance of people within group1 from one within group2 (lists of string)
def GroupDistance(group1, group2):
d = 0.0
nbPairs = 0
for p1 in group1:
for p2 in group2:
nbPairs += 1
d += distance(profile_df.loc[p1].values,
profile_df.loc[p2].values)
return d / nbPairs
In [ ]:
partite = "Groupe socialiste"
for p in Group_df.index:
#dist = meanDistance(list(voting_df[voting_df['ParlGroupName'] == p]['Name'].unique()))
dist = GroupDistance(list(voting_df[voting_df['ParlGroupName'] == partite]['Name'].unique()),
list(voting_df[voting_df['ParlGroupName'] == p]['Name'].unique()))
print("Mean distance between partite {p1} and {p2} is : {d}".format(
p1 = partite, p2 = p, d = dist))
We observe that the partite to which the socialist partite is the further from in terms of votation decisions is the "Groupe des Paysans, Artisans et Bourgeois" partite, and the one to which it is the closest is the "Groupe écologiste" partite.
In [ ]:
groups = Group_df.index
groupDistance_df = pd.DataFrame(index = groups, columns = groups)
for g1 in groups:
for g2 in groups:
groupDistance_df.set_value(g1, g2, GroupDistance(list(voting_df[voting_df['ParlGroupName'] == g1]['Name'].unique()),
list(voting_df[voting_df['ParlGroupName'] == g2]['Name'].unique())))
In [ ]:
groupDistance_df
We now want to detect people that vote very differently from their own groups.
In [ ]:
groupId_inv
In [ ]:
list(voting_df[voting_df['Name'] == 'Didier Berberat']['Name'].unique())
In [ ]:
GroupPeople_df.index
In [ ]:
partite = "Groupe écologiste"
GroupPeople_df[GroupPeople_df == groupId[partite]]
maxDist = 0
furthestPerson = ""
for p in GroupPeople_df[GroupPeople_df == groupId[partite]].index:
dist = GroupDistance(list(voting_df[voting_df['ParlGroupName'] == partite]['Name'].unique()),
list(voting_df[voting_df['Name'] == p]['Name'].unique()))
if dist > maxDist:
maxDist = dist
furthestPerson = p
print("Mean distance of {person} to its partite {part} : {d}".format(person = p, part = partite, d = dist))
print("The person in partite {part} which is the furthest of the others in terms of voting is : {p}"
.format(part=partite, p = furthestPerson))
In [ ]:
partite = "Groupe écologiste"
GroupPeople_df[GroupPeople_df == groupId[partite]]
maxDist = 0
furthestPerson = ""
for partite in groupDistance_df.index:
maxDist = 0
for p in GroupPeople_df[GroupPeople_df == groupId[partite]].index:
dist = GroupDistance(list(voting_df[voting_df['ParlGroupName'] == partite]['Name'].unique()),
list(voting_df[voting_df['Name'] == p]['Name'].unique()))
if dist > maxDist:
maxDist = dist
furthestPerson = p
#print("{pers} {d}".format(pers=p, d=dist))
#print("Mean distance of {person} to its partite {part} : {d}".format(person = p, part = partite, d = dist))
print("The person in partite {part} which is the furthest of the others in terms of voting is : {p} with distance {d}"
.format(part=partite, p = furthestPerson, d = maxDist))