In [ ]:
In [1]:
import pandas as pd
import glob
import os
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict, learning_curve
import sklearn.metrics
%matplotlib inline
%load_ext autoreload
%autoreload 2
# There's a lot of columns in the DF.
# Therefore, we add this option so that we can see more columns
pd.options.display.max_columns = 100
Loads the NLP results
In [2]:
path = '../../datas/nlp_results/'
voting_df = pd.read_csv(path+'voting_with_topics_unique_sentiment.csv')
print('Entries in the DataFrame',voting_df.shape)
#Dropping the useless column
#voting_df = voting_df.drop('Unnamed: 0',1)
#Putting numerical values into the columns that should have numerical values
#num_cols = ['BillTitle', 'BusinessTitle','text','text_eng','FirstName','LastName']
#voting = voting_df.drop(num_cols,axis=1).apply(pd.to_numeric)
#voting['text'] = voting_df.text
#Inserting the full name at the second position
#voting.insert(1,'Name', voting_df['FirstName'] + ' ' + voting_df['LastName'])
voting_df.head(3)
Out[2]:
Separates each deputee
In [ ]:
def split_df(df, field):
"""
Splits the input df along a certain field into multiple dictionaries which links each unique
entry of the field to the entries in the dataframe
"""
# Retrieve first all the unique Name entries
unique_field = df[field].unique()
print('Number of unique entries in',field,':',len(unique_field))
#Create a dictionary of DataFrames which stores all the info relative to a single deputee
df_dict = {elem : pd.DataFrame for elem in unique_field}
for key in df_dict.keys():
df_dict[key] = df.loc[df[field] == key]
return df_dict
voting_dict = split_df(voting, 'Name')
In [ ]:
depute_dict = {}
for deputee in voting_dict :
df_deputee = voting_dict[deputee]
df = df_deputee.groupby('Decision')['Decision'].count()
#df.plot(kind='bar',title=deputee)
plt.show()
depute_dict[deputee] = df
Stats on each deputee (yes/no/abstention) regarding to the topics
In [ ]:
max_ = [0]*7
frequency = [0]*7
winner_max = ['']*7
winner_freq = ['']*7
Decisions = ['Yes','No','Abstention','','Away','Excused','President']
for deputee_frame in depute_dict :
for i in range(1,8) :
if i in depute_dict[deputee_frame].index :
frequency_old = frequency[i-1]
max_old = max_[i-1]
max_[i-1] = max(max_[i-1],depute_dict[deputee_frame][i])
frequency[i-1] = max(frequency[i-1],depute_dict[deputee_frame][i]/depute_dict[deputee_frame].sum())
if frequency_old != frequency[i-1]:
winner_freq[i-1] = deputee_frame
if max_old != max_[i-1]:
winner_max[i-1] = deputee_frame
for i in range(7) :
if max_[i]!=0 :
print("for {0} :".format(Decisions[i]) )
print("{0} as the highest frequency:{1}".format(winner_freq[i],frequency[i],i+1))
print("{0} as the highest value:{1}".format(winner_max[i],max_[i],i+1))
print()
In [ ]:
dataset_tmp = []
path = '../../datas/scrap/Voting'
allFiles = glob.glob(os.path.join(path, 'Session*.csv'))
for file_ in allFiles:
print(file_)
data_tmp = pd.read_csv(file_)
print(data_tmp.shape)
dataset_tmp += [data_tmp]
data_frame = pd.concat(dataset_tmp)
print(data_frame.shape)
data_tmp.drop('Unnamed: 0',1,inplace=True)
In [ ]:
columns = ['BillTitle','BusinessTitle','Canton','Decision','FirstName','LastName','ParlGroupName','VoteEnd']
treated_data =data_frame[columns]
treated_data['text'] = treated_data['BillTitle'] + ' ' + treated_data['BusinessTitle']
treated_data.head()
In [ ]:
#columns = ['Decision','Name','ParlGroupCode','positive','negative','neutral','compound','text']
columns= ['text','text_eng','positive','negative','neutral','compound']
vote = voting_df.drop(columns,1)
to_merge = voting_df[['text']]
to_merge['subject']= vote.idxmax(axis=1)
In [ ]:
to_merge.head()
In [ ]:
data_for_viz = pd.merge(treated_data,to_merge)
In [ ]:
data_for_viz['VoteEnd'] = [x[0:7] for x in data_for_viz.VoteEnd]
In [ ]:
data_for_viz.head()
In [ ]:
data_for_viz[['VoteEnd']].to_json('viz_data_vote_month.json')
In [ ]:
data_for_viz.sort_values('VoteEnd', ascending = ['True'])
In [ ]: