In [ ]:


In [1]:
import pandas as pd
import glob
import os
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, cross_val_predict, learning_curve
import sklearn.metrics

%matplotlib inline
%load_ext autoreload
%autoreload 2

# There's a lot of columns in the DF. 
# Therefore, we add this option so that we can see more columns
pd.options.display.max_columns = 100

Loads the NLP results


In [2]:
path = '../../datas/nlp_results/'
voting_df = pd.read_csv(path+'voting_with_topics_unique_sentiment.csv')
print('Entries in the DataFrame',voting_df.shape)

#Dropping the useless column
#voting_df = voting_df.drop('Unnamed: 0',1)

#Putting numerical values into the columns that should have numerical values

#num_cols = ['BillTitle', 'BusinessTitle','text','text_eng','FirstName','LastName']
#voting = voting_df.drop(num_cols,axis=1).apply(pd.to_numeric)
#voting['text'] = voting_df.text
#Inserting the full name at the second position
#voting.insert(1,'Name', voting_df['FirstName'] + ' ' + voting_df['LastName'])

voting_df.head(3)


Entries in the DataFrame (3470, 17)
Out[2]:
armée asile / immigration assurances budget dunno entreprise/ finance environnement famille / enfants imposition politique internationale retraite text text_eng positive negative neutral compound
0 0.006995 0.930066 0.006993 0.006993 0.006993 0.006994 0.006993 0.006993 0.006993 0.006993 0.006993 Arrêté fédéral concernant la contribution de l... Federal decree on Switzerland's contribution i... 0.075 0.000 0.925 0.4404
1 0.018182 0.018185 0.018183 0.271557 0.564795 0.018183 0.018182 0.018183 0.018183 0.018183 0.018184 Renforcement du Traité sur la non-proliférati... Strengthening of the Treaty on the non-prolif... 0.227 0.206 0.567 0.0772
2 0.015152 0.389858 0.015152 0.232520 0.256405 0.015152 0.015152 0.015152 0.015152 0.015154 0.015152 Une zone exempte d'armes nucléaires au coeur ... A nuclear weapon free zone in the heart of Eu... 0.264 0.176 0.560 0.2732

Separates each deputee


In [ ]:
def split_df(df, field):
    """
        Splits the input df along a certain field into multiple dictionaries which links each unique
        entry of the field to the entries in the dataframe
    """
    # Retrieve first all the unique Name entries
    unique_field = df[field].unique()
    print('Number of unique entries in',field,':',len(unique_field))
    #Create a dictionary of DataFrames which stores all the info relative to a single deputee
    df_dict = {elem : pd.DataFrame for elem in unique_field}

    for key in df_dict.keys():
        df_dict[key] = df.loc[df[field] == key]
    
    return df_dict

voting_dict = split_df(voting, 'Name')

In [ ]:
depute_dict = {}
for deputee in voting_dict :
    df_deputee = voting_dict[deputee]
    df = df_deputee.groupby('Decision')['Decision'].count()
    #df.plot(kind='bar',title=deputee)
    plt.show()
    depute_dict[deputee] = df

Stats on each deputee (yes/no/abstention) regarding to the topics


In [ ]:
max_ = [0]*7
frequency = [0]*7
winner_max = ['']*7
winner_freq = ['']*7
Decisions = ['Yes','No','Abstention','','Away','Excused','President']
for deputee_frame in depute_dict :
    for i in range(1,8) :
        if i in depute_dict[deputee_frame].index :
            frequency_old = frequency[i-1]
            max_old = max_[i-1]
            max_[i-1] = max(max_[i-1],depute_dict[deputee_frame][i])
            frequency[i-1] = max(frequency[i-1],depute_dict[deputee_frame][i]/depute_dict[deputee_frame].sum())
            if frequency_old != frequency[i-1]:
                winner_freq[i-1] = deputee_frame
            if max_old != max_[i-1]:
                winner_max[i-1] = deputee_frame
for i in range(7) : 
    if max_[i]!=0 :
        print("for {0} :".format(Decisions[i]) )
        print("{0} as the highest frequency:{1}".format(winner_freq[i],frequency[i],i+1))
        print("{0} as the highest value:{1}".format(winner_max[i],max_[i],i+1))
        print()

In [ ]:
dataset_tmp = []
path = '../../datas/scrap/Voting'
allFiles = glob.glob(os.path.join(path, 'Session*.csv'))

for file_ in allFiles:
    print(file_)
    data_tmp = pd.read_csv(file_)
    print(data_tmp.shape)
    dataset_tmp += [data_tmp] 
data_frame = pd.concat(dataset_tmp)
print(data_frame.shape)
data_tmp.drop('Unnamed: 0',1,inplace=True)

In [ ]:
columns = ['BillTitle','BusinessTitle','Canton','Decision','FirstName','LastName','ParlGroupName','VoteEnd']
treated_data =data_frame[columns]
treated_data['text'] =  treated_data['BillTitle'] + ' ' + treated_data['BusinessTitle']
treated_data.head()

In [ ]:
#columns = ['Decision','Name','ParlGroupCode','positive','negative','neutral','compound','text']
columns= ['text','text_eng','positive','negative','neutral','compound']
vote  = voting_df.drop(columns,1)
to_merge = voting_df[['text']]
to_merge['subject']= vote.idxmax(axis=1)

In [ ]:
to_merge.head()

In [ ]:
data_for_viz = pd.merge(treated_data,to_merge)

In [ ]:
data_for_viz['VoteEnd'] = [x[0:7] for x in data_for_viz.VoteEnd]

In [ ]:
data_for_viz.head()

In [ ]:
data_for_viz[['VoteEnd']].to_json('viz_data_vote_month.json')

In [ ]:
data_for_viz.sort_values('VoteEnd', ascending = ['True'])

In [ ]: