What we aim to perform now is predict the topics that are treated in a Vote, given the short string description of the title of the law (BillTitle) and the BusinessTitle. We apply our model, saved at ../datas/lda/ldamodel
to the data from the Voting field, in order to prepare it for the machine learning we'll do later on.
In [ ]:
import pandas as pd
import glob
import os
import numpy as np
from time import time
import logging
import gensim
import bz2
import re
from stop_words import get_stop_words
First of all, we define a function, getTopicForQuery in order to obtain the topics probability dsistribution for the lda model we're currently using. It will be of use mostly to retrieve the topic probability distribution for the attributes from the merged BillTitle and BusinessTitle.
In [ ]:
def getTopicForQuery (question,stoplist,dictionary,lda):
"""
Returns the topic probability distribution for a given input question, filtering with the stoplist
and finding the matches in the dictionary of words we have from our topic modelling algorithm.
@param question : The string from which we want to extract the topic
@param stoplist : The list of common words for the language, that we want to exclude
@param dictionary : The dictionary of all the words we find for a given lda model (associated to lda)
@param lda : the model of lda (Latent Dirichlet Allocation) that we want to model the topics from.
@return the topic probability distribution for the given question
"""
# 1. Question -> Lower case -> Split -> Exclude common words
temp = question.lower()
words = re.findall(r'\w+', temp, flags = re.UNICODE | re.LOCALE)
important_words = []
important_words = filter(lambda x: x not in stoplist, words)
# 2. Find matches in the dictionary of words and get the topics
ques_vec = []
ques_vec = dictionary.doc2bow(important_words)
return ldamodel.get_document_topics(ques_vec,minimum_probability=0)
Now we load the lda model we use along with the stop words, in order to have them available for the time we will use them, avoiding to reload them every time we call getTopicForQuery. We also load our lda model for once.
In [ ]:
stop_words_de = get_stop_words('de')
with open ("../datas/stop_dictionaries/French_stop_words_changed.txt", "r") as myfile:
stop_words=myfile.read()
stop_words = stop_words.split(',')
stop_words = stop_words_de+stop_words
ldamodel = gensim.models.LdaModel.load('../datas/lda/ldamodelFR.model', mmap='r')
In [ ]:
dataset_tmp = []
path = '../datas/scrap/Voting'
allFiles = glob.glob(os.path.join(path, 'Session*.csv'))
for file_ in allFiles:
print(file_)
data_tmp = pd.read_csv(file_)
dataset_tmp += [data_tmp]
data_frame = pd.concat(dataset_tmp)
We take only the relevant fields to us, that is
In [ ]:
parl = data_frame.ParlGroupCode.unique().tolist()
#for group in parl :
# data_frame.loc[data_frame.ParlGroupCode==group,'ParlGroupCode']= parl.index(group)
In [ ]:
data_frame.head()
In [ ]:
votation_frame = data_frame#[['BillTitle','BusinessTitle','FirstName','LastName','Decision','ParlGroupCode','VoteEnd']]
votation_frame = votation_frame.fillna(value='')
votation_frame['text'] = votation_frame['BillTitle']+' '+votation_frame['BusinessTitle']
We create a smaller DataFrame which contains only the subjects that are voted, we do not repeat the text each time for each person who votes. Hence we will perform the NLP once on each unique entry.
In [ ]:
text_dict = {'text': votation_frame.text.unique()}
topic_learning_frame = pd.DataFrame(text_dict)
topic_learning_frame.head()
In [ ]:
def insert_topic(data_frame) :
dict_ = {}
dict_['text'] =data_frame['text'].values
with open ("../datas/lda/topics.txt", "r") as myfile:
s=myfile.read()
topics = s.split('\n')
topics_dic = {}
for topic in topics :
if len(topic)>1 :
name = topic.split(':')
topics_dic[name[0]] = name[1]
dictionary = gensim.corpora.Dictionary.load('../datas/lda/ldaDictionaryFR.dict')
for index, text in zip(data_frame.index,data_frame['text'].values) :
if index%1000 == 0 :
print(index)
for topic in getTopicForQuery(text,stop_words,dictionary,ldamodel) :
if (topics_dic[str(topic[0])]) in dict_ :
dict_[topics_dic[str(topic[0])]] +=[topic[1]]
else :
dict_[topics_dic[str(topic[0])]] =[topic[1]]
return dict_
In [ ]:
if not os.path.exists("../datas/nlp_results"):
os.makedirs("../datas/nlp_results")
We now create a frame using the topics found using insert_topic
In [ ]:
from yandex_translate import YandexTranslate
translate = YandexTranslate('trnsl.1.1.20161208T132730Z.fe490b34d7db4e4f.0a4c7781a0273d520073a1550b6a6624c1c3fd0a')
text_eng = []
for text in topic_learning_frame.text:
s = translate.translate(text, 'fr-en')
text_eng += [s['text'][0]]
In [ ]:
topics_dict = insert_topic(topic_learning_frame)
topics_frame = pd.DataFrame(topics_dict)
#topics_frame['text_eng'] = text_eng
In [ ]:
topics_frame.head(5)
#topics_frame.to_csv('../datas/nlp_results/voting_with_topics_unique.csv',index=False)
Finally merging the topics with the original frame containing the name and decision of parlementeer ?
In [ ]:
(pd.merge(votation_frame,topics_frame)).to_csv('../datas/nlp_results/voting_with_topics.csv',index=False)
In [ ]:
def insert_topic_unique(data_frame) :
dict_ = {}
dict_['text'] =data_frame['text'].values
dict_['Topic'] = []
with open ("../datas/lda/topics.txt", "r") as myfile:
s=myfile.read()
topics = s.split('\n')
topics_dic = {}
for topic in topics :
if len(topic)>1 :
name = topic.split(':')
topics_dic[name[0]] = name[1]
dictionary = gensim.corpora.Dictionary.load('../datas/lda/ldaDictionary')
for index, text in zip(data_frame.index,data_frame['text'].values) :
if index%1000 == 0 :
print(index)
max_ = 0
for topic in getTopicForQuery(text,stop_words,dictionary,ldamodel) :
max_old = max_
max_ = max(max_,topic[1])
if max_old != max_ :
topic_id = topic[0]
dict_['Topic'] += [topics_dic[str(topic_id)]]
return dict_
In [ ]:
topics_s = insert_topic_unique(topic_learning_frame)
topics_ss = pd.DataFrame(topics_s)
topics_ss.head()
In [ ]:
topics_ss.to_csv("../datas/nlp_results/voting_single_topic.csv")
In [ ]:
data_complete = (pd.merge(votation_frame,topics_ss))
data_complete.head(2)
In [ ]:
data_complete.VoteEnd = [x[:10] for x in data_complete.VoteEnd]
data_complete.head(2)
In [ ]:
not_used = ['Unnamed: 0','BusinessNumber','BusinessShortNumber','CantonID','ID','IdLegislativePeriod',
'IdSession','IdVote','PersonNumber','RegistrationNumber','BillTitle','BusinessTitle','DecisionText',
'Language','MeaningNo','MeaningYes','ParlGroupColour','ParlGroupCode','ParlGroupNameAbbreviation',
'Subject','text']
data_used = data_complete.drop(not_used ,axis=1)
#data_used.head()
data_mean = data_used.set_index(['VoteEnd','Topic','FirstName'])
In [ ]:
data_mean.head(1000)
data_used.to_csv("../datas/nlp_results/voting_single_topic_not_unique.csv")
In [ ]:
for year in data_mean.index.get_level_values('VoteEnd').unique() :
for Topic in data_mean.loc[year].index.get_level_values('Topic').unique() :
print("the {0} the subject : {1} had a result of {2}".format(
year,Topic,data_mean.loc[year,Topic].Decision.mean()))