In [ ]:
import pandas as pd
import glob
import os
import numpy as np
from time import time
import logging
import gensim
import bz2
import re
from stop_words import get_stop_words
from TwitterSearch import *
import time
In [ ]:
#df = pd.read_csv('../datas/nlp_results/voting_with_topics.csv')
df = pd.read_csv('../datas/nlp_results/voting_with_topics_unique.csv')
In [ ]:
df.head()
In [ ]:
texts = [[word for word in re.split(' |\'|\n|\(|\)|,|;|:|-',
document.lower()) if len(word)>1 and (word in words)]
for document in df.text_eng]
In [ ]:
def demo_vader_instance(text):
"""
Output polarity scores for a text using Vader approach.
:param text: a text whose polarity has to be evaluated.
"""
from nltk.sentiment import SentimentIntensityAnalyzer
vader_analyzer = SentimentIntensityAnalyzer()
#print(vader_analyzer.polarity_scores(text))
return vader_analyzer
In [ ]:
from collections import defaultdict
from nltk.corpus import opinion_lexicon
from nltk.corpus import opinion_lexicon
from nltk.tokenize import treebank
pos =[]
neg =[]
neu =[]
compound = []
i = 0
for text in df.text_eng:
if i%500 ==0 :
print('{0}%'.format((i*100)/len(df.text_eng)))
i +=1
#email_str = ' '.join(email)
s = demo_vader_instance(text)
polarity = s.polarity_scores(text)
pos += [polarity['pos']]
neg += [polarity['neg']]
neu += [polarity['neu']]
compound += [polarity['compound']]
#pol = 1*(pos>neg) - 1*(neg>pos)
#pol = demo_liu_hu_lexicon(email_str,positive,negative,tokenizer)
#pola += [pol]
In [ ]:
df['positive']= pos
df['negative']= neg
df['neutral']= neu
df['compound']= compound
In [ ]:
#df = df.drop('sentiment',1)
df.head()
In [ ]:
df.to_csv('../datas/nlp_results/voting_with_topics_unique_sentiment.csv',index=False)
#df.to_csv('../datas/nlp_results/voting_with_topics.csv',index=False)
In [ ]:
dataset_tmp = []
path = '../datas/scrap/Voting'
allFiles = glob.glob(os.path.join(path, 'Session*.csv'))
for file_ in allFiles:
print(file_)
data_tmp = pd.read_csv(file_)
dataset_tmp += [data_tmp]
data_frame = pd.concat(dataset_tmp)
In [ ]:
parl = data_frame.ParlGroupCode.unique().tolist()
for group in parl :
data_frame.loc[data_frame.ParlGroupCode==group,'ParlGroupCode']= parl.index(group)
In [ ]:
votation_frame = data_frame[['BillTitle','BusinessTitle','FirstName','LastName','Decision','ParlGroupCode']]
votation_frame = votation_frame.fillna(value='')
votation_frame['text'] = votation_frame['BillTitle']+' '+votation_frame['BusinessTitle']
In [ ]:
votation_frame.head()
In [ ]:
(pd.merge(votation_frame,df)).to_csv('../datas/nlp_results/voting_with_topics_sentiment.csv',index=False)