notebook.community

Edit and run



In [ ]:

    
import pandas as pd
import glob
import os
import numpy as np
from time import time
import logging
import gensim
import bz2
import re
from stop_words import get_stop_words
from TwitterSearch import *
import time



In [ ]:

    
#df = pd.read_csv('../datas/nlp_results/voting_with_topics.csv')
df = pd.read_csv('../datas/nlp_results/voting_with_topics_unique.csv')



In [ ]:

    
df.head()



In [ ]:

    
texts = [[word for word in re.split(' |\'|\n|\(|\)|,|;|:|-',
                                    document.lower()) if len(word)>1 and (word in words)] 
             for document in df.text_eng]



In [ ]:

    
def demo_vader_instance(text):
    """
    Output polarity scores for a text using Vader approach.

    :param text: a text whose polarity has to be evaluated.
    """
    from nltk.sentiment import SentimentIntensityAnalyzer
    vader_analyzer = SentimentIntensityAnalyzer()
    #print(vader_analyzer.polarity_scores(text))
    return vader_analyzer



In [ ]:

    
from collections import defaultdict
from nltk.corpus import opinion_lexicon
from nltk.corpus import opinion_lexicon
from nltk.tokenize import treebank
pos =[]
neg =[]
neu =[]
compound = []
i = 0
for text in df.text_eng:
    
    if i%500 ==0 :
        print('{0}%'.format((i*100)/len(df.text_eng)))
    i +=1
    #email_str = ' '.join(email)
    s = demo_vader_instance(text)
    polarity = s.polarity_scores(text)
    pos += [polarity['pos']]
    neg += [polarity['neg']]
    neu += [polarity['neu']]
    compound += [polarity['compound']]
    #pol = 1*(pos>neg) - 1*(neg>pos)
    #pol = demo_liu_hu_lexicon(email_str,positive,negative,tokenizer)             
    #pola += [pol]



In [ ]:

    
df['positive']= pos
df['negative']= neg
df['neutral']= neu
df['compound']= compound



In [ ]:

    
#df = df.drop('sentiment',1)
df.head()



In [ ]:

    
df.to_csv('../datas/nlp_results/voting_with_topics_unique_sentiment.csv',index=False)
#df.to_csv('../datas/nlp_results/voting_with_topics.csv',index=False)



In [ ]:

    
dataset_tmp = []
path = '../datas/scrap/Voting'
allFiles = glob.glob(os.path.join(path, 'Session*.csv'))

for file_ in allFiles:
    print(file_)
    data_tmp = pd.read_csv(file_)
    dataset_tmp += [data_tmp] 
data_frame = pd.concat(dataset_tmp)



In [ ]:

    
parl = data_frame.ParlGroupCode.unique().tolist()
for group in parl :
    data_frame.loc[data_frame.ParlGroupCode==group,'ParlGroupCode']= parl.index(group)



In [ ]:

    
votation_frame = data_frame[['BillTitle','BusinessTitle','FirstName','LastName','Decision','ParlGroupCode']]
votation_frame = votation_frame.fillna(value='')
votation_frame['text'] = votation_frame['BillTitle']+' '+votation_frame['BusinessTitle']



In [ ]:

    
votation_frame.head()



In [ ]:

    
(pd.merge(votation_frame,df)).to_csv('../datas/nlp_results/voting_with_topics_sentiment.csv',index=False)