In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import re
punctuation = re.compile(r'[0-9]')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def read_data(path):
old_data = pd.DataFrame.from_csv(path) #take first column as index
train = old_data.head(n=100)
# combine all the strings of each tuple
train2 = train[['Label']].copy()
#all the new column
# new_column = []
pos = []
neg = []
compound = []
neutral = []
# example = ""
example_list = []
for row in train.itertuples():
for i in range(2,27):
# example = example + row[i]
example_list.append(row[i])
temp1 = " "
example = temp1.join(example_list)
#process example
# print (example)
# example2 = example.lower()
example3 = CountVectorizer().build_tokenizer()(example)
example4 = [punctuation.sub("", word) for word in example3]
temp = " "
example5 = temp.join(example4)
# print(example5)
result = sid.polarity_scores(example5)
pos.append(result['pos'])
neg.append(result['neg'])
compound.append(result['compound'])
neutral.append(result['neu'])
# new_column.append(example)
# example = ""
example_list = []
# train2['news']=new_column
train2['pos']=pos
train2['neg']=neg
train2['compound']=compound
train2['neutral']=neutral
return train2
data = read_data("./Documents/Cornell/Courses/MPS Project/Combined_News_DJIA.csv")
data.to_csv("./Documents/Cornell/Courses/MPS Project/data_after_polarity.csv")
print("Done!")
In [ ]: