In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import re

punctuation = re.compile(r'[0-9]')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def read_data(path):
    old_data = pd.DataFrame.from_csv(path)  #take first column as index
    train = old_data.head(n=100)
    
    # combine all the strings of each tuple
    train2 = train[['Label']].copy()
    
    #all the new column
#     new_column = []
    pos = []
    neg = []
    compound = []
    neutral = []
#     example = ""
    example_list = []
    for row in train.itertuples():
        for i in range(2,27):
#             example = example + row[i]
            example_list.append(row[i])
        temp1 = " "
        example = temp1.join(example_list)
        
        #process example
#         print (example)
#         example2 = example.lower()
        example3 = CountVectorizer().build_tokenizer()(example)
        example4 = [punctuation.sub("", word) for word in example3]
        temp = " "
        example5 = temp.join(example4)
#         print(example5)
        result = sid.polarity_scores(example5)
        pos.append(result['pos'])
        neg.append(result['neg'])
        compound.append(result['compound'])
        neutral.append(result['neu'])
#         new_column.append(example)
#         example = ""
        example_list = []
    
#     train2['news']=new_column
    train2['pos']=pos
    train2['neg']=neg
    train2['compound']=compound
    train2['neutral']=neutral
    return train2

data = read_data("./Documents/Cornell/Courses/MPS Project/Combined_News_DJIA.csv")
data.to_csv("./Documents/Cornell/Courses/MPS Project/data_after_polarity.csv")
print("Done!")


/Users/siqi/anaconda/lib/python2.7/site-packages/nltk/twitter/__init__.py:20: UserWarning: The twython library has not been installed. Some functionality from the twitter package will not be available.
  warnings.warn("The twython library has not been installed. "
Done!

In [ ]: