In [67]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import re

punctuation = re.compile(r'[0-9]')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def preprocessing(pre_text):
    example3 = CountVectorizer().build_tokenizer()(pre_text)
    example4 = [punctuation.sub("", word) for word in example3]
    temp = " "
    example5 = temp.join(example4)
    return example5
        
def read_data(path):
    old_data = pd.DataFrame.from_csv(path)  #take first column as index
    train2 = old_data.drop(['Label'], axis=1)
    train3 = old_data[['Label']].copy()
    # iterate each items
    flag_index = "Top1"
    pos = []
    neg = []
    compound = []
    neutral = []
    for index,news in train2.iteritems():
        for item in news:
            if flag_index != index:
                
                title_pos = 'pos' + flag_index
                title_neg = 'neg' + flag_index
                title_neu = 'neu' + flag_index
                title_com = 'com' + flag_index
                train3[title_pos]=pos
                train3[title_neg]=neg
                train3[title_neu]=neutral
                train3[title_com]=compound
                flag_index = index
                
                pos = []
                neg = []
                compound = []
                neutral = []
                
            text = preprocessing(str(item))
            result = sid.polarity_scores(text)
            pos.append(result['pos'])
            neg.append(result['neg'])
            compound.append(result['compound'])
            neutral.append(result['neu'])
            
    title_pos = 'posTop25'
    title_neg = 'negTop25'
    title_neu = 'neuTop25' 
    title_com = 'comTop25' 
    train3[title_pos]=pos
    train3[title_neg]=neg
    train3[title_neu]=neutral
    train3[title_com]=compound

    return train3

data = read_data("./Documents/Cornell/Courses/MPS Project/Combined_News_DJIA.csv")
data.to_csv("./Documents/Cornell/Courses/MPS Project/polarity_specific.csv")
print(data)
print("Done!")


            Label  posTop1  negTop1  neuTop1  comTop1  posTop2  negTop2  \
Date                                                                      
2008-08-08      0    0.000    0.262    0.738  -0.5994    0.000    0.000   
2008-08-11      1    0.323    0.000    0.677   0.7964    0.000    0.277   
2008-08-12      0    0.166    0.161    0.674   0.0258    0.000    0.000   
2008-08-13      0    0.000    0.545    0.455  -0.7184    0.000    0.249   
2008-08-14      1    0.184    0.000    0.816   0.2023    0.000    0.328   
2008-08-15      1    0.000    0.315    0.685  -0.7481    0.381    0.000   
2008-08-18      0    0.043    0.489    0.468  -0.9246    0.278    0.000   
2008-08-19      0    0.000    0.348    0.652  -0.7906    0.097    0.000   
2008-08-20      1    0.000    0.114    0.886  -0.2732    0.086    0.180   
2008-08-21      1    0.211    0.263    0.526  -0.3182    0.000    0.360   
2008-08-22      1    0.091    0.000    0.909   0.3612    0.067    0.000   
2008-08-25      0    0.000    0.286    0.714  -0.5574    0.000    0.398   
2008-08-26      1    0.000    0.346    0.654  -0.8126    0.000    0.381   
2008-08-27      1    0.000    0.310    0.690  -0.6705    0.000    0.273   
2008-08-28      1    0.185    0.267    0.548  -0.2960    0.000    0.223   
2008-08-29      0    0.000    0.270    0.730  -0.6486    0.000    0.000   
2008-09-02      0    0.000    0.508    0.492  -0.9081    0.000    0.146   
2008-09-03      1    0.000    0.000    1.000   0.0000    0.000    0.000   
2008-09-04      0    0.130    0.000    0.870   0.3400    0.000    0.213   
2008-09-05      1    0.224    0.316    0.460  -0.7385    0.000    0.000   
2008-09-08      1    0.000    0.202    0.798  -0.5859    0.135    0.324   
2008-09-09      0    0.121    0.303    0.576  -0.6486    0.000    0.000   
2008-09-10      1    0.000    0.286    0.714  -0.5574    0.000    0.234   
2008-09-11      1    0.117    0.087    0.795   0.2023    0.000    0.140   
2008-09-12      0    0.000    0.000    1.000   0.0000    0.000    0.000   
2008-09-15      0    0.000    0.217    0.783  -0.3612    0.247    0.272   
2008-09-16      1    0.094    0.094    0.812   0.0000    0.000    0.000   
2008-09-17      0    0.000    0.552    0.448  -0.8126    0.000    0.200   
2008-09-18      1    0.000    0.674    0.326  -0.7351    0.000    0.000   
2008-09-19      1    0.000    0.310    0.690  -0.5574    0.000    0.620   
...           ...      ...      ...      ...      ...      ...      ...   
2016-05-20      1    0.000    0.000    1.000   0.0000    0.000    0.189   
2016-05-23      0    0.070    0.318    0.612  -0.9451    0.000    0.328   
2016-05-24      1    0.000    0.247    0.753  -0.5574    0.140    0.000   
2016-05-25      1    0.000    0.000    1.000   0.0000    0.000    0.279   
2016-05-26      0    0.000    0.000    1.000   0.0000    0.000    0.383   
2016-05-27      1    0.255    0.109    0.637   0.5423    0.000    0.179   
2016-05-31      0    0.167    0.061    0.772   0.5574    0.060    0.049   
2016-06-01      1    0.000    0.262    0.738  -0.7506    0.040    0.180   
2016-06-02      1    0.000    0.000    1.000   0.0000    0.206    0.186   
2016-06-03      0    0.068    0.221    0.711  -0.7003    0.203    0.173   
2016-06-06      1    0.000    0.000    1.000   0.0000    0.073    0.256   
2016-06-07      1    0.000    0.266    0.734  -0.6908    0.111    0.160   
2016-06-08      1    0.085    0.141    0.775  -0.1531    0.121    0.000   
2016-06-09      0    0.000    0.314    0.686  -0.7506    0.000    0.000   
2016-06-10      0    0.126    0.246    0.629  -0.4767    0.000    0.000   
2016-06-13      0    0.032    0.188    0.780  -0.8126    0.000    0.000   
2016-06-14      0    0.000    0.000    1.000   0.0000    0.352    0.000   
2016-06-15      0    0.000    0.000    1.000   0.0000    0.000    0.186   
2016-06-16      1    0.050    0.285    0.665  -0.8591    0.156    0.061   
2016-06-17      0    0.000    0.597    0.403  -0.8126    0.000    0.444   
2016-06-20      1    0.000    0.000    1.000   0.0000    0.096    0.000   
2016-06-21      1    0.000    0.059    0.941  -0.3400    0.000    0.167   
2016-06-22      0    0.173    0.346    0.481  -0.4215    0.000    0.321   
2016-06-23      1    0.175    0.075    0.750   0.3818    0.107    0.171   
2016-06-24      0    0.000    0.231    0.769  -0.3400    0.000    0.130   
2016-06-27      0    0.154    0.217    0.629  -0.2263    0.174    0.000   
2016-06-28      1    0.475    0.153    0.372   0.8316    0.073    0.134   
2016-06-29      1    0.000    0.000    1.000   0.0000    0.000    0.277   
2016-06-30      1    0.000    0.000    1.000   0.0000    0.106    0.226   
2016-07-01      1    0.000    0.110    0.890  -0.5574    0.000    0.146   

            neuTop2  comTop2  posTop3    ...     neuTop23  comTop23  posTop24  \
Date                                     ...                                    
2008-08-08    1.000   0.0000    0.000    ...        0.860   -0.2755     0.000   
2008-08-11    0.723  -0.3182    0.209    ...        0.753   -0.3182     0.263   
2008-08-12    1.000   0.0000    0.000    ...        0.785    0.3818     0.000   
2008-08-13    0.751  -0.8074    0.120    ...        0.515   -0.5719     0.000   
2008-08-14    0.672  -0.5994    0.384    ...        0.571    0.1779     0.000   
2008-08-15    0.619   0.5719    0.178    ...        0.761   -0.2960     0.000   
2008-08-18    0.722   0.4023    0.000    ...        0.909   -0.0516     0.211   
2008-08-19    0.903   0.1027    0.000    ...        0.598   -0.6908     0.000   
2008-08-20    0.734  -0.4767    0.000    ...        0.737   -0.3612     0.199   
2008-08-21    0.640  -0.6705    0.000    ...        0.755   -0.5994     0.000   
2008-08-22    0.933   0.2716    0.088    ...        1.000    0.0000     0.000   
2008-08-25    0.602  -0.5106    0.091    ...        0.319   -0.7506     0.000   
2008-08-26    0.619  -0.6486    0.000    ...        0.481   -0.6597     0.000   
2008-08-27    0.727  -0.4588    0.045    ...        0.602    0.5106     0.000   
2008-08-28    0.777  -0.3182    0.123    ...        0.630    0.4466     0.000   
2008-08-29    1.000   0.0000    0.000    ...        1.000    0.0000     0.000   
2008-09-02    0.854  -0.5574    0.126    ...        1.000    0.0000     0.000   
2008-09-03    1.000   0.0000    0.202    ...        1.000    0.0000     0.000   
2008-09-04    0.787  -0.5859    0.105    ...        0.517   -0.6808     0.193   
2008-09-05    1.000   0.0000    0.000    ...        0.812   -0.4215     0.083   
2008-09-08    0.541  -0.4767    0.185    ...        0.816   -0.6705     0.000   
2008-09-09    1.000   0.0000    0.000    ...        0.795   -0.4754     0.000   
2008-09-10    0.766  -0.7269    0.000    ...        0.526   -0.6597     0.000   
2008-09-11    0.860  -0.4767    0.000    ...        1.000    0.0000     0.213   
2008-09-12    1.000   0.0000    0.050    ...        0.798    0.0258     0.000   
2008-09-15    0.481  -0.4767    0.000    ...        0.301   -0.8750     0.000   
2008-09-16    1.000   0.0000    0.000    ...        0.652   -0.5859     0.000   
2008-09-17    0.800  -0.1280    0.000    ...        0.571   -0.6705     0.000   
2008-09-18    1.000   0.0000    0.000    ...        0.798   -0.5106     0.000   
2008-09-19    0.380  -0.8860    0.000    ...        1.000    0.0000     0.000   
...             ...      ...      ...    ...          ...       ...       ...   
2016-05-20    0.811  -0.6486    0.000    ...        0.612   -0.8974     0.000   
2016-05-23    0.672  -0.5261    0.000    ...        0.678   -0.5859     0.000   
2016-05-24    0.860   0.0772    0.096    ...        0.669    0.8360     0.094   
2016-05-25    0.721  -0.7003    0.134    ...        0.649   -0.4019     0.000   
2016-05-26    0.617  -0.7351    0.323    ...        0.474   -0.8750     0.124   
2016-05-27    0.821  -0.3400    0.000    ...        0.876   -0.5267     0.000   
2016-05-31    0.891   0.1280    0.100    ...        0.764    0.5267     0.123   
2016-06-01    0.780  -0.7096    0.000    ...        0.739   -0.4767     0.174   
2016-06-02    0.608   0.2484    0.113    ...        0.864   -0.2617     0.000   
2016-06-03    0.624   0.3612    0.000    ...        0.887   -0.1027     0.000   
2016-06-06    0.671  -0.6124    0.075    ...        0.542   -0.8225     0.000   
2016-06-07    0.729  -0.3400    0.138    ...        1.000    0.0000     0.000   
2016-06-08    0.879   0.0258    0.135    ...        0.543   -0.7096     0.041   
2016-06-09    1.000   0.0000    0.206    ...        0.680   -0.6808     0.000   
2016-06-10    1.000   0.0000    0.000    ...        0.758   -0.2500     0.268   
2016-06-13    1.000   0.0000    0.185    ...        0.631   -0.9081     0.000   
2016-06-14    0.648   0.5859    0.000    ...        0.813    0.3182     0.000   
2016-06-15    0.814  -0.4391    0.079    ...        1.000    0.0000     0.000   
2016-06-16    0.782   0.5267    0.000    ...        0.692   -0.5574     0.000   
2016-06-17    0.556  -0.8020    0.000    ...        0.801   -0.7351     0.407   
2016-06-20    0.904   0.2023    0.167    ...        0.816    0.2500     0.075   
2016-06-21    0.833  -0.3400    0.000    ...        0.622   -0.9241     0.000   
2016-06-22    0.679  -0.9231    0.000    ...        1.000    0.0000     0.000   
2016-06-23    0.722  -0.4802    0.000    ...        0.641    0.4215     0.000   
2016-06-24    0.870  -0.0516    0.000    ...        0.902   -0.0516     0.000   
2016-06-27    0.826   0.2732    0.000    ...        0.611   -0.6249     0.087   
2016-06-28    0.793  -0.4588    0.346    ...        1.000    0.0000     0.000   
2016-06-29    0.723  -0.6808    0.459    ...        0.667    0.2023     0.264   
2016-06-30    0.667  -0.6870    0.213    ...        0.798   -0.8720     0.000   
2016-07-01    0.854  -0.0516    0.252    ...        0.623   -0.9618     0.000   

            negTop24  neuTop24  comTop24  posTop25  negTop25  neuTop25  \
Date                                                                     
2008-08-08     0.650     0.350   -0.8519     0.303     0.247     0.449   
2008-08-11     0.323     0.414   -0.1832     0.000     0.000     1.000   
2008-08-12     0.000     1.000    0.0000     0.000     0.000     1.000   
2008-08-13     0.177     0.823   -0.4215     0.000     0.231     0.769   
2008-08-14     0.573     0.427   -0.6908     0.349     0.000     0.651   
2008-08-15     0.000     1.000    0.0000     0.000     0.000     1.000   
2008-08-18     0.000     0.789    0.3400     0.000     0.320     0.680   
2008-08-19     0.000     1.000    0.0000     0.000     0.000     1.000   
2008-08-20     0.199     0.602    0.2500     0.000     0.000     1.000   
2008-08-21     0.632     0.368   -0.8625     0.000     0.000     1.000   
2008-08-22     0.000     1.000    0.0000     0.437     0.000     0.563   
2008-08-25     0.540     0.460   -0.6908     0.000     0.213     0.787   
2008-08-26     0.505     0.495   -0.8807     0.258     0.430     0.313   
2008-08-27     0.000     1.000    0.0000     0.000     0.000     1.000   
2008-08-28     0.328     0.672   -0.5994     0.000     0.554     0.446   
2008-08-29     0.217     0.783   -0.7269     0.000     0.223     0.777   
2008-09-02     0.000     1.000    0.0000     0.000     0.550     0.450   
2008-09-03     0.000     1.000    0.0000     0.313     0.000     0.687   
2008-09-04     0.164     0.643    0.1027     0.000     0.432     0.568   
2008-09-05     0.273     0.645   -0.8926     0.485     0.000     0.515   
2008-09-08     0.494     0.506   -0.5994     0.000     0.000     1.000   
2008-09-09     0.000     1.000    0.0000     0.000     0.000     1.000   
2008-09-10     0.208     0.792   -0.6369     0.000     0.000     1.000   
2008-09-11     0.157     0.630    0.1779     0.000     0.000     1.000   
2008-09-12     0.149     0.851   -0.1027     0.209     0.171     0.620   
2008-09-15     0.000     1.000    0.0000     0.000     0.157     0.843   
2008-09-16     0.000     1.000    0.0000     0.000     0.239     0.761   
2008-09-17     0.559     0.441   -0.8225     0.000     0.333     0.667   
2008-09-18     0.000     1.000    0.0000     0.000     0.000     1.000   
2008-09-19     0.000     1.000    0.0000     0.000     0.561     0.439   
...              ...       ...       ...       ...       ...       ...   
2016-05-20     0.000     1.000    0.0000     0.000     0.527     0.473   
2016-05-23     0.495     0.505   -0.5994     0.000     0.583     0.417   
2016-05-24     0.177     0.729   -0.4939     0.000     0.168     0.832   
2016-05-25     0.000     1.000    0.0000     0.000     0.310     0.690   
2016-05-26     0.298     0.579   -0.4767     0.000     0.286     0.714   
2016-05-27     0.333     0.667   -0.4588     0.000     0.203     0.797   
2016-05-31     0.000     0.877    0.5994     0.000     0.000     1.000   
2016-06-01     0.000     0.826    0.2732     0.075     0.466     0.460   
2016-06-02     0.291     0.709   -0.5719     0.314     0.148     0.538   
2016-06-03     0.293     0.707   -0.4404     0.000     0.000     1.000   
2016-06-06     0.243     0.757   -0.6705     0.000     0.088     0.912   
2016-06-07     0.000     1.000    0.0000     0.000     0.000     1.000   
2016-06-08     0.056     0.902   -0.1531     0.123     0.374     0.503   
2016-06-09     0.000     1.000    0.0000     0.101     0.000     0.899   
2016-06-10     0.000     0.732    0.2960     0.000     0.145     0.855   
2016-06-13     0.244     0.756   -0.4404     0.359     0.000     0.641   
2016-06-14     0.000     1.000    0.0000     0.000     0.225     0.775   
2016-06-15     0.325     0.675   -0.6486     0.000     0.438     0.563   
2016-06-16     0.188     0.812   -0.6249     0.096     0.000     0.904   
2016-06-17     0.000     0.593    0.5859     0.000     0.438     0.563   
2016-06-20     0.043     0.882    0.3400     0.000     0.000     1.000   
2016-06-21     0.000     1.000    0.0000     0.088     0.181     0.731   
2016-06-22     0.198     0.802   -0.8779     0.000     0.000     1.000   
2016-06-23     0.000     1.000    0.0000     0.094     0.000     0.906   
2016-06-24     0.000     1.000    0.0000     0.024     0.170     0.805   
2016-06-27     0.047     0.866    0.2500     0.158     0.000     0.842   
2016-06-28     0.750     0.250   -0.7184     0.089     0.335     0.576   
2016-06-29     0.176     0.560    0.2732     0.000     0.000     1.000   
2016-06-30     0.231     0.769   -0.5423     0.000     0.526     0.474   
2016-07-01     0.278     0.722   -0.9432     0.000     0.073     0.927   

            comTop25  
Date                  
2008-08-08    0.1280  
2008-08-11    0.0000  
2008-08-12    0.0000  
2008-08-13   -0.3400  
2008-08-14    0.7096  
2008-08-15    0.0000  
2008-08-18   -0.5106  
2008-08-19    0.0000  
2008-08-20    0.0000  
2008-08-21    0.0000  
2008-08-22    0.4767  
2008-08-25   -0.4019  
2008-08-26   -0.4939  
2008-08-27    0.0000  
2008-08-28   -0.7351  
2008-08-29   -0.3182  
2008-09-02   -0.7269  
2008-09-03    0.4927  
2008-09-04   -0.5859  
2008-09-05    0.5719  
2008-09-08    0.0000  
2008-09-09    0.0000  
2008-09-10    0.0000  
2008-09-11    0.0000  
2008-09-12    0.1280  
2008-09-15   -0.3818  
2008-09-16   -0.2960  
2008-09-17   -0.5423  
2008-09-18    0.0000  
2008-09-19   -0.7506  
...              ...  
2016-05-20   -0.8316  
2016-05-23   -0.8689  
2016-05-24   -0.7564  
2016-05-25   -0.5574  
2016-05-26   -0.4939  
2016-05-27   -0.7506  
2016-05-31    0.0000  
2016-06-01   -0.7783  
2016-06-02    0.5719  
2016-06-03    0.0000  
2016-06-06   -0.4588  
2016-06-07    0.0000  
2016-06-08   -0.6705  
2016-06-09    0.5423  
2016-06-10   -0.3400  
2016-06-13    0.6808  
2016-06-14   -0.4939  
2016-06-15   -0.7906  
2016-06-16    0.1531  
2016-06-17   -0.7906  
2016-06-20    0.0000  
2016-06-21   -0.3400  
2016-06-22    0.0000  
2016-06-23    0.3167  
2016-06-24   -0.8316  
2016-06-27    0.1280  
2016-06-28   -0.9349  
2016-06-29    0.0000  
2016-06-30   -0.8750  
2016-07-01   -0.6249  

[1989 rows x 101 columns]
Done!