notebook.community

Edit and run



In [55]:

    
#*****************PROJECT YARK*********************/
#* Ariel Boris Dexter bad225@nyu.edu */
#* Kania Azrina ka1531@nyu.edu       */
#* Michael Rawson mr4209             */
#* Yixue Wang yw1819@nyu.edu         */
#**************************************************/

%matplotlib inline

#import required packages
import sys
import datetime
import csv
import math
import pandas as pd 
import numpy as np 
from scipy import stats 
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot
import json



In [234]:

    
#import news from pig
news = pd.read_csv('../data/extracted_topics_refined', sep='	', names=['CountryID', 'SequenceID', 'Timestamp',
                                                          'Title','Story','Keywords','Country','Region', 'Count'],header=True)
news['Timestamp'] = pd.to_datetime(news['Timestamp'].str[:10], format = '%Y-%m-%d')
news['Timestamp'] = news['Timestamp'].values.astype('M8[D]')
news['Keywords'] = news['Keywords'].astype(str)
news['Keywords'] = news['Keywords'].apply(lambda x :x.split(" "))

#load news count
news_count = pd.read_csv('../data/keyword_list_sorted.csv', sep=',', names=['ID', 'Topic', 'Count'],header=True)

#load treasury data (and percent changes)
treasury = pd.read_csv('../data/treasury.csv', names=['Date', 'PercentChange'],header=True, parse_dates=True)
treasury['Date'] = pd.to_datetime(treasury['Date'].str[:10], format = '%Y-%m-%d')
treasury['PercentChange'] = treasury['PercentChange'].convert_objects(convert_numeric=True)
treasury = treasury.set_index(pd.DatetimeIndex(treasury['Date']))



In [235]:

    
treasury.head()









    Out[235]:






  
    
      
      Date
      PercentChange
    
  
  
    
      2000-01-04
      2000-01-04
      -1.36778
    
    
      2000-01-05
      2000-01-05
       2.00308
    
    
      2000-01-06
      2000-01-06
      -0.75529
    
    
      2000-01-07
      2000-01-07
      -0.76104
    
    
      2000-01-10
      2000-01-10
       0.76687



In [236]:

    
news.head()









    Out[236]:






  
    
      
      CountryID
      SequenceID
      Timestamp
      Title
      Story
      Keywords
      Country
      Region
      Count
    
  
  
    
      0
        27
      -16464
      1954-12-04
       NaN
       In Northern Bahr El Ghazal and Warrap States, ...
       [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 2...
               Sudan
        MENA
       38
    
    
      1
        27
       -3866
      1989-06-01
       NaN
       A joint UN and AU team led by Jean-Marie Guehe...
                                [28, 29, 30, 31, 32, 33]
               Sudan
        MENA
       17
    
    
      2
        60
        -582
      1998-05-29
       NaN
       Eritrea on May 28, 1998, mobilised thousands o...
                                            [34, 35, 36]
             Eritrea
       ESARO
        8
    
    
      3
       145
         298
      2000-10-26
       NaN
                                      militants du FPI."
                                                    [37]
       Cote d'Ivoire
       WCARO
        2
    
    
      4
       145
         991
      2002-09-19
       NaN
       Following the armed insurgency of 19 September...
                                    [38, 39, 40, 41, 42]
       Cote d'Ivoire
       WCARO
       14



In [251]:

    
keywords_list = news['Keywords'].tolist()
keywords_list_int = []

for keyword in keywords_list :
    key_list_int = []
    for k in keyword[:5] :  
        if (k != 'nan') :
            k = int(k)
        else :
            k = 0
        key_list_int.append(k)
    keywords_list_int.append(key_list_int)



In [251]:



In [252]:

    
#for each topic, get the dates where it occurs and its country
def getDateCount(topic) :
    dates = []
    for keywords in keywords_list_int:
        if topic in keywords :
            dates.append(news['Timestamp'].loc[keywords_list_int.index(keywords)])
    datesCount = {}
    for date in dates :
        if date not in datesCount:
            datesCount[date] = 1
        else :
            datesCount[date] += 1
    return datesCount



In [253]:

    
def getChangeAvg(dates):
    valueList = []
    for date in dates :
        newDate = date.strftime('%Y-%m-%d')   
        indexList = treasury[treasury['Date']== newDate].index.tolist()
        newTre = treasury.loc[indexList]
        newlist = newTre['PercentChange'].tolist()
        if (len(newlist)!=0):
            valueList.append(newlist[0])
    return np.nanmean(valueList)



In [ ]:

    
topic_avg = []
frequentTopics = news_count['ID'].tolist()

#for each topic, get the average percent changes
for topic in frequentTopics :
    avg = 0
    dates = getDateCount(topic)
    avg = getChangeAvg(dates)
    topic_avg.append(avg)



In [249]:



In [249]:

    
#store to csv
news_count_change = news_count[:100]
print len(news_count_change)
news_count_change['Changes'] = topic_avg
news_count_change = news_count_change

news_count_change.to_csv('../data/keyword_list_count.csv',index=False)









    



100






    



-c:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [ ]:

	Date	PercentChange
2000-01-04	2000-01-04	-1.36778
2000-01-05	2000-01-05	2.00308
2000-01-06	2000-01-06	-0.75529
2000-01-07	2000-01-07	-0.76104
2000-01-10	2000-01-10	0.76687

	CountryID	SequenceID	Timestamp	Title	Story	Keywords	Country	Region	Count
0	27	-16464	1954-12-04	NaN	In Northern Bahr El Ghazal and Warrap States, ...	[15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 2...	Sudan	MENA	38
1	27	-3866	1989-06-01	NaN	A joint UN and AU team led by Jean-Marie Guehe...	[28, 29, 30, 31, 32, 33]	Sudan	MENA	17
2	60	-582	1998-05-29	NaN	Eritrea on May 28, 1998, mobilised thousands o...	[34, 35, 36]	Eritrea	ESARO	8
3	145	298	2000-10-26	NaN	militants du FPI."	[37]	Cote d'Ivoire	WCARO	2
4	145	991	2002-09-19	NaN	Following the armed insurgency of 19 September...	[38, 39, 40, 41, 42]	Cote d'Ivoire	WCARO	14