In [55]:
#*****************PROJECT YARK*********************/
#* Ariel Boris Dexter bad225@nyu.edu */
#* Kania Azrina ka1531@nyu.edu       */
#* Michael Rawson mr4209             */
#* Yixue Wang yw1819@nyu.edu         */
#**************************************************/

%matplotlib inline

#import required packages
import sys
import datetime
import csv
import math
import pandas as pd 
import numpy as np 
from scipy import stats 
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from mpltools import style
from mpltools import layout
from pandas.tools.plotting import autocorrelation_plot
import json

In [234]:
#import news from pig
news = pd.read_csv('../data/extracted_topics_refined', sep='	', names=['CountryID', 'SequenceID', 'Timestamp',
                                                          'Title','Story','Keywords','Country','Region', 'Count'],header=True)
news['Timestamp'] = pd.to_datetime(news['Timestamp'].str[:10], format = '%Y-%m-%d')
news['Timestamp'] = news['Timestamp'].values.astype('M8[D]')
news['Keywords'] = news['Keywords'].astype(str)
news['Keywords'] = news['Keywords'].apply(lambda x :x.split(" "))

#load news count
news_count = pd.read_csv('../data/keyword_list_sorted.csv', sep=',', names=['ID', 'Topic', 'Count'],header=True)

#load treasury data (and percent changes)
treasury = pd.read_csv('../data/treasury.csv', names=['Date', 'PercentChange'],header=True, parse_dates=True)
treasury['Date'] = pd.to_datetime(treasury['Date'].str[:10], format = '%Y-%m-%d')
treasury['PercentChange'] = treasury['PercentChange'].convert_objects(convert_numeric=True)
treasury = treasury.set_index(pd.DatetimeIndex(treasury['Date']))

In [235]:
treasury.head()


Out[235]:
Date PercentChange
2000-01-04 2000-01-04 -1.36778
2000-01-05 2000-01-05 2.00308
2000-01-06 2000-01-06 -0.75529
2000-01-07 2000-01-07 -0.76104
2000-01-10 2000-01-10 0.76687

In [236]:
news.head()


Out[236]:
CountryID SequenceID Timestamp Title Story Keywords Country Region Count
0 27 -16464 1954-12-04 NaN In Northern Bahr El Ghazal and Warrap States, ... [15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 2... Sudan MENA 38
1 27 -3866 1989-06-01 NaN A joint UN and AU team led by Jean-Marie Guehe... [28, 29, 30, 31, 32, 33] Sudan MENA 17
2 60 -582 1998-05-29 NaN Eritrea on May 28, 1998, mobilised thousands o... [34, 35, 36] Eritrea ESARO 8
3 145 298 2000-10-26 NaN militants du FPI." [37] Cote d'Ivoire WCARO 2
4 145 991 2002-09-19 NaN Following the armed insurgency of 19 September... [38, 39, 40, 41, 42] Cote d'Ivoire WCARO 14

In [251]:
keywords_list = news['Keywords'].tolist()
keywords_list_int = []

for keyword in keywords_list :
    key_list_int = []
    for k in keyword[:5] :  
        if (k != 'nan') :
            k = int(k)
        else :
            k = 0
        key_list_int.append(k)
    keywords_list_int.append(key_list_int)

In [251]:


In [252]:
#for each topic, get the dates where it occurs and its country
def getDateCount(topic) :
    dates = []
    for keywords in keywords_list_int:
        if topic in keywords :
            dates.append(news['Timestamp'].loc[keywords_list_int.index(keywords)])
    datesCount = {}
    for date in dates :
        if date not in datesCount:
            datesCount[date] = 1
        else :
            datesCount[date] += 1
    return datesCount

In [253]:
def getChangeAvg(dates):
    valueList = []
    for date in dates :
        newDate = date.strftime('%Y-%m-%d')   
        indexList = treasury[treasury['Date']== newDate].index.tolist()
        newTre = treasury.loc[indexList]
        newlist = newTre['PercentChange'].tolist()
        if (len(newlist)!=0):
            valueList.append(newlist[0])
    return np.nanmean(valueList)

In [ ]:
topic_avg = []
frequentTopics = news_count['ID'].tolist()

#for each topic, get the average percent changes
for topic in frequentTopics :
    avg = 0
    dates = getDateCount(topic)
    avg = getChangeAvg(dates)
    topic_avg.append(avg)

In [249]:


In [249]:
#store to csv
news_count_change = news_count[:100]
print len(news_count_change)
news_count_change['Changes'] = topic_avg
news_count_change = news_count_change

news_count_change.to_csv('../data/keyword_list_count.csv',index=False)


100
-c:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [ ]: